From d90feabe05f2f97da4a63665e4af002bfbbbff9c Mon Sep 17 00:00:00 2001
From: Georgi Pavlov <georgi.pavlov@sap.com>
Date: Thu, 5 Nov 2020 20:49:07 +0000
Subject: [PATCH] fix wrong handling of MD links w/h src=

---
 pkg/reactor/content_processor.go      |  58 +++++++-------
 pkg/reactor/content_processor_test.go | 111 ++++++++++++++++++++++++++
 2 files changed, 139 insertions(+), 30 deletions(-)
diff --git a/pkg/reactor/content_processor.go b/pkg/reactor/content_processor.go
index 816f8f38..38093a01 100644
--- a/pkg/reactor/content_processor.go
+++ b/pkg/reactor/content_processor.go
@@ -23,10 +23,8 @@ import (
 )
 
 var (
-	htmlLinksRegexList = []*regexp.Regexp{
-		regexp.MustCompile(`href=["\']?([^"\'>]+)["\']?`),
-		regexp.MustCompile(`src=["\']?([^"\'>]+)["\']?`),
-	}
+	htmlLinksRegexList = regexp.MustCompile(`<\b[^>]*?\b((?i)href|(?i)src)\s*=\s*(\"([^"]*\")|'[^']*'|([^'">\s]+))`)
+	htmlLinkRegex      = regexp.MustCompile(`((http|https|ftp|mailto):\/\/)?(\.?\/?[\w\.\-]+)+\/?([#?=&])?`)
 )
 
 // NodeContentProcessor operates on documents content to reconcile links and
@@ -163,36 +161,36 @@ func (c *nodeContentProcessor) reconcileMDLinks(ctx context.Context, docNode *ap
 // replace html raw links of any sorts.
 func (c *nodeContentProcessor) reconcileHTMLLinks(ctx context.Context, docNode *api.Node, documentBytes []byte, contentSourcePath string) ([]byte, error) {
 	var errors *multierror.Error
-	for _, regex := range htmlLinksRegexList {
-		documentBytes = regex.ReplaceAllFunc(documentBytes, func(match []byte) []byte {
-			attr := strings.Split(string(match), "=")
-			name := attr[0]
-			url := attr[1]
-			if len(url) > 0 {
-				url = strings.TrimPrefix(url, "\"")
-				url = strings.TrimSuffix(url, "\"")
-			}
-			destination, _, _, download, err := c.resolveLink(ctx, docNode, url, contentSourcePath)
-			if docNode != nil && destination != nil {
-				if url != *destination {
-					recordLinkStats(docNode, "Links", fmt.Sprintf("%s -> %s", url, *destination))
-				} else {
-					recordLinkStats(docNode, "Links", "")
-				}
-			}
-			if download != nil {
-				if err := c.schedule(ctx, download, contentSourcePath); err != nil {
-					errors = multierror.Append(err)
-					return match
-				}
+	documentBytes = htmlLinksRegexList.ReplaceAllFunc(documentBytes, func(match []byte) []byte {
+		var prefix, suffix string
+		attrs := strings.SplitAfter(string(match), "=")
+		url := attrs[len(attrs)-1]
+		url = htmlLinkRegex.FindString(url)
+		splits := strings.Split(string(match), url)
+		prefix = splits[0]
+		if len(splits) > 1 {
+			suffix = strings.Split(string(match), url)[1]
+		}
+		destination, _, _, download, err := c.resolveLink(ctx, docNode, url, contentSourcePath)
+		if docNode != nil && destination != nil {
+			if url != *destination {
+				recordLinkStats(docNode, "Links", fmt.Sprintf("%s -> %s", url, *destination))
+			} else {
+				recordLinkStats(docNode, "Links", "")
 			}
-			if err != nil {
+		}
+		if download != nil {
+			if err := c.schedule(ctx, download, contentSourcePath); err != nil {
 				errors = multierror.Append(err)
 				return match
 			}
-			return []byte(fmt.Sprintf("%s=%s", name, *destination))
-		})
-	}
+		}
+		if err != nil {
+			errors = multierror.Append(err)
+			return match
+		}
+		return []byte(fmt.Sprintf("%s%s%s", prefix, *destination, suffix))
+	})
 	return documentBytes, errors.ErrorOrNil()
 }
 
diff --git a/pkg/reactor/content_processor_test.go b/pkg/reactor/content_processor_test.go
index 7058f7c6..2948eee8 100644
--- a/pkg/reactor/content_processor_test.go
+++ b/pkg/reactor/content_processor_test.go
@@ -391,3 +391,114 @@ func Test_processLink(t *testing.T) {
 		})
 	}
 }
+
+func Test_matchHTMLLinks(t *testing.T) {
+	testCases := []struct {
+		in   []string
+		want []string
+	}{
+		{
+			in: []string{
+				`<script src="abc/a.js" />`,
+				`<script src=   "abc/a.js" />`,
+				`<script src="   abc/a.js" />`,
+				`<script SRC = "   abc/a.js" />`,
+				`<script SRC = '   abc/a.js' />`,
+				`<script SRC = '   abc/a.js  ' title="test" />`,
+				`<script src= abc/a.js />`,
+				`<script SRC = "   abc/a.js" `,
+				`<img src="abc/a.js">`,
+				`<a href="abc/a.js">A</a>`,
+			},
+			want: []string{
+				`<script src="cde" />`,
+				`<script src=   "cde" />`,
+				`<script src="   cde" />`,
+				`<script SRC = "   cde" />`,
+				`<script SRC = '   cde' />`,
+				`<script SRC = '   cde  ' title="test" />`,
+				`<script src= cde />`,
+				`<script SRC = "   cde" `,
+				`<img src="cde">`,
+				`<a href="cde">A</a>`,
+			},
+		},
+		{
+			in: []string{
+				`< src="abc/a.js" />`,
+				`<script src="   abc/a.js' />`,
+				// FIXME: unbalanced quotation marks break the regex
+				// `<script SRC = "   abc/a.js   title="test" />`,
+			},
+			want: []string{
+				`< src="abc/a.js" />`,
+				`<script src="   abc/a.js' />`,
+				// `<script SRC = "   abc/a.js   title="test" />`,
+			},
+		},
+	}
+	for _, tc := range testCases {
+		t.Run("", func(t *testing.T) {
+			c := &nodeContentProcessor{
+				resourceAbsLinks: make(map[string]string),
+				resourcesRoot:    "/__resources",
+				resourceHandlers: resourcehandlers.NewRegistry(&testResourceHandler{}),
+				rewriteEmbedded:  true,
+				globalLinksConfig: &api.Links{
+					Rewrites: map[string]*api.LinkRewriteRule{
+						"abc": &api.LinkRewriteRule{
+							Destination: tests.StrPtr("cde"),
+						},
+					},
+				},
+			}
+			node := &api.Node{
+				Name:   "node_A.md",
+				Source: "https://github.com/gardener/gardener/blob/v1.10.0/docs/README.md",
+			}
+			var (
+				b   []byte
+				err error
+			)
+			for i, in := range tc.in {
+				if b, err = c.reconcileHTMLLinks(nil, node, []byte(in), ""); err != nil {
+					t.Fatal(err)
+				}
+				assert.Equal(t, tc.want[i], string(b))
+			}
+		})
+	}
+}
+
+type testResourceHandler struct {
+	hitCounter int
+}
+
+func (rh *testResourceHandler) Accept(uri string) bool {
+	return true
+}
+func (rh *testResourceHandler) ResolveNodeSelector(ctx context.Context, node *api.Node, excludePaths []string, frontMatter map[string]interface{}, excludeFrontMatter map[string]interface{}, depth int32) error {
+	return nil
+}
+func (rh *testResourceHandler) Read(ctx context.Context, uri string) ([]byte, error) {
+	return nil, nil
+}
+func (rh *testResourceHandler) ReadGitInfo(ctx context.Context, uri string) ([]byte, error) {
+	return nil, nil
+}
+func (rh *testResourceHandler) Name(uri string) string {
+	return ""
+}
+func (rh *testResourceHandler) ResourceName(uri string) (string, string) {
+	return "", ""
+}
+func (rh *testResourceHandler) BuildAbsLink(source, relLink string) (string, error) {
+	return relLink, nil
+}
+func (rh *testResourceHandler) SetVersion(link, version string) (string, error) {
+	return link, nil
+}
+
+func (rh *testResourceHandler) GetRawFormatLink(absLink string) (string, error) {
+	return absLink, nil
+}