From d90feabe05f2f97da4a63665e4af002bfbbbff9c Mon Sep 17 00:00:00 2001 From: Georgi Pavlov Date: Thu, 5 Nov 2020 20:49:07 +0000 Subject: [PATCH] fix wrong handling of MD links w/h src= --- pkg/reactor/content_processor.go | 58 +++++++------- pkg/reactor/content_processor_test.go | 111 ++++++++++++++++++++++++++ 2 files changed, 139 insertions(+), 30 deletions(-) diff --git a/pkg/reactor/content_processor.go b/pkg/reactor/content_processor.go index 816f8f38..38093a01 100644 --- a/pkg/reactor/content_processor.go +++ b/pkg/reactor/content_processor.go @@ -23,10 +23,8 @@ import ( ) var ( - htmlLinksRegexList = []*regexp.Regexp{ - regexp.MustCompile(`href=["\']?([^"\'>]+)["\']?`), - regexp.MustCompile(`src=["\']?([^"\'>]+)["\']?`), - } + htmlLinksRegexList = regexp.MustCompile(`<\b[^>]*?\b((?i)href|(?i)src)\s*=\s*(\"([^"]*\")|'[^']*'|([^'">\s]+))`) + htmlLinkRegex = regexp.MustCompile(`((http|https|ftp|mailto):\/\/)?(\.?\/?[\w\.\-]+)+\/?([#?=&])?`) ) // NodeContentProcessor operates on documents content to reconcile links and @@ -163,36 +161,36 @@ func (c *nodeContentProcessor) reconcileMDLinks(ctx context.Context, docNode *ap // replace html raw links of any sorts. func (c *nodeContentProcessor) reconcileHTMLLinks(ctx context.Context, docNode *api.Node, documentBytes []byte, contentSourcePath string) ([]byte, error) { var errors *multierror.Error - for _, regex := range htmlLinksRegexList { - documentBytes = regex.ReplaceAllFunc(documentBytes, func(match []byte) []byte { - attr := strings.Split(string(match), "=") - name := attr[0] - url := attr[1] - if len(url) > 0 { - url = strings.TrimPrefix(url, "\"") - url = strings.TrimSuffix(url, "\"") - } - destination, _, _, download, err := c.resolveLink(ctx, docNode, url, contentSourcePath) - if docNode != nil && destination != nil { - if url != *destination { - recordLinkStats(docNode, "Links", fmt.Sprintf("%s -> %s", url, *destination)) - } else { - recordLinkStats(docNode, "Links", "") - } - } - if download != nil { - if err := c.schedule(ctx, download, contentSourcePath); err != nil { - errors = multierror.Append(err) - return match - } + documentBytes = htmlLinksRegexList.ReplaceAllFunc(documentBytes, func(match []byte) []byte { + var prefix, suffix string + attrs := strings.SplitAfter(string(match), "=") + url := attrs[len(attrs)-1] + url = htmlLinkRegex.FindString(url) + splits := strings.Split(string(match), url) + prefix = splits[0] + if len(splits) > 1 { + suffix = strings.Split(string(match), url)[1] + } + destination, _, _, download, err := c.resolveLink(ctx, docNode, url, contentSourcePath) + if docNode != nil && destination != nil { + if url != *destination { + recordLinkStats(docNode, "Links", fmt.Sprintf("%s -> %s", url, *destination)) + } else { + recordLinkStats(docNode, "Links", "") } - if err != nil { + } + if download != nil { + if err := c.schedule(ctx, download, contentSourcePath); err != nil { errors = multierror.Append(err) return match } - return []byte(fmt.Sprintf("%s=%s", name, *destination)) - }) - } + } + if err != nil { + errors = multierror.Append(err) + return match + } + return []byte(fmt.Sprintf("%s%s%s", prefix, *destination, suffix)) + }) return documentBytes, errors.ErrorOrNil() } diff --git a/pkg/reactor/content_processor_test.go b/pkg/reactor/content_processor_test.go index 7058f7c6..2948eee8 100644 --- a/pkg/reactor/content_processor_test.go +++ b/pkg/reactor/content_processor_test.go @@ -391,3 +391,114 @@ func Test_processLink(t *testing.T) { }) } } + +func Test_matchHTMLLinks(t *testing.T) { + testCases := []struct { + in []string + want []string + }{ + { + in: []string{ + `