Skip to content

Commit

Permalink
fix wrong handling of MD links w/h src=
Browse files Browse the repository at this point in the history
  • Loading branch information
g-pavlov committed Nov 5, 2020
1 parent e496924 commit d90feab
Show file tree
Hide file tree
Showing 2 changed files with 139 additions and 30 deletions.
58 changes: 28 additions & 30 deletions pkg/reactor/content_processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,8 @@ import (
)

var (
htmlLinksRegexList = []*regexp.Regexp{
regexp.MustCompile(`href=["\']?([^"\'>]+)["\']?`),
regexp.MustCompile(`src=["\']?([^"\'>]+)["\']?`),
}
htmlLinksRegexList = regexp.MustCompile(`<\b[^>]*?\b((?i)href|(?i)src)\s*=\s*(\"([^"]*\")|'[^']*'|([^'">\s]+))`)
htmlLinkRegex = regexp.MustCompile(`((http|https|ftp|mailto):\/\/)?(\.?\/?[\w\.\-]+)+\/?([#?=&])?`)
)

// NodeContentProcessor operates on documents content to reconcile links and
Expand Down Expand Up @@ -163,36 +161,36 @@ func (c *nodeContentProcessor) reconcileMDLinks(ctx context.Context, docNode *ap
// replace html raw links of any sorts.
func (c *nodeContentProcessor) reconcileHTMLLinks(ctx context.Context, docNode *api.Node, documentBytes []byte, contentSourcePath string) ([]byte, error) {
var errors *multierror.Error
for _, regex := range htmlLinksRegexList {
documentBytes = regex.ReplaceAllFunc(documentBytes, func(match []byte) []byte {
attr := strings.Split(string(match), "=")
name := attr[0]
url := attr[1]
if len(url) > 0 {
url = strings.TrimPrefix(url, "\"")
url = strings.TrimSuffix(url, "\"")
}
destination, _, _, download, err := c.resolveLink(ctx, docNode, url, contentSourcePath)
if docNode != nil && destination != nil {
if url != *destination {
recordLinkStats(docNode, "Links", fmt.Sprintf("%s -> %s", url, *destination))
} else {
recordLinkStats(docNode, "Links", "")
}
}
if download != nil {
if err := c.schedule(ctx, download, contentSourcePath); err != nil {
errors = multierror.Append(err)
return match
}
documentBytes = htmlLinksRegexList.ReplaceAllFunc(documentBytes, func(match []byte) []byte {
var prefix, suffix string
attrs := strings.SplitAfter(string(match), "=")
url := attrs[len(attrs)-1]
url = htmlLinkRegex.FindString(url)
splits := strings.Split(string(match), url)
prefix = splits[0]
if len(splits) > 1 {
suffix = strings.Split(string(match), url)[1]
}
destination, _, _, download, err := c.resolveLink(ctx, docNode, url, contentSourcePath)
if docNode != nil && destination != nil {
if url != *destination {
recordLinkStats(docNode, "Links", fmt.Sprintf("%s -> %s", url, *destination))
} else {
recordLinkStats(docNode, "Links", "")
}
if err != nil {
}
if download != nil {
if err := c.schedule(ctx, download, contentSourcePath); err != nil {
errors = multierror.Append(err)
return match
}
return []byte(fmt.Sprintf("%s=%s", name, *destination))
})
}
}
if err != nil {
errors = multierror.Append(err)
return match
}
return []byte(fmt.Sprintf("%s%s%s", prefix, *destination, suffix))
})
return documentBytes, errors.ErrorOrNil()
}

Expand Down
111 changes: 111 additions & 0 deletions pkg/reactor/content_processor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -391,3 +391,114 @@ func Test_processLink(t *testing.T) {
})
}
}

func Test_matchHTMLLinks(t *testing.T) {
testCases := []struct {
in []string
want []string
}{
{
in: []string{
`<script src="abc/a.js" />`,
`<script src= "abc/a.js" />`,
`<script src=" abc/a.js" />`,
`<script SRC = " abc/a.js" />`,
`<script SRC = ' abc/a.js' />`,
`<script SRC = ' abc/a.js ' title="test" />`,
`<script src= abc/a.js />`,
`<script SRC = " abc/a.js" `,
`<img src="abc/a.js">`,
`<a href="abc/a.js">A</a>`,
},
want: []string{
`<script src="cde" />`,
`<script src= "cde" />`,
`<script src=" cde" />`,
`<script SRC = " cde" />`,
`<script SRC = ' cde' />`,
`<script SRC = ' cde ' title="test" />`,
`<script src= cde />`,
`<script SRC = " cde" `,
`<img src="cde">`,
`<a href="cde">A</a>`,
},
},
{
in: []string{
`< src="abc/a.js" />`,
`<script src=" abc/a.js' />`,
// FIXME: unbalanced quotation marks break the regex
// `<script SRC = " abc/a.js title="test" />`,
},
want: []string{
`< src="abc/a.js" />`,
`<script src=" abc/a.js' />`,
// `<script SRC = " abc/a.js title="test" />`,
},
},
}
for _, tc := range testCases {
t.Run("", func(t *testing.T) {
c := &nodeContentProcessor{
resourceAbsLinks: make(map[string]string),
resourcesRoot: "/__resources",
resourceHandlers: resourcehandlers.NewRegistry(&testResourceHandler{}),
rewriteEmbedded: true,
globalLinksConfig: &api.Links{
Rewrites: map[string]*api.LinkRewriteRule{
"abc": &api.LinkRewriteRule{
Destination: tests.StrPtr("cde"),
},
},
},
}
node := &api.Node{
Name: "node_A.md",
Source: "https://github.com/gardener/gardener/blob/v1.10.0/docs/README.md",
}
var (
b []byte
err error
)
for i, in := range tc.in {
if b, err = c.reconcileHTMLLinks(nil, node, []byte(in), ""); err != nil {
t.Fatal(err)
}
assert.Equal(t, tc.want[i], string(b))
}
})
}
}

type testResourceHandler struct {
hitCounter int
}

func (rh *testResourceHandler) Accept(uri string) bool {
return true
}
func (rh *testResourceHandler) ResolveNodeSelector(ctx context.Context, node *api.Node, excludePaths []string, frontMatter map[string]interface{}, excludeFrontMatter map[string]interface{}, depth int32) error {
return nil
}
func (rh *testResourceHandler) Read(ctx context.Context, uri string) ([]byte, error) {
return nil, nil
}
func (rh *testResourceHandler) ReadGitInfo(ctx context.Context, uri string) ([]byte, error) {
return nil, nil
}
func (rh *testResourceHandler) Name(uri string) string {
return ""
}
func (rh *testResourceHandler) ResourceName(uri string) (string, string) {
return "", ""
}
func (rh *testResourceHandler) BuildAbsLink(source, relLink string) (string, error) {
return relLink, nil
}
func (rh *testResourceHandler) SetVersion(link, version string) (string, error) {
return link, nil
}

func (rh *testResourceHandler) GetRawFormatLink(absLink string) (string, error) {
return absLink, nil
}

0 comments on commit d90feab

Please sign in to comment.