Skip to content

Commit

Permalink
chore(scraper): improve data-src rewriting rule
Browse files Browse the repository at this point in the history
  • Loading branch information
ncarlier committed Oct 2, 2024
1 parent 18202fa commit ab0f6b5
Show file tree
Hide file tree
Showing 6 changed files with 17 additions and 24 deletions.
2 changes: 1 addition & 1 deletion pkg/sanitizer/sanitizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ func NewSanitizer(blockList *BlockList) *Sanitizer {
policy := bluemonday.UGCPolicy()
policy.AddTargetBlankToFullyQualifiedLinks(true)
policy.AllowAttrs("width", "height", "src", "allowfullscreen", "sandbox").OnElements("iframe")
policy.AllowAttrs("srcset", "sizes", "data-src").OnElements("img", "source")
policy.AllowAttrs("srcset", "sizes").OnElements("img", "source")
policy.AllowElements("picture", "source")

if blockList != nil {
Expand Down
4 changes: 2 additions & 2 deletions pkg/scraper/html-rewriter/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ func newHTMLAttribute(key string) html.Attribute {
}
}

func findHTMLAttribute(attributes []html.Attribute, key string) *html.Attribute {
for _, attr := range attributes {
func findHTMLAttribute(node *html.Node, key string) *html.Attribute {
for _, attr := range node.Attr {
if attr.Key == key {
return &attr
}
Expand Down
27 changes: 10 additions & 17 deletions pkg/scraper/html-rewriter/img-rewriter.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,26 +6,19 @@ import (
)

func RewriteDataSrcToSrcAttribute(node *html.Node) {
if node.DataAtom != atom.Img {
if node.DataAtom != atom.Img && node.DataAtom != atom.Source {
return
}

attrs := []html.Attribute{}
srcAttr := newHTMLAttribute("src")
dataSrcAttr := newHTMLAttribute("data-src")
for _, attr := range node.Attr {
switch attr.Key {
case srcAttr.Key:
srcAttr = attr
case dataSrcAttr.Key:
dataSrcAttr = attr
default:
attrs = append(attrs, attr)
keys := []string{"src", "srcset"}
for _, key := range keys {
realAttr := findHTMLAttribute(node, key)
if realAttr == nil || realAttr.Val == "" {
dataAttr := findHTMLAttribute(node, "data-"+key)
if dataAttr != nil && dataAttr.Val != "" {
dataAttr.Key = key
replaceHTMLAttribute(node, *dataAttr)
}
}
}
if dataSrcAttr.Val != "" && srcAttr.Val == "" {
srcAttr.Val = dataSrcAttr.Val
attrs = append(attrs, srcAttr)
node.Attr = attrs
}
}
4 changes: 2 additions & 2 deletions pkg/scraper/html-rewriter/picture-rewriter.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,11 @@ func RewritePictureWithoutImgSrcAttribute(node *html.Node) {
return
}

srcAttr := findHTMLAttribute(imgNode.Attr, "src")
srcAttr := findHTMLAttribute(imgNode, "src")
if srcAttr != nil && srcAttr.Val != "" {
return
}
srcsetAttr := findHTMLAttribute(sourceNode.Attr, "srcset")
srcsetAttr := findHTMLAttribute(sourceNode, "srcset")
if srcsetAttr == nil || srcsetAttr.Val == "" {
return
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/scraper/html-rewriter/test/rewrite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ var testCases = []struct {
}{
{
input: "<html><body><img data-src=\"foo\" /></body></html>",
expected: "<html><head></head><body><img src=\"foo\"/></body></html>",
expected: "<html><head></head><body><img data-src=\"foo\" src=\"foo\"/></body></html>",
},
{
input: "<html><body><img data-src=\"foo\" src=\"bar\" /></body></html>",
Expand Down
2 changes: 1 addition & 1 deletion pkg/scraper/test/web-scraper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ func TestSimpleWebScraping(t *testing.T) {
assert.Equal(t, "readflow", page.Title)
assert.Equal(t, "read your Internet article flow in one place with complete peace of mind and freedom", page.Text)
assert.Contains(t, page.HTML, "relax.png")
assert.Equal(t, "https://about.readflow.app/images/readflow.png", page.Image)
assert.Equal(t, "https://about.readflow.app/img/readflow.png", page.Image)
assert.Equal(t, "https://about.readflow.app/favicon.png", page.Favicon)
}

Expand Down

0 comments on commit ab0f6b5

Please sign in to comment.