Skip to content

Commit

Permalink
internal/sanitizer: address jba's comments on CL 543858
Browse files Browse the repository at this point in the history
For #61399

Change-Id: Ia4c4103a0d649172a82b482b6947b4ba2f2e785a
Reviewed-on: https://go-review.googlesource.com/c/pkgsite/+/546315
kokoro-CI: kokoro <[email protected]>
Reviewed-by: Roland Shoemaker <[email protected]>
Reviewed-by: Jonathan Amsterdam <[email protected]>
LUCI-TryBot-Result: Go LUCI <[email protected]>
  • Loading branch information
matloob committed Dec 5, 2023
1 parent cd7aa4a commit 47bf8d7
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 9 deletions.
21 changes: 12 additions & 9 deletions internal/sanitizer/sanitizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ package sanitizer

import (
"bytes"
"fmt"
"net/url"
"regexp"
"strings"
Expand All @@ -19,14 +18,15 @@ import (
// SanitizeBytes returns a sanitized version of the input.
// It throws out any attributes or tags that are not explicitly
// allowed in allowElems or allowAttributes below, including
// any child nodes of elements that are not allowed.
// any child nodes of elements that are not allowed. It returns
// the empty string if there was an error parsing the input.
func SanitizeBytes(b []byte) []byte {
// TODO(matloob): We want to sanitize a fragment that would
// appear in the body. Can we call ParseFragment without
// creating the body node here?
document, err := html.Parse(strings.NewReader("<html><head></head><body></body></html>"))
if err != nil {
panic(fmt.Errorf("error parsing document: %v", err))
return nil
}
body := document.FirstChild.LastChild // document.FirstChild is the <html> node

Expand All @@ -47,10 +47,6 @@ func SanitizeBytes(b []byte) []byte {
// of parent-less nodes the node should be replaced with.
func sanitize(n *html.Node) ([]*html.Node, bool) {
switch n.Type {
case html.CommentNode:
return nil, false
case html.DoctypeNode:
return nil, false
case html.TextNode:
return nil, true // Assume text nodes are safe
case html.ElementNode:
Expand Down Expand Up @@ -119,7 +115,7 @@ func sanitize(n *html.Node) ([]*html.Node, bool) {
}
return nil, true
default:
return extractSanitizedChildren(n), false
return nil, false
}
}

Expand All @@ -146,6 +142,9 @@ func sanitizeNodes(nodes []*html.Node) []*html.Node {
return keepNodes
}

// addRelNoFollow adds a rel="nofollow" attribute to the attributes
// if the href attribute is present. If there's already a rel
// attribute present its value is replaced with "nofollow".
func addRelNoFollow(attrs []html.Attribute) []html.Attribute {
hasHref := false
for _, attr := range attrs {
Expand Down Expand Up @@ -382,10 +381,14 @@ func re(rx string) func(string) bool {
return regexp.MustCompile(rx).MatchString
}

// validURL returns true if the URL is a valid url according to the
// following rules: it must be url.Parsable when the spaces are trimmed,
// it can not contain interior newlines, tabs, or spaces, and its scheme,
// if present must be mailto, http, or https.
func validURL(rawurl string) bool {
rawurl = strings.TrimSpace(rawurl)

if strings.ContainsAny(rawurl, " \t\n") {
if strings.ContainsAny(rawurl, " \t") {
return false
}

Expand Down
71 changes: 71 additions & 0 deletions internal/sanitizer/sanitizer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
package sanitizer

import (
"reflect"
"testing"

"golang.org/x/net/html"
)

func TestSanitizeBytes(t *testing.T) {
Expand Down Expand Up @@ -152,3 +155,71 @@ func TestSanitizeBytes(t *testing.T) {
}
}
}

func TestAddRelNoFollow(t *testing.T) {
testCases := []struct {
input []html.Attribute
want []html.Attribute
}{
{
[]html.Attribute{},
[]html.Attribute{},
},
{
[]html.Attribute{{Key: "id", Val: "foo"}},
[]html.Attribute{{Key: "id", Val: "foo"}},
},
{
[]html.Attribute{{Key: "href", Val: "https://golang.org"}},
[]html.Attribute{{Key: "href", Val: "https://golang.org"}, {Key: "rel", Val: "nofollow"}},
},
{
[]html.Attribute{{Key: "href", Val: "https://golang.org"}, {Key: "rel", Val: "nofollow"}},
[]html.Attribute{{Key: "href", Val: "https://golang.org"}, {Key: "rel", Val: "nofollow"}},
},
{
[]html.Attribute{{Key: "href", Val: "https://golang.org"}, {Key: "rel", Val: "canonical"}},
[]html.Attribute{{Key: "href", Val: "https://golang.org"}, {Key: "rel", Val: "nofollow"}},
},
{
[]html.Attribute{{Key: "id", Val: "foo"}, {Key: "rel", Val: "canonical"}},
[]html.Attribute{{Key: "id", Val: "foo"}, {Key: "rel", Val: "canonical"}},
},
}

for _, tc := range testCases {
got := addRelNoFollow(append([]html.Attribute{}, tc.input...))
if !reflect.DeepEqual(got, tc.want) {
t.Errorf("addRelNoFollow(%v): got %v, want %v", tc.input, got, tc.want)
}
}
}

func TestValidURL(t *testing.T) {
testCases := []struct {
input string
want bool
}{
{"", true},
{"#", true},
{"https://golang.org", true},
{"http://golang.org", true},
{"mailto:golang.org", true},
{"unsupported:golang.org", false},
{" https://golang.org ", true},
{"\thttps://golang.org ", true},
{" https://golang.org/my file", false},
{" https://golang.org/my\tfile", false},
{" https://golang.org/my\nfile", false},
{"%", false},
{" % ", false},
{" %\t% ", false},
}

for _, tc := range testCases {
got := validURL(tc.input)
if got != tc.want {
t.Errorf("validURL(%q): got %v, want %v", tc.input, got, tc.want)
}
}
}

0 comments on commit 47bf8d7

Please sign in to comment.