diff --git a/transformer/transformer.go b/transformer/transformer.go index 22d4bb796..5809585e1 100644 --- a/transformer/transformer.go +++ b/transformer/transformer.go @@ -48,6 +48,7 @@ var transformerFunctionMap = map[string]func(*transformers.Context) error{ "reorderhead": transformers.ReorderHead, "serversiderendering": transformers.ServerSideRendering, "stripjs": transformers.StripJS, + "stripscriptcomments": transformers.StripScriptComments, "transformedidentifier": transformers.TransformedIdentifier, "unusedextensions": transformers.UnusedExtensions, "urlrewrite": transformers.URLRewrite, @@ -60,6 +61,7 @@ var configMap = map[rpb.Request_TransformersConfig][]func(*transformers.Context) // NodeCleanup should be first. transformers.NodeCleanup, transformers.StripJS, + transformers.StripScriptComments, transformers.LinkTag, transformers.AbsoluteURL, transformers.AMPBoilerplate, diff --git a/transformer/transformer_test.go b/transformer/transformer_test.go index 046ba9663..1b50ae320 100644 --- a/transformer/transformer_test.go +++ b/transformer/transformer_test.go @@ -26,7 +26,7 @@ func TestProcess(t *testing.T) { config rpb.Request_TransformersConfig expectedLen int }{ - {rpb.Request_DEFAULT, 12}, + {rpb.Request_DEFAULT, 13}, {rpb.Request_NONE, 0}, {rpb.Request_VALIDATION, 1}, {rpb.Request_CUSTOM, 0}, diff --git a/transformer/transformers/stripscriptcomments.go b/transformer/transformers/stripscriptcomments.go new file mode 100644 index 000000000..d68099141 --- /dev/null +++ b/transformer/transformers/stripscriptcomments.go @@ -0,0 +1,174 @@ +package transformers + +import ( + "strings" +) + +import ( + "github.com/ampproject/amppackager/transformer/internal/htmlnode" + "golang.org/x/net/html" + "golang.org/x/net/html/atom" +) + +// Strips html style comments from all the transforms to empty script node. +// transforms to empty +// script tag. +// +// Attributes and any content outside comment are preserved. +// +// transforms to +// +// +// Any comment that looks like a comment but is a string variable is preserved. +// is not modified. +func StripScriptComments(e *Context) error { + for n := e.DOM.RootNode; n != nil; n = htmlnode.Next(n) { + if n.Type != html.ElementNode { + continue + } + + if n.DataAtom == atom.Script { + typeVal, typeOk := htmlnode.GetAttributeVal(n, "", "type") + if typeOk && (typeVal == "application/json" || + typeVal == "application/ld+json") { + contents := n.FirstChild + if contents == nil || len(contents.Data) == 0 { + return nil + } + + contents.Data = stripComments(&contents.Data) + } + } + } + + return nil +} + +type commentStripper struct { + Content string + index int + insideQuotes bool + quoteChar byte +} + +func (r *commentStripper) readNextByte() byte { + r.index = r.index + 1 + if r.index >= len(r.Content) { + return 0 + } + return r.Content[r.index] +} + +func (r *commentStripper) readByteAt(i int) byte { + if i < 0 || i >= len(r.Content) { + return 0 + } + + return r.Content[i] +} + +func (r *commentStripper) isPreviousEscapeChar() bool { + previous := r.index - 1 + c := r.readByteAt(previous) + if c != '\\' { + return false + } + + // Check if '\' is escaped by a preceding '\', consume all preceding '\'. + for { + previous = previous - 1 + c := r.readByteAt(previous) + if c != '\\' { + break + } + } + + return (r.index - previous) % 2 != 0 +} + +func (r *commentStripper) isQuoteChar(c byte) bool { + return c == '"' || c == '\'' +} + +func (r *commentStripper) skipComment() bool { + cdataStyleComment := false + // Checks initial !-- characters. + if r.readByteAt(r.index + 1) != '!' && r.readByteAt(r.index + 2) != '-' && r.readByteAt(r.index + 3) != '-' { + return false + } + + // Skips the '!--' chars. + r.index = r.index + 3 + + // Checks if comment is of [CDATA] format. + // + if r.readByteAt(r.index + 1) == '[' { + cdataStyleComment = true + } + + // Skip until we encounter --> + for { + c := r.readNextByte() + if c == 0 { + // Skip everything until eof. + return true + } + + if c == '-' { + if cdataStyleComment { + if r.readByteAt(r.index - 1) != ']' { + continue + } + } + + if r.readByteAt(r.index + 1) == '-' && r.readByteAt(r.index + 2) == '>' { + r.index = r.index + 2 + return true + } + } + } +} + +func (r *commentStripper) stripComments() string { + var buffer strings.Builder + for { + c := r.readNextByte() + if c == 0 { + break + } + + if r.insideQuotes { + if r.quoteChar == c && !r.isPreviousEscapeChar() { + r.insideQuotes = false + r.quoteChar = 0 + } + buffer.WriteByte(c) + continue + + } + + if r.isQuoteChar(c) { + r.insideQuotes = true + r.quoteChar = c + buffer.WriteByte(c) + continue + } + + // Normal character. + if c == '<' && r.skipComment() { + continue + } + + buffer.WriteByte(c) + } + + return buffer.String() +} + +func stripComments(ss *string) string { + cs := commentStripper{Content: *ss, insideQuotes: false, index: -1} + return cs.stripComments() +} diff --git a/transformer/transformers/stripscriptcomments_test.go b/transformer/transformers/stripscriptcomments_test.go new file mode 100644 index 000000000..3d3ea1e55 --- /dev/null +++ b/transformer/transformers/stripscriptcomments_test.go @@ -0,0 +1,142 @@ +package transformers_test + +import ( + "strings" + "testing" +) + +import ( + "github.com/ampproject/amppackager/transformer/internal/amphtml" + "github.com/ampproject/amppackager/transformer/transformers" + "golang.org/x/net/html" + tt "github.com/ampproject/amppackager/transformer/internal/testing" +) + +// TODO(amaltas): WIP, Add lot of tests. +func TestStripScriptComments(t *testing.T) { + tcs := []tt.TestCase{ + { + Desc: "No comments in script", + Input: "", + Expected: "", + }, + { + Desc: "Basic html comment in script", + Input: "", + Expected: "", + }, + { + Desc: "Basic html comment with condition", + Input: "", + Expected: "", + }, + { + Desc: "Basic html comment with IE 10 condition", + Input: "", + Expected: "", + }, + { + Desc: "Basic html comment with different syntax", + Input: "", + Expected: "", + }, + { + Desc: "Script with comment inside string variable", + Input: "", + Expected: "", + }, + { + Desc: "Script with comment inside string variable escaped slashes", + Input: "", + Expected: "", + }, + { + Desc: "Script with comment containing quotes", + Input: "", + Expected: "", + }, + { + Desc: "Script with comment and some valid content", + Input: "", + Expected: "", + }, + { + Desc: "Script with empty comment", + Input: "", + Expected: "", + }, + { + Desc: "Script with empty comment and variables", + Input: "", + Expected: "", + }, + { + Desc: "Script with too many comments", + Input: "", + Expected: "", + }, + { + Desc: "Too many comments V2", + Input: "", + Expected: "", + }, + { + Desc: "Simple JSON", + Input: "", + Expected: "", + }, + { + Desc: "JSON with comment as string variable", + Input: "", + Expected: "", + }, + { + Desc: "JSON with double quoted values", + Input: "", + Expected: "", + }, + { + Desc: "JSON with too many escape chars", + Input: "", + Expected: "", + }, + { + Desc: "JSON with multiple comments", + Input: "", + Expected: "", + }, + } + + for _, tc := range tcs { + inputDoc, err := html.Parse(strings.NewReader(tc.Input)) + if err != nil { + t.Errorf("%s: html.Parse failed %q", tc.Input, err) + continue + } + inputDOM, err := amphtml.NewDOM(inputDoc) + if err != nil { + t.Errorf("%s\namphtml.NewDOM for %s failed %q", tc.Input, err) + continue + } + transformers.StripScriptComments(&transformers.Context{DOM: inputDOM}) + var input strings.Builder + if err = html.Render(&input, inputDoc); err != nil { + t.Errorf("%s: html.Render failed %q", tc.Input, err) + continue + } + expectedDoc, err := html.Parse(strings.NewReader(tc.Expected)) + if err != nil { + t.Errorf("%s: html.Parse failed %q", tc.Expected, err) + continue + } + var expected strings.Builder + err = html.Render(&expected, expectedDoc) + if err != nil { + t.Errorf("%s: html.Render failed %q", tc.Expected, err) + continue + } + if input.String() != expected.String() { + t.Errorf("%s: Transform\n%q\nwant=\n%q", tc.Desc, &input, &expected) + } + } +}