Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sync from Google #341

Merged
merged 2 commits into from
Sep 6, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions transformer/transformer.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ var transformerFunctionMap = map[string]func(*transformers.Context) error{
"reorderhead": transformers.ReorderHead,
"serversiderendering": transformers.ServerSideRendering,
"stripjs": transformers.StripJS,
"stripscriptcomments": transformers.StripScriptComments,
"transformedidentifier": transformers.TransformedIdentifier,
"unusedextensions": transformers.UnusedExtensions,
"urlrewrite": transformers.URLRewrite,
Expand All @@ -60,6 +61,7 @@ var configMap = map[rpb.Request_TransformersConfig][]func(*transformers.Context)
// NodeCleanup should be first.
transformers.NodeCleanup,
transformers.StripJS,
transformers.StripScriptComments,
transformers.LinkTag,
transformers.AbsoluteURL,
transformers.AMPBoilerplate,
Expand Down
2 changes: 1 addition & 1 deletion transformer/transformer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ func TestProcess(t *testing.T) {
config rpb.Request_TransformersConfig
expectedLen int
}{
{rpb.Request_DEFAULT, 12},
{rpb.Request_DEFAULT, 13},
{rpb.Request_NONE, 0},
{rpb.Request_VALIDATION, 1},
{rpb.Request_CUSTOM, 0},
Expand Down
174 changes: 174 additions & 0 deletions transformer/transformers/stripscriptcomments.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
package transformers

import (
"strings"
)

import (
"github.com/ampproject/amppackager/transformer/internal/htmlnode"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)

// Strips html style comments from all the <script> elements that are child
// nodes of the given node. Example:
// <script><!-- var a = 1; --></script> transforms to empty script node.
// <script><!--[if IE]-->var a=1;<![endif]--></script> transforms to empty
// script tag.
//
// Attributes and any content outside comment are preserved.
// <script type="text/javascript"><!-- hello -->var a = 0;</script>
// transforms to
// <script type="text/javascript">var a = 0;</script>
//
// Any comment that looks like a comment but is a string variable is preserved.
// <script>var comment = "<!-- comment -->";</script> is not modified.
func StripScriptComments(e *Context) error {
for n := e.DOM.RootNode; n != nil; n = htmlnode.Next(n) {
if n.Type != html.ElementNode {
continue
}

if n.DataAtom == atom.Script {
typeVal, typeOk := htmlnode.GetAttributeVal(n, "", "type")
if typeOk && (typeVal == "application/json" ||
typeVal == "application/ld+json") {
contents := n.FirstChild
if contents == nil || len(contents.Data) == 0 {
return nil
}

contents.Data = stripComments(&contents.Data)
}
}
}

return nil
}

type commentStripper struct {
Content string
index int
insideQuotes bool
quoteChar byte
}

func (r *commentStripper) readNextByte() byte {
r.index = r.index + 1
if r.index >= len(r.Content) {
return 0
}
return r.Content[r.index]
}

func (r *commentStripper) readByteAt(i int) byte {
if i < 0 || i >= len(r.Content) {
return 0
}

return r.Content[i]
}

func (r *commentStripper) isPreviousEscapeChar() bool {
previous := r.index - 1
c := r.readByteAt(previous)
if c != '\\' {
return false
}

// Check if '\' is escaped by a preceding '\', consume all preceding '\'.
for {
previous = previous - 1
c := r.readByteAt(previous)
if c != '\\' {
break
}
}

return (r.index - previous) % 2 != 0
}

func (r *commentStripper) isQuoteChar(c byte) bool {
return c == '"' || c == '\''
}

func (r *commentStripper) skipComment() bool {
cdataStyleComment := false
// Checks initial !-- characters.
if r.readByteAt(r.index + 1) != '!' && r.readByteAt(r.index + 2) != '-' && r.readByteAt(r.index + 3) != '-' {
return false
}

// Skips the '!--' chars.
r.index = r.index + 3

// Checks if comment is of [CDATA] format.
// <!--[condition]>, we are interested in <!--[ in which case comment
// will end with ]-->
if r.readByteAt(r.index + 1) == '[' {
cdataStyleComment = true
}

// Skip until we encounter -->
for {
c := r.readNextByte()
if c == 0 {
// Skip everything until eof.
return true
}

if c == '-' {
if cdataStyleComment {
if r.readByteAt(r.index - 1) != ']' {
continue
}
}

if r.readByteAt(r.index + 1) == '-' && r.readByteAt(r.index + 2) == '>' {
r.index = r.index + 2
return true
}
}
}
}

func (r *commentStripper) stripComments() string {
var buffer strings.Builder
for {
c := r.readNextByte()
if c == 0 {
break
}

if r.insideQuotes {
if r.quoteChar == c && !r.isPreviousEscapeChar() {
r.insideQuotes = false
r.quoteChar = 0
}
buffer.WriteByte(c)
continue

}

if r.isQuoteChar(c) {
r.insideQuotes = true
r.quoteChar = c
buffer.WriteByte(c)
continue
}

// Normal character.
if c == '<' && r.skipComment() {
continue
}

buffer.WriteByte(c)
}

return buffer.String()
}

func stripComments(ss *string) string {
cs := commentStripper{Content: *ss, insideQuotes: false, index: -1}
return cs.stripComments()
}
141 changes: 141 additions & 0 deletions transformer/transformers/stripscriptcomments_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
package transformers_test

import (
"strings"
"testing"
)

import (
"github.com/ampproject/amppackager/transformer/internal/amphtml"
"github.com/ampproject/amppackager/transformer/transformers"
"golang.org/x/net/html"
tt "github.com/ampproject/amppackager/transformer/internal/testing"
)

func TestStripScriptComments(t *testing.T) {
tcs := []tt.TestCase{
{
Desc: "No comments in script",
Input: "<html><head><script type=\"application/json\">alert('hi');</script></head><body></body></html>",
Expected: "<html><head><script type=\"application/json\">alert('hi');</script></head><body></body></html>",
},
{
Desc: "Basic html comment in script",
Input: "<html><head><script type=\"application/json\"><!--alert('hi');--></script></head><body></body></html>",
Expected: "<html><head><script type=\"application/json\"></script></head><body></body></html>",
},
{
Desc: "Basic html comment with condition",
Input: "<html><head><script type=\"application/json\"><!--[if IE]>alert('hi');<![endif]--></script></head><body></body></html>",
Expected: "<html><head><script type=\"application/json\"></script></head><body></body></html>",
},
{
Desc: "Basic html comment with IE 10 condition",
Input: "<html><head><script type=\"application/json\"><!--[if gte IE 10]>alert('hi');<!--<![endif]--></script></head><body></body></html>",
Expected: "<html><head><script type=\"application/json\"></script></head><body></body></html>",
},
{
Desc: "Basic html comment with different syntax",
Input: "<html><head><script type=\"application/json\"><!--[if gte IE 10]><!-->alert('hi');<!--<![endif]--></script></head><body></body></html>",
Expected: "<html><head><script type=\"application/json\"></script></head><body></body></html>",
},
{
Desc: "Script with comment inside string variable",
Input: "<html><head><script type=\"application/json\">var k = \"<!--[if !IE]> -->alert('hi');<!-- <![endif]-->\";</script></head><body></body></html>",
Expected: "<html><head><script type=\"application/json\">var k = \"<!--[if !IE]> -->alert('hi');<!-- <![endif]-->\";</script></head><body></body></html>",
},
{
Desc: "Script with comment inside string variable escaped slashes",
Input: "<html><head><script type=\"application/json\">var k = \"<!--[if !IE]> -->alert(\\\"hi\\\");<!-- <![endif]-->\";</script></head><body></body></html>",
Expected: "<html><head><script type=\"application/json\">var k = \"<!--[if !IE]> -->alert(\\\"hi\\\");<!-- <![endif]-->\";</script></head><body></body></html>",
},
{
Desc: "Script with comment containing quotes",
Input: "<html><head><script type=\"application/json\">var k = \"<!--[if !IE]> -->alert(\"hi\");<!-- <![endif]-->\";</script></head><body></body></html>",
Expected: "<html><head><script type=\"application/json\">var k = \"<!--[if !IE]> -->alert(\"hi\");<!-- <![endif]-->\";</script></head><body></body></html>",
},
{
Desc: "Script with comment and some valid content",
Input: "<html><head><script type=\"application/json\"><!--[if !IE]>alert('hi');<![endif]-->var a = 1; var k = \"hi\";</script></head><body></body></html>",
Expected: "<html><head><script type=\"application/json\">var a = 1; var k = \"hi\";</script></head><body></body></html>",
},
{
Desc: "Script with empty comment",
Input: "<html><head><script type=\"application/json\"><!-- --></script></head><body></body></html>",
Expected: "<html><head><script type=\"application/json\"></script></head><body></body></html>",
},
{
Desc: "Script with empty comment and variables",
Input: "<html><head><script type=\"application/json\"><!-- -->var a = 1;</script></head><body></body></html>",
Expected: "<html><head><script type=\"application/json\">var a = 1;</script></head><body></body></html>",
},
{
Desc: "Script with too many comments",
Input: "<html><head><script type=\"application/json\"><!-- comment -->var a = 1;<!--[if IE]>It's an IE<![endif]-->var b = \"hello\";var c = false;</script></head><body></body></html>",
Expected: "<html><head><script type=\"application/json\">var a = 1;var b = \"hello\";var c = false;</script></head><body></body></html>",
},
{
Desc: "Too many comments V2",
Input: "<html><head><script type=\"application/json\"><!-- comment -->var a = 1;<!--[if IE]>It's an IE<![endif]-->var b = \"hello\";var c = false;<!-- hello -->var d = \"foo\";<!--[if !IE]>One more conditional comment<![endif]-->var f = \"<!-- comment variable -->\";</script>",
Expected: "<html><head><script type=application/json>var a = 1;var b = \"hello\";var c = false;var d = \"foo\";var f = \"<!-- comment variable -->\";</script></head><body></body></html>",
},
{
Desc: "Simple JSON",
Input: "<html><head><script type=\"application/json\"><!-- comment -->{'a': 'b', 'c': true, 'd': 123}</script>",
Expected: "<html><head><script type=application/json>{'a': 'b', 'c': true, 'd': 123}</script></head><body></body></html>",
},
{
Desc: "JSON with comment as string variable",
Input: "<html><head><script type=\"application/ld+json\"><!-- comment -->{'a': 'b', 'c': true, 'd': '<!-- should preserve -->'}</script>",
Expected: "<html><head><script type=application/ld+json>{'a': 'b', 'c': true, 'd': '<!-- should preserve -->'}</script></head><body></body></html>",
},
{
Desc: "JSON with double quoted values",
Input: "<script type=\"application/ld+json\"><!-- comment -->{\"a\": \"b\", \"c\": true, \"d\": \"<!-- should preserve -->\"}</script>",
Expected: "<html><head><script type=application/ld+json>{\"a\": \"b\", \"c\": true, \"d\": \"<!-- should preserve -->\"}</script></head><body></body></html>",
},
{
Desc: "JSON with too many escape chars",
Input: "<html><head><script type=\"application/ld+json\"><!-- comment -->{\"a\": \"\\\\\"b\", \"c\": true, \"d\": \"<!-- should preserve -->\"}</script>",
Expected: "<html><head><script type=application/ld+json>{\"a\": \"\\\\\"b\", \"c\": true, \"d\": \"<!-- should preserve -->\"}</script></head><body></body></html>",
},
{
Desc: "JSON with multiple comments",
Input: "<html><head><script type=\"application/json\"><!-- comment -->{\"a\": \"b\", \"c\": true, \"d\": \"<!-- should preserve -->\", <!-- more comments --> e: 123}</script>",
Expected: "<html><head><script type=application/json>{\"a\": \"b\", \"c\": true, \"d\": \"<!-- should preserve -->\", e: 123}</script></head><body></body></html>",
},
}

for _, tc := range tcs {
inputDoc, err := html.Parse(strings.NewReader(tc.Input))
if err != nil {
t.Errorf("%s: html.Parse failed %q", tc.Input, err)
continue
}
inputDOM, err := amphtml.NewDOM(inputDoc)
if err != nil {
t.Errorf("%s\namphtml.NewDOM for %s failed %q", tc.Desc, tc.Input, err)
continue
}
transformers.StripScriptComments(&transformers.Context{DOM: inputDOM})
var input strings.Builder
if err = html.Render(&input, inputDoc); err != nil {
t.Errorf("%s: html.Render failed %q", tc.Input, err)
continue
}
expectedDoc, err := html.Parse(strings.NewReader(tc.Expected))
if err != nil {
t.Errorf("%s: html.Parse failed %q", tc.Expected, err)
continue
}
var expected strings.Builder
err = html.Render(&expected, expectedDoc)
if err != nil {
t.Errorf("%s: html.Render failed %q", tc.Expected, err)
continue
}
if input.String() != expected.String() {
t.Errorf("%s: Transform\n%q\nwant=\n%q", tc.Desc, &input, &expected)
}
}
}