Skip to content

Commit

Permalink
Transformer to strip HTML style comments from script tags.
Browse files Browse the repository at this point in the history
Examples:
<script><!-- Hello --></script> == <script></script>

<script><!--[if IE]>var ie = true;<![endif]--></script> == <script></script>

PiperOrigin-RevId: 263786190
  • Loading branch information
amaltas authored and honeybadgerdontcare committed Sep 4, 2019
1 parent 2b545a4 commit 0cb645c
Show file tree
Hide file tree
Showing 4 changed files with 319 additions and 1 deletion.
2 changes: 2 additions & 0 deletions transformer/transformer.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ var transformerFunctionMap = map[string]func(*transformers.Context) error{
"reorderhead": transformers.ReorderHead,
"serversiderendering": transformers.ServerSideRendering,
"stripjs": transformers.StripJS,
"stripscriptcomments": transformers.StripScriptComments,
"transformedidentifier": transformers.TransformedIdentifier,
"unusedextensions": transformers.UnusedExtensions,
"urlrewrite": transformers.URLRewrite,
Expand All @@ -60,6 +61,7 @@ var configMap = map[rpb.Request_TransformersConfig][]func(*transformers.Context)
// NodeCleanup should be first.
transformers.NodeCleanup,
transformers.StripJS,
transformers.StripScriptComments,
transformers.LinkTag,
transformers.AbsoluteURL,
transformers.AMPBoilerplate,
Expand Down
2 changes: 1 addition & 1 deletion transformer/transformer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ func TestProcess(t *testing.T) {
config rpb.Request_TransformersConfig
expectedLen int
}{
{rpb.Request_DEFAULT, 12},
{rpb.Request_DEFAULT, 13},
{rpb.Request_NONE, 0},
{rpb.Request_VALIDATION, 1},
{rpb.Request_CUSTOM, 0},
Expand Down
174 changes: 174 additions & 0 deletions transformer/transformers/stripscriptcomments.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
package transformers

import (
"strings"
)

import (
"github.com/ampproject/amppackager/transformer/internal/htmlnode"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)

// Strips html style comments from all the <script> elements that are child
// nodes of the given node. Example:
// <script><!-- var a = 1; --></script> transforms to empty script node.
// <script><!--[if IE]-->var a=1;<![endif]--></script> transforms to empty
// script tag.
//
// Attributes and any content outside comment are preserved.
// <script type="text/javascript"><!-- hello -->var a = 0;</script>
// transforms to
// <script type="text/javascript">var a = 0;</script>
//
// Any comment that looks like a comment but is a string variable is preserved.
// <script>var comment = "<!-- comment -->";</script> is not modified.
func StripScriptComments(e *Context) error {
for n := e.DOM.RootNode; n != nil; n = htmlnode.Next(n) {
if n.Type != html.ElementNode {
continue
}

if n.DataAtom == atom.Script {
typeVal, typeOk := htmlnode.GetAttributeVal(n, "", "type")
if typeOk && (typeVal == "application/json" ||
typeVal == "application/ld+json") {
contents := n.FirstChild
if contents == nil || len(contents.Data) == 0 {
return nil
}

contents.Data = stripComments(&contents.Data)
}
}
}

return nil
}

type commentStripper struct {
Content string
index int
insideQuotes bool
quoteChar byte
}

func (r *commentStripper) readNextByte() byte {
r.index = r.index + 1
if r.index >= len(r.Content) {
return 0
}
return r.Content[r.index]
}

func (r *commentStripper) readByteAt(i int) byte {
if i < 0 || i >= len(r.Content) {
return 0
}

return r.Content[i]
}

func (r *commentStripper) isPreviousEscapeChar() bool {
previous := r.index - 1
c := r.readByteAt(previous)
if c != '\\' {
return false
}

// Check if '\' is escaped by a preceding '\', consume all preceding '\'.
for {
previous = previous - 1
c := r.readByteAt(previous)
if c != '\\' {
break
}
}

return (r.index - previous) % 2 != 0
}

func (r *commentStripper) isQuoteChar(c byte) bool {
return c == '"' || c == '\''
}

func (r *commentStripper) skipComment() bool {
cdataStyleComment := false
// Checks initial !-- characters.
if r.readByteAt(r.index + 1) != '!' && r.readByteAt(r.index + 2) != '-' && r.readByteAt(r.index + 3) != '-' {
return false
}

// Skips the '!--' chars.
r.index = r.index + 3

// Checks if comment is of [CDATA] format.
// <!--[condition]>, we are interested in <!--[ in which case comment
// will end with ]-->
if r.readByteAt(r.index + 1) == '[' {
cdataStyleComment = true
}

// Skip until we encounter -->
for {
c := r.readNextByte()
if c == 0 {
// Skip everything until eof.
return true
}

if c == '-' {
if cdataStyleComment {
if r.readByteAt(r.index - 1) != ']' {
continue
}
}

if r.readByteAt(r.index + 1) == '-' && r.readByteAt(r.index + 2) == '>' {
r.index = r.index + 2
return true
}
}
}
}

func (r *commentStripper) stripComments() string {
var buffer strings.Builder
for {
c := r.readNextByte()
if c == 0 {
break
}

if r.insideQuotes {
if r.quoteChar == c && !r.isPreviousEscapeChar() {
r.insideQuotes = false
r.quoteChar = 0
}
buffer.WriteByte(c)
continue

}

if r.isQuoteChar(c) {
r.insideQuotes = true
r.quoteChar = c
buffer.WriteByte(c)
continue
}

// Normal character.
if c == '<' && r.skipComment() {
continue
}

buffer.WriteByte(c)
}

return buffer.String()
}

func stripComments(ss *string) string {
cs := commentStripper{Content: *ss, insideQuotes: false, index: -1}
return cs.stripComments()
}
142 changes: 142 additions & 0 deletions transformer/transformers/stripscriptcomments_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
package transformers_test

import (
"strings"
"testing"
)

import (
"github.com/ampproject/amppackager/transformer/internal/amphtml"
"github.com/ampproject/amppackager/transformer/transformers"
"golang.org/x/net/html"
tt "github.com/ampproject/amppackager/transformer/internal/testing"
)

// TODO(amaltas): WIP, Add lot of tests.
func TestStripScriptComments(t *testing.T) {
tcs := []tt.TestCase{
{
Desc: "No comments in script",
Input: "<html><head><script type=\"application/json\">alert('hi');</script></head><body></body></html>",
Expected: "<html><head><script type=\"application/json\">alert('hi');</script></head><body></body></html>",
},
{
Desc: "Basic html comment in script",
Input: "<html><head><script type=\"application/json\"><!--alert('hi');--></script></head><body></body></html>",
Expected: "<html><head><script type=\"application/json\"></script></head><body></body></html>",
},
{
Desc: "Basic html comment with condition",
Input: "<html><head><script type=\"application/json\"><!--[if IE]>alert('hi');<![endif]--></script></head><body></body></html>",
Expected: "<html><head><script type=\"application/json\"></script></head><body></body></html>",
},
{
Desc: "Basic html comment with IE 10 condition",
Input: "<html><head><script type=\"application/json\"><!--[if gte IE 10]>alert('hi');<!--<![endif]--></script></head><body></body></html>",
Expected: "<html><head><script type=\"application/json\"></script></head><body></body></html>",
},
{
Desc: "Basic html comment with different syntax",
Input: "<html><head><script type=\"application/json\"><!--[if gte IE 10]><!-->alert('hi');<!--<![endif]--></script></head><body></body></html>",
Expected: "<html><head><script type=\"application/json\"></script></head><body></body></html>",
},
{
Desc: "Script with comment inside string variable",
Input: "<html><head><script type=\"application/json\">var k = \"<!--[if !IE]> -->alert('hi');<!-- <![endif]-->\";</script></head><body></body></html>",
Expected: "<html><head><script type=\"application/json\">var k = \"<!--[if !IE]> -->alert('hi');<!-- <![endif]-->\";</script></head><body></body></html>",
},
{
Desc: "Script with comment inside string variable escaped slashes",
Input: "<html><head><script type=\"application/json\">var k = \"<!--[if !IE]> -->alert(\\\"hi\\\");<!-- <![endif]-->\";</script></head><body></body></html>",
Expected: "<html><head><script type=\"application/json\">var k = \"<!--[if !IE]> -->alert(\\\"hi\\\");<!-- <![endif]-->\";</script></head><body></body></html>",
},
{
Desc: "Script with comment containing quotes",
Input: "<html><head><script type=\"application/json\">var k = \"<!--[if !IE]> -->alert(\"hi\");<!-- <![endif]-->\";</script></head><body></body></html>",
Expected: "<html><head><script type=\"application/json\">var k = \"<!--[if !IE]> -->alert(\"hi\");<!-- <![endif]-->\";</script></head><body></body></html>",
},
{
Desc: "Script with comment and some valid content",
Input: "<html><head><script type=\"application/json\"><!--[if !IE]>alert('hi');<![endif]-->var a = 1; var k = \"hi\";</script></head><body></body></html>",
Expected: "<html><head><script type=\"application/json\">var a = 1; var k = \"hi\";</script></head><body></body></html>",
},
{
Desc: "Script with empty comment",
Input: "<html><head><script type=\"application/json\"><!-- --></script></head><body></body></html>",
Expected: "<html><head><script type=\"application/json\"></script></head><body></body></html>",
},
{
Desc: "Script with empty comment and variables",
Input: "<html><head><script type=\"application/json\"><!-- -->var a = 1;</script></head><body></body></html>",
Expected: "<html><head><script type=\"application/json\">var a = 1;</script></head><body></body></html>",
},
{
Desc: "Script with too many comments",
Input: "<html><head><script type=\"application/json\"><!-- comment -->var a = 1;<!--[if IE]>It's an IE<![endif]-->var b = \"hello\";var c = false;</script></head><body></body></html>",
Expected: "<html><head><script type=\"application/json\">var a = 1;var b = \"hello\";var c = false;</script></head><body></body></html>",
},
{
Desc: "Too many comments V2",
Input: "<html><head><script type=\"application/json\"><!-- comment -->var a = 1;<!--[if IE]>It's an IE<![endif]-->var b = \"hello\";var c = false;<!-- hello -->var d = \"foo\";<!--[if !IE]>One more conditional comment<![endif]-->var f = \"<!-- comment variable -->\";</script>",
Expected: "<html><head><script type=application/json>var a = 1;var b = \"hello\";var c = false;var d = \"foo\";var f = \"<!-- comment variable -->\";</script></head><body></body></html>",
},
{
Desc: "Simple JSON",
Input: "<html><head><script type=\"application/json\"><!-- comment -->{'a': 'b', 'c': true, 'd': 123}</script>",
Expected: "<html><head><script type=application/json>{'a': 'b', 'c': true, 'd': 123}</script></head><body></body></html>",
},
{
Desc: "JSON with comment as string variable",
Input: "<html><head><script type=\"application/ld+json\"><!-- comment -->{'a': 'b', 'c': true, 'd': '<!-- should preserve -->'}</script>",
Expected: "<html><head><script type=application/ld+json>{'a': 'b', 'c': true, 'd': '<!-- should preserve -->'}</script></head><body></body></html>",
},
{
Desc: "JSON with double quoted values",
Input: "<script type=\"application/ld+json\"><!-- comment -->{\"a\": \"b\", \"c\": true, \"d\": \"<!-- should preserve -->\"}</script>",
Expected: "<html><head><script type=application/ld+json>{\"a\": \"b\", \"c\": true, \"d\": \"<!-- should preserve -->\"}</script></head><body></body></html>",
},
{
Desc: "JSON with too many escape chars",
Input: "<html><head><script type=\"application/ld+json\"><!-- comment -->{\"a\": \"\\\\\"b\", \"c\": true, \"d\": \"<!-- should preserve -->\"}</script>",
Expected: "<html><head><script type=application/ld+json>{\"a\": \"\\\\\"b\", \"c\": true, \"d\": \"<!-- should preserve -->\"}</script></head><body></body></html>",
},
{
Desc: "JSON with multiple comments",
Input: "<html><head><script type=\"application/json\"><!-- comment -->{\"a\": \"b\", \"c\": true, \"d\": \"<!-- should preserve -->\", <!-- more comments --> e: 123}</script>",
Expected: "<html><head><script type=application/json>{\"a\": \"b\", \"c\": true, \"d\": \"<!-- should preserve -->\", e: 123}</script></head><body></body></html>",
},
}

for _, tc := range tcs {
inputDoc, err := html.Parse(strings.NewReader(tc.Input))
if err != nil {
t.Errorf("%s: html.Parse failed %q", tc.Input, err)
continue
}
inputDOM, err := amphtml.NewDOM(inputDoc)
if err != nil {
t.Errorf("%s\namphtml.NewDOM for %s failed %q", tc.Input, err)
continue
}
transformers.StripScriptComments(&transformers.Context{DOM: inputDOM})
var input strings.Builder
if err = html.Render(&input, inputDoc); err != nil {
t.Errorf("%s: html.Render failed %q", tc.Input, err)
continue
}
expectedDoc, err := html.Parse(strings.NewReader(tc.Expected))
if err != nil {
t.Errorf("%s: html.Parse failed %q", tc.Expected, err)
continue
}
var expected strings.Builder
err = html.Render(&expected, expectedDoc)
if err != nil {
t.Errorf("%s: html.Render failed %q", tc.Expected, err)
continue
}
if input.String() != expected.String() {
t.Errorf("%s: Transform\n%q\nwant=\n%q", tc.Desc, &input, &expected)
}
}
}

0 comments on commit 0cb645c

Please sign in to comment.