-
Notifications
You must be signed in to change notification settings - Fork 55
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Transformer to strip HTML style comments from script tags.
Examples: <script><!-- Hello --></script> == <script></script> <script><!--[if IE]>var ie = true;<![endif]--></script> == <script></script> PiperOrigin-RevId: 263786190
- Loading branch information
1 parent
3860a2a
commit 514f73a
Showing
4 changed files
with
319 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,174 @@ | ||
package transformers | ||
|
||
import ( | ||
"strings" | ||
) | ||
|
||
import ( | ||
"github.com/ampproject/amppackager/transformer/internal/htmlnode" | ||
"golang.org/x/net/html" | ||
"golang.org/x/net/html/atom" | ||
) | ||
|
||
// Strips html style comments from all the <script> elements that are child | ||
// nodes of the given node. Example: | ||
// <script><!-- var a = 1; --></script> transforms to empty script node. | ||
// <script><!--[if IE]-->var a=1;<![endif]--></script> transforms to empty | ||
// script tag. | ||
// | ||
// Attributes and any content outside comment are preserved. | ||
// <script type="text/javascript"><!-- hello -->var a = 0;</script> | ||
// transforms to | ||
// <script type="text/javascript">var a = 0;</script> | ||
// | ||
// Any comment that looks like a comment but is a string variable is preserved. | ||
// <script>var comment = "<!-- comment -->";</script> is not modified. | ||
func StripScriptComments(e *Context) error { | ||
for n := e.DOM.RootNode; n != nil; n = htmlnode.Next(n) { | ||
if n.Type != html.ElementNode { | ||
continue | ||
} | ||
|
||
if n.DataAtom == atom.Script { | ||
typeVal, typeOk := htmlnode.GetAttributeVal(n, "", "type") | ||
if typeOk && (typeVal == "application/json" || | ||
typeVal == "application/ld+json") { | ||
contents := n.FirstChild | ||
if contents == nil || len(contents.Data) == 0 { | ||
return nil | ||
} | ||
|
||
contents.Data = stripComments(&contents.Data) | ||
} | ||
} | ||
} | ||
|
||
return nil | ||
} | ||
|
||
type commentStripper struct { | ||
Content string | ||
index int | ||
insideQuotes bool | ||
quoteChar byte | ||
} | ||
|
||
func (r *commentStripper) readNextByte() byte { | ||
r.index = r.index + 1 | ||
if r.index >= len(r.Content) { | ||
return 0 | ||
} | ||
return r.Content[r.index] | ||
} | ||
|
||
func (r *commentStripper) readByteAt(i int) byte { | ||
if i < 0 || i >= len(r.Content) { | ||
return 0 | ||
} | ||
|
||
return r.Content[i] | ||
} | ||
|
||
func (r *commentStripper) isPreviousEscapeChar() bool { | ||
previous := r.index - 1 | ||
c := r.readByteAt(previous) | ||
if c != '\\' { | ||
return false | ||
} | ||
|
||
// Check if '\' is escaped by a preceding '\', consume all preceding '\'. | ||
for { | ||
previous = previous - 1 | ||
c := r.readByteAt(previous) | ||
if c != '\\' { | ||
break | ||
} | ||
} | ||
|
||
return (r.index - previous) % 2 != 0 | ||
} | ||
|
||
func (r *commentStripper) isQuoteChar(c byte) bool { | ||
return c == '"' || c == '\'' | ||
} | ||
|
||
func (r *commentStripper) skipComment() bool { | ||
cdataStyleComment := false | ||
// Checks initial !-- characters. | ||
if r.readByteAt(r.index + 1) != '!' && r.readByteAt(r.index + 2) != '-' && r.readByteAt(r.index + 3) != '-' { | ||
return false | ||
} | ||
|
||
// Skips the '!--' chars. | ||
r.index = r.index + 3 | ||
|
||
// Checks if comment is of [CDATA] format. | ||
// <!--[condition]>, we are interested in <!--[ in which case comment | ||
// will end with ]--> | ||
if r.readByteAt(r.index + 1) == '[' { | ||
cdataStyleComment = true | ||
} | ||
|
||
// Skip until we encounter --> | ||
for { | ||
c := r.readNextByte() | ||
if c == 0 { | ||
// Skip everything until eof. | ||
return true | ||
} | ||
|
||
if c == '-' { | ||
if cdataStyleComment { | ||
if r.readByteAt(r.index - 1) != ']' { | ||
continue | ||
} | ||
} | ||
|
||
if r.readByteAt(r.index + 1) == '-' && r.readByteAt(r.index + 2) == '>' { | ||
r.index = r.index + 2 | ||
return true | ||
} | ||
} | ||
} | ||
} | ||
|
||
func (r *commentStripper) stripComments() string { | ||
var buffer strings.Builder | ||
for { | ||
c := r.readNextByte() | ||
if c == 0 { | ||
break | ||
} | ||
|
||
if r.insideQuotes { | ||
if r.quoteChar == c && !r.isPreviousEscapeChar() { | ||
r.insideQuotes = false | ||
r.quoteChar = 0 | ||
} | ||
buffer.WriteByte(c) | ||
continue | ||
|
||
} | ||
|
||
if r.isQuoteChar(c) { | ||
r.insideQuotes = true | ||
r.quoteChar = c | ||
buffer.WriteByte(c) | ||
continue | ||
} | ||
|
||
// Normal character. | ||
if c == '<' && r.skipComment() { | ||
continue | ||
} | ||
|
||
buffer.WriteByte(c) | ||
} | ||
|
||
return buffer.String() | ||
} | ||
|
||
func stripComments(ss *string) string { | ||
cs := commentStripper{Content: *ss, insideQuotes: false, index: -1} | ||
return cs.stripComments() | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
package transformers_test | ||
|
||
import ( | ||
"strings" | ||
"testing" | ||
) | ||
|
||
import ( | ||
"github.com/ampproject/amppackager/transformer/internal/amphtml" | ||
"github.com/ampproject/amppackager/transformer/transformers" | ||
"golang.org/x/net/html" | ||
tt "github.com/ampproject/amppackager/transformer/internal/testing" | ||
) | ||
|
||
// TODO(amaltas): WIP, Add lot of tests. | ||
func TestStripScriptComments(t *testing.T) { | ||
tcs := []tt.TestCase{ | ||
{ | ||
Desc: "No comments in script", | ||
Input: "<html><head><script type=\"application/json\">alert('hi');</script></head><body></body></html>", | ||
Expected: "<html><head><script type=\"application/json\">alert('hi');</script></head><body></body></html>", | ||
}, | ||
{ | ||
Desc: "Basic html comment in script", | ||
Input: "<html><head><script type=\"application/json\"><!--alert('hi');--></script></head><body></body></html>", | ||
Expected: "<html><head><script type=\"application/json\"></script></head><body></body></html>", | ||
}, | ||
{ | ||
Desc: "Basic html comment with condition", | ||
Input: "<html><head><script type=\"application/json\"><!--[if IE]>alert('hi');<![endif]--></script></head><body></body></html>", | ||
Expected: "<html><head><script type=\"application/json\"></script></head><body></body></html>", | ||
}, | ||
{ | ||
Desc: "Basic html comment with IE 10 condition", | ||
Input: "<html><head><script type=\"application/json\"><!--[if gte IE 10]>alert('hi');<!--<![endif]--></script></head><body></body></html>", | ||
Expected: "<html><head><script type=\"application/json\"></script></head><body></body></html>", | ||
}, | ||
{ | ||
Desc: "Basic html comment with different syntax", | ||
Input: "<html><head><script type=\"application/json\"><!--[if gte IE 10]><!-->alert('hi');<!--<![endif]--></script></head><body></body></html>", | ||
Expected: "<html><head><script type=\"application/json\"></script></head><body></body></html>", | ||
}, | ||
{ | ||
Desc: "Script with comment inside string variable", | ||
Input: "<html><head><script type=\"application/json\">var k = \"<!--[if !IE]> -->alert('hi');<!-- <![endif]-->\";</script></head><body></body></html>", | ||
Expected: "<html><head><script type=\"application/json\">var k = \"<!--[if !IE]> -->alert('hi');<!-- <![endif]-->\";</script></head><body></body></html>", | ||
}, | ||
{ | ||
Desc: "Script with comment inside string variable escaped slashes", | ||
Input: "<html><head><script type=\"application/json\">var k = \"<!--[if !IE]> -->alert(\\\"hi\\\");<!-- <![endif]-->\";</script></head><body></body></html>", | ||
Expected: "<html><head><script type=\"application/json\">var k = \"<!--[if !IE]> -->alert(\\\"hi\\\");<!-- <![endif]-->\";</script></head><body></body></html>", | ||
}, | ||
{ | ||
Desc: "Script with comment containing quotes", | ||
Input: "<html><head><script type=\"application/json\">var k = \"<!--[if !IE]> -->alert(\"hi\");<!-- <![endif]-->\";</script></head><body></body></html>", | ||
Expected: "<html><head><script type=\"application/json\">var k = \"<!--[if !IE]> -->alert(\"hi\");<!-- <![endif]-->\";</script></head><body></body></html>", | ||
}, | ||
{ | ||
Desc: "Script with comment and some valid content", | ||
Input: "<html><head><script type=\"application/json\"><!--[if !IE]>alert('hi');<![endif]-->var a = 1; var k = \"hi\";</script></head><body></body></html>", | ||
Expected: "<html><head><script type=\"application/json\">var a = 1; var k = \"hi\";</script></head><body></body></html>", | ||
}, | ||
{ | ||
Desc: "Script with empty comment", | ||
Input: "<html><head><script type=\"application/json\"><!-- --></script></head><body></body></html>", | ||
Expected: "<html><head><script type=\"application/json\"></script></head><body></body></html>", | ||
}, | ||
{ | ||
Desc: "Script with empty comment and variables", | ||
Input: "<html><head><script type=\"application/json\"><!-- -->var a = 1;</script></head><body></body></html>", | ||
Expected: "<html><head><script type=\"application/json\">var a = 1;</script></head><body></body></html>", | ||
}, | ||
{ | ||
Desc: "Script with too many comments", | ||
Input: "<html><head><script type=\"application/json\"><!-- comment -->var a = 1;<!--[if IE]>It's an IE<![endif]-->var b = \"hello\";var c = false;</script></head><body></body></html>", | ||
Expected: "<html><head><script type=\"application/json\">var a = 1;var b = \"hello\";var c = false;</script></head><body></body></html>", | ||
}, | ||
{ | ||
Desc: "Too many comments V2", | ||
Input: "<html><head><script type=\"application/json\"><!-- comment -->var a = 1;<!--[if IE]>It's an IE<![endif]-->var b = \"hello\";var c = false;<!-- hello -->var d = \"foo\";<!--[if !IE]>One more conditional comment<![endif]-->var f = \"<!-- comment variable -->\";</script>", | ||
Expected: "<html><head><script type=application/json>var a = 1;var b = \"hello\";var c = false;var d = \"foo\";var f = \"<!-- comment variable -->\";</script></head><body></body></html>", | ||
}, | ||
{ | ||
Desc: "Simple JSON", | ||
Input: "<html><head><script type=\"application/json\"><!-- comment -->{'a': 'b', 'c': true, 'd': 123}</script>", | ||
Expected: "<html><head><script type=application/json>{'a': 'b', 'c': true, 'd': 123}</script></head><body></body></html>", | ||
}, | ||
{ | ||
Desc: "JSON with comment as string variable", | ||
Input: "<html><head><script type=\"application/ld+json\"><!-- comment -->{'a': 'b', 'c': true, 'd': '<!-- should preserve -->'}</script>", | ||
Expected: "<html><head><script type=application/ld+json>{'a': 'b', 'c': true, 'd': '<!-- should preserve -->'}</script></head><body></body></html>", | ||
}, | ||
{ | ||
Desc: "JSON with double quoted values", | ||
Input: "<script type=\"application/ld+json\"><!-- comment -->{\"a\": \"b\", \"c\": true, \"d\": \"<!-- should preserve -->\"}</script>", | ||
Expected: "<html><head><script type=application/ld+json>{\"a\": \"b\", \"c\": true, \"d\": \"<!-- should preserve -->\"}</script></head><body></body></html>", | ||
}, | ||
{ | ||
Desc: "JSON with too many escape chars", | ||
Input: "<html><head><script type=\"application/ld+json\"><!-- comment -->{\"a\": \"\\\\\"b\", \"c\": true, \"d\": \"<!-- should preserve -->\"}</script>", | ||
Expected: "<html><head><script type=application/ld+json>{\"a\": \"\\\\\"b\", \"c\": true, \"d\": \"<!-- should preserve -->\"}</script></head><body></body></html>", | ||
}, | ||
{ | ||
Desc: "JSON with multiple comments", | ||
Input: "<html><head><script type=\"application/json\"><!-- comment -->{\"a\": \"b\", \"c\": true, \"d\": \"<!-- should preserve -->\", <!-- more comments --> e: 123}</script>", | ||
Expected: "<html><head><script type=application/json>{\"a\": \"b\", \"c\": true, \"d\": \"<!-- should preserve -->\", e: 123}</script></head><body></body></html>", | ||
}, | ||
} | ||
|
||
for _, tc := range tcs { | ||
inputDoc, err := html.Parse(strings.NewReader(tc.Input)) | ||
if err != nil { | ||
t.Errorf("%s: html.Parse failed %q", tc.Input, err) | ||
continue | ||
} | ||
inputDOM, err := amphtml.NewDOM(inputDoc) | ||
if err != nil { | ||
t.Errorf("%s\namphtml.NewDOM for %s failed %q", tc.Input, err) | ||
continue | ||
} | ||
transformers.StripScriptComments(&transformers.Context{DOM: inputDOM}) | ||
var input strings.Builder | ||
if err = html.Render(&input, inputDoc); err != nil { | ||
t.Errorf("%s: html.Render failed %q", tc.Input, err) | ||
continue | ||
} | ||
expectedDoc, err := html.Parse(strings.NewReader(tc.Expected)) | ||
if err != nil { | ||
t.Errorf("%s: html.Parse failed %q", tc.Expected, err) | ||
continue | ||
} | ||
var expected strings.Builder | ||
err = html.Render(&expected, expectedDoc) | ||
if err != nil { | ||
t.Errorf("%s: html.Render failed %q", tc.Expected, err) | ||
continue | ||
} | ||
if input.String() != expected.String() { | ||
t.Errorf("%s: Transform\n%q\nwant=\n%q", tc.Desc, &input, &expected) | ||
} | ||
} | ||
} |