-
Notifications
You must be signed in to change notification settings - Fork 285
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
squash! Fix #72: rewrite the sanitizer to be a treewalker filter only.
Undoes deletion of the testsuite
- Loading branch information
Showing
1 changed file
with
132 additions
and
28 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,30 +1,134 @@ | ||
from __future__ import absolute_import, division, unicode_literals | ||
|
||
import json | ||
|
||
from .support import get_data_files | ||
|
||
from html5lib import parseFragment, serialize | ||
|
||
|
||
def runSanitizerTest(name, input, expected): | ||
parsed = parseFragment(input) | ||
serialized = serialize(parsed, | ||
sanitize=True, | ||
omit_optional_tags=False, | ||
use_trailing_solidus=True, | ||
space_before_trailing_solidus=False, | ||
quote_attr_values=True, | ||
quote_char="'") | ||
errorMsg = "\n".join(["\n\nInput:", input, | ||
"\nExpected:", expected, | ||
"\nReceived:", serialized]) | ||
assert expected == serialized, errorMsg | ||
|
||
|
||
def testSanitizer(): | ||
for filename in get_data_files('sanitizer', '*.dat'): | ||
with open(filename) as fp: | ||
tests = json.load(fp) | ||
for test in tests: | ||
yield runSanitizerTest, test["name"], test["input"], test["output"] | ||
try: | ||
import json | ||
except ImportError: | ||
import simplejson as json | ||
|
||
from html5lib import html5parser, sanitizer, constants, treebuilders | ||
|
||
|
||
def toxmlFactory(): | ||
tree = treebuilders.getTreeBuilder("etree") | ||
|
||
def toxml(element): | ||
# encode/decode roundtrip required for Python 2.6 compatibility | ||
result_bytes = tree.implementation.tostring(element, encoding="utf-8") | ||
return result_bytes.decode("utf-8") | ||
|
||
return toxml | ||
|
||
|
||
def runSanitizerTest(name, expected, input, toxml=None): | ||
if toxml is None: | ||
toxml = toxmlFactory() | ||
expected = ''.join([toxml(token) for token in html5parser.HTMLParser(). | ||
parseFragment(expected)]) | ||
expected = json.loads(json.dumps(expected)) | ||
assert expected == sanitize_html(input) | ||
|
||
|
||
def sanitize_html(stream, toxml=None): | ||
if toxml is None: | ||
toxml = toxmlFactory() | ||
return ''.join([toxml(token) for token in | ||
html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer). | ||
parseFragment(stream)]) | ||
|
||
|
||
def test_should_handle_astral_plane_characters(): | ||
assert '<html:p xmlns:html="http://www.w3.org/1999/xhtml">\U0001d4b5 \U0001d538</html:p>' == sanitize_html("<p>𝒵 𝔸</p>") | ||
|
||
|
||
def test_should_allow_relative_uris(): | ||
assert '<html:p xmlns:html="http://www.w3.org/1999/xhtml"><html:a href="/example.com" /></html:p>' == sanitize_html('<p><a href="/example.com"></a></p>') | ||
|
||
|
||
def test_sanitizer(): | ||
toxml = toxmlFactory() | ||
for tag_name in sanitizer.HTMLSanitizer.allowed_elements: | ||
if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr']: | ||
continue # TODO | ||
if tag_name != tag_name.lower(): | ||
continue # TODO | ||
if tag_name == 'image': | ||
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, | ||
"<img title=\"1\"/>foo <bad>bar</bad> baz", | ||
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), | ||
toxml) | ||
elif tag_name == 'br': | ||
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, | ||
"<br title=\"1\"/>foo <bad>bar</bad> baz<br/>", | ||
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), | ||
toxml) | ||
elif tag_name in constants.voidElements: | ||
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, | ||
"<%s title=\"1\"/>foo <bad>bar</bad> baz" % tag_name, | ||
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), | ||
toxml) | ||
else: | ||
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, | ||
"<%s title=\"1\">foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), | ||
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), | ||
toxml) | ||
|
||
for tag_name in sanitizer.HTMLSanitizer.allowed_elements: | ||
tag_name = tag_name.upper() | ||
yield (runSanitizerTest, "test_should_forbid_%s_tag" % tag_name, | ||
"<%s title=\"1\">foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), | ||
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), | ||
toxml) | ||
|
||
for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes: | ||
if attribute_name != attribute_name.lower(): | ||
continue # TODO | ||
if attribute_name == 'style': | ||
continue | ||
attribute_value = 'foo' | ||
if attribute_name in sanitizer.HTMLSanitizer.attr_val_is_uri: | ||
attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.HTMLSanitizer.allowed_protocols[0] | ||
yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name, | ||
"<p %s=\"%s\">foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value), | ||
"<p %s='%s'>foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value), | ||
toxml) | ||
|
||
for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes: | ||
attribute_name = attribute_name.upper() | ||
yield (runSanitizerTest, "test_should_forbid_%s_attribute" % attribute_name, | ||
"<p>foo <bad>bar</bad> baz</p>", | ||
"<p %s='display: none;'>foo <bad>bar</bad> baz</p>" % attribute_name, | ||
toxml) | ||
|
||
for protocol in sanitizer.HTMLSanitizer.allowed_protocols: | ||
rest_of_uri = '//sub.domain.tld/path/object.ext' | ||
if protocol == 'data': | ||
rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ=' | ||
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol, | ||
"<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri), | ||
"""<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri), | ||
toxml) | ||
|
||
yield (runSanitizerTest, "test_invalid_data_uri", | ||
"<audio controls=\"\"></audio>", | ||
"<audio controls=\"\" src=\"data:foobar\"></audio>", | ||
toxml) | ||
|
||
yield (runSanitizerTest, "test_invalid_ipv6_url", | ||
"<a>", | ||
"<a href=\"h://]\">", | ||
toxml) | ||
|
||
yield (runSanitizerTest, "test_data_uri_disallowed_type", | ||
"<audio controls=\"\"></audio>", | ||
"<audio controls=\"\" src=\"data:text/html,<html>\"></audio>", | ||
toxml) | ||
|
||
for protocol in sanitizer.HTMLSanitizer.allowed_protocols: | ||
rest_of_uri = '//sub.domain.tld/path/object.ext' | ||
if protocol == 'data': | ||
rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ=' | ||
protocol = protocol.upper() | ||
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol, | ||
"<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri), | ||
"""<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri), | ||
toxml) |