diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py index afe2e0e2..23f6befe 100644 --- a/html5lib/serializer/htmlserializer.py +++ b/html5lib/serializer/htmlserializer.py @@ -184,6 +184,12 @@ def serialize(self, treewalker, encoding=None): if encoding and self.inject_meta_charset: from ..filters.inject_meta_charset import Filter treewalker = Filter(treewalker, encoding) + # Alphabetical attributes is here under the assumption that none of + # the later filters add or change order of attributes; it needs to be + # before the sanitizer so escaped elements come out correctly + if self.alphabetical_attributes: + from ..filters.alphabeticalattributes import Filter + treewalker = Filter(treewalker) # WhitespaceFilter should be used before OptionalTagFilter # for maximum efficiently of this latter filter if self.strip_whitespace: @@ -195,11 +201,6 @@ def serialize(self, treewalker, encoding=None): if self.omit_optional_tags: from ..filters.optionaltags import Filter treewalker = Filter(treewalker) - # Alphabetical attributes must be last, as other filters - # could add attributes and alter the order - if self.alphabetical_attributes: - from ..filters.alphabeticalattributes import Filter - treewalker = Filter(treewalker) for token in treewalker: type = token["type"] diff --git a/html5lib/tests/conftest.py b/html5lib/tests/conftest.py index 811aebbf..dceb94cc 100644 --- a/html5lib/tests/conftest.py +++ b/html5lib/tests/conftest.py @@ -2,11 +2,13 @@ from .tree_construction import TreeConstructionFile from .tokenizer import TokenizerFile +from .sanitizer import SanitizerFile _dir = os.path.abspath(os.path.dirname(__file__)) _testdata = os.path.join(_dir, "testdata") _tree_construction = os.path.join(_testdata, "tree-construction") _tokenizer = os.path.join(_testdata, "tokenizer") +_sanitizer_testdata = os.path.join(_dir, "sanitizer-testdata") def pytest_collectstart(): @@ -24,3 +26,6 @@ def pytest_collect_file(path, parent): elif dir == _tokenizer: if path.ext == ".test": return TokenizerFile(path, parent) + elif dir == _sanitizer_testdata: + if path.ext == ".dat": + return SanitizerFile(path, parent) diff --git a/html5lib/tests/sanitizer-testdata/tests1.dat b/html5lib/tests/sanitizer-testdata/tests1.dat new file mode 100644 index 00000000..74e88336 --- /dev/null +++ b/html5lib/tests/sanitizer-testdata/tests1.dat @@ -0,0 +1,433 @@ +[ + { + "name": "IE_Comments", + "input": "", + "output": "" + }, + + { + "name": "IE_Comments_2", + "input": "", + "output": "<script>alert('XSS');</script>" + }, + + { + "name": "allow_colons_in_path_component", + "input": "foo", + "output": "foo" + }, + + { + "name": "background_attribute", + "input": "
", + "output": "
" + }, + + { + "name": "bgsound", + "input": "", + "output": "<bgsound src=\"javascript:alert('XSS');\"></bgsound>" + }, + + { + "name": "div_background_image_unicode_encoded", + "input": "
foo
", + "output": "
foo
" + }, + + { + "name": "div_expression", + "input": "
foo
", + "output": "
foo
" + }, + + { + "name": "double_open_angle_brackets", + "input": "", + "output": "" + }, + + { + "name": "img_dynsrc_lowsrc", + "input": "", + "output": "" + }, + + { + "name": "img_vbscript", + "input": "", + "output": "" + }, + + { + "name": "input_image", + "input": "", + "output": "" + }, + + { + "name": "link_stylesheets", + "input": "", + "output": "<link href=\"javascript:alert('XSS');\" rel=\"stylesheet\">" + }, + + { + "name": "link_stylesheets_2", + "input": "", + "output": "<link href=\"http://ha.ckers.org/xss.css\" rel=\"stylesheet\">" + }, + + { + "name": "list_style_image", + "input": "
  • foo
  • ", + "output": "
  • foo
  • " + }, + + { + "name": "no_closing_script_tags", + "input": "", + "output": "<script src=\"http://ha.ckers.org/xss.js\" xss=\"\"></script>" + }, + + { + "name": "non_alpha_non_digit_2", + "input": "foo", + "output": "foo" + }, + + { + "name": "non_alpha_non_digit_3", + "input": "", + "output": "" + }, + + { + "name": "non_alpha_non_digit_II", + "input": "foo", + "output": "foo" + }, + + { + "name": "non_alpha_non_digit_III", + "input": "foo", + "output": "foo" + }, + + { + "name": "platypus", + "input": "never trust your upstream platypus", + "output": "never trust your upstream platypus" + }, + + { + "name": "protocol_resolution_in_script_tag", + "input": "", + "output": "<script src=\"//ha.ckers.org/.j\"></script>" + }, + + { + "name": "should_allow_anchors", + "input": "", + "output": "<script>baz</script>" + }, + + { + "name": "should_allow_image_alt_attribute", + "input": "foo", + "output": "foo" + }, + + { + "name": "should_allow_image_height_attribute", + "input": "", + "output": "" + }, + + { + "name": "should_allow_image_src_attribute", + "input": "", + "output": "" + }, + + { + "name": "should_allow_image_width_attribute", + "input": "", + "output": "" + }, + + { + "name": "should_handle_blank_text", + "input": "", + "output": "" + }, + + { + "name": "should_handle_malformed_image_tags", + "input": "\">", + "output": "<script>alert(\"XSS\")</script>\">" + }, + + { + "name": "should_handle_non_html", + "input": "abc", + "output": "abc" + }, + + { + "name": "should_not_fall_for_ridiculous_hack", + "input": "", + "output": "" + }, + + { + "name": "should_not_fall_for_xss_image_hack_0", + "input": "", + "output": "" + }, + + { + "name": "should_not_fall_for_xss_image_hack_1", + "input": "", + "output": "" + }, + + { + "name": "should_not_fall_for_xss_image_hack_10", + "input": "", + "output": "" + }, + + { + "name": "should_not_fall_for_xss_image_hack_11", + "input": "", + "output": "" + }, + + { + "name": "should_not_fall_for_xss_image_hack_12", + "input": "", + "output": "" + }, + + { + "name": "should_not_fall_for_xss_image_hack_13", + "input": "", + "output": "" + }, + + { + "name": "should_not_fall_for_xss_image_hack_14", + "input": "", + "output": "" + }, + + { + "name": "should_not_fall_for_xss_image_hack_2", + "input": "", + "output": "" + }, + + { + "name": "should_not_fall_for_xss_image_hack_3", + "input": "", + "output": "" + }, + + { + "name": "should_not_fall_for_xss_image_hack_4", + "input": "", + "output": "" + }, + + { + "name": "should_not_fall_for_xss_image_hack_5", + "input": "", + "output": "" + }, + + { + "name": "should_not_fall_for_xss_image_hack_6", + "input": "", + "output": "" + }, + + { + "name": "should_not_fall_for_xss_image_hack_7", + "input": "", + "output": "" + }, + + { + "name": "should_not_fall_for_xss_image_hack_8", + "input": "", + "output": "" + }, + + { + "name": "should_not_fall_for_xss_image_hack_9", + "input": "", + "output": "" + }, + + { + "name": "should_sanitize_half_open_scripts", + "input": "", + "output": "<script src=\"http://ha.ckers.org/xss.js\" xss=\"\"></script>" + }, + + { + "name": "should_sanitize_script_tag_with_multiple_open_brackets", + "input": "<", + "output": "<<script>alert(\"XSS\");//<</script>" + }, + + { + "name": "should_sanitize_script_tag_with_multiple_open_brackets_2", + "input": "