diff --git a/lxml_html_clean/clean.py b/lxml_html_clean/clean.py index a71da81..36205e9 100644 --- a/lxml_html_clean/clean.py +++ b/lxml_html_clean/clean.py @@ -581,22 +581,27 @@ def _has_sneaky_javascript(self, style): that and remove only the Javascript from the style; this catches more sneaky attempts. """ - style = self._substitute_comments('', style) - style = style.replace('\\', '') style = _substitute_whitespace('', style) style = style.lower() - if _has_javascript_scheme(style): - return True - if 'expression(' in style: - return True - if '@import' in style: - return True - if '' - return True + + for with_comments in True, False: + if not with_comments: + style = self._substitute_comments('', style) + + style = style.replace('\\', '') + + if _has_javascript_scheme(style): + return True + if 'expression(' in style: + return True + if '@import' in style: + return True + if '' + return True return False def clean_html(self, html): diff --git a/tests/test_clean.py b/tests/test_clean.py index 8c9bc20..5e844b9 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -127,6 +127,23 @@ def test_sneaky_js_in_math_style(self): b'', lxml.html.tostring(clean_html(s))) + def test_sneaky_js_in_style_comment_math_svg(self): + for tag in "svg", "math": + html = f'<{tag}>'.encode(), + lxml.html.tostring(clean_html(s))) + + def test_sneaky_js_in_style_comment_noscript(self): + html = '', + lxml.html.tostring(clean_html(s))) + def test_sneaky_import_in_style(self): # Prevent "@@importimport" -> "@import" replacement etc. style_codes = [