Skip to content

Commit

Permalink
Scan for JS code also in CSS comments
Browse files Browse the repository at this point in the history
The `Cleaner()` now scans for hidden JavaScript code embedded
within CSS comments. In certain contexts, such as within `<svg>`
or `<math>` tags, `<style>` tags may lose their intended function,
allowing comments like `/* foo */` to potentially be executed by
the browser.
  • Loading branch information
frenzymadness committed Nov 14, 2024
1 parent dcbc163 commit 3b644e9
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 14 deletions.
33 changes: 19 additions & 14 deletions lxml_html_clean/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -581,22 +581,27 @@ def _has_sneaky_javascript(self, style):
that and remove only the Javascript from the style; this catches
more sneaky attempts.
"""
style = self._substitute_comments('', style)
style = style.replace('\\', '')
style = _substitute_whitespace('', style)
style = style.lower()
if _has_javascript_scheme(style):
return True
if 'expression(' in style:
return True
if '@import' in style:
return True
if '</noscript' in style:
# e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
return True
if _looks_like_tag_content(style):
# e.g. '<math><style><img src=x onerror=alert(1)></style></math>'
return True

for with_comments in True, False:
if not with_comments:
style = self._substitute_comments('', style)

style = style.replace('\\', '')

if _has_javascript_scheme(style):
return True
if 'expression(' in style:
return True
if '@import' in style:
return True
if '</noscript' in style:
# e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
return True
if _looks_like_tag_content(style):
# e.g. '<math><style><img src=x onerror=alert(1)></style></math>'
return True
return False

def clean_html(self, html):
Expand Down
17 changes: 17 additions & 0 deletions tests/test_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,23 @@ def test_sneaky_js_in_math_style(self):
b'<math><style>/* deleted */</style></math>',
lxml.html.tostring(clean_html(s)))

def test_sneaky_js_in_style_comment_math_svg(self):
for tag in "svg", "math":
html = f'<{tag}><style>/*<img src onerror=alert(origin)>*/'
s = lxml.html.fragment_fromstring(html)

self.assertEqual(
f'<{tag}><style>/* deleted */</style></{tag}>'.encode(),
lxml.html.tostring(clean_html(s)))

def test_sneaky_js_in_style_comment_noscript(self):
html = '<noscript><style>/*</noscript><img src onerror=alert(origin)>*/'
s = lxml.html.fragment_fromstring(html)

self.assertEqual(
b'<noscript><style>/* deleted */</style></noscript>',
lxml.html.tostring(clean_html(s)))

def test_sneaky_import_in_style(self):
# Prevent "@@importimport" -> "@import" replacement etc.
style_codes = [
Expand Down

0 comments on commit 3b644e9

Please sign in to comment.