Skip to content

Commit

Permalink
Raise warning for unstable URL parsing
Browse files Browse the repository at this point in the history
`urllib.parse` does not perform any input validation so
its output might be invalid as well. Because host_whitelist
functionality relies on hostnames parsed from URLs, the result of
`urlsplit` is newly compared with the result of simple regex
parser and If they differ, a warning is raised.
  • Loading branch information
frenzymadness committed Oct 9, 2024
1 parent 8ce436d commit 8b3c612
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 3 deletions.
4 changes: 3 additions & 1 deletion lxml_html_clean/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,7 @@ from .clean import (
autolink as autolink,
autolink_html as autolink_html,
word_break as word_break,
word_break_html as word_break_html
word_break_html as word_break_html,
LXMLHTMLCleanWarning as LXMLHTMLCleanWarning,
AmbiguousURLWarning as AmbiguousURLWarning,
)
38 changes: 37 additions & 1 deletion lxml_html_clean/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import re
from collections import deque
from urllib.parse import urlsplit, unquote_plus
import warnings

from lxml import etree
from lxml.html import defs
Expand All @@ -18,7 +19,7 @@


__all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
'word_break', 'word_break_html']
'word_break', 'word_break_html', 'LXMLHTMLCleanWarning', 'AmbiguousURLWarning']

# Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl
# Particularly the CSS cleaning; most of the tag cleaning is integrated now
Expand Down Expand Up @@ -100,6 +101,33 @@ def fromstring(string):
return lxml_fromstring(_ascii_control_characters.sub("", string))


# This regular expression is inspired by the one in urllib3.
_URI_RE = re.compile(
r"^(?:(?P<scheme>[a-zA-Z][a-zA-Z0-9+.-]*[a-zA-Z0-9]):)?"
r"(?://(?P<authority>[^\\/?#]*))?"
r"(?P<path>[^?#]*)"
r"(?:\?(?P<query_string>[^#]*))?"
r"(?:#(?P<fragment>.*))?$",
re.UNICODE,
)


def _get_authority_from_url(url):
match = _URI_RE.match(url)
if match:
return match.group("authority")
else:
return None


class LXMLHTMLCleanWarning(Warning):
pass


class AmbiguousURLWarning(LXMLHTMLCleanWarning):
pass


class Cleaner:
"""
Instances cleans the document of each of the possible offending
Expand Down Expand Up @@ -499,6 +527,14 @@ def allow_embedded_url(self, el, url):
parts = urlsplit(url)
if parts.scheme not in ('http', 'https'):
return False

authority = _get_authority_from_url(url)
if (parts.netloc or authority) and parts.netloc != authority:
warnings.warn(f"It's impossible to parse the hostname from URL: '{url}'! "
"URL is not allowed because parsers returned ambiguous results.",
AmbiguousURLWarning)
return False

if parts.hostname in self.host_whitelist:
return True
return False
Expand Down
11 changes: 11 additions & 0 deletions lxml_html_clean/clean.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,17 @@ _DT = TypeVar("_DT", str, bytes, HtmlElement)
_ET_DT = TypeVar("_ET_DT", str, bytes, HtmlElement, _ElementTree[HtmlElement])


def _get_authority_from_url(url: str) -> str | None: ...


class LXMLHTMLCleanWarning(Warning):
pass


class AmbiguousURLWarning(LXMLHTMLCleanWarning):
pass


class Cleaner:
@overload # allow_tags present, remove_unknown_tags must be False
def __init__(
Expand Down
15 changes: 14 additions & 1 deletion tests/test_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
import gzip
import io
import unittest
import warnings

import lxml.html
from lxml_html_clean import Cleaner, clean_html
from lxml_html_clean import AmbiguousURLWarning, Cleaner, clean_html, LXMLHTMLCleanWarning
from .utils import peak_memory_usage


Expand Down Expand Up @@ -346,3 +347,15 @@ def test_memory_usage_many_elements_with_long_tails(self):
mem = peak_memory_usage(cleaner.clean_html, html)

self.assertTrue(mem < 10, f"Used {mem} MiB memory, expected at most 10 MiB")

def test_possibly_invalid_url(self):
cleaner = Cleaner(host_whitelist=['google.com'])
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
result = cleaner.clean_html(r"<p><iframe src='http://example.com:\@google.com'> </iframe></p>")
self.assertGreaterEqual(len(w), 1)
self.assertIs(w[-1].category, AmbiguousURLWarning)
self.assertTrue(issubclass(w[-1].category, LXMLHTMLCleanWarning))
self.assertIn("impossible to parse the hostname", str(w[-1].message))
self.assertNotIn("google.com", result)
self.assertNotIn("example.com", result)

0 comments on commit 8b3c612

Please sign in to comment.