diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index 0f5b7cc5..507a3799 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -9,7 +9,12 @@ from xml.sax.saxutils import unescape from bleach import html5lib_shim -from bleach.utils import alphabetize_attributes, force_unicode +from bleach.utils import ( + _is_valid_netloc_and_port, + _parse_uri_scheme, + alphabetize_attributes, + force_unicode, +) #: List of allowed tags @@ -447,9 +452,20 @@ def sanitize_characters(self, token): return new_tokens def sanitize_uri_value(self, value, allowed_protocols): - """Checks a uri value to see if it's allowed + """Checks a URI value to see if it's allowed + + ``urllib.parse.urlparse`` must be able to parse the URI. + + The URI scheme must be in ``allowed_protocols`` or not have a + scheme and begin with a ``#`` indicating a relative URI by + fragment. + + When ``"http"`` is in ``allowed_protocols`` (the default), + ``sanitize_uri_value`` also allows relative URIs matching an + IP address or hostname and port (e.g. ``localhost:8000``) and + relative URIs without a scheme (e.g. ``/path``). - :arg value: the uri value to sanitize + :arg value: the URI value to sanitize :arg allowed_protocols: list of allowed protocols :returns: allowed value or None @@ -473,33 +489,35 @@ def sanitize_uri_value(self, value, allowed_protocols): new_value = new_value.lower() try: - # Drop attributes with uri values that have protocols that aren't - # allowed - parsed = urlparse(new_value) + _ = urlparse(new_value) except ValueError: # URI is impossible to parse, therefore it's not allowed return None - if parsed.scheme: - # If urlparse found a scheme, check that - if parsed.scheme in allowed_protocols: - return value + # If there's no protocol/scheme specified, then assume it's "http" + # and see if that's allowed + implicit_http_allowed = "http" in allowed_protocols + # Drop attributes with uri values that have protocols that aren't + # allowed + scheme = _parse_uri_scheme(new_value) + if scheme: + if scheme in allowed_protocols: + return value + elif implicit_http_allowed and _is_valid_netloc_and_port(scheme): + return value + else: + # parsed a disallowed protocol/scheme + # or implicit protocols are allowed and it's an invalid netloc:port + return None else: - # Allow uris that are just an anchor if new_value.startswith("#"): + # Allow uris that are just an anchor return value - - # Handle protocols that urlparse doesn't recognize like "myprotocol" - if ":" in new_value and new_value.split(":")[0] in allowed_protocols: - return value - - # If there's no protocol/scheme specified, then assume it's "http" - # and see if that's allowed - if "http" in allowed_protocols: + elif implicit_http_allowed: return value - - return None + else: + return None def allow_token(self, token): """Handles the case where we're allowing the tag""" diff --git a/bleach/utils.py b/bleach/utils.py index ad780d52..ac3aea57 100644 --- a/bleach/utils.py +++ b/bleach/utils.py @@ -1,7 +1,10 @@ from collections import OrderedDict +import re import six +from bleach._vendor.django.core.validators import URLValidator + def _attr_key(attr): """Returns appropriate key for sorting attribute names @@ -40,3 +43,52 @@ def force_unicode(text): # If not, convert it return six.text_type(text, "utf-8", "strict") + + +netloc_port_re = re.compile( + "^" + URLValidator.netloc_re + URLValidator.port_re + "$", re.IGNORECASE +) + + +# Characters valid in scheme names +scheme_chars = ( + "abcdefghijklmnopqrstuvwxyz" "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "0123456789" "+-." +) + + +def _is_valid_netloc_and_port(netloc): + """ + Returns the scheme for a URI or None when parsing the URI fails + + :arg str/unicode netloc: + + :returns: bool + + """ + # The maximum length of a full host name is 253 characters per RFC 1034 + # section 3.1. It's defined to be 255 bytes or less, but this includes + # one byte for the length of the name and one byte for the trailing dot + # that's used to indicate absolute names in DNS. + netloc = netloc_port_re.match(netloc) + return bool(netloc and len(netloc.group(0)) < 254) + + +def _parse_uri_scheme(uri): + """ + Returns the scheme for a URI or None when parsing the URI fails + + :arg str/unicode text: + + :returns: text or None + + """ + # replicate Python 3.9 urlparse scheme parsing for older Python versions + i = uri.find(":") + if i > 0: + scheme = uri[:i] + for c in uri[:i]: + if c not in scheme_chars: + break + return scheme + + return None diff --git a/tests/test_clean.py b/tests/test_clean.py index 7c565750..7fa7ec33 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -6,7 +6,7 @@ from bleach import clean from bleach.html5lib_shim import Filter -from bleach.sanitizer import Cleaner +from bleach.sanitizer import ALLOWED_PROTOCOLS, Cleaner from bleach._vendor.html5lib.constants import rcdataElements @@ -58,10 +58,6 @@ def test_html_is_lowercased(): ) -def test_invalid_uri_does_not_raise_error(): - assert clean('text') == "text" - - @pytest.mark.parametrize( "data, should_strip, expected", [ @@ -471,10 +467,31 @@ def test_attributes_list(): @pytest.mark.parametrize( "data, kwargs, expected", [ + # invalid URI (urlparse raises a ValueError: Invalid IPv6 URL) + # is not allowed by default + ( + 'text', + {"protocols": ALLOWED_PROTOCOLS}, + "text", + ), + # data protocol is not allowed by default + ( + 'foo', + {"protocols": ALLOWED_PROTOCOLS}, + "foo", + ), # javascript: is not allowed by default - ("xss", {}, "xss"), + ( + "xss", + {"protocols": ALLOWED_PROTOCOLS}, + "xss", + ), # File protocol is not allowed by default - ('foo', {}, "foo"), + ( + 'foo', + {"protocols": ALLOWED_PROTOCOLS}, + "foo", + ), # Specified protocols are allowed ( 'allowed href', @@ -494,6 +511,11 @@ def test_attributes_list(): 'foo', ), # Allow implicit http if allowed + ( + 'valid', + {"protocols": ["http"]}, + 'valid', + ), ( 'valid', {"protocols": ["http"]}, @@ -524,6 +546,11 @@ def test_attributes_list(): {"protocols": ["http"]}, 'valid', ), + ( + 'valid', + {"protocols": ["http"]}, + 'valid', + ), # Disallow implicit http if disallowed ('foo', {"protocols": []}, "foo"), ('foo', {"protocols": []}, "foo"),