diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
index 0f5b7cc5..507a3799 100644
--- a/bleach/sanitizer.py
+++ b/bleach/sanitizer.py
@@ -9,7 +9,12 @@
from xml.sax.saxutils import unescape
from bleach import html5lib_shim
-from bleach.utils import alphabetize_attributes, force_unicode
+from bleach.utils import (
+ _is_valid_netloc_and_port,
+ _parse_uri_scheme,
+ alphabetize_attributes,
+ force_unicode,
+)
#: List of allowed tags
@@ -447,9 +452,20 @@ def sanitize_characters(self, token):
return new_tokens
def sanitize_uri_value(self, value, allowed_protocols):
- """Checks a uri value to see if it's allowed
+ """Checks a URI value to see if it's allowed
+
+ ``urllib.parse.urlparse`` must be able to parse the URI.
+
+ The URI scheme must be in ``allowed_protocols`` or not have a
+ scheme and begin with a ``#`` indicating a relative URI by
+ fragment.
+
+ When ``"http"`` is in ``allowed_protocols`` (the default),
+ ``sanitize_uri_value`` also allows relative URIs matching an
+ IP address or hostname and port (e.g. ``localhost:8000``) and
+ relative URIs without a scheme (e.g. ``/path``).
- :arg value: the uri value to sanitize
+ :arg value: the URI value to sanitize
:arg allowed_protocols: list of allowed protocols
:returns: allowed value or None
@@ -473,33 +489,35 @@ def sanitize_uri_value(self, value, allowed_protocols):
new_value = new_value.lower()
try:
- # Drop attributes with uri values that have protocols that aren't
- # allowed
- parsed = urlparse(new_value)
+ _ = urlparse(new_value)
except ValueError:
# URI is impossible to parse, therefore it's not allowed
return None
- if parsed.scheme:
- # If urlparse found a scheme, check that
- if parsed.scheme in allowed_protocols:
- return value
+ # If there's no protocol/scheme specified, then assume it's "http"
+ # and see if that's allowed
+ implicit_http_allowed = "http" in allowed_protocols
+ # Drop attributes with uri values that have protocols that aren't
+ # allowed
+ scheme = _parse_uri_scheme(new_value)
+ if scheme:
+ if scheme in allowed_protocols:
+ return value
+ elif implicit_http_allowed and _is_valid_netloc_and_port(scheme):
+ return value
+ else:
+ # parsed a disallowed protocol/scheme
+ # or implicit protocols are allowed and it's an invalid netloc:port
+ return None
else:
- # Allow uris that are just an anchor
if new_value.startswith("#"):
+ # Allow uris that are just an anchor
return value
-
- # Handle protocols that urlparse doesn't recognize like "myprotocol"
- if ":" in new_value and new_value.split(":")[0] in allowed_protocols:
- return value
-
- # If there's no protocol/scheme specified, then assume it's "http"
- # and see if that's allowed
- if "http" in allowed_protocols:
+ elif implicit_http_allowed:
return value
-
- return None
+ else:
+ return None
def allow_token(self, token):
"""Handles the case where we're allowing the tag"""
diff --git a/bleach/utils.py b/bleach/utils.py
index ad780d52..ac3aea57 100644
--- a/bleach/utils.py
+++ b/bleach/utils.py
@@ -1,7 +1,10 @@
from collections import OrderedDict
+import re
import six
+from bleach._vendor.django.core.validators import URLValidator
+
def _attr_key(attr):
"""Returns appropriate key for sorting attribute names
@@ -40,3 +43,52 @@ def force_unicode(text):
# If not, convert it
return six.text_type(text, "utf-8", "strict")
+
+
+netloc_port_re = re.compile(
+ "^" + URLValidator.netloc_re + URLValidator.port_re + "$", re.IGNORECASE
+)
+
+
+# Characters valid in scheme names
+scheme_chars = (
+ "abcdefghijklmnopqrstuvwxyz" "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "0123456789" "+-."
+)
+
+
+def _is_valid_netloc_and_port(netloc):
+ """
+ Returns the scheme for a URI or None when parsing the URI fails
+
+ :arg str/unicode netloc:
+
+ :returns: bool
+
+ """
+ # The maximum length of a full host name is 253 characters per RFC 1034
+ # section 3.1. It's defined to be 255 bytes or less, but this includes
+ # one byte for the length of the name and one byte for the trailing dot
+ # that's used to indicate absolute names in DNS.
+ netloc = netloc_port_re.match(netloc)
+ return bool(netloc and len(netloc.group(0)) < 254)
+
+
+def _parse_uri_scheme(uri):
+ """
+ Returns the scheme for a URI or None when parsing the URI fails
+
+ :arg str/unicode text:
+
+ :returns: text or None
+
+ """
+ # replicate Python 3.9 urlparse scheme parsing for older Python versions
+ i = uri.find(":")
+ if i > 0:
+ scheme = uri[:i]
+ for c in uri[:i]:
+ if c not in scheme_chars:
+ break
+ return scheme
+
+ return None
diff --git a/tests/test_clean.py b/tests/test_clean.py
index 7c565750..7fa7ec33 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -6,7 +6,7 @@
from bleach import clean
from bleach.html5lib_shim import Filter
-from bleach.sanitizer import Cleaner
+from bleach.sanitizer import ALLOWED_PROTOCOLS, Cleaner
from bleach._vendor.html5lib.constants import rcdataElements
@@ -58,10 +58,6 @@ def test_html_is_lowercased():
)
-def test_invalid_uri_does_not_raise_error():
- assert clean('text') == "text"
-
-
@pytest.mark.parametrize(
"data, should_strip, expected",
[
@@ -471,10 +467,31 @@ def test_attributes_list():
@pytest.mark.parametrize(
"data, kwargs, expected",
[
+ # invalid URI (urlparse raises a ValueError: Invalid IPv6 URL)
+ # is not allowed by default
+ (
+ 'text',
+ {"protocols": ALLOWED_PROTOCOLS},
+ "text",
+ ),
+ # data protocol is not allowed by default
+ (
+ 'foo',
+ {"protocols": ALLOWED_PROTOCOLS},
+ "foo",
+ ),
# javascript: is not allowed by default
- ("xss", {}, "xss"),
+ (
+ "xss",
+ {"protocols": ALLOWED_PROTOCOLS},
+ "xss",
+ ),
# File protocol is not allowed by default
- ('foo', {}, "foo"),
+ (
+ 'foo',
+ {"protocols": ALLOWED_PROTOCOLS},
+ "foo",
+ ),
# Specified protocols are allowed
(
'allowed href',
@@ -494,6 +511,11 @@ def test_attributes_list():
'foo',
),
# Allow implicit http if allowed
+ (
+ 'valid',
+ {"protocols": ["http"]},
+ 'valid',
+ ),
(
'valid',
{"protocols": ["http"]},
@@ -524,6 +546,11 @@ def test_attributes_list():
{"protocols": ["http"]},
'valid',
),
+ (
+ 'valid',
+ {"protocols": ["http"]},
+ 'valid',
+ ),
# Disallow implicit http if disallowed
('foo', {"protocols": []}, "foo"),
('foo', {"protocols": []}, "foo"),