Skip to content

Commit

Permalink
sanitizer: update sanitize_uri_value for Python 3.9 urlparse
Browse files Browse the repository at this point in the history
Use Python 3.9 urlparse scheme parsing behavior for all Python
versions

Changes:

* add utils._parse_uri_scheme to match Python 3.9 urlparse behavior
* add utils._is_valid_netloc_and_port with Django URL validator
* in test_uri_value_allowed_protocols:
  * add test case for implicit http for IP and port with path and fragment
  * add test case for data: scheme
  * add test case for relative path URI
  * test "is not allowed by default" test cases against default
    ALLOWED_PROTOCOLS
  * convert test_invalid_uri_does_not_raise_error into a test case
  • Loading branch information
Greg Guthe committed Feb 1, 2021
1 parent c405b7b commit 7f4a376
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 28 deletions.
60 changes: 39 additions & 21 deletions bleach/sanitizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,12 @@
from xml.sax.saxutils import unescape

from bleach import html5lib_shim
from bleach.utils import alphabetize_attributes, force_unicode
from bleach.utils import (
_is_valid_netloc_and_port,
_parse_uri_scheme,
alphabetize_attributes,
force_unicode,
)


#: List of allowed tags
Expand Down Expand Up @@ -447,9 +452,20 @@ def sanitize_characters(self, token):
return new_tokens

def sanitize_uri_value(self, value, allowed_protocols):
"""Checks a uri value to see if it's allowed
"""Checks a URI value to see if it's allowed
``urllib.parse.urlparse`` must be able to parse the URI.
The URI scheme must be in ``allowed_protocols`` or not have a
scheme and begin with a ``#`` indicating a relative URI by
fragment.
When ``"http"`` is in ``allowed_protocols`` (the default),
``sanitize_uri_value`` also allows relative URIs matching an
IP address or hostname and port (e.g. ``localhost:8000``) and
relative URIs without a scheme (e.g. ``/path``).
:arg value: the uri value to sanitize
:arg value: the URI value to sanitize
:arg allowed_protocols: list of allowed protocols
:returns: allowed value or None
Expand All @@ -473,33 +489,35 @@ def sanitize_uri_value(self, value, allowed_protocols):
new_value = new_value.lower()

try:
# Drop attributes with uri values that have protocols that aren't
# allowed
parsed = urlparse(new_value)
_ = urlparse(new_value)
except ValueError:
# URI is impossible to parse, therefore it's not allowed
return None

if parsed.scheme:
# If urlparse found a scheme, check that
if parsed.scheme in allowed_protocols:
return value
# If there's no protocol/scheme specified, then assume it's "http"
# and see if that's allowed
implicit_http_allowed = "http" in allowed_protocols

# Drop attributes with uri values that have protocols that aren't
# allowed
scheme = _parse_uri_scheme(new_value)
if scheme:
if scheme in allowed_protocols:
return value
elif implicit_http_allowed and _is_valid_netloc_and_port(scheme):
return value
else:
# parsed a disallowed protocol/scheme
# or implicit protocols are allowed and it's an invalid netloc:port
return None
else:
# Allow uris that are just an anchor
if new_value.startswith("#"):
# Allow uris that are just an anchor
return value

# Handle protocols that urlparse doesn't recognize like "myprotocol"
if ":" in new_value and new_value.split(":")[0] in allowed_protocols:
return value

# If there's no protocol/scheme specified, then assume it's "http"
# and see if that's allowed
if "http" in allowed_protocols:
elif implicit_http_allowed:
return value

return None
else:
return None

def allow_token(self, token):
"""Handles the case where we're allowing the tag"""
Expand Down
52 changes: 52 additions & 0 deletions bleach/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from collections import OrderedDict
import re

import six

from bleach._vendor.django.core.validators import URLValidator


def _attr_key(attr):
"""Returns appropriate key for sorting attribute names
Expand Down Expand Up @@ -40,3 +43,52 @@ def force_unicode(text):

# If not, convert it
return six.text_type(text, "utf-8", "strict")


netloc_port_re = re.compile(
"^" + URLValidator.netloc_re + URLValidator.port_re + "$", re.IGNORECASE
)


# Characters valid in scheme names
scheme_chars = (
"abcdefghijklmnopqrstuvwxyz" "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "0123456789" "+-."
)


def _is_valid_netloc_and_port(netloc):
"""
Returns the scheme for a URI or None when parsing the URI fails
:arg str/unicode netloc:
:returns: bool
"""
# The maximum length of a full host name is 253 characters per RFC 1034
# section 3.1. It's defined to be 255 bytes or less, but this includes
# one byte for the length of the name and one byte for the trailing dot
# that's used to indicate absolute names in DNS.
netloc = netloc_port_re.match(netloc)
return bool(netloc and len(netloc.group(0)) < 254)


def _parse_uri_scheme(uri):
"""
Returns the scheme for a URI or None when parsing the URI fails
:arg str/unicode text:
:returns: text or None
"""
# replicate Python 3.9 urlparse scheme parsing for older Python versions
i = uri.find(":")
if i > 0:
scheme = uri[:i]
for c in uri[:i]:
if c not in scheme_chars:
break
return scheme

return None
41 changes: 34 additions & 7 deletions tests/test_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from bleach import clean
from bleach.html5lib_shim import Filter
from bleach.sanitizer import Cleaner
from bleach.sanitizer import ALLOWED_PROTOCOLS, Cleaner
from bleach._vendor.html5lib.constants import rcdataElements


Expand Down Expand Up @@ -58,10 +58,6 @@ def test_html_is_lowercased():
)


def test_invalid_uri_does_not_raise_error():
assert clean('<a href="http://example.com]">text</a>') == "<a>text</a>"


@pytest.mark.parametrize(
"data, should_strip, expected",
[
Expand Down Expand Up @@ -471,10 +467,31 @@ def test_attributes_list():
@pytest.mark.parametrize(
"data, kwargs, expected",
[
# invalid URI (urlparse raises a ValueError: Invalid IPv6 URL)
# is not allowed by default
(
'<a href="http://example.com]">text</a>',
{"protocols": ALLOWED_PROTOCOLS},
"<a>text</a>",
),
# data protocol is not allowed by default
(
'<a href="data:text/javascript,prompt(1)">foo</a>',
{"protocols": ALLOWED_PROTOCOLS},
"<a>foo</a>",
),
# javascript: is not allowed by default
("<a href=\"javascript:alert('XSS')\">xss</a>", {}, "<a>xss</a>"),
(
"<a href=\"javascript:alert('XSS')\">xss</a>",
{"protocols": ALLOWED_PROTOCOLS},
"<a>xss</a>",
),
# File protocol is not allowed by default
('<a href="file:///tmp/foo">foo</a>', {}, "<a>foo</a>"),
(
'<a href="file:///tmp/foo">foo</a>',
{"protocols": ALLOWED_PROTOCOLS},
"<a>foo</a>",
),
# Specified protocols are allowed
(
'<a href="myprotocol://more_text">allowed href</a>',
Expand All @@ -494,6 +511,11 @@ def test_attributes_list():
'<a href="#example.com">foo</a>',
),
# Allow implicit http if allowed
(
'<a href="/path">valid</a>',
{"protocols": ["http"]},
'<a href="/path">valid</a>',
),
(
'<a href="example.com">valid</a>',
{"protocols": ["http"]},
Expand Down Expand Up @@ -524,6 +546,11 @@ def test_attributes_list():
{"protocols": ["http"]},
'<a href="192.168.100.100:8000">valid</a>',
),
(
'<a href="192.168.100.100:8000/foo#bar">valid</a>',
{"protocols": ["http"]},
'<a href="192.168.100.100:8000/foo#bar">valid</a>',
),
# Disallow implicit http if disallowed
('<a href="example.com">foo</a>', {"protocols": []}, "<a>foo</a>"),
('<a href="example.com:8000">foo</a>', {"protocols": []}, "<a>foo</a>"),
Expand Down

0 comments on commit 7f4a376

Please sign in to comment.