Skip to content

Commit

Permalink
Fixed 35467 -- Replaced urlparse with urlsplit where appropriate.
Browse files Browse the repository at this point in the history
This work should not generate any change of functionality, and
`urlsplit` is approximately 6x faster.

Most use cases of `urlparse` didn't touch the path, so they can be
converted to `urlsplit` without any issue. Most of those which do use
`.path`, simply parse the URL, mutate the querystring, then put them
back together, which is also fine (so long as urlunsplit is used).
  • Loading branch information
RealOrangeOne authored May 29, 2024
1 parent 02dab94 commit ff308a0
Show file tree
Hide file tree
Showing 17 changed files with 61 additions and 72 deletions.
4 changes: 2 additions & 2 deletions django/contrib/admin/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from functools import partial, update_wrapper
from urllib.parse import parse_qsl
from urllib.parse import quote as urlquote
from urllib.parse import urlparse
from urllib.parse import urlsplit

from django import forms
from django.conf import settings
Expand Down Expand Up @@ -1384,7 +1384,7 @@ def render_change_form(
)

def _get_preserved_qsl(self, request, preserved_filters):
query_string = urlparse(request.build_absolute_uri()).query
query_string = urlsplit(request.build_absolute_uri()).query
return parse_qsl(query_string.replace(preserved_filters, ""))

def response_add(self, request, obj, post_url_continue=None):
Expand Down
10 changes: 5 additions & 5 deletions django/contrib/admin/templatetags/admin_urls.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from urllib.parse import parse_qsl, unquote, urlparse, urlunparse
from urllib.parse import parse_qsl, unquote, urlsplit, urlunsplit

from django import template
from django.contrib.admin.utils import quote
Expand All @@ -24,8 +24,8 @@ def add_preserved_filters(context, url, popup=False, to_field=None):
preserved_filters = context.get("preserved_filters")
preserved_qsl = context.get("preserved_qsl")

parsed_url = list(urlparse(url))
parsed_qs = dict(parse_qsl(parsed_url[4]))
parsed_url = list(urlsplit(url))
parsed_qs = dict(parse_qsl(parsed_url[3]))
merged_qs = {}

if preserved_qsl:
Expand Down Expand Up @@ -66,5 +66,5 @@ def add_preserved_filters(context, url, popup=False, to_field=None):

merged_qs.update(parsed_qs)

parsed_url[4] = urlencode(merged_qs)
return urlunparse(parsed_url)
parsed_url[3] = urlencode(merged_qs)
return urlunsplit(parsed_url)
6 changes: 3 additions & 3 deletions django/contrib/auth/decorators.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import asyncio
from functools import wraps
from urllib.parse import urlparse
from urllib.parse import urlsplit

from asgiref.sync import async_to_sync, sync_to_async

Expand All @@ -25,8 +25,8 @@ def _redirect_to_login(request):
resolved_login_url = resolve_url(login_url or settings.LOGIN_URL)
# If the login url is the same scheme and net location then just
# use the path as the "next" url.
login_scheme, login_netloc = urlparse(resolved_login_url)[:2]
current_scheme, current_netloc = urlparse(path)[:2]
login_scheme, login_netloc = urlsplit(resolved_login_url)[:2]
current_scheme, current_netloc = urlsplit(path)[:2]
if (not login_scheme or login_scheme == current_scheme) and (
not login_netloc or login_netloc == current_netloc
):
Expand Down
6 changes: 3 additions & 3 deletions django/contrib/auth/middleware.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from functools import partial
from urllib.parse import urlparse
from urllib.parse import urlsplit

from django.conf import settings
from django.contrib import auth
Expand Down Expand Up @@ -74,8 +74,8 @@ def handle_no_permission(self, request, view_func):
resolved_login_url = resolve_url(self.get_login_url(view_func))
# If the login url is the same scheme and net location then use the
# path as the "next" url.
login_scheme, login_netloc = urlparse(resolved_login_url)[:2]
current_scheme, current_netloc = urlparse(path)[:2]
login_scheme, login_netloc = urlsplit(resolved_login_url)[:2]
current_scheme, current_netloc = urlsplit(path)[:2]
if (not login_scheme or login_scheme == current_scheme) and (
not login_netloc or login_netloc == current_netloc
):
Expand Down
6 changes: 3 additions & 3 deletions django/contrib/auth/mixins.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from urllib.parse import urlparse
from urllib.parse import urlsplit

from django.conf import settings
from django.contrib.auth import REDIRECT_FIELD_NAME
Expand Down Expand Up @@ -51,8 +51,8 @@ def handle_no_permission(self):
resolved_login_url = resolve_url(self.get_login_url())
# If the login url is the same scheme and net location then use the
# path as the "next" url.
login_scheme, login_netloc = urlparse(resolved_login_url)[:2]
current_scheme, current_netloc = urlparse(path)[:2]
login_scheme, login_netloc = urlsplit(resolved_login_url)[:2]
current_scheme, current_netloc = urlsplit(path)[:2]
if (not login_scheme or login_scheme == current_scheme) and (
not login_netloc or login_netloc == current_netloc
):
Expand Down
10 changes: 5 additions & 5 deletions django/contrib/auth/views.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from urllib.parse import urlparse, urlunparse
from urllib.parse import urlsplit, urlunsplit

from django.conf import settings

Expand Down Expand Up @@ -183,13 +183,13 @@ def redirect_to_login(next, login_url=None, redirect_field_name=REDIRECT_FIELD_N
"""
resolved_url = resolve_url(login_url or settings.LOGIN_URL)

login_url_parts = list(urlparse(resolved_url))
login_url_parts = list(urlsplit(resolved_url))
if redirect_field_name:
querystring = QueryDict(login_url_parts[4], mutable=True)
querystring = QueryDict(login_url_parts[3], mutable=True)
querystring[redirect_field_name] = next
login_url_parts[4] = querystring.urlencode(safe="/")
login_url_parts[3] = querystring.urlencode(safe="/")

return HttpResponseRedirect(urlunparse(login_url_parts))
return HttpResponseRedirect(urlunsplit(login_url_parts))


# Class-based password reset views
Expand Down
4 changes: 2 additions & 2 deletions django/contrib/staticfiles/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,13 @@ def _should_handle(self, path):
* the host is provided as part of the base_url
* the request's path isn't under the media path (or equal)
"""
return path.startswith(self.base_url[2]) and not self.base_url[1]
return path.startswith(self.base_url.path) and not self.base_url.netloc

def file_path(self, url):
"""
Return the relative path to the media file on disk for the given URL.
"""
relative_url = url.removeprefix(self.base_url[2])
relative_url = url.removeprefix(self.base_url.path)
return url2pathname(relative_url)

def serve(self, request):
Expand Down
4 changes: 2 additions & 2 deletions django/forms/fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -792,13 +792,13 @@ def __init__(self, *, assume_scheme=None, **kwargs):
def to_python(self, value):
def split_url(url):
"""
Return a list of url parts via urlparse.urlsplit(), or raise
Return a list of url parts via urlsplit(), or raise
ValidationError for some malformed URLs.
"""
try:
return list(urlsplit(url))
except ValueError:
# urlparse.urlsplit can raise a ValueError with some
# urlsplit can raise a ValueError with some
# misformatted URLs.
raise ValidationError(self.error_messages["invalid"], code="invalid")

Expand Down
4 changes: 2 additions & 2 deletions django/http/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import warnings
from email.header import Header
from http.client import responses
from urllib.parse import urlparse
from urllib.parse import urlsplit

from asgiref.sync import async_to_sync, sync_to_async

Expand Down Expand Up @@ -616,7 +616,7 @@ class HttpResponseRedirectBase(HttpResponse):
def __init__(self, redirect_to, *args, **kwargs):
super().__init__(*args, **kwargs)
self["Location"] = iri_to_uri(redirect_to)
parsed = urlparse(str(redirect_to))
parsed = urlsplit(str(redirect_to))
if parsed.scheme and parsed.scheme not in self.allowed_schemes:
raise DisallowedRedirect(
"Unsafe redirect to URL with protocol '%s'" % parsed.scheme
Expand Down
4 changes: 2 additions & 2 deletions django/middleware/common.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import re
from urllib.parse import urlparse
from urllib.parse import urlsplit

from django.conf import settings
from django.core.exceptions import PermissionDenied
Expand Down Expand Up @@ -171,7 +171,7 @@ def is_ignorable_request(self, request, uri, domain, referer):

# The referer is equal to the current URL, ignoring the scheme (assumed
# to be a poorly implemented bot).
parsed_referer = urlparse(referer)
parsed_referer = urlsplit(referer)
if parsed_referer.netloc in ["", domain] and parsed_referer.path == uri:
return True

Expand Down
10 changes: 5 additions & 5 deletions django/middleware/csrf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import logging
import string
from collections import defaultdict
from urllib.parse import urlparse
from urllib.parse import urlsplit

from django.conf import settings
from django.core.exceptions import DisallowedHost, ImproperlyConfigured
Expand Down Expand Up @@ -174,7 +174,7 @@ class CsrfViewMiddleware(MiddlewareMixin):
@cached_property
def csrf_trusted_origins_hosts(self):
return [
urlparse(origin).netloc.lstrip("*")
urlsplit(origin).netloc.lstrip("*")
for origin in settings.CSRF_TRUSTED_ORIGINS
]

Expand All @@ -190,7 +190,7 @@ def allowed_origin_subdomains(self):
"""
allowed_origin_subdomains = defaultdict(list)
for parsed in (
urlparse(origin)
urlsplit(origin)
for origin in settings.CSRF_TRUSTED_ORIGINS
if "*" in origin
):
Expand Down Expand Up @@ -284,7 +284,7 @@ def _origin_verified(self, request):
if request_origin in self.allowed_origins_exact:
return True
try:
parsed_origin = urlparse(request_origin)
parsed_origin = urlsplit(request_origin)
except ValueError:
return False
request_scheme = parsed_origin.scheme
Expand All @@ -300,7 +300,7 @@ def _check_referer(self, request):
raise RejectRequest(REASON_NO_REFERER)

try:
referer = urlparse(referer)
referer = urlsplit(referer)
except ValueError:
raise RejectRequest(REASON_MALFORMED_REFERER)

Expand Down
17 changes: 6 additions & 11 deletions django/test/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from http import HTTPStatus
from importlib import import_module
from io import BytesIO, IOBase
from urllib.parse import unquote_to_bytes, urljoin, urlparse, urlsplit
from urllib.parse import unquote_to_bytes, urljoin, urlsplit

from asgiref.sync import sync_to_async

Expand Down Expand Up @@ -458,11 +458,7 @@ def _encode_json(self, data, content_type):
return json.dumps(data, cls=self.json_encoder) if should_encode else data

def _get_path(self, parsed):
path = parsed.path
# If there are parameters, add them
if parsed.params:
path += ";" + parsed.params
path = unquote_to_bytes(path)
path = unquote_to_bytes(parsed.path)
# Replace the behavior where non-ASCII values in the WSGI environ are
# arbitrarily decoded with ISO-8859-1.
# Refs comment in `get_bytes_from_wsgi()`.
Expand Down Expand Up @@ -647,7 +643,7 @@ def generic(
**extra,
):
"""Construct an arbitrary HTTP request."""
parsed = urlparse(str(path)) # path can be lazy
parsed = urlsplit(str(path)) # path can be lazy
data = force_bytes(data, settings.DEFAULT_CHARSET)
r = {
"PATH_INFO": self._get_path(parsed),
Expand All @@ -671,8 +667,7 @@ def generic(
# If QUERY_STRING is absent or empty, we want to extract it from the URL.
if not r.get("QUERY_STRING"):
# WSGI requires latin-1 encoded strings. See get_path_info().
query_string = parsed[4].encode().decode("iso-8859-1")
r["QUERY_STRING"] = query_string
r["QUERY_STRING"] = parsed.query.encode().decode("iso-8859-1")
return self.request(**r)


Expand Down Expand Up @@ -748,7 +743,7 @@ def generic(
**extra,
):
"""Construct an arbitrary HTTP request."""
parsed = urlparse(str(path)) # path can be lazy.
parsed = urlsplit(str(path)) # path can be lazy.
data = force_bytes(data, settings.DEFAULT_CHARSET)
s = {
"method": method,
Expand All @@ -772,7 +767,7 @@ def generic(
else:
# If QUERY_STRING is absent or empty, we want to extract it from
# the URL.
s["query_string"] = parsed[4]
s["query_string"] = parsed.query
if headers:
extra.update(HttpHeaders.to_asgi_names(headers))
s["headers"] += [
Expand Down
12 changes: 5 additions & 7 deletions django/test/testcases.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
urljoin,
urlparse,
urlsplit,
urlunparse,
urlunsplit,
)
from urllib.request import url2pathname

Expand Down Expand Up @@ -541,11 +541,9 @@ def assertURLEqual(self, url1, url2, msg_prefix=""):
def normalize(url):
"""Sort the URL's query string parameters."""
url = str(url) # Coerce reverse_lazy() URLs.
scheme, netloc, path, params, query, fragment = urlparse(url)
scheme, netloc, path, query, fragment = urlsplit(url)
query_parts = sorted(parse_qsl(query))
return urlunparse(
(scheme, netloc, path, params, urlencode(query_parts), fragment)
)
return urlunsplit((scheme, netloc, path, urlencode(query_parts), fragment))

if msg_prefix:
msg_prefix += ": "
Expand Down Expand Up @@ -1637,11 +1635,11 @@ def _should_handle(self, path):
* the host is provided as part of the base_url
* the request's path isn't under the media path (or equal)
"""
return path.startswith(self.base_url[2]) and not self.base_url[1]
return path.startswith(self.base_url.path) and not self.base_url.netloc

def file_path(self, url):
"""Return the relative path to the file on disk for the given URL."""
relative_url = url.removeprefix(self.base_url[2])
relative_url = url.removeprefix(self.base_url.path)
return url2pathname(relative_url)

def get_response(self, request):
Expand Down
6 changes: 3 additions & 3 deletions django/utils/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from email.utils import formatdate
from urllib.parse import quote, unquote
from urllib.parse import urlencode as original_urlencode
from urllib.parse import urlparse
from urllib.parse import urlsplit

from django.utils.datastructures import MultiValueDict
from django.utils.regex_helper import _lazy_re_compile
Expand Down Expand Up @@ -271,11 +271,11 @@ def url_has_allowed_host_and_scheme(url, allowed_hosts, require_https=False):

def _url_has_allowed_host_and_scheme(url, allowed_hosts, require_https=False):
# Chrome considers any URL with more than two slashes to be absolute, but
# urlparse is not so flexible. Treat any url with three slashes as unsafe.
# urlsplit is not so flexible. Treat any url with three slashes as unsafe.
if url.startswith("///"):
return False
try:
url_info = urlparse(url)
url_info = urlsplit(url)
except ValueError: # e.g. invalid IPv6 addresses
return False
# Forbid URLs like http:///example.com - with a scheme, but without a hostname.
Expand Down
4 changes: 2 additions & 2 deletions docs/ref/urlresolvers.txt
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ A :class:`ResolverMatch` object can also be assigned to a triple::
One possible use of :func:`~django.urls.resolve` would be to test whether a
view would raise a ``Http404`` error before redirecting to it::

from urllib.parse import urlparse
from urllib.parse import urlsplit
from django.urls import resolve
from django.http import Http404, HttpResponseRedirect

Expand All @@ -215,7 +215,7 @@ view would raise a ``Http404`` error before redirecting to it::
# modify the request and response as required, e.g. change locale
# and set corresponding locale cookie

view, args, kwargs = resolve(urlparse(next)[2])
view, args, kwargs = resolve(urlsplit(next).path)
kwargs["request"] = request
try:
view(*args, **kwargs)
Expand Down
Loading

0 comments on commit ff308a0

Please sign in to comment.