Fixed 35467 -- Replaced urlparse with urlsplit where appropriate.

This work should not generate any change of functionality, and `urlsplit` is approximately 6x faster. Most use cases of `urlparse` didn't touch the path, so they can be converted to `urlsplit` without any issue. Most of those which do use `.path`, simply parse the URL, mutate the querystring, then put them back together, which is also fine (so long as urlunsplit is used).
felixxm · May 29, 2024 · ff308a0 · ff308a0
1 parent 02dab94
commit ff308a0
Show file tree

Hide file tree

Showing 17 changed files with 61 additions and 72 deletions.
diff --git a/django/contrib/admin/options.py b/django/contrib/admin/options.py
@@ -6,7 +6,7 @@
 from functools import partial, update_wrapper
 from urllib.parse import parse_qsl
 from urllib.parse import quote as urlquote
-from urllib.parse import urlparse
+from urllib.parse import urlsplit
 
 from django import forms
 from django.conf import settings
@@ -1384,7 +1384,7 @@ def render_change_form(
         )
 
     def _get_preserved_qsl(self, request, preserved_filters):
-        query_string = urlparse(request.build_absolute_uri()).query
+        query_string = urlsplit(request.build_absolute_uri()).query
         return parse_qsl(query_string.replace(preserved_filters, ""))
 
     def response_add(self, request, obj, post_url_continue=None):

diff --git a/django/contrib/admin/templatetags/admin_urls.py b/django/contrib/admin/templatetags/admin_urls.py
@@ -1,4 +1,4 @@
-from urllib.parse import parse_qsl, unquote, urlparse, urlunparse
+from urllib.parse import parse_qsl, unquote, urlsplit, urlunsplit
 
 from django import template
 from django.contrib.admin.utils import quote
@@ -24,8 +24,8 @@ def add_preserved_filters(context, url, popup=False, to_field=None):
     preserved_filters = context.get("preserved_filters")
     preserved_qsl = context.get("preserved_qsl")
 
-    parsed_url = list(urlparse(url))
-    parsed_qs = dict(parse_qsl(parsed_url[4]))
+    parsed_url = list(urlsplit(url))
+    parsed_qs = dict(parse_qsl(parsed_url[3]))
     merged_qs = {}
 
     if preserved_qsl:
@@ -66,5 +66,5 @@ def add_preserved_filters(context, url, popup=False, to_field=None):
 
     merged_qs.update(parsed_qs)
 
-    parsed_url[4] = urlencode(merged_qs)
-    return urlunparse(parsed_url)
+    parsed_url[3] = urlencode(merged_qs)
+    return urlunsplit(parsed_url)
diff --git a/django/contrib/auth/decorators.py b/django/contrib/auth/decorators.py
@@ -1,6 +1,6 @@
 import asyncio
 from functools import wraps
-from urllib.parse import urlparse
+from urllib.parse import urlsplit
 
 from asgiref.sync import async_to_sync, sync_to_async
 
@@ -25,8 +25,8 @@ def _redirect_to_login(request):
             resolved_login_url = resolve_url(login_url or settings.LOGIN_URL)
             # If the login url is the same scheme and net location then just
             # use the path as the "next" url.
-            login_scheme, login_netloc = urlparse(resolved_login_url)[:2]
-            current_scheme, current_netloc = urlparse(path)[:2]
+            login_scheme, login_netloc = urlsplit(resolved_login_url)[:2]
+            current_scheme, current_netloc = urlsplit(path)[:2]
             if (not login_scheme or login_scheme == current_scheme) and (
                 not login_netloc or login_netloc == current_netloc
             ):

diff --git a/django/contrib/auth/middleware.py b/django/contrib/auth/middleware.py
@@ -1,5 +1,5 @@
 from functools import partial
-from urllib.parse import urlparse
+from urllib.parse import urlsplit
 
 from django.conf import settings
 from django.contrib import auth
@@ -74,8 +74,8 @@ def handle_no_permission(self, request, view_func):
         resolved_login_url = resolve_url(self.get_login_url(view_func))
         # If the login url is the same scheme and net location then use the
         # path as the "next" url.
-        login_scheme, login_netloc = urlparse(resolved_login_url)[:2]
-        current_scheme, current_netloc = urlparse(path)[:2]
+        login_scheme, login_netloc = urlsplit(resolved_login_url)[:2]
+        current_scheme, current_netloc = urlsplit(path)[:2]
         if (not login_scheme or login_scheme == current_scheme) and (
             not login_netloc or login_netloc == current_netloc
         ):

diff --git a/django/contrib/auth/mixins.py b/django/contrib/auth/mixins.py
@@ -1,4 +1,4 @@
-from urllib.parse import urlparse
+from urllib.parse import urlsplit
 
 from django.conf import settings
 from django.contrib.auth import REDIRECT_FIELD_NAME
@@ -51,8 +51,8 @@ def handle_no_permission(self):
         resolved_login_url = resolve_url(self.get_login_url())
         # If the login url is the same scheme and net location then use the
         # path as the "next" url.
-        login_scheme, login_netloc = urlparse(resolved_login_url)[:2]
-        current_scheme, current_netloc = urlparse(path)[:2]
+        login_scheme, login_netloc = urlsplit(resolved_login_url)[:2]
+        current_scheme, current_netloc = urlsplit(path)[:2]
         if (not login_scheme or login_scheme == current_scheme) and (
             not login_netloc or login_netloc == current_netloc
         ):

diff --git a/django/contrib/auth/views.py b/django/contrib/auth/views.py
@@ -1,4 +1,4 @@
-from urllib.parse import urlparse, urlunparse
+from urllib.parse import urlsplit, urlunsplit
 
 from django.conf import settings
 
@@ -183,13 +183,13 @@ def redirect_to_login(next, login_url=None, redirect_field_name=REDIRECT_FIELD_N
     """
     resolved_url = resolve_url(login_url or settings.LOGIN_URL)
 
-    login_url_parts = list(urlparse(resolved_url))
+    login_url_parts = list(urlsplit(resolved_url))
     if redirect_field_name:
-        querystring = QueryDict(login_url_parts[4], mutable=True)
+        querystring = QueryDict(login_url_parts[3], mutable=True)
         querystring[redirect_field_name] = next
-        login_url_parts[4] = querystring.urlencode(safe="/")
+        login_url_parts[3] = querystring.urlencode(safe="/")
 
-    return HttpResponseRedirect(urlunparse(login_url_parts))
+    return HttpResponseRedirect(urlunsplit(login_url_parts))
 
 
 # Class-based password reset views

diff --git a/django/contrib/staticfiles/handlers.py b/django/contrib/staticfiles/handlers.py
@@ -36,13 +36,13 @@ def _should_handle(self, path):
         * the host is provided as part of the base_url
         * the request's path isn't under the media path (or equal)
         """
-        return path.startswith(self.base_url[2]) and not self.base_url[1]
+        return path.startswith(self.base_url.path) and not self.base_url.netloc
 
     def file_path(self, url):
         """
         Return the relative path to the media file on disk for the given URL.
         """
-        relative_url = url.removeprefix(self.base_url[2])
+        relative_url = url.removeprefix(self.base_url.path)
         return url2pathname(relative_url)
 
     def serve(self, request):

diff --git a/django/forms/fields.py b/django/forms/fields.py
@@ -792,13 +792,13 @@ def __init__(self, *, assume_scheme=None, **kwargs):
     def to_python(self, value):
         def split_url(url):
             """
-            Return a list of url parts via urlparse.urlsplit(), or raise
+            Return a list of url parts via urlsplit(), or raise
             ValidationError for some malformed URLs.
             """
             try:
                 return list(urlsplit(url))
             except ValueError:
-                # urlparse.urlsplit can raise a ValueError with some
+                # urlsplit can raise a ValueError with some
                 # misformatted URLs.
                 raise ValidationError(self.error_messages["invalid"], code="invalid")
 

diff --git a/django/http/response.py b/django/http/response.py
@@ -9,7 +9,7 @@
 import warnings
 from email.header import Header
 from http.client import responses
-from urllib.parse import urlparse
+from urllib.parse import urlsplit
 
 from asgiref.sync import async_to_sync, sync_to_async
 
@@ -616,7 +616,7 @@ class HttpResponseRedirectBase(HttpResponse):
     def __init__(self, redirect_to, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self["Location"] = iri_to_uri(redirect_to)
-        parsed = urlparse(str(redirect_to))
+        parsed = urlsplit(str(redirect_to))
         if parsed.scheme and parsed.scheme not in self.allowed_schemes:
             raise DisallowedRedirect(
                 "Unsafe redirect to URL with protocol '%s'" % parsed.scheme

diff --git a/django/middleware/common.py b/django/middleware/common.py
@@ -1,5 +1,5 @@
 import re
-from urllib.parse import urlparse
+from urllib.parse import urlsplit
 
 from django.conf import settings
 from django.core.exceptions import PermissionDenied
@@ -171,7 +171,7 @@ def is_ignorable_request(self, request, uri, domain, referer):
 
         # The referer is equal to the current URL, ignoring the scheme (assumed
         # to be a poorly implemented bot).
-        parsed_referer = urlparse(referer)
+        parsed_referer = urlsplit(referer)
         if parsed_referer.netloc in ["", domain] and parsed_referer.path == uri:
             return True
 

diff --git a/django/middleware/csrf.py b/django/middleware/csrf.py
@@ -8,7 +8,7 @@
 import logging
 import string
 from collections import defaultdict
-from urllib.parse import urlparse
+from urllib.parse import urlsplit
 
 from django.conf import settings
 from django.core.exceptions import DisallowedHost, ImproperlyConfigured
@@ -174,7 +174,7 @@ class CsrfViewMiddleware(MiddlewareMixin):
     @cached_property
     def csrf_trusted_origins_hosts(self):
         return [
-            urlparse(origin).netloc.lstrip("*")
+            urlsplit(origin).netloc.lstrip("*")
             for origin in settings.CSRF_TRUSTED_ORIGINS
         ]
 
@@ -190,7 +190,7 @@ def allowed_origin_subdomains(self):
         """
         allowed_origin_subdomains = defaultdict(list)
         for parsed in (
-            urlparse(origin)
+            urlsplit(origin)
             for origin in settings.CSRF_TRUSTED_ORIGINS
             if "*" in origin
         ):
@@ -284,7 +284,7 @@ def _origin_verified(self, request):
         if request_origin in self.allowed_origins_exact:
             return True
         try:
-            parsed_origin = urlparse(request_origin)
+            parsed_origin = urlsplit(request_origin)
         except ValueError:
             return False
         request_scheme = parsed_origin.scheme
@@ -300,7 +300,7 @@ def _check_referer(self, request):
             raise RejectRequest(REASON_NO_REFERER)
 
         try:
-            referer = urlparse(referer)
+            referer = urlsplit(referer)
         except ValueError:
             raise RejectRequest(REASON_MALFORMED_REFERER)
 

diff --git a/django/test/client.py b/django/test/client.py
@@ -8,7 +8,7 @@
 from http import HTTPStatus
 from importlib import import_module
 from io import BytesIO, IOBase
-from urllib.parse import unquote_to_bytes, urljoin, urlparse, urlsplit
+from urllib.parse import unquote_to_bytes, urljoin, urlsplit
 
 from asgiref.sync import sync_to_async
 
@@ -458,11 +458,7 @@ def _encode_json(self, data, content_type):
         return json.dumps(data, cls=self.json_encoder) if should_encode else data
 
     def _get_path(self, parsed):
-        path = parsed.path
-        # If there are parameters, add them
-        if parsed.params:
-            path += ";" + parsed.params
-        path = unquote_to_bytes(path)
+        path = unquote_to_bytes(parsed.path)
         # Replace the behavior where non-ASCII values in the WSGI environ are
         # arbitrarily decoded with ISO-8859-1.
         # Refs comment in `get_bytes_from_wsgi()`.
@@ -647,7 +643,7 @@ def generic(
         **extra,
     ):
         """Construct an arbitrary HTTP request."""
-        parsed = urlparse(str(path))  # path can be lazy
+        parsed = urlsplit(str(path))  # path can be lazy
         data = force_bytes(data, settings.DEFAULT_CHARSET)
         r = {
             "PATH_INFO": self._get_path(parsed),
@@ -671,8 +667,7 @@ def generic(
         # If QUERY_STRING is absent or empty, we want to extract it from the URL.
         if not r.get("QUERY_STRING"):
             # WSGI requires latin-1 encoded strings. See get_path_info().
-            query_string = parsed[4].encode().decode("iso-8859-1")
-            r["QUERY_STRING"] = query_string
+            r["QUERY_STRING"] = parsed.query.encode().decode("iso-8859-1")
         return self.request(**r)
 
 
@@ -748,7 +743,7 @@ def generic(
         **extra,
     ):
         """Construct an arbitrary HTTP request."""
-        parsed = urlparse(str(path))  # path can be lazy.
+        parsed = urlsplit(str(path))  # path can be lazy.
         data = force_bytes(data, settings.DEFAULT_CHARSET)
         s = {
             "method": method,
@@ -772,7 +767,7 @@ def generic(
         else:
             # If QUERY_STRING is absent or empty, we want to extract it from
             # the URL.
-            s["query_string"] = parsed[4]
+            s["query_string"] = parsed.query
         if headers:
             extra.update(HttpHeaders.to_asgi_names(headers))
         s["headers"] += [

diff --git a/django/test/testcases.py b/django/test/testcases.py
@@ -21,7 +21,7 @@
     urljoin,
     urlparse,
     urlsplit,
-    urlunparse,
+    urlunsplit,
 )
 from urllib.request import url2pathname
 
@@ -541,11 +541,9 @@ def assertURLEqual(self, url1, url2, msg_prefix=""):
         def normalize(url):
             """Sort the URL's query string parameters."""
             url = str(url)  # Coerce reverse_lazy() URLs.
-            scheme, netloc, path, params, query, fragment = urlparse(url)
+            scheme, netloc, path, query, fragment = urlsplit(url)
             query_parts = sorted(parse_qsl(query))
-            return urlunparse(
-                (scheme, netloc, path, params, urlencode(query_parts), fragment)
-            )
+            return urlunsplit((scheme, netloc, path, urlencode(query_parts), fragment))
 
         if msg_prefix:
             msg_prefix += ": "
@@ -1637,11 +1635,11 @@ def _should_handle(self, path):
         * the host is provided as part of the base_url
         * the request's path isn't under the media path (or equal)
         """
-        return path.startswith(self.base_url[2]) and not self.base_url[1]
+        return path.startswith(self.base_url.path) and not self.base_url.netloc
 
     def file_path(self, url):
         """Return the relative path to the file on disk for the given URL."""
-        relative_url = url.removeprefix(self.base_url[2])
+        relative_url = url.removeprefix(self.base_url.path)
         return url2pathname(relative_url)
 
     def get_response(self, request):

diff --git a/django/utils/http.py b/django/utils/http.py
@@ -6,7 +6,7 @@
 from email.utils import formatdate
 from urllib.parse import quote, unquote
 from urllib.parse import urlencode as original_urlencode
-from urllib.parse import urlparse
+from urllib.parse import urlsplit
 
 from django.utils.datastructures import MultiValueDict
 from django.utils.regex_helper import _lazy_re_compile
@@ -271,11 +271,11 @@ def url_has_allowed_host_and_scheme(url, allowed_hosts, require_https=False):
 
 def _url_has_allowed_host_and_scheme(url, allowed_hosts, require_https=False):
     # Chrome considers any URL with more than two slashes to be absolute, but
-    # urlparse is not so flexible. Treat any url with three slashes as unsafe.
+    # urlsplit is not so flexible. Treat any url with three slashes as unsafe.
     if url.startswith("///"):
         return False
     try:
-        url_info = urlparse(url)
+        url_info = urlsplit(url)
     except ValueError:  # e.g. invalid IPv6 addresses
         return False
     # Forbid URLs like http:///example.com - with a scheme, but without a hostname.

diff --git a/docs/ref/urlresolvers.txt b/docs/ref/urlresolvers.txt
@@ -203,7 +203,7 @@ A :class:`ResolverMatch` object can also be assigned to a triple::
 One possible use of :func:`~django.urls.resolve` would be to test whether a
 view would raise a ``Http404`` error before redirecting to it::
 
-    from urllib.parse import urlparse
+    from urllib.parse import urlsplit
     from django.urls import resolve
     from django.http import Http404, HttpResponseRedirect
 
@@ -215,7 +215,7 @@ view would raise a ``Http404`` error before redirecting to it::
         # modify the request and response as required, e.g. change locale
         # and set corresponding locale cookie
 
-        view, args, kwargs = resolve(urlparse(next)[2])
+        view, args, kwargs = resolve(urlsplit(next).path)
         kwargs["request"] = request
         try:
             view(*args, **kwargs)