From c55d17c8cde2b0c381a60417f0a71b18e434e418 Mon Sep 17 00:00:00 2001 From: Tzu-ping Chung Date: Sat, 19 Jun 2021 03:20:16 +0800 Subject: [PATCH] Smarter (and looser) link equivalency logic --- src/pip/_internal/models/link.py | 49 ++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 3 deletions(-) diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py index ebee3839598..40bc7784383 100644 --- a/src/pip/_internal/models/link.py +++ b/src/pip/_internal/models/link.py @@ -2,7 +2,7 @@ import posixpath import re import urllib.parse -from typing import TYPE_CHECKING, Optional, Tuple, Union +from typing import TYPE_CHECKING, Dict, List, NamedTuple, Optional, Tuple, Union from pip._internal.utils.filetypes import WHEEL_EXTENSION from pip._internal.utils.hashes import Hashes @@ -242,7 +242,50 @@ def is_hash_allowed(self, hashes): return hashes.is_hash_allowed(self.hash_name, hex_digest=self.hash) -# TODO: Relax this comparison logic to ignore, for example, fragments. +class _CleanResult(NamedTuple): + """Convert link for equivalency check. + + This is used in the resolver to check whether two URL-specified requirements + likely point to the same distribution and can be considered equivalent. This + equivalency logic avoids comparing URLs literally, which can be too strict + (e.g. "a=1&b=2" vs "b=2&a=1") and produce conflicts unexpecting to users. + + Currently this does three things: + + 1. Drop the basic auth part. This is technically wrong since a server can + serve different content based on auth, but if it does that, it is even + impossible to guarantee two URLs without auth are equivalent, since + the user can input different auth information when prompted. So the + practical solution is to assume the auth doesn't affect the response. + 2. Parse the query to avoid the ordering issue. + 3. Parse the fragment, and explicitly drop the "egg=" part since it is + commonly provided as the project name for compatibility. This is wrong in + the strictest sense, but too many people are doing it. + + Note that query value ordering under the same key in query and fragment are + NOT cleaned; i.e. "a=1&a=2" and "a=2&a=1" are still considered different. + """ + + parsed: urllib.parse.SplitResult + query: Dict[str, List[str]] + fragment: Dict[str, List[str]] + + @classmethod + def from_link(cls, link: Link) -> "_CleanResult": + parsed = link._parsed_url + netloc = parsed.netloc.rsplit("@", 1)[-1] + # The fragment does not necessarily use the query string format + # (it's a pip-specific syntax), so we set keep_blank_values to keep + # a fragment that's not a key-value pair (e.g. "#title_1"). + frag_qs = urllib.parse.parse_qs(parsed.fragment, keep_blank_values=True) + frag_qs.pop("egg", None) + return _CleanResult( + parsed=parsed._replace(netloc=netloc, query="", fragment=""), + query=urllib.parse.parse_qs(parsed.query), + fragment=frag_qs, + ) + + def links_equivalent(link1, link2): # type: (Link, Link) -> bool - return link1 == link2 + return _CleanResult.from_link(link1) == _CleanResult.from_link(link2)