cache Link parsing and interpreter compatibility checking

- also compress the link parsing
pypa · Jan 17, 2024 · 7a17fd3 · 7a17fd3
1 parent cdf2fa5
commit 7a17fd3
Show file tree

Hide file tree

Showing 5 changed files with 254 additions and 59 deletions.
diff --git a/src/pip/_internal/cache.py b/src/pip/_internal/cache.py
@@ -8,7 +8,7 @@
 import os
 import re
 from pathlib import Path
-from typing import Dict, Iterator, List, Optional, Tuple
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Type
 
 from pip._vendor.packaging.tags import Tag, interpreter_name, interpreter_version
 from pip._vendor.packaging.utils import canonicalize_name
@@ -146,6 +146,17 @@ def get_path_for_link(self, link: Link) -> str:
         return os.path.join(self.cache_dir, "link-metadata", *parts)
 
 
+class SerializableEntry(abc.ABC):
+    @classmethod
+    @abc.abstractmethod
+    def suffix(cls) -> str:
+        ...
+
+    @abc.abstractmethod
+    def serialize(self) -> Dict[str, Any]:
+        ...
+
+
 class FetchResolveCache(Cache):
     def get_path_for_link(self, link: Link) -> str:
         # We are reading index links to extract other links from, not executing any
@@ -154,6 +165,19 @@ def get_path_for_link(self, link: Link) -> str:
         assert self.cache_dir
         return os.path.join(self.cache_dir, "fetch-resolve", *parts)
 
+    def hashed_entry_path(self, link: Link, entry: SerializableEntry) -> Path:
+        hashed = _hash_dict(entry.serialize())
+        return self.cache_path(link) / f"{hashed}{entry.suffix()}"
+
+    def clear_hashed_entries(
+        self, link: Link, entry_type: Type[SerializableEntry]
+    ) -> None:
+        for hashed_entry in self.cache_path(link).glob(f"*{entry_type.suffix()}"):
+            logger.debug(
+                "unlinking invalidated hashed link eval cache entry %s", hashed_entry
+            )
+            hashed_entry.unlink()
+
 
 class WheelCacheBase(Cache):
     """Specializations to the cache concept for wheels."""

diff --git a/src/pip/_internal/index/package_finder.py b/src/pip/_internal/index/package_finder.py
@@ -1,10 +1,12 @@
 """Routines related to PyPI, indexes"""
 
 import binascii
+import bz2
 import datetime
 import enum
 import functools
 import itertools
+import json
 import logging
 import os
 import re
@@ -13,6 +15,8 @@
 from pathlib import Path
 from typing import (
     TYPE_CHECKING,
+    Any,
+    Callable,
     Dict,
     FrozenSet,
     Iterable,
@@ -29,7 +33,7 @@
 from pip._vendor.packaging.version import _BaseVersion
 from pip._vendor.packaging.version import parse as parse_version
 
-from pip._internal.cache import FetchResolveCache
+from pip._internal.cache import FetchResolveCache, SerializableEntry
 from pip._internal.exceptions import (
     BestVersionAlreadyInstalled,
     DistributionNotFound,
@@ -39,7 +43,7 @@
 from pip._internal.index.collector import IndexContent, LinkCollector, parse_links
 from pip._internal.models.candidate import InstallationCandidate
 from pip._internal.models.format_control import FormatControl
-from pip._internal.models.link import Link
+from pip._internal.models.link import Link, PersistentLinkCacheArgs
 from pip._internal.models.search_scope import SearchScope
 from pip._internal.models.selection_prefs import SelectionPreferences
 from pip._internal.models.target_python import TargetPython
@@ -122,14 +126,29 @@ class LinkType(enum.Enum):
     requires_python_mismatch = enum.auto()
 
 
-class LinkEvaluator:
+class LinkEvaluator(SerializableEntry):
 
     """
     Responsible for evaluating links for a particular project.
     """
 
+    @classmethod
+    def suffix(cls) -> str:
+        return ".evaluation"
+
     _py_version_re = re.compile(r"-py([123]\.?[0-9]?)$")
 
+    def serialize(self) -> Dict[str, Any]:
+        return {
+            "project_name": self.project_name,
+            "canonical_name": self._canonical_name,
+            # Sort these for determinism.
+            "formats": sorted(self._formats),
+            "target_python": self._target_python.format_given(),
+            "allow_yanked": self._allow_yanked,
+            "ignore_requires_python": self._ignore_requires_python,
+        }
+
     # Don't include an allow_yanked default value to make sure each call
     # site considers whether yanked releases are allowed. This also causes
     # that decision to be made explicit in the calling code, which helps
@@ -604,6 +623,19 @@ def compute_best_candidate(
         )
 
 
+_FindCandidates = Callable[["PackageFinder", str], List[InstallationCandidate]]
+
+
+def _canonicalize_arg(func: _FindCandidates) -> _FindCandidates:
+    @functools.wraps(func)
+    def wrapper(
+        self: "PackageFinder", project_name: str
+    ) -> List[InstallationCandidate]:
+        return func(self, canonicalize_name(project_name))
+
+    return wrapper
+
+
 class PackageFinder:
     """This finds packages.
 
@@ -961,6 +993,91 @@ def _write_http_cache_info(
 
         return (new_etag, new_date, new_checksum, page_unmodified)
 
+    @staticmethod
+    def _try_load_parsed_links_cache(parsed_links_path: Path) -> Optional[List[Link]]:
+        page_links: Optional[List[Link]] = None
+        try:
+            with bz2.open(parsed_links_path, mode="rt", encoding="utf-8") as f:
+                logger.debug("reading page links from cache %s", parsed_links_path)
+                cached_links = json.load(f)
+                page_links = []
+                for cache_info in cached_links:
+                    link = Link.from_cache_args(
+                        PersistentLinkCacheArgs.from_json(cache_info)
+                    )
+                    assert link is not None
+                    page_links.append(link)
+        except (OSError, json.decoder.JSONDecodeError, KeyError) as e:
+            logger.debug(
+                "could not read page links from cache file %s %s(%s)",
+                parsed_links_path,
+                e.__class__.__name__,
+                str(e),
+            )
+        return page_links
+
+    @staticmethod
+    def _write_parsed_links_cache(
+        parsed_links_path: Path, links: Iterable[Link]
+    ) -> List[Link]:
+        cacheable_links: List[Dict[str, Any]] = []
+        page_links: List[Link] = []
+        for link in links:
+            cache_info = link.cache_args()
+            assert cache_info is not None
+            cacheable_links.append(cache_info.to_json())
+            page_links.append(link)
+
+        logger.debug("writing page links to %s", parsed_links_path)
+        with bz2.open(parsed_links_path, mode="wt", encoding="utf-8") as f:
+            json.dump(cacheable_links, f)
+
+        return page_links
+
+    @staticmethod
+    def _try_load_installation_candidate_cache(
+        cached_candidates_path: Path,
+    ) -> Optional[List[InstallationCandidate]]:
+        try:
+            with bz2.open(cached_candidates_path, mode="rt", encoding="utf-8") as f:
+                serialized_candidates = json.load(f)
+            logger.debug("read serialized candidates from %s", cached_candidates_path)
+            package_links: List[InstallationCandidate] = []
+            for cand in serialized_candidates:
+                link_cache_args = PersistentLinkCacheArgs.from_json(cand["link"])
+                link = Link.from_cache_args(link_cache_args)
+                package_links.append(
+                    InstallationCandidate(cand["name"], cand["version"], link)
+                )
+            return package_links
+        except (OSError, json.decoder.JSONDecodeError, KeyError) as e:
+            logger.debug(
+                "could not read cached candidates at %s %s(%s)",
+                cached_candidates_path,
+                e.__class__.__name__,
+                str(e),
+            )
+        return None
+
+    @staticmethod
+    def _write_installation_candidate_cache(
+        cached_candidates_path: Path,
+        candidates: Iterable[InstallationCandidate],
+    ) -> List[InstallationCandidate]:
+        candidates = list(candidates)
+        serialized_candidates = [
+            {
+                "name": candidate.name,
+                "version": str(candidate.version),
+                "link": candidate.link.cache_args().to_json(),
+            }
+            for candidate in candidates
+        ]
+        with bz2.open(cached_candidates_path, mode="wt", encoding="utf-8") as f:
+            logger.debug("writing serialized candidates to %s", cached_candidates_path)
+            json.dump(serialized_candidates, f)
+        return candidates
+
     def _process_project_url_uncached(
         self, project_url: Link, link_evaluator: LinkEvaluator
     ) -> List[InstallationCandidate]:
@@ -979,7 +1096,6 @@ def _process_project_url_uncached(
             package_links = self.evaluate_links(link_evaluator, links=page_links)
         return package_links
 
-    @functools.lru_cache(maxsize=None)
     def process_project_url(
         self, project_url: Link, link_evaluator: LinkEvaluator
     ) -> List[InstallationCandidate]:
@@ -992,6 +1108,10 @@ def process_project_url(
         etag_path = cached_path / "etag"
         date_path = cached_path / "modified-since-date"
         checksum_path = cached_path / "checksum"
+        parsed_links_path = cached_path / "parsed-links"
+        cached_candidates_path = self._fetch_resolve_cache.hashed_entry_path(
+            project_url, link_evaluator
+        )
 
         headers: Dict[str, str] = {}
         # NB: mutates headers!
@@ -1028,16 +1148,45 @@ def process_project_url(
             prev_checksum=prev_checksum,
         )
 
-        page_links = parse_links(index_response)
+        page_links: Optional[List[Link]] = None
+        # Only try our persistent link parsing and evaluation caches if we know the page
+        # was unmodified via checksum.
+        if page_unmodified:
+            cached_candidates = self._try_load_installation_candidate_cache(
+                cached_candidates_path
+            )
+            if cached_candidates is not None:
+                return cached_candidates
+
+            page_links = self._try_load_parsed_links_cache(parsed_links_path)
+        else:
+            try:
+                parsed_links_path.unlink()
+            except OSError:
+                pass
+            self._fetch_resolve_cache.clear_hashed_entries(project_url, LinkEvaluator)
+
+        if page_links is None:
+            logger.debug(
+                "extracting new parsed links from index response %s", index_response
+            )
+            page_links = self._write_parsed_links_cache(
+                parsed_links_path,
+                parse_links(index_response),
+            )
 
         with indent_log():
-            package_links = self.evaluate_links(
-                link_evaluator,
-                links=page_links,
+            package_links = self._write_installation_candidate_cache(
+                cached_candidates_path,
+                self.evaluate_links(
+                    link_evaluator,
+                    links=page_links,
+                ),
             )
 
         return package_links
 
+    @_canonicalize_arg
     @functools.lru_cache(maxsize=None)
     def find_all_candidates(self, project_name: str) -> List[InstallationCandidate]:
         """Find all available InstallationCandidate for project_name

diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py
@@ -179,6 +179,43 @@ def _ensure_quoted_url(url: str) -> str:
     return urllib.parse.urlunparse(result._replace(path=path))
 
 
+@dataclass(frozen=True)
+class PersistentLinkCacheArgs:
+    url: str
+    comes_from: Optional[str] = None
+    requires_python: Optional[str] = None
+    yanked_reason: Optional[str] = None
+    metadata_file_data: Optional[MetadataFile] = None
+    hashes: Optional[Mapping[str, str]] = None
+
+    def to_json(self) -> Dict[str, Any]:
+        return {
+            "url": self.url,
+            "comes_from": self.comes_from,
+            "requires_python": self.requires_python,
+            "yanked_reason": self.yanked_reason,
+            "metadata_file_data": (
+                self.metadata_file_data.hashes if self.metadata_file_data else None
+            ),
+            "hashes": self.hashes,
+        }
+
+    @classmethod
+    def from_json(cls, cache_info: Dict[str, Any]) -> "PersistentLinkCacheArgs":
+        return cls(
+            url=cache_info["url"],
+            comes_from=cache_info["comes_from"],
+            requires_python=cache_info["requires_python"],
+            yanked_reason=cache_info["yanked_reason"],
+            metadata_file_data=(
+                MetadataFile(hashes=cache_info["metadata_file_data"])
+                if cache_info["metadata_file_data"]
+                else None
+            ),
+            hashes=cache_info["hashes"],
+        )
+
+
 class Link(KeyBasedCompareMixin):
     """Represents a parsed link from a Package Index's simple URL"""
 
@@ -305,6 +342,27 @@ def from_json(
             metadata_file_data=metadata_file_data,
         )
 
+    def cache_args(self) -> PersistentLinkCacheArgs:
+        return PersistentLinkCacheArgs(
+            url=self.url,
+            comes_from=(str(self.comes_from) if self.comes_from else None),
+            requires_python=self.requires_python,
+            yanked_reason=self.yanked_reason,
+            metadata_file_data=self.metadata_file_data,
+            hashes=self._hashes,
+        )
+
+    @classmethod
+    def from_cache_args(cls, args: PersistentLinkCacheArgs) -> "Link":
+        return cls(
+            args.url,
+            comes_from=args.comes_from,
+            requires_python=args.requires_python,
+            yanked_reason=args.yanked_reason,
+            metadata_file_data=args.metadata_file_data,
+            hashes=args.hashes,
+        )
+
     @classmethod
     def from_element(
         cls,