Skip to content

Commit

Permalink
Optimize _FlatDirectorySource to only scan each path once
Browse files Browse the repository at this point in the history
  • Loading branch information
notatallshaw committed Oct 25, 2023
1 parent d1f0981 commit 3fb2637
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 11 deletions.
2 changes: 2 additions & 0 deletions src/pip/_internal/index/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,7 @@ def collect_sources(
page_validator=self.session.is_secure_origin,
expand_dir=False,
cache_link_parsing=False,
project_name=project_name,
)
for loc in self.search_scope.get_index_urls_locations(project_name)
).values()
Expand All @@ -483,6 +484,7 @@ def collect_sources(
page_validator=self.session.is_secure_origin,
expand_dir=True,
cache_link_parsing=True,
project_name=project_name,
)
for loc in self.find_links
).values()
Expand Down
84 changes: 73 additions & 11 deletions src/pip/_internal/index/sources.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@
import logging
import mimetypes
import os
import pathlib
from typing import Callable, Iterable, Optional, Tuple
from collections import defaultdict
from typing import Callable, Dict, Iterable, List, Optional, Tuple

from pip._vendor.packaging.utils import (
InvalidSdistFilename,
InvalidVersion,
InvalidWheelFilename,
canonicalize_name,
parse_sdist_filename,
parse_wheel_filename,
)

from pip._internal.models.candidate import InstallationCandidate
from pip._internal.models.link import Link
Expand Down Expand Up @@ -36,6 +45,53 @@ def _is_html_file(file_url: str) -> bool:
return mimetypes.guess_type(file_url, strict=False)[0] == "text/html"


class _FlatDirectoryToUrls:
"""Scans directory and caches results"""

def __init__(self, path: str) -> None:
self._path = path
self._page_candidates: List[str] = []
self._project_name_to_urls: Dict[str, List[str]] = defaultdict(list)
self._scanned_directory = False

def _scan_directory(self) -> None:
"""Scans directory once and populates both page_candidates
and project_name_to_urls at the same time
"""
for entry in os.scandir(self._path):
url = path_to_url(entry.path)
if _is_html_file(url):
self._page_candidates.append(url)
continue

# File must have a valid wheel or sdist name,
# otherwise not worth considering as a package
try:
project_filename = parse_wheel_filename(entry.name)[0]
except (InvalidWheelFilename, InvalidVersion):
try:
project_filename = parse_sdist_filename(entry.name)[0]
except (InvalidSdistFilename, InvalidVersion):
continue

self._project_name_to_urls[project_filename].append(url)
self._scanned_directory = True

@property
def page_candidates(self) -> List[str]:
if not self._scanned_directory:
self._scan_directory()

return self._page_candidates

@property
def project_name_to_urls(self) -> Dict[str, List[str]]:
if not self._scanned_directory:
self._scan_directory()

return self._project_name_to_urls


class _FlatDirectorySource(LinkSource):
"""Link source specified by ``--find-links=<path-to-dir>``.
Expand All @@ -45,30 +101,34 @@ class _FlatDirectorySource(LinkSource):
* ``file_candidates``: Archives in the directory.
"""

_paths_to_urls: Dict[str, _FlatDirectoryToUrls] = {}

def __init__(
self,
candidates_from_page: CandidatesFromPage,
path: str,
project_name: str,
) -> None:
self._candidates_from_page = candidates_from_page
self._path = pathlib.Path(os.path.realpath(path))
self._project_name = canonicalize_name(project_name)

# Get existing instance of _FlatDirectoryToUrls if it exists
if path in self._paths_to_urls:
self._path_to_urls = self._paths_to_urls[path]
else:
self._path_to_urls = _FlatDirectoryToUrls(path=path)
self._paths_to_urls[path] = self._path_to_urls

@property
def link(self) -> Optional[Link]:
return None

def page_candidates(self) -> FoundCandidates:
for path in self._path.iterdir():
url = path_to_url(str(path))
if not _is_html_file(url):
continue
for url in self._path_to_urls.page_candidates:
yield from self._candidates_from_page(Link(url))

def file_links(self) -> FoundLinks:
for path in self._path.iterdir():
url = path_to_url(str(path))
if _is_html_file(url):
continue
for url in self._path_to_urls.project_name_to_urls[self._project_name]:
yield Link(url)


Expand Down Expand Up @@ -170,6 +230,7 @@ def build_source(
page_validator: PageValidator,
expand_dir: bool,
cache_link_parsing: bool,
project_name: str,
) -> Tuple[Optional[str], Optional[LinkSource]]:
path: Optional[str] = None
url: Optional[str] = None
Expand Down Expand Up @@ -203,6 +264,7 @@ def build_source(
source = _FlatDirectorySource(
candidates_from_page=candidates_from_page,
path=path,
project_name=project_name,
)
else:
source = _IndexDirectorySource(
Expand Down

0 comments on commit 3fb2637

Please sign in to comment.