From a8f9fa2b9266f65cdf2efa33ca8c62903370765f Mon Sep 17 00:00:00 2001 From: William Woodruff Date: Sat, 4 Dec 2021 22:10:26 -0500 Subject: [PATCH] Don't explode sdists before installing them (#166) * pip_audit: don't unpack sdists before installing * pypi_provider: simplify sdist placement * pip_audit: logging, keep sdists out of candidacy * pypi_provider: remove old code * _virtual_env: fix f-strings * _virtual_env: re-enable cache * _virtual_env: remove log * CHANGELOG: record changes * _virtual_env: Update comment Co-authored-by: Alex Cameron --- CHANGELOG.md | 4 + .../resolvelib/pypi_provider.py | 82 +++++++++++-------- pip_audit/_util.py | 2 +- pip_audit/_virtual_env.py | 32 ++++++-- 4 files changed, 76 insertions(+), 44 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6bafbfb9..9035293c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,10 @@ All versions prior to 0.0.9 are untracked. an version string that couldn't be parsed as a PEP-440 version was fixed ([#175](https://github.com/trailofbits/pip-audit/pull/175)) +* Dependency sources: a crash caused by incorrect assumptions about + the structure of source distributions was fixed + ([#166](https://github.com/trailofbits/pip-audit/pull/166)) + ### Removed ## [1.0.1] - 2021-12-02 diff --git a/pip_audit/_dependency_source/resolvelib/pypi_provider.py b/pip_audit/_dependency_source/resolvelib/pypi_provider.py index 3228ce39..61a673b5 100644 --- a/pip_audit/_dependency_source/resolvelib/pypi_provider.py +++ b/pip_audit/_dependency_source/resolvelib/pypi_provider.py @@ -5,14 +5,14 @@ authors under the ISC license. """ -import os +import itertools from email.message import EmailMessage from email.parser import BytesParser from io import BytesIO from operator import attrgetter -from tarfile import TarFile +from pathlib import Path from tempfile import TemporaryDirectory -from typing import BinaryIO, List, Optional, Set, cast +from typing import BinaryIO, Iterator, List, Optional, Set, cast from urllib.parse import urlparse from zipfile import ZipFile @@ -42,6 +42,7 @@ class Candidate: def __init__( self, name: str, + filename: Path, version: Version, url: str, extras: Set[str], @@ -54,6 +55,7 @@ def __init__( """ self.name = canonicalize_name(name) + self.filename = filename self.version = version self.url = url self.extras = extras @@ -69,8 +71,8 @@ def __repr__(self): # pragma: no cover A string representation for `Candidate`. """ if not self.extras: - return f"<{self.name}=={self.version}>" - return f"<{self.name}[{','.join(self.extras)}]=={self.version}>" + return f"<{self.name}=={self.version} wheel={self.is_wheel}>" + return f"<{self.name}[{','.join(self.extras)}]=={self.version} wheel={self.is_wheel}>" @property def metadata(self) -> EmailMessage: @@ -139,23 +141,15 @@ def _get_metadata_for_sdist(self): """ Extracts the metadata for this candidate, if it's a source distribution. """ + response: requests.Response = requests.get(self.url, timeout=self.timeout) response.raise_for_status() - data = response.content + sdist_data = response.content metadata = EmailMessage() with TemporaryDirectory() as pkg_dir: - if self.state is not None: - self.state.update_state( - f"Extracting source distribution for {self.name} ({self.version})" - ) # pragma: no cover - - # Extract archive onto the disk - with TarFile.open(fileobj=BytesIO(data), mode="r:gz") as t: - # The directory is the first member in a tarball - names = t.getnames() - pkg_name = names[0] - t.extractall(pkg_dir) + sdist = Path(pkg_dir) / self.filename.name + sdist.write_bytes(sdist_data) if self.state is not None: self.state.update_state( @@ -163,11 +157,8 @@ def _get_metadata_for_sdist(self): f"({self.version})" ) # pragma: no cover - # Put together a full path of where the source distribution is - pkg_path = os.path.join(pkg_dir, pkg_name) - with TemporaryDirectory() as ve_dir: - ve = VirtualEnv(["-e", pkg_path], self.state) + ve = VirtualEnv([str(sdist)], self.state) ve.create(ve_dir) if self.state is not None: @@ -182,7 +173,9 @@ def _get_metadata_for_sdist(self): return metadata -def get_project_from_pypi(project, extras, timeout: Optional[int], state: Optional[AuditState]): +def get_project_from_pypi( + project, extras, timeout: Optional[int], state: Optional[AuditState] +) -> Iterator[Candidate]: """Return candidates created from the project name and extras.""" url = "https://pypi.org/simple/{}".format(project) response: requests.Response = requests.get(url, timeout=timeout) @@ -214,15 +207,21 @@ def get_project_from_pypi(project, extras, timeout: Optional[int], state: Option # which we'll then skip via the exception handler. (name, version) = parse_sdist_filename(filename) is_wheel = False + + # TODO: Handle compatibility tags? + yield Candidate( + name, + Path(filename), + version, + url=url, + extras=extras, + is_wheel=is_wheel, + timeout=timeout, + state=state, + ) except Exception: continue - # TODO: Handle compatibility tags? - - yield Candidate( - name, version, url=url, extras=extras, is_wheel=is_wheel, timeout=timeout, state=state - ) - class PyPIProvider(AbstractProvider): """ @@ -272,14 +271,27 @@ def find_matches(self, identifier, requirements, incompatibilities): # Need to pass the extras to the search, so they # are added to the candidate at creation - we # treat candidates as immutable once created. - candidates = ( - candidate - for candidate in get_project_from_pypi(identifier, extras, self.timeout, self.state) - if candidate.version not in bad_versions - and all(candidate.version in r.specifier for r in requirements) + candidates = sorted( + [ + candidate + for candidate in get_project_from_pypi(identifier, extras, self.timeout, self.state) + if candidate.version not in bad_versions + and all(candidate.version in r.specifier for r in requirements) + ], + key=attrgetter("version", "is_wheel"), + reverse=True, ) - # We want to prefer more recent versions and prioritize wheels - return sorted(candidates, key=attrgetter("version", "is_wheel"), reverse=True) + + # If we have multiple candidates for a single version and some are wheels, + # yield only the wheels. This keeps us from wasting a large amount of + # dependency search time when comparing wheels against source distributions. + for _, candidates in itertools.groupby(candidates, key=attrgetter("version")): + candidate = next(candidates) + yield candidate + if candidate.is_wheel: + yield from (c for c in candidates if c.is_wheel) + else: + yield from candidates def is_satisfied_by(self, requirement, candidate): """ diff --git a/pip_audit/_util.py b/pip_audit/_util.py index e06fe9c8..dc8a16be 100644 --- a/pip_audit/_util.py +++ b/pip_audit/_util.py @@ -1,5 +1,5 @@ """ -Utilities functions for `pip-audit`. +Utility functions for `pip-audit`. """ import sys diff --git a/pip_audit/_virtual_env.py b/pip_audit/_virtual_env.py index e058b9e9..62fe2edc 100644 --- a/pip_audit/_virtual_env.py +++ b/pip_audit/_virtual_env.py @@ -3,6 +3,7 @@ """ import json +import logging import subprocess import venv from typing import Iterator, List, Optional, Tuple @@ -11,6 +12,8 @@ from ._state import AuditState +logger = logging.getLogger(__name__) + class VirtualEnv(venv.EnvBuilder): """ @@ -61,6 +64,9 @@ def post_setup(self, context): We do a few things in our custom post-setup: - Upgrade the `pip` version. We'll be using `pip list` with the `--format json` option which requires a non-ancient version for `pip`. + - Install `wheel`. When our packages install their own dependencies, they might be able + to do so through wheels, which are much faster and don't require us to run + setup scripts. - Execute the custom install command. - Call `pip list`, and parse the output into a list of packages to be returned from when the `installed_packages` property is queried. @@ -71,7 +77,16 @@ def post_setup(self, context): ) # pragma: no cover # Firstly, upgrade our `pip` versions since `ensurepip` can leave us with an old version - pip_upgrade_cmd = [context.env_exe, "-m", "pip", "install", "--upgrade", "pip"] + # and install `wheel` in case our package dependencies are offered as wheels + pip_upgrade_cmd = [ + context.env_exe, + "-m", + "pip", + "install", + "--upgrade", + "pip", + "wheel", + ] try: subprocess.run( pip_upgrade_cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL @@ -85,14 +100,15 @@ def post_setup(self, context): ) # pragma: no cover # Install our packages - package_install_cmd = [context.env_exe, "-m", "pip", "install", *self._install_args] + package_install_cmd = [ + context.env_exe, + "-m", + "pip", + "install", + *self._install_args, + ] try: - subprocess.run( - package_install_cmd, - check=True, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - ) + subprocess.run(package_install_cmd, check=True) except subprocess.CalledProcessError as cpe: raise VirtualEnvError(f"Failed to install packages: {package_install_cmd}") from cpe