Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[fast-deps] Add a hook to download all "skipped" wheels #8638

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
84 changes: 57 additions & 27 deletions src/pip/_internal/network/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from pip._vendor.requests.models import CONTENT_CHUNK_SIZE

from pip._internal.cli.progress_bars import DownloadProgressProvider
from pip._internal.exceptions import NetworkConnectionError
from pip._internal.exceptions import HashMismatch, NetworkConnectionError
from pip._internal.models.index import PyPI
from pip._internal.network.cache import is_from_cache
from pip._internal.network.utils import (
Expand All @@ -21,15 +21,19 @@
redact_auth_from_url,
splitext,
)
from pip._internal.utils.temp_dir import TempDirectory
from pip._internal.utils.typing import MYPY_CHECK_RUNNING

if MYPY_CHECK_RUNNING:
from typing import Iterable, Optional
from typing import Iterable, Optional, Tuple

from pip._vendor.requests.models import Response

from pip._internal.models.link import Link
from pip._internal.network.session import PipSession
from pip._internal.utils.hashes import Hashes

File = Tuple[str, Optional[str]]

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -133,25 +137,31 @@ def _get_http_response_filename(resp, link):
return filename


def _http_get_download(session, link):
# type: (PipSession, Link) -> Response
target_url = link.url.split('#', 1)[0]
resp = session.get(target_url, headers=HEADERS, stream=True)
raise_for_status(resp)
return resp
def check_download_dir(link, location, hashes):
# type: (Link, str, Optional[Hashes]) -> Optional[str]
"""Check location for previously downloaded file with correct hash.

If a correct file is found return its path else None.
"""
download_path = os.path.join(location, link.filename)

class Download(object):
def __init__(
self,
response, # type: Response
filename, # type: str
chunks, # type: Iterable[bytes]
):
# type: (...) -> None
self.response = response
self.filename = filename
self.chunks = chunks
if not os.path.exists(download_path):
return None

# If already downloaded, does its hash match?
logger.info('File was already downloaded %s', download_path)
if hashes:
try:
hashes.check_against_path(download_path)
except HashMismatch:
logger.warning(
'Previously-downloaded file %s has bad hash. '
'Re-downloading.',
download_path
)
os.unlink(download_path)
return None
return download_path


class Downloader(object):
Expand All @@ -163,20 +173,40 @@ def __init__(
# type: (...) -> None
self._session = session
self._progress_bar = progress_bar
self._tmpdir = TempDirectory(kind='unpack', globally_managed=True)

def __call__(self, link):
# type: (Link) -> Download
def _download(self, link, location):
# type: (Link, str) -> File
url, sep, checksum = link.url.partition('#')
response = self._session.get(url, headers=HEADERS, stream=True)
try:
resp = _http_get_download(self._session, link)
raise_for_status(response)
except NetworkConnectionError as e:
assert e.response is not None
logger.critical(
"HTTP error %s while getting %s", e.response.status_code, link
)
raise

return Download(
resp,
_get_http_response_filename(resp, link),
_prepare_download(resp, link, self._progress_bar),
)
chunks = _prepare_download(response, link, self._progress_bar)
filename = _get_http_response_filename(response, link)
file_path = os.path.join(location, filename)

with open(file_path, 'wb') as content_file:
for chunk in chunks:
content_file.write(chunk)
return file_path, response.headers.get('content-type', '')

def __call__(self, link, location=None, hashes=None):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Previously this class and function call did essentially 1 thing: download a file with progress. That was a good thing because it could be understood as a single small chunk, passed around, and used without much effort. When we add more responsibilities it gets harder to understand and maintain.

Instead of pulling functionality from RequirementPreparer into this, and then invoking this from the new resolver, what if we:

  1. move the functionality from RequirementPreparer somewhere else and invoke it from RequirementPreparer and the new resolver, or
  2. (preferred) move downloading out of RequirementPreparer all together, and do it either:
    1. internally in AbstractDistribution/Candidate when required to satisfy some part of processing (transparently to the caller), or
    2. in a common place after resolving has happened, if still needed

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(preferred) move downloading out of RequirementPreparer all together, and do it either:

I haven't looked at this PR yet, but IIUC, this was the conclusion from what @McSinyx and I discussed. I do think I agree that Downloader might not be the best place to put this though. :)

Copy link
Contributor Author

@McSinyx McSinyx Aug 2, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On current master, network.download.Downloader only initiates the download and passes the lazy response to the preparation code which then performs the actual download (to a file). What 744896f aims to do is making sure that Downloader.__call__ will perform the download, with addition to hash checking to make the interface consistent when caching is involved. In the way I see it, Downloader should be able to act analogous to wget, which is given an URL/Link, optional output target (-O). I assume more responsibilities refers to the hash checking.

The reason I placed the call to Downloader directly in resolve is that it is easier to ensure that such call is thread-safe than a higher-level wrapper like a method in RequirementPreparer. It would also be more trivial to handle the progress bar for a batch of downloads directly within the Downloader than extracting it from the preparer.

If I understand correctly, then RequirementPreparer seems to be the preparer for candidates of the new resolver, and downloading something happens to be part of the work for some candidates, i.e. RequirementPreparer sometimes need to download to be able to construct an (Abstract)Distribution to give to the candidate, so I'm not sure if I understand the model (2.i) is referring to.

Regarding (2.ii), I completely agree that it would be really nice if we can have this call outside of the resolver code. However, in a discuss with @pradyunsg, we came to a conclusion that InstallRequirement is currently to tangled with the preparation code (or that it logically requires the underlying file to exist), so that we will move the call out once we reworked RequirementSet. @chrahunt, I'm not sure if I've correctly understood and addressed your points since my head is buzzing as I try to visualize this but I hope I am able to deliver some reasonings. Do they change your mind about 744896f or do you have any further suggestions?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pradyunsg, we came to a conclusion that InstallRequirement is currently to tangled with the preparation code (or that it logically requires the underlying file to exist), so that we will move the call out once we reworked RequirementSet.

The issue is that the code that gets run after Resolver.resolve returns, is closely tied to the assumption that the underlying file exists. We don't need for reworking of RequirementSet, but rather, we need to cleanly handle the "I need to be fully downloaded" state without needing to touch the old resolver code.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@McSinyx I think some of my concern was an extension of my original comments in #8532, which I did not follow up on in #8588 (sorry). I played with it a bit and came up with #8685, which I think we can use as a pretty clean base for your download work. Please take a look!

Regarding the downloader, I think we can probably achieve the desired effect by creating a new function or class that uses it as-is instead of putting more logic into it. Since our fast-deps feature isn't enabled when doing hash checking, the wrapper won't need any of that stuff.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't need for reworking of RequirementSet, but rather, we need to cleanly handle the "I need to be fully downloaded" state without needing to touch the old resolver code.

Thank you for the correction.

I played with it a bit and came up with 8685, which I think we can use as a pretty clean base for your download work. Please take a look!

Thanks, I haven't read it carefully but the resulting hook seems really nice!

Regarding the downloader, I think we can probably achieve the desired effect by creating a new function or class that uses it as-is instead of putting more logic into it. Since our fast-deps feature isn't enabled when doing hash checking, the wrapper won't need any of that stuff.

This sounds good to me too. @pradyunsg said that you two had a discussion on this and I assume this is the consensus that came out of it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking about the hashing part, and IMHO apparently we should check them, just to make sure that a wheel is not corrupted during the download process.

Copy link
Member

@pradyunsg pradyunsg Aug 4, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking about the hashing part, and IMHO apparently we should check them, just to make sure that a wheel is not corrupted during the download process.

If we're in "require-hashes" mode, doing partial downloads does not make sense, since we're only permitted to trust artifacts/files with that specific hash. And the only way to compute that hash is to download the entire file. Since we're going to download the entire file (and we know that if you have hashes, you also have performed dependency resolution already; so we're definitely using that downloaded file), we should not use partial downloads/fast-deps when hash-checking mode is enabled.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I didn't communicate this right 😛 Let me provide more context (or at least the context I'm assuming):

  • We want a routine taking multiple URLs and download them in parallel, with UI taken into account
  • Before that's done, let's have a map/for that call something download of each of them, and that something should be thread-safe
  • It'll be difficult to keep all preparations thread-safe, even it is at the moment (I think)
    def prepare_linked_requirement_more(self, req, parallel_builds=False):
    # type: (InstallRequirement, bool) -> None
    """Prepare a linked requirement more, if needed."""
    if not req.needs_more_preparation:
    return
    self._prepare_linked_requirement(req, parallel_builds)
  • Rather, let what to be parallelized to be solely downloading the file to (usually) a temp dir and let the preparation code use it as cache latter. Both (after download and when verifying the cache) needs hash checking because the file might corrupt for some reason.
  • I'm not sure if it's the effect of --no-cache-dir, but some wheels just don't get cached within a session. This doesn't seem to sufficient:
    if is_from_cache(resp):
    logger.info("Using cached %s", logged_url)
    else:
    logger.info("Downloading %s", logged_url)
    but this is:
    def _check_download_dir(link, download_dir, hashes):
    # type: (Link, str, Optional[Hashes]) -> Optional[str]
    """ Check download_dir for previously downloaded file with correct hash
    If a correct file is found return its path else None
    """
    download_path = os.path.join(download_dir, link.filename)
    if not os.path.exists(download_path):
    return None
    # If already downloaded, does its hash match?
  • If we use a common temp dir for all downloads, this can be down automatically and this common state really well suit Downloader. Furthermore the similarity between Downloader.download_one (or something to be renamed from current .__call__) and Downloader.download_batch would be really nice.

I am truly sorry for not clarifying this earlier in a discussion point and ended up causing a decent amount of confusion 😅

# type: (Link, Optional[str], Optional[Hashes]) -> File
if location is None:
location = self._tmpdir.path
file_path = check_download_dir(link, location, hashes)
if file_path is not None:
content_type = mimetypes.guess_type(file_path)[0]
return file_path, content_type

file_path, content_type = self._download(link, location)
if hashes:
hashes.check_against_path(file_path)
return file_path, content_type
92 changes: 16 additions & 76 deletions src/pip/_internal/operations/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@
from pip._internal.distributions.installed import InstalledDistribution
from pip._internal.exceptions import (
DirectoryUrlHashUnsupported,
HashMismatch,
HashUnpinned,
InstallationError,
NetworkConnectionError,
PreviousBuildDirError,
VcsHashUnsupported,
)
from pip._internal.network.download import check_download_dir
from pip._internal.utils.filesystem import copy2_fixed
from pip._internal.utils.hashes import MissingHashes
from pip._internal.utils.logging import indent_log
Expand All @@ -33,15 +33,12 @@
path_to_display,
rmtree,
)
from pip._internal.utils.temp_dir import TempDirectory
from pip._internal.utils.typing import MYPY_CHECK_RUNNING
from pip._internal.utils.unpacking import unpack_file
from pip._internal.vcs import vcs

if MYPY_CHECK_RUNNING:
from typing import (
Callable, List, Optional, Tuple,
)
from typing import Callable, List, Optional

from mypy_extensions import TypedDict

Expand Down Expand Up @@ -101,7 +98,7 @@ def unpack_vcs_link(link, location):

class File(object):
def __init__(self, path, content_type):
# type: (str, str) -> None
# type: (str, Optional[str]) -> None
self.path = path
self.content_type = content_type

Expand All @@ -113,11 +110,10 @@ def get_http_url(
hashes=None, # type: Optional[Hashes]
):
# type: (...) -> File
temp_dir = TempDirectory(kind="unpack", globally_managed=True)
# If a download dir is specified, is the file already downloaded there?
already_downloaded_path = None
if download_dir:
already_downloaded_path = _check_download_dir(
already_downloaded_path = check_download_dir(
link, download_dir, hashes
)

Expand All @@ -126,9 +122,7 @@ def get_http_url(
content_type = mimetypes.guess_type(from_path)[0]
else:
# let's download to a tmp dir
from_path, content_type = _download_http_url(
link, downloader, temp_dir.path, hashes
)
from_path, content_type = downloader(link, hashes=hashes)

return File(from_path, content_type)

Expand Down Expand Up @@ -197,7 +191,7 @@ def get_file_url(
# If a download dir is specified, is the file already there and valid?
already_downloaded_path = None
if download_dir:
already_downloaded_path = _check_download_dir(
already_downloaded_path = check_download_dir(
link, download_dir, hashes
)

Expand Down Expand Up @@ -229,91 +223,37 @@ def unpack_url(
# type: (...) -> Optional[File]
"""Unpack link into location, downloading if required.

:param hashes: A Hashes object, one of whose embedded hashes must match,
or HashMismatch will be raised. If the Hashes is empty, no matches are
required, and unhashable types of requirements (like VCS ones, which
would ordinarily raise HashUnsupported) are allowed.
One of embedded hashes in the given Hashes object must match,
or HashMismatch will be raised. If the Hashes is empty, no matches
are required, and unhashable types of requirements (like VCS ones,
which would ordinarily raise HashUnsupported) are allowed.
"""
# non-editable vcs urls
# Non-editable VCS URL
if link.is_vcs:
unpack_vcs_link(link, location)
return None

# If it's a url to a local directory
# URL to a local directory
if link.is_existing_dir():
if os.path.isdir(location):
rmtree(location)
_copy_source_tree(link.file_path, location)
return None

# file urls
if link.is_file:
file = get_file_url(link, download_dir, hashes=hashes)

# http urls
else:
file = get_http_url(
link,
downloader,
download_dir,
hashes=hashes,
)
file = get_http_url(link, downloader, download_dir, hashes=hashes)

# unpack the archive to the build dir location. even when only downloading
# archives, they have to be unpacked to parse dependencies, except wheels
# Unpack the archive to the build directory unless it is a wheel.
# Even if the command is download, archives still have to be
# unpacked to parse dependencies.
if not link.is_wheel:
unpack_file(file.path, location, file.content_type)

return file


def _download_http_url(
link, # type: Link
downloader, # type: Downloader
temp_dir, # type: str
hashes, # type: Optional[Hashes]
):
# type: (...) -> Tuple[str, str]
"""Download link url into temp_dir using provided session"""
download = downloader(link)

file_path = os.path.join(temp_dir, download.filename)
with open(file_path, 'wb') as content_file:
for chunk in download.chunks:
content_file.write(chunk)

if hashes:
hashes.check_against_path(file_path)

return file_path, download.response.headers.get('content-type', '')


def _check_download_dir(link, download_dir, hashes):
# type: (Link, str, Optional[Hashes]) -> Optional[str]
""" Check download_dir for previously downloaded file with correct hash
If a correct file is found return its path else None
"""
download_path = os.path.join(download_dir, link.filename)

if not os.path.exists(download_path):
return None

# If already downloaded, does its hash match?
logger.info('File was already downloaded %s', download_path)
if hashes:
try:
hashes.check_against_path(download_path)
except HashMismatch:
logger.warning(
'Previously-downloaded file %s has bad hash. '
'Re-downloading.',
download_path
)
os.unlink(download_path)
return None
return download_path


class RequirementPreparer(object):
"""Prepares a Requirement
"""
Expand Down
Loading