pypa · yichi-yang · Jul 17, 2022 · Jul 17, 2022
diff --git a/news/11180.feature.rst b/news/11180.feature.rst
@@ -0,0 +1 @@
+Add support to resume incomplete download. The behavior can be controlled using flags ``--incomplete-downloads`` and ``--incomplete-download-retries``.
diff --git a/src/pip/_internal/cli/cmdoptions.py b/src/pip/_internal/cli/cmdoptions.py
@@ -1017,6 +1017,25 @@ def check_list_path_option(options: Values) -> None:
     help=("Enable deprecated functionality, that will be removed in the future."),
 )
 
+incomplete_downloads: Callable[..., Option] = partial(
+    Option,
+    "--incomplete-downloads",
+    dest="resume_incomplete",
+    choices=["resume", "discard"],
+    default="discard",
+    metavar="policy",
+    help="How to handle an incomplete download: resume, discard (default to %default).",
+)
+
+incomplete_download_retries: Callable[..., Option] = partial(
+    Option,
+    "--incomplete-download-retries",
+    dest="resume_attempts",
+    type="int",
+    default=5,
+    help="Maximum number of resumption retries for incomplete download "
+    "(default %default times).",
+)
 
 ##########
 # groups #
@@ -1048,6 +1067,8 @@ def check_list_path_option(options: Values) -> None:
         no_python_version_warning,
         use_new_feature,
         use_deprecated_feature,
+        incomplete_downloads,
+        incomplete_download_retries,
     ],
 }
 

diff --git a/src/pip/_internal/cli/progress_bars.py b/src/pip/_internal/cli/progress_bars.py
@@ -24,6 +24,7 @@ def _rich_progress_bar(
     *,
     bar_type: str,
     size: int,
+    initial_progress: Optional[int] = None,
 ) -> Generator[bytes, None, None]:
     assert bar_type == "on", "This should only be used in the default mode."
 
@@ -49,20 +50,27 @@ def _rich_progress_bar(
 
     progress = Progress(*columns, refresh_per_second=30)
     task_id = progress.add_task(" " * (get_indentation() + 2), total=total)
+    if initial_progress is not None:
+        progress.update(task_id, advance=initial_progress)
     with progress:
         for chunk in iterable:
             yield chunk
             progress.update(task_id, advance=len(chunk))
 
 
 def get_download_progress_renderer(
-    *, bar_type: str, size: Optional[int] = None
+    *, bar_type: str, size: Optional[int] = None, initial_progress: Optional[int] = None
 ) -> DownloadProgressRenderer:
     """Get an object that can be used to render the download progress.
 
     Returns a callable, that takes an iterable to "wrap".
     """
     if bar_type == "on":
-        return functools.partial(_rich_progress_bar, bar_type=bar_type, size=size)
+        return functools.partial(
+            _rich_progress_bar,
+            bar_type=bar_type,
+            size=size,
+            initial_progress=initial_progress,
+        )
     else:
         return iter  # no-op, when passed an iterator
diff --git a/src/pip/_internal/cli/req_command.py b/src/pip/_internal/cli/req_command.py
@@ -305,6 +305,8 @@ def make_requirement_preparer(
                     "fast-deps has no effect when used with the legacy resolver."
                 )
 
+        resume_incomplete = options.resume_incomplete == "resume"
+
         return RequirementPreparer(
             build_dir=temp_build_dir_path,
             src_dir=options.src_dir,
@@ -319,6 +321,8 @@ def make_requirement_preparer(
             use_user_site=use_user_site,
             lazy_wheel=lazy_wheel,
             verbosity=verbosity,
+            resume_incomplete=resume_incomplete,
+            resume_attempts=options.resume_attempts,
         )
 
     @classmethod

diff --git a/src/pip/_internal/exceptions.py b/src/pip/_internal/exceptions.py
@@ -656,3 +656,37 @@ def __str__(self) -> str:
             assert self.error is not None
             message_part = f".\n{self.error}\n"
         return f"Configuration file {self.reason}{message_part}"
+
+
+class IncompleteDownloadError(DiagnosticPipError):
+    """Raised when the downloader receives fewer bytes than advertised
+    in the Content-Length header."""
+
+    reference = "incomplete-download-error"
+
+    def __init__(
+        self, link: str, resume_incomplete: bool, resume_attempts: int
+    ) -> None:
+        if resume_incomplete:
+            message = (
+                "Download failed after {} attempts because not enough bytes are"
+                " received. The incomplete file has been cleaned up."
+            ).format(resume_attempts)
+            hint = "Use --incomplete-download-retries to configure resume retry limit."
+        else:
+            message = (
+                "Download failed because not enough bytes are received."
+                " The incomplete file has been cleaned up."
+            )
+            hint = (
+                "Use --incomplete-downloads=resume to make pip retry failed download."
+            )
+
+        super().__init__(
+            message=message,
+            context="File: {}\n"
+            "Resume failed download: {}\n"
+            "Resume retry limit: {}".format(link, resume_incomplete, resume_attempts),
+            hint_stmt=hint,
+            note_stmt="This is an issue with network connectivity, not pip.",
+        )
diff --git a/src/pip/_internal/network/download.py b/src/pip/_internal/network/download.py
@@ -4,12 +4,13 @@
 import logging
 import mimetypes
 import os
+from http import HTTPStatus
 from typing import Iterable, Optional, Tuple
 
 from pip._vendor.requests.models import CONTENT_CHUNK_SIZE, Response
 
 from pip._internal.cli.progress_bars import get_download_progress_renderer
-from pip._internal.exceptions import NetworkConnectionError
+from pip._internal.exceptions import IncompleteDownloadError, NetworkConnectionError
 from pip._internal.models.index import PyPI
 from pip._internal.models.link import Link
 from pip._internal.network.cache import is_from_cache
@@ -27,13 +28,21 @@ def _get_http_response_size(resp: Response) -> Optional[int]:
         return None
 
 
+def _get_http_response_etag_or_date(resp: Response) -> Optional[str]:
+    """
+    Return either the ETag or Date header (or None if neither exists).
+    The return value can be used in an If-Range header.
+    """
+    return resp.headers.get("etag", resp.headers.get("date"))
+
+
 def _prepare_download(
     resp: Response,
     link: Link,
     progress_bar: str,
+    total_length: Optional[int],
+    range_start: Optional[int] = None,
 ) -> Iterable[bytes]:
-    total_length = _get_http_response_size(resp)
-
     if link.netloc == PyPI.file_storage_domain:
         url = link.show_url
     else:
@@ -42,10 +51,17 @@ def _prepare_download(
     logged_url = redact_auth_from_url(url)
 
     if total_length:
-        logged_url = "{} ({})".format(logged_url, format_size(total_length))
+        if range_start is not None:
+            logged_url = "{} ({}/{})".format(
+                logged_url, format_size(range_start), format_size(total_length)
+            )
+        else:
+            logged_url = "{} ({})".format(logged_url, format_size(total_length))
 
     if is_from_cache(resp):
         logger.info("Using cached %s", logged_url)
+    elif range_start is not None:
+        logger.info("Resume download %s", logged_url)
     else:
         logger.info("Downloading %s", logged_url)
 
@@ -65,7 +81,9 @@ def _prepare_download(
     if not show_progress:
         return chunks
 
-    renderer = get_download_progress_renderer(bar_type=progress_bar, size=total_length)
+    renderer = get_download_progress_renderer(
+        bar_type=progress_bar, size=total_length, initial_progress=range_start
+    )
     return renderer(chunks)
 
 
@@ -112,10 +130,27 @@ def _get_http_response_filename(resp: Response, link: Link) -> str:
     return filename
 
 
-def _http_get_download(session: PipSession, link: Link) -> Response:
+def _http_get_download(
+    session: PipSession,
+    link: Link,
+    range_start: Optional[int] = None,
+    if_range: Optional[str] = None,
+) -> Response:
     target_url = link.url.split("#", 1)[0]
-    resp = session.get(target_url, headers=HEADERS, stream=True)
-    raise_for_status(resp)
+    headers = {**HEADERS}
+    # request a partial download
+    if range_start is not None:
+        headers["Range"] = "bytes={}-".format(range_start)
+    # make sure the file hasn't changed
+    if if_range is not None:
+        headers["If-Range"] = if_range
+    try:
+        resp = session.get(target_url, headers=headers, stream=True)
+        raise_for_status(resp)
+    except NetworkConnectionError as e:
+        assert e.response is not None
+        logger.critical("HTTP error %s while getting %s", e.response.status_code, link)
+        raise
     return resp
 
 
@@ -124,28 +159,80 @@ def __init__(
         self,
         session: PipSession,
         progress_bar: str,
+        resume_incomplete: bool,
+        resume_attempts: int,
     ) -> None:
         self._session = session
         self._progress_bar = progress_bar
+        self._resume_incomplete = resume_incomplete
+        assert (
+            resume_attempts > 0
+        ), "Number of max incomplete download retries must be positive"
+        self._resume_attempts = resume_attempts
 
     def __call__(self, link: Link, location: str) -> Tuple[str, str]:
         """Download the file given by link into location."""
-        try:
-            resp = _http_get_download(self._session, link)
-        except NetworkConnectionError as e:
-            assert e.response is not None
-            logger.critical(
-                "HTTP error %s while getting %s", e.response.status_code, link
-            )
-            raise
+        resp = _http_get_download(self._session, link)
+        total_length = _get_http_response_size(resp)
+        etag_or_date = _get_http_response_etag_or_date(resp)
 
         filename = _get_http_response_filename(resp, link)
         filepath = os.path.join(location, filename)
 
-        chunks = _prepare_download(resp, link, self._progress_bar)
+        chunks = _prepare_download(resp, link, self._progress_bar, total_length)
+        bytes_received = 0
+
         with open(filepath, "wb") as content_file:
+
+            # Process the initial response
             for chunk in chunks:
+                bytes_received += len(chunk)
                 content_file.write(chunk)
+
+            if self._resume_incomplete:
+                attempts_left = self._resume_attempts
+
+                while total_length is not None and bytes_received < total_length:
+                    if attempts_left <= 0:
+                        break
+                    attempts_left -= 1
+
+                    # Attempt to resume download
+                    resume_resp = _http_get_download(
+                        self._session,
+                        link,
+                        range_start=bytes_received,
+                        if_range=etag_or_date,
+                    )
+
+                    restart = resume_resp.status_code != HTTPStatus.PARTIAL_CONTENT
+                    # If the server responded with 200 (e.g. when the file has been
+                    # modifiedon the server or the server doesn't support range
+                    # requests), reset the download to start from the beginning.
+                    if restart:
+                        content_file.seek(0)
+                        content_file.truncate()
+                        bytes_received = 0
+                        total_length = _get_http_response_size(resume_resp)
+                        etag_or_date = _get_http_response_etag_or_date(resume_resp)
+
+                    chunks = _prepare_download(
+                        resume_resp,
+                        link,
+                        self._progress_bar,
+                        total_length,
+                        range_start=bytes_received,
+                    )
+                    for chunk in chunks:
+                        bytes_received += len(chunk)
+                        content_file.write(chunk)
+
+        if total_length is not None and bytes_received < total_length:
+            os.remove(filepath)
+            raise IncompleteDownloadError(
+                str(link), self._resume_incomplete, self._resume_attempts
+            )
+
         content_type = resp.headers.get("Content-Type", "")
         return filepath, content_type
 
@@ -155,32 +242,17 @@ def __init__(
         self,
         session: PipSession,
         progress_bar: str,
+        resume_incomplete: bool,
+        resume_attempts: int,
     ) -> None:
-        self._session = session
-        self._progress_bar = progress_bar
+        self._downloader = Downloader(
+            session, progress_bar, resume_incomplete, resume_attempts
+        )
 
     def __call__(
         self, links: Iterable[Link], location: str
     ) -> Iterable[Tuple[Link, Tuple[str, str]]]:
         """Download the files given by links into location."""
         for link in links:
-            try:
-                resp = _http_get_download(self._session, link)
-            except NetworkConnectionError as e:
-                assert e.response is not None
-                logger.critical(
-                    "HTTP error %s while getting %s",
-                    e.response.status_code,
-                    link,
-                )
-                raise
-
-            filename = _get_http_response_filename(resp, link)
-            filepath = os.path.join(location, filename)
-
-            chunks = _prepare_download(resp, link, self._progress_bar)
-            with open(filepath, "wb") as content_file:
-                for chunk in chunks:
-                    content_file.write(chunk)
-            content_type = resp.headers.get("Content-Type", "")
+            filepath, content_type = self._downloader(link, location)
             yield link, (filepath, content_type)
diff --git a/src/pip/_internal/operations/prepare.py b/src/pip/_internal/operations/prepare.py
@@ -221,15 +221,21 @@ def __init__(
         use_user_site: bool,
         lazy_wheel: bool,
         verbosity: int,
+        resume_incomplete: bool,
+        resume_attempts: int,
     ) -> None:
         super().__init__()
 
         self.src_dir = src_dir
         self.build_dir = build_dir
         self.build_tracker = build_tracker
         self._session = session
-        self._download = Downloader(session, progress_bar)
-        self._batch_download = BatchDownloader(session, progress_bar)
+        self._download = Downloader(
+            session, progress_bar, resume_incomplete, resume_attempts
+        )
+        self._batch_download = BatchDownloader(
+            session, progress_bar, resume_incomplete, resume_attempts
+        )
         self.finder = finder
 
         # Where still-packed archives should be written to. If None, they are