avoid caching parse_links() when the url is an index url

pypa · Feb 29, 2020 · beac52a · beac52a
1 parent d2bc147
commit beac52a
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 22 deletions.
diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py
@@ -18,7 +18,7 @@
 
 from pip._internal.models.link import Link
 from pip._internal.utils.filetypes import ARCHIVE_EXTENSIONS
-from pip._internal.utils.misc import redact_auth_from_url
+from pip._internal.utils.misc import pairwise, redact_auth_from_url
 from pip._internal.utils.typing import MYPY_CHECK_RUNNING
 from pip._internal.utils.urls import path_to_url, url_to_path
 from pip._internal.vcs import is_url, vcs
@@ -341,6 +341,9 @@ def wrapper(cacheable_page):
 
     def wrapper_wrapper(page):
         # type: (HTMLPage) -> List[Link]
+        if page.is_index_url:
+            # Avoid caching when requesting pypi indices.
+            return list(fn(page))
         return wrapper(CacheablePageContent(page))
 
     return wrapper_wrapper
@@ -376,9 +379,10 @@ class HTMLPage(object):
 
     def __init__(
         self,
-        content,   # type: bytes
-        encoding,  # type: Optional[str]
-        url,       # type: str
+        content,                # type: bytes
+        encoding,               # type: Optional[str]
+        url,                    # type: str
+        is_index_url=False,     # type: bool
     ):
         # type: (...) -> None
         """
@@ -388,6 +392,7 @@ def __init__(
         self.content = content
         self.encoding = encoding
         self.url = url
+        self.is_index_url = is_index_url
 
     def __str__(self):
         # type: () -> str
@@ -405,10 +410,13 @@ def _handle_get_page_fail(
     meth("Could not fetch URL %s: %s - skipping", link, reason)
 
 
-def _make_html_page(response):
-    # type: (Response) -> HTMLPage
+def _make_html_page(response, is_index_url=False):
+    # type: (Response, bool) -> HTMLPage
     encoding = _get_encoding_from_headers(response.headers)
-    return HTMLPage(response.content, encoding=encoding, url=response.url)
+    return HTMLPage(
+        response.content,
+        encoding=encoding,
+        url=response.url, is_index_url=is_index_url)
 
 
 def _get_html_page(link, session=None):
@@ -461,7 +469,7 @@ def _get_html_page(link, session=None):
     except requests.Timeout:
         _handle_get_page_fail(link, "timed out")
     else:
-        return _make_html_page(resp)
+        return _make_html_page(resp, is_index_url=link.is_index_url)
     return None
 
 
@@ -624,7 +632,7 @@ def collect_links(self, project_name):
         # We want to filter out anything that does not have a secure origin.
         url_locations = [
             link for link in itertools.chain(
-                (Link(url) for url in index_url_loc),
+                (Link(url, is_index_url=True) for url in index_url_loc),
                 (Link(url) for url in fl_url_loc),
             )
             if self.session.is_secure_origin(link)

diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py
@@ -30,6 +30,7 @@ def __init__(
         comes_from=None,       # type: Optional[Union[str, HTMLPage]]
         requires_python=None,  # type: Optional[str]
         yanked_reason=None,    # type: Optional[Text]
+        is_index_url=False,    # type: bool
     ):
         # type: (...) -> None
         """
@@ -63,6 +64,8 @@ def __init__(
 
         super(Link, self).__init__(key=url, defining_class=Link)
 
+        self.is_index_url = is_index_url
+
     def __str__(self):
         # type: () -> str
         if self.requires_python:

diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py
@@ -1,5 +1,6 @@
 import logging
 import os.path
+import re
 import uuid
 from textwrap import dedent
 
@@ -387,21 +388,27 @@ def test_parse_links_caches_same_page_by_url():
         encoding=None,
         url='https://example.com/simple/',
     )
+    # Make a third page which represents an index url, which should not be
+    # cached, even for the same url. We modify the page content slightly to
+    # ensure that the result is not cached.
+    page_3 = HTMLPage(
+        re.sub(b'pkg1', b'pkg2', html_bytes),
+        encoding=None,
+        url='https://example.com/simple/',
+        is_index_url=True,
+    )
 
-    mock_parse = mock.patch("pip._internal.index.collector.html5lib.parse")
-    with mock_parse as mock_parse:
-        mock_parse.return_value = html5lib.parse(
-            page_1.content,
-            transport_encoding=page_1.encoding,
-            namespaceHTMLElements=False,
-        )
-        parsed_links_1 = list(parse_links(page_1))
-        mock_parse.assert_called()
+    parsed_links_1 = list(parse_links(page_1))
+    assert len(parsed_links_1) == 1
+    assert 'pkg1' in parsed_links_1[0].url
+
+    parsed_links_2 = list(parse_links(page_2))
+    assert parsed_links_2 == parsed_links_1
 
-    with mock_parse as mock_parse:
-        parsed_links_2 = list(parse_links(page_2))
-        assert parsed_links_2 == parsed_links_1
-        mock_parse.assert_not_called()
+    parsed_links_3 = list(parse_links(page_3))
+    assert len(parsed_links_3) == 1
+    assert parsed_links_3 != parsed_links_1
+    assert 'pkg2' in parsed_links_3[0].url
 
 
 def test_request_http_error(caplog):