Skip to content

Commit

Permalink
avoid caching parse_links() when the url is an index url
Browse files Browse the repository at this point in the history
  • Loading branch information
cosmicexplorer committed Feb 29, 2020
1 parent d2bc147 commit beac52a
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 22 deletions.
26 changes: 17 additions & 9 deletions src/pip/_internal/index/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

from pip._internal.models.link import Link
from pip._internal.utils.filetypes import ARCHIVE_EXTENSIONS
from pip._internal.utils.misc import redact_auth_from_url
from pip._internal.utils.misc import pairwise, redact_auth_from_url
from pip._internal.utils.typing import MYPY_CHECK_RUNNING
from pip._internal.utils.urls import path_to_url, url_to_path
from pip._internal.vcs import is_url, vcs
Expand Down Expand Up @@ -341,6 +341,9 @@ def wrapper(cacheable_page):

def wrapper_wrapper(page):
# type: (HTMLPage) -> List[Link]
if page.is_index_url:
# Avoid caching when requesting pypi indices.
return list(fn(page))
return wrapper(CacheablePageContent(page))

return wrapper_wrapper
Expand Down Expand Up @@ -376,9 +379,10 @@ class HTMLPage(object):

def __init__(
self,
content, # type: bytes
encoding, # type: Optional[str]
url, # type: str
content, # type: bytes
encoding, # type: Optional[str]
url, # type: str
is_index_url=False, # type: bool
):
# type: (...) -> None
"""
Expand All @@ -388,6 +392,7 @@ def __init__(
self.content = content
self.encoding = encoding
self.url = url
self.is_index_url = is_index_url

def __str__(self):
# type: () -> str
Expand All @@ -405,10 +410,13 @@ def _handle_get_page_fail(
meth("Could not fetch URL %s: %s - skipping", link, reason)


def _make_html_page(response):
# type: (Response) -> HTMLPage
def _make_html_page(response, is_index_url=False):
# type: (Response, bool) -> HTMLPage
encoding = _get_encoding_from_headers(response.headers)
return HTMLPage(response.content, encoding=encoding, url=response.url)
return HTMLPage(
response.content,
encoding=encoding,
url=response.url, is_index_url=is_index_url)


def _get_html_page(link, session=None):
Expand Down Expand Up @@ -461,7 +469,7 @@ def _get_html_page(link, session=None):
except requests.Timeout:
_handle_get_page_fail(link, "timed out")
else:
return _make_html_page(resp)
return _make_html_page(resp, is_index_url=link.is_index_url)
return None


Expand Down Expand Up @@ -624,7 +632,7 @@ def collect_links(self, project_name):
# We want to filter out anything that does not have a secure origin.
url_locations = [
link for link in itertools.chain(
(Link(url) for url in index_url_loc),
(Link(url, is_index_url=True) for url in index_url_loc),
(Link(url) for url in fl_url_loc),
)
if self.session.is_secure_origin(link)
Expand Down
3 changes: 3 additions & 0 deletions src/pip/_internal/models/link.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def __init__(
comes_from=None, # type: Optional[Union[str, HTMLPage]]
requires_python=None, # type: Optional[str]
yanked_reason=None, # type: Optional[Text]
is_index_url=False, # type: bool
):
# type: (...) -> None
"""
Expand Down Expand Up @@ -63,6 +64,8 @@ def __init__(

super(Link, self).__init__(key=url, defining_class=Link)

self.is_index_url = is_index_url

def __str__(self):
# type: () -> str
if self.requires_python:
Expand Down
33 changes: 20 additions & 13 deletions tests/unit/test_collector.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import os.path
import re
import uuid
from textwrap import dedent

Expand Down Expand Up @@ -387,21 +388,27 @@ def test_parse_links_caches_same_page_by_url():
encoding=None,
url='https://example.com/simple/',
)
# Make a third page which represents an index url, which should not be
# cached, even for the same url. We modify the page content slightly to
# ensure that the result is not cached.
page_3 = HTMLPage(
re.sub(b'pkg1', b'pkg2', html_bytes),
encoding=None,
url='https://example.com/simple/',
is_index_url=True,
)

mock_parse = mock.patch("pip._internal.index.collector.html5lib.parse")
with mock_parse as mock_parse:
mock_parse.return_value = html5lib.parse(
page_1.content,
transport_encoding=page_1.encoding,
namespaceHTMLElements=False,
)
parsed_links_1 = list(parse_links(page_1))
mock_parse.assert_called()
parsed_links_1 = list(parse_links(page_1))
assert len(parsed_links_1) == 1
assert 'pkg1' in parsed_links_1[0].url

parsed_links_2 = list(parse_links(page_2))
assert parsed_links_2 == parsed_links_1

with mock_parse as mock_parse:
parsed_links_2 = list(parse_links(page_2))
assert parsed_links_2 == parsed_links_1
mock_parse.assert_not_called()
parsed_links_3 = list(parse_links(page_3))
assert len(parsed_links_3) == 1
assert parsed_links_3 != parsed_links_1
assert 'pkg2' in parsed_links_3[0].url


def test_request_http_error(caplog):
Expand Down

0 comments on commit beac52a

Please sign in to comment.