diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py index 2f957e69e90..ff722cfeb85 100644 --- a/src/pip/_internal/index/collector.py +++ b/src/pip/_internal/index/collector.py @@ -41,7 +41,6 @@ from pip._internal.models.search_scope import SearchScope from pip._internal.network.session import PipSession from pip._internal.network.utils import raise_for_status -from pip._internal.utils.deprecation import deprecated from pip._internal.utils.filetypes import is_archive_file from pip._internal.utils.misc import pairwise, redact_auth_from_url from pip._internal.vcs import vcs @@ -346,34 +345,13 @@ def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Lin """ Parse an HTML document, and yield its anchor elements as Link objects. """ - encoding = page.encoding or "utf-8" - - # Check if the page starts with a valid doctype, to decide whether to use - # http.parser or (deprecated) html5lib for parsing -- unless explicitly - # requested to use html5lib. - if not use_deprecated_html5lib: - expected_doctype = "".encode(encoding) - actual_start = page.content[: len(expected_doctype)] - if actual_start.decode(encoding).lower() != "": - deprecated( - reason=( - f"The HTML index page being used ({page.url}) is not a proper " - "HTML 5 document. This is in violation of PEP 503 which requires " - "these pages to be well-formed HTML 5 documents. Please reach out " - "to the owners of this index page, and ask them to update this " - "index page to a valid HTML 5 document." - ), - replacement=None, - gone_in="22.2", - issue=10825, - ) - use_deprecated_html5lib = True if use_deprecated_html5lib: yield from _parse_links_html5lib(page) return - parser = HTMLLinkParser() + parser = HTMLLinkParser(page.url) + encoding = page.encoding or "utf-8" parser.feed(page.content.decode(encoding)) url = page.url diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index 403da9c4005..219e0c0cc4c 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -551,21 +551,42 @@ def test_parse_link_handles_deprecated_usage_properly() -> None: assert "pkg1-2.0" in parsed_links[1].url -@mock.patch("pip._internal.index.collector.deprecated") -def test_parse_links_presents_deprecation_warning_on_non_html5_page( - mock_deprecated: mock.Mock, +def test_parse_links_presents_warning_on_missing_doctype( + caplog: pytest.LogCaptureFixture, ) -> None: html = b'' url = "https://example.com/simple/" page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False) - parsed_links = list(parse_links(page, use_deprecated_html5lib=False)) + with caplog.at_level(logging.WARN): + parsed_links = list(parse_links(page, use_deprecated_html5lib=False)) assert len(parsed_links) == 2, parsed_links assert "pkg1-1.0" in parsed_links[0].url assert "pkg1-2.0" in parsed_links[1].url - mock_deprecated.assert_called_once() + assert len(caplog.records) == 1 + + +def test_parse_links_presents_warning_on_html4_doctype( + caplog: pytest.LogCaptureFixture, +) -> None: + html = ( + b'' + b'' + ) + url = "https://example.com/simple/" + page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False) + + with caplog.at_level(logging.WARN): + parsed_links = list(parse_links(page, use_deprecated_html5lib=False)) + + assert len(parsed_links) == 2, parsed_links + assert "pkg1-1.0" in parsed_links[0].url + assert "pkg1-2.0" in parsed_links[1].url + + assert len(caplog.records) == 1 @mock.patch("pip._internal.index.collector.raise_for_status")