Handle req file decode failures on locale encoding

For the case where: * a requirements file is encoded as UTF-8, and * some bytes in the file are incompatible with the system locale In this case, fallback to decoding as UTF-8 as a last resort (rather than crashing on the `UnicodeDecodeError`). This behaviour was added when parsing the request file, rather than in `auto_decode` as it didn't seem to belong in a generic util (though that util looks to only be ever called when parsing requirements files anyway). Perhaps we should just go straight to UTF-8 without querying the system locale (unless there is a PEP-263 style comment), per the docs[1]: > Requirements files are utf-8 encoding by default But to avoid a breaking change just warn if decoding with this locale fails then fallback to UTF-8 [1] https://pip.pypa.io/en/stable/reference/requirements-file-format/#encoding
pypa · Jun 25, 2024 · a3f1cac · a3f1cac
1 parent 300ed75
commit a3f1cac
Show file tree

Hide file tree

Showing 2 changed files with 59 additions and 1 deletion.
diff --git a/src/pip/_internal/req/req_file.py b/src/pip/_internal/req/req_file.py
@@ -8,6 +8,7 @@
 import re
 import shlex
 import urllib.parse
+import warnings
 from optparse import Values
 from typing import (
     TYPE_CHECKING,
@@ -545,7 +546,23 @@ def get_file_content(url: str, session: "PipSession") -> Tuple[str, str]:
     # Assume this is a bare path.
     try:
         with open(url, "rb") as f:
-            content = auto_decode(f.read())
+            raw_content = f.read()
     except OSError as exc:
         raise InstallationError(f"Could not open requirements file: {exc}")
+
+    try:
+        content = auto_decode(raw_content)
+    except UnicodeDecodeError as exc:
+        fallback_encoding = "utf-8"
+        # don't try an decode again if we know it will fail
+        if exc.encoding == fallback_encoding:
+            raise
+
+        warnings.warn(
+            f"unable to decode data with {exc.encoding}, falling back to {fallback_encoding}",  #  noqa: E501
+            UnicodeWarning,
+            stacklevel=2,
+        )
+        content = raw_content.decode(fallback_encoding)
+
     return url, content
diff --git a/tests/unit/test_req_file.py b/tests/unit/test_req_file.py
@@ -2,6 +2,7 @@
 import logging
 import os
 import textwrap
+import warnings
 from optparse import Values
 from pathlib import Path
 from typing import Any, Iterator, List, Optional, Protocol, Tuple, Union
@@ -883,3 +884,43 @@ def test_install_requirements_with_options(
             )
 
         assert req.global_options == [global_option]
+
+    def test_warns_on_decode_fail_in_locale(
+        self, tmpdir: Path, session: PipSession
+    ) -> None:
+        # \xe3\x80\x82 encodes to 'IDEOGRAPHIC FULL STOP' in UTF-8
+        # the lone \x82 byte is invalid in the gbk encoding
+        data = b"pip<=24.0 # some comment\xe3\x80\x82\n"
+        locale_encoding = "gbk"
+        req_file = tmpdir / "requirements.txt"
+        req_file.write_bytes(data)
+
+        # it's hard to rely on a locale definitely existing for testing
+        # so patch things out for simplicity
+        with pytest.warns(UnicodeWarning) as records, mock.patch(
+            "locale.getpreferredencoding", return_value=locale_encoding
+        ):
+            reqs = tuple(parse_reqfile(req_file.resolve(), session=session))
+
+        assert len(records) == 1
+        assert (
+            str(records[0].message)
+            == "unable to decode data with gbk, falling back to utf-8"
+        )
+        assert len(reqs) == 1
+        assert reqs[0].name == "pip"
+        assert str(reqs[0].specifier) == "<=24.0"
+
+    @pytest.mark.parametrize("encoding", ("utf-8", "gbk"))
+    def test_erorrs_on_non_decodable_data(
+        self, encoding: str, tmpdir: Path, session: PipSession
+    ) -> None:
+        data = b"\xff"
+        req_file = tmpdir / "requirements.txt"
+        req_file.write_bytes(data)
+
+        with warnings.catch_warnings(), pytest.raises(UnicodeDecodeError), mock.patch(
+            "locale.getpreferredencoding", return_value=encoding
+        ):
+            warnings.simplefilter("ignore")  # suppress warning not under test here
+            next(parse_reqfile(req_file.resolve(), session=session))