Skip to content

Commit

Permalink
Handle req file decode failures on locale encoding
Browse files Browse the repository at this point in the history
For the case where:

* a requirements file is encoded as UTF-8, and
* some bytes in the file are incompatible with the system locale

In this case, fallback to decoding as UTF-8 as a last resort (rather
than crashing on the `UnicodeDecodeError`). This behaviour was added
when parsing the request file, rather than in `auto_decode` as it didn't
seem to belong in a generic util (though that util looks to only be ever
called when parsing requirements files anyway).

Perhaps we should just go straight to UTF-8 without querying the system
locale (unless there is a PEP-263 style comment), per the docs[1]:

> Requirements files are utf-8 encoding by default

But to avoid a breaking change just warn if decoding with this locale
fails then fallback to UTF-8

[1] https://pip.pypa.io/en/stable/reference/requirements-file-format/#encoding
  • Loading branch information
matthewhughes934 committed Jun 25, 2024
1 parent 300ed75 commit a3f1cac
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 1 deletion.
19 changes: 18 additions & 1 deletion src/pip/_internal/req/req_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import re
import shlex
import urllib.parse
import warnings
from optparse import Values
from typing import (
TYPE_CHECKING,
Expand Down Expand Up @@ -545,7 +546,23 @@ def get_file_content(url: str, session: "PipSession") -> Tuple[str, str]:
# Assume this is a bare path.
try:
with open(url, "rb") as f:
content = auto_decode(f.read())
raw_content = f.read()
except OSError as exc:
raise InstallationError(f"Could not open requirements file: {exc}")

try:
content = auto_decode(raw_content)
except UnicodeDecodeError as exc:
fallback_encoding = "utf-8"
# don't try an decode again if we know it will fail
if exc.encoding == fallback_encoding:
raise

warnings.warn(
f"unable to decode data with {exc.encoding}, falling back to {fallback_encoding}", # noqa: E501
UnicodeWarning,
stacklevel=2,
)
content = raw_content.decode(fallback_encoding)

return url, content
41 changes: 41 additions & 0 deletions tests/unit/test_req_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import os
import textwrap
import warnings
from optparse import Values
from pathlib import Path
from typing import Any, Iterator, List, Optional, Protocol, Tuple, Union
Expand Down Expand Up @@ -883,3 +884,43 @@ def test_install_requirements_with_options(
)

assert req.global_options == [global_option]

def test_warns_on_decode_fail_in_locale(
self, tmpdir: Path, session: PipSession
) -> None:
# \xe3\x80\x82 encodes to 'IDEOGRAPHIC FULL STOP' in UTF-8
# the lone \x82 byte is invalid in the gbk encoding
data = b"pip<=24.0 # some comment\xe3\x80\x82\n"
locale_encoding = "gbk"
req_file = tmpdir / "requirements.txt"
req_file.write_bytes(data)

# it's hard to rely on a locale definitely existing for testing
# so patch things out for simplicity
with pytest.warns(UnicodeWarning) as records, mock.patch(
"locale.getpreferredencoding", return_value=locale_encoding
):
reqs = tuple(parse_reqfile(req_file.resolve(), session=session))

assert len(records) == 1
assert (
str(records[0].message)
== "unable to decode data with gbk, falling back to utf-8"
)
assert len(reqs) == 1
assert reqs[0].name == "pip"
assert str(reqs[0].specifier) == "<=24.0"

@pytest.mark.parametrize("encoding", ("utf-8", "gbk"))
def test_erorrs_on_non_decodable_data(
self, encoding: str, tmpdir: Path, session: PipSession
) -> None:
data = b"\xff"
req_file = tmpdir / "requirements.txt"
req_file.write_bytes(data)

with warnings.catch_warnings(), pytest.raises(UnicodeDecodeError), mock.patch(
"locale.getpreferredencoding", return_value=encoding
):
warnings.simplefilter("ignore") # suppress warning not under test here
next(parse_reqfile(req_file.resolve(), session=session))

0 comments on commit a3f1cac

Please sign in to comment.