Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Drop chardet #1269

Merged
merged 12 commits into from
Sep 15, 2020
9 changes: 5 additions & 4 deletions httpx/_decoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,12 +167,13 @@ def __init__(self, encoding: typing.Optional[str] = None):

def decode(self, data: bytes) -> str:
if self.decoder is None:
self.decoder = codecs.getincrementaldecoder("utf-8")(errors="strict")
attempt_utf_8 = codecs.getincrementaldecoder("utf-8")(errors="strict")
try:
return self.decoder.decode(data)
attempt_utf_8.decode(data)
except UnicodeDecodeError:
self.decoder = codecs.getincrementaldecoder("cp1251")(errors="ignore")
return self.decoder.decode(data)
self.decoder = codecs.getincrementaldecoder("cp1252")(errors="replace")
else:
self.decoder = codecs.getincrementaldecoder("utf-8")(errors="replace")
Copy link
Contributor

@StephenBrown2 StephenBrown2 Sep 9, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why ("utf-8")(errors="replace") here if it passed with ("utf-8")(errors="strict") to get here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need strict to raise an error if it doesn't appear to decode as UTF-8, but once we've made the decision we use errors="replace" for the most robust behaviour possible. So eg. if we've got a streaming response that initially appears to be UTF-8, but later has some non-UTF-8 bytes, then we're not raising a hard error on accessing .text.

(We'd like it to have a failure mode that is as graceful as possible.)


return self.decoder.decode(data)

Expand Down
22 changes: 7 additions & 15 deletions httpx/_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -748,8 +748,7 @@ def text(self) -> str:
if not content:
self._text = ""
else:
decoder = TextDecoder(encoding=self.encoding)
self._text = "".join([decoder.decode(content), decoder.flush()])
self._text = "".join([chunk for chunk in self.iter_text()])
return self._text

@property
Expand All @@ -775,18 +774,11 @@ def charset_encoding(self) -> typing.Optional[str]:
if content_type is None:
return None

parsed = cgi.parse_header(content_type)
media_type, params = parsed[0], parsed[-1]
if "charset" in params:
return params["charset"].strip("'\"")

# RFC 2616 specifies that 'iso-8859-1' should be used as the default
# for 'text/*' media types, if no charset is provided.
# See: https://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
if media_type.startswith("text/"):
return "iso-8859-1"
_, params = cgi.parse_header(content_type)
if "charset" not in params:
return None

return None
return params["charset"].strip("'\"")

@property
def apparent_encoding(self) -> typing.Optional[str]:
Expand Down Expand Up @@ -924,7 +916,7 @@ def iter_text(self) -> typing.Iterator[str]:
that handles both gzip, deflate, etc but also detects the content's
string encoding.
"""
decoder = TextDecoder(encoding=self.charset_encoding)
decoder = TextDecoder(encoding=self.encoding)
with self._wrap_decoder_errors():
for chunk in self.iter_bytes():
yield decoder.decode(chunk)
Expand Down Expand Up @@ -1005,7 +997,7 @@ async def aiter_text(self) -> typing.AsyncIterator[str]:
that handles both gzip, deflate, etc but also detects the content's
string encoding.
"""
decoder = TextDecoder(encoding=self.charset_encoding)
decoder = TextDecoder(encoding=self.encoding)
with self._wrap_decoder_errors():
async for chunk in self.aiter_bytes():
yield decoder.decode(chunk)
Expand Down
1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ def get_packages(package):
install_requires=[
"certifi",
"sniffio",
"chardet==3.*",
"rfc3986[idna2008]>=1.3,<2",
"httpcore==0.10.*",
],
Expand Down
2 changes: 1 addition & 1 deletion tests/client/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def test_get(server):
assert response.content == b"Hello, world!"
assert response.text == "Hello, world!"
assert response.http_version == "HTTP/1.1"
assert response.encoding == "iso-8859-1"
assert response.encoding is None
assert response.request.url == url
assert response.headers
assert response.is_redirect is False
Expand Down
70 changes: 53 additions & 17 deletions tests/models/test_responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,36 +82,36 @@ def test_response_content_type_encoding():

def test_response_autodetect_encoding():
"""
Autodetect encoding if there is no charset info in a Content-Type header.
Autodetect encoding if there is no Content-Type header.
"""
content = "おはようございます。".encode("EUC-JP")
content = "おはようございます。".encode("utf-8")
response = httpx.Response(
200,
content=content,
)
assert response.text == "おはようございます。"
assert response.encoding == "EUC-JP"
assert response.encoding is None


def test_response_fallback_to_autodetect():
"""
Fallback to autodetection if we get an invalid charset in the Content-Type header.
"""
headers = {"Content-Type": "text-plain; charset=invalid-codec-name"}
content = "おはようございます。".encode("EUC-JP")
content = "おはようございます。".encode("utf-8")
response = httpx.Response(
200,
content=content,
headers=headers,
)
assert response.text == "おはようございます。"
assert response.encoding == "EUC-JP"
assert response.encoding is None


def test_response_default_text_encoding():
def test_response_no_charset_with_ascii_content():
"""
A media type of 'text/*' with no charset should default to ISO-8859-1.
See: https://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
A response with ascii encoded content should decode correctly,
even with no charset specified.
"""
content = b"Hello, world!"
headers = {"Content-Type": "text/plain"}
Expand All @@ -121,20 +121,56 @@ def test_response_default_text_encoding():
headers=headers,
)
assert response.status_code == 200
assert response.encoding == "iso-8859-1"
assert response.encoding is None
assert response.text == "Hello, world!"


def test_response_default_encoding():
def test_response_no_charset_with_utf8_content():
"""
Default to utf-8 if all else fails.
A response with UTF-8 encoded content should decode correctly,
even with no charset specified.
"""
content = "Unicode Snowman: ☃".encode("utf-8")
headers = {"Content-Type": "text/plain"}
response = httpx.Response(
200,
content=b"",
content=content,
headers=headers,
)
assert response.text == ""
assert response.encoding == "utf-8"
assert response.text == "Unicode Snowman: ☃"
assert response.encoding is None


def test_response_no_charset_with_iso_8859_1_content():
"""
A response with ISO 8859-1 encoded content should decode correctly,
even with no charset specified.
"""
content = "Accented: Österreich".encode("iso-8859-1")
headers = {"Content-Type": "text/plain"}
response = httpx.Response(
200,
content=content,
headers=headers,
)
assert response.text == "Accented: Österreich"
assert response.encoding is None


def test_response_no_charset_with_cp_1252_content():
"""
A response with Windows 1252 encoded content should decode correctly,
even with no charset specified.
"""
content = "Euro Currency: €".encode("cp1252")
headers = {"Content-Type": "text/plain"}
response = httpx.Response(
200,
content=content,
headers=headers,
)
assert response.text == "Euro Currency: €"
assert response.encoding is None


def test_response_non_text_encoding():
Expand All @@ -148,7 +184,7 @@ def test_response_non_text_encoding():
headers=headers,
)
assert response.text == "xyz"
assert response.encoding == "ascii"
assert response.encoding is None


def test_response_set_explicit_encoding():
Expand Down Expand Up @@ -185,7 +221,7 @@ def test_read():

assert response.status_code == 200
assert response.text == "Hello, world!"
assert response.encoding == "ascii"
assert response.encoding is None
assert response.is_closed

content = response.read()
Expand All @@ -204,7 +240,7 @@ async def test_aread():

assert response.status_code == 200
assert response.text == "Hello, world!"
assert response.encoding == "ascii"
assert response.encoding is None
assert response.is_closed

content = await response.aread()
Expand Down
2 changes: 1 addition & 1 deletion tests/test_decoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def test_decoding_errors(header_value):
[
((b"Hello,", b" world!"), "ascii"),
((b"\xe3\x83", b"\x88\xe3\x83\xa9", b"\xe3", b"\x83\x99\xe3\x83\xab"), "utf-8"),
((b"Euro character: \x88!", b""), "cp1251"),
((b"Euro character: \x88!", b""), "cp1252"),
# ((b"\x83g\x83\x89\x83x\x83\x8b",) * 64, "shift-jis"),
# ((b"\x83g\x83\x89\x83x\x83\x8b",) * 600, "shift-jis"),
# (
Expand Down