Skip to content

Commit

Permalink
Drop chardet (#1269)
Browse files Browse the repository at this point in the history
* Internal refactoring to swap auth/redirects ordering

* Drop chardet for charset detection

* Drop chardet in favour of simpler charset autodetection

* Revert unintentionally included changes

* Update test case

* Refactor to prefer different decoding style

* Update text decoding docs/docstrings

* Resolve typo

* Update docs/quickstart.md

Co-authored-by: Florimond Manca <[email protected]>

Co-authored-by: Florimond Manca <[email protected]>
  • Loading branch information
tomchristie and florimondmanca authored Sep 15, 2020
1 parent 2d6c30d commit d0fe113
Show file tree
Hide file tree
Showing 7 changed files with 128 additions and 112 deletions.
17 changes: 15 additions & 2 deletions docs/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,14 +65,27 @@ HTTPX will automatically handle decoding the response content into Unicode text.
'<!doctype html>\n<html>\n<head>\n<title>Example Domain</title>...'
```

You can inspect what encoding has been used to decode the response.
You can inspect what encoding will be used to decode the response.

```pycon
>>> r.encoding
'UTF-8'
```

If you need to override the standard behavior and explicitly set the encoding to
In some cases the response may not contain an explicit encoding, in which case HTTPX
will attempt to automatically determine an encoding to use. This defaults to
UTF-8, but also includes robust fallback behaviour for handling ascii,
iso-8859-1 and windows 1252 encodings.

```pycon
>>> r.encoding
None
>>> r.text
'<!doctype html>\n<html>\n<head>\n<title>Example Domain</title>...'
```


If you need to override the standard behaviour and explicitly set the encoding to
use, then you can do that too.

```pycon
Expand Down
94 changes: 41 additions & 53 deletions httpx/_decoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
import typing
import zlib

import chardet

try:
import brotli
except ImportError: # pragma: nocover
Expand Down Expand Up @@ -163,62 +161,52 @@ class TextDecoder:
"""

def __init__(self, encoding: typing.Optional[str] = None):
self.decoder: typing.Optional[codecs.IncrementalDecoder] = (
None if encoding is None else codecs.getincrementaldecoder(encoding)()
)
self.detector = chardet.universaldetector.UniversalDetector()

# This buffer is only needed if 'decoder' is 'None'
# we want to trigger errors if data is getting added to
# our internal buffer for some silly reason while
# a decoder is discovered.
self.buffer: typing.Optional[bytearray] = None if self.decoder else bytearray()
self.decoder: typing.Optional[codecs.IncrementalDecoder] = None
if encoding is not None:
self.decoder = codecs.getincrementaldecoder(encoding)(errors="strict")

def decode(self, data: bytes) -> str:
try:
if self.decoder is not None:
text = self.decoder.decode(data)
else:
assert self.buffer is not None
text = ""
self.detector.feed(data)
self.buffer += data

# Should be more than enough data to process, we don't
# want to buffer too long as chardet will wait until
# detector.close() is used to give back common
# encodings like 'utf-8'.
if len(self.buffer) >= 4096:
self.decoder = codecs.getincrementaldecoder(
self._detector_result()
)()
text = self.decoder.decode(bytes(self.buffer), False)
self.buffer = None

return text
except UnicodeDecodeError as exc: # pragma: nocover
raise ValueError(str(exc))
"""
If an encoding is explicitly specified, then we use that.
Otherwise our strategy is to attempt UTF-8, and fallback to Windows 1252.
def flush(self) -> str:
try:
if self.decoder is None:
# Empty string case as chardet is guaranteed to not have a guess.
assert self.buffer is not None
if len(self.buffer) == 0:
return ""
return bytes(self.buffer).decode(self._detector_result())

return self.decoder.decode(b"", True)
except UnicodeDecodeError as exc: # pragma: nocover
raise ValueError(str(exc))
Note that UTF-8 is a strict superset of ascii, and Windows 1252 is a
superset of the non-control characters in iso-8859-1, so we essentially
end up supporting any of ascii, utf-8, iso-8859-1, cp1252.
def _detector_result(self) -> str:
self.detector.close()
result = self.detector.result["encoding"]
if not result: # pragma: nocover
raise ValueError("Unable to determine encoding of content")
Given that UTF-8 is now by *far* the most widely used encoding, this
should be a pretty robust strategy for cases where a charset has
not been explicitly included.
return result
Useful stats on the prevalence of different charsets in the wild...
* https://w3techs.com/technologies/overview/character_encoding
* https://w3techs.com/technologies/history_overview/character_encoding
The HTML5 spec also has some useful guidelines, suggesting defaults of
either UTF-8 or Windows 1252 in most cases...
* https://dev.w3.org/html5/spec-LC/Overview.html
"""
if self.decoder is None:
# If this is the first decode pass then we need to determine which
# encoding to use by attempting UTF-8 and raising any decode errors.
attempt_utf_8 = codecs.getincrementaldecoder("utf-8")(errors="strict")
try:
attempt_utf_8.decode(data)
except UnicodeDecodeError:
# Could not decode as UTF-8. Use Windows 1252.
self.decoder = codecs.getincrementaldecoder("cp1252")(errors="replace")
else:
# Can decode as UTF-8. Use UTF-8 with lenient error settings.
self.decoder = codecs.getincrementaldecoder("utf-8")(errors="replace")

return self.decoder.decode(data)

def flush(self) -> str:
if self.decoder is None:
return ""
return self.decoder.decode(b"", True)


class LineDecoder:
Expand Down
44 changes: 16 additions & 28 deletions httpx/_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from http.cookiejar import Cookie, CookieJar
from urllib.parse import parse_qsl, quote, unquote, urlencode

import chardet
import rfc3986
import rfc3986.exceptions

Expand Down Expand Up @@ -755,19 +754,22 @@ def text(self) -> str:
if not content:
self._text = ""
else:
encoding = self.encoding
self._text = content.decode(encoding, errors="replace")
decoder = TextDecoder(encoding=self.encoding)
self._text = "".join([decoder.decode(self.content), decoder.flush()])
return self._text

@property
def encoding(self) -> str:
def encoding(self) -> typing.Optional[str]:
"""
Return the encoding, which may have been set explicitly, or may have
been specified by the Content-Type header.
"""
if not hasattr(self, "_encoding"):
encoding = self.charset_encoding
if encoding is None or not is_known_encoding(encoding):
encoding = self.apparent_encoding
if encoding is None or not is_known_encoding(encoding):
encoding = "utf-8"
self._encoding = encoding
self._encoding = None
else:
self._encoding = encoding
return self._encoding

@encoding.setter
Expand All @@ -783,25 +785,11 @@ def charset_encoding(self) -> typing.Optional[str]:
if content_type is None:
return None

parsed = cgi.parse_header(content_type)
media_type, params = parsed[0], parsed[-1]
if "charset" in params:
return params["charset"].strip("'\"")

# RFC 2616 specifies that 'iso-8859-1' should be used as the default
# for 'text/*' media types, if no charset is provided.
# See: https://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
if media_type.startswith("text/"):
return "iso-8859-1"

return None
_, params = cgi.parse_header(content_type)
if "charset" not in params:
return None

@property
def apparent_encoding(self) -> typing.Optional[str]:
"""
Return the encoding, as it appears to autodetection.
"""
return chardet.detect(self.content)["encoding"]
return params["charset"].strip("'\"")

def _get_content_decoder(self) -> ContentDecoder:
"""
Expand Down Expand Up @@ -936,7 +924,7 @@ def iter_text(self) -> typing.Iterator[str]:
that handles both gzip, deflate, etc but also detects the content's
string encoding.
"""
decoder = TextDecoder(encoding=self.charset_encoding)
decoder = TextDecoder(encoding=self.encoding)
with self._wrap_decoder_errors():
for chunk in self.iter_bytes():
yield decoder.decode(chunk)
Expand Down Expand Up @@ -1020,7 +1008,7 @@ async def aiter_text(self) -> typing.AsyncIterator[str]:
that handles both gzip, deflate, etc but also detects the content's
string encoding.
"""
decoder = TextDecoder(encoding=self.charset_encoding)
decoder = TextDecoder(encoding=self.encoding)
with self._wrap_decoder_errors():
async for chunk in self.aiter_bytes():
yield decoder.decode(chunk)
Expand Down
1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ def get_packages(package):
install_requires=[
"certifi",
"sniffio",
"chardet==3.*",
"rfc3986[idna2008]>=1.3,<2",
"httpcore==0.10.*",
],
Expand Down
2 changes: 1 addition & 1 deletion tests/client/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def test_get(server):
assert response.content == b"Hello, world!"
assert response.text == "Hello, world!"
assert response.http_version == "HTTP/1.1"
assert response.encoding == "iso-8859-1"
assert response.encoding is None
assert response.request.url == url
assert response.headers
assert response.is_redirect is False
Expand Down
70 changes: 53 additions & 17 deletions tests/models/test_responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,36 +81,36 @@ def test_response_content_type_encoding():

def test_response_autodetect_encoding():
"""
Autodetect encoding if there is no charset info in a Content-Type header.
Autodetect encoding if there is no Content-Type header.
"""
content = "おはようございます。".encode("EUC-JP")
content = "おはようございます。".encode("utf-8")
response = httpx.Response(
200,
content=content,
)
assert response.text == "おはようございます。"
assert response.encoding == "EUC-JP"
assert response.encoding is None


def test_response_fallback_to_autodetect():
"""
Fallback to autodetection if we get an invalid charset in the Content-Type header.
"""
headers = {"Content-Type": "text-plain; charset=invalid-codec-name"}
content = "おはようございます。".encode("EUC-JP")
content = "おはようございます。".encode("utf-8")
response = httpx.Response(
200,
content=content,
headers=headers,
)
assert response.text == "おはようございます。"
assert response.encoding == "EUC-JP"
assert response.encoding is None


def test_response_default_text_encoding():
def test_response_no_charset_with_ascii_content():
"""
A media type of 'text/*' with no charset should default to ISO-8859-1.
See: https://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
A response with ascii encoded content should decode correctly,
even with no charset specified.
"""
content = b"Hello, world!"
headers = {"Content-Type": "text/plain"}
Expand All @@ -120,20 +120,56 @@ def test_response_default_text_encoding():
headers=headers,
)
assert response.status_code == 200
assert response.encoding == "iso-8859-1"
assert response.encoding is None
assert response.text == "Hello, world!"


def test_response_default_encoding():
def test_response_no_charset_with_utf8_content():
"""
Default to utf-8 if all else fails.
A response with UTF-8 encoded content should decode correctly,
even with no charset specified.
"""
content = "Unicode Snowman: ☃".encode("utf-8")
headers = {"Content-Type": "text/plain"}
response = httpx.Response(
200,
content=b"",
content=content,
headers=headers,
)
assert response.text == ""
assert response.encoding == "utf-8"
assert response.text == "Unicode Snowman: ☃"
assert response.encoding is None


def test_response_no_charset_with_iso_8859_1_content():
"""
A response with ISO 8859-1 encoded content should decode correctly,
even with no charset specified.
"""
content = "Accented: Österreich".encode("iso-8859-1")
headers = {"Content-Type": "text/plain"}
response = httpx.Response(
200,
content=content,
headers=headers,
)
assert response.text == "Accented: Österreich"
assert response.encoding is None


def test_response_no_charset_with_cp_1252_content():
"""
A response with Windows 1252 encoded content should decode correctly,
even with no charset specified.
"""
content = "Euro Currency: €".encode("cp1252")
headers = {"Content-Type": "text/plain"}
response = httpx.Response(
200,
content=content,
headers=headers,
)
assert response.text == "Euro Currency: €"
assert response.encoding is None


def test_response_non_text_encoding():
Expand All @@ -147,7 +183,7 @@ def test_response_non_text_encoding():
headers=headers,
)
assert response.text == "xyz"
assert response.encoding == "ascii"
assert response.encoding is None


def test_response_set_explicit_encoding():
Expand Down Expand Up @@ -184,7 +220,7 @@ def test_read():

assert response.status_code == 200
assert response.text == "Hello, world!"
assert response.encoding == "ascii"
assert response.encoding is None
assert response.is_closed

content = response.read()
Expand All @@ -203,7 +239,7 @@ async def test_aread():

assert response.status_code == 200
assert response.text == "Hello, world!"
assert response.encoding == "ascii"
assert response.encoding is None
assert response.is_closed

content = await response.aread()
Expand Down
12 changes: 2 additions & 10 deletions tests/test_decoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,16 +177,8 @@ def test_decoding_errors(header_value):
[
((b"Hello,", b" world!"), "ascii"),
((b"\xe3\x83", b"\x88\xe3\x83\xa9", b"\xe3", b"\x83\x99\xe3\x83\xab"), "utf-8"),
((b"\x83g\x83\x89\x83x\x83\x8b",) * 64, "shift-jis"),
((b"\x83g\x83\x89\x83x\x83\x8b",) * 600, "shift-jis"),
(
(b"\xcb\xee\xf0\xe5\xec \xe8\xef\xf1\xf3\xec \xe4\xee\xeb\xee\xf0",) * 64,
"MacCyrillic",
),
(
(b"\xa5\xa6\xa5\xa7\xa5\xd6\xa4\xce\xb9\xf1\xba\xdd\xb2\xbd",) * 512,
"euc-jp",
),
((b"Euro character: \x88!", b""), "cp1252"),
((b"Accented: \xd6sterreich", b""), "iso-8859-1"),
],
)
@pytest.mark.asyncio
Expand Down

0 comments on commit d0fe113

Please sign in to comment.