encode · tomchristie · Sep 15, 2020 · Sep 7, 2020 · Sep 8, 2020 · Sep 9, 2020
diff --git a/httpx/_decoders.py b/httpx/_decoders.py
@@ -167,12 +167,13 @@ def __init__(self, encoding: typing.Optional[str] = None):
 
     def decode(self, data: bytes) -> str:
         if self.decoder is None:
-            self.decoder = codecs.getincrementaldecoder("utf-8")(errors="strict")
+            attempt_utf_8 = codecs.getincrementaldecoder("utf-8")(errors="strict")
             try:
-                return self.decoder.decode(data)
+                attempt_utf_8.decode(data)
             except UnicodeDecodeError:
-                self.decoder = codecs.getincrementaldecoder("cp1251")(errors="ignore")
-                return self.decoder.decode(data)
+                self.decoder = codecs.getincrementaldecoder("cp1252")(errors="replace")
+            else:
+                self.decoder = codecs.getincrementaldecoder("utf-8")(errors="replace")
 
         return self.decoder.decode(data)
 

diff --git a/httpx/_models.py b/httpx/_models.py
@@ -748,8 +748,7 @@ def text(self) -> str:
             if not content:
                 self._text = ""
             else:
-                decoder = TextDecoder(encoding=self.encoding)
-                self._text = "".join([decoder.decode(content), decoder.flush()])
+                self._text = "".join([chunk for chunk in self.iter_text()])
         return self._text
 
     @property
@@ -775,18 +774,11 @@ def charset_encoding(self) -> typing.Optional[str]:
         if content_type is None:
             return None
 
-        parsed = cgi.parse_header(content_type)
-        media_type, params = parsed[0], parsed[-1]
-        if "charset" in params:
-            return params["charset"].strip("'\"")
-
-        # RFC 2616 specifies that 'iso-8859-1' should be used as the default
-        # for 'text/*' media types, if no charset is provided.
-        # See: https://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
-        if media_type.startswith("text/"):
-            return "iso-8859-1"
+        _, params = cgi.parse_header(content_type)
+        if "charset" not in params:
+            return None
 
-        return None
+        return params["charset"].strip("'\"")
 
     @property
     def apparent_encoding(self) -> typing.Optional[str]:
@@ -924,7 +916,7 @@ def iter_text(self) -> typing.Iterator[str]:
         that handles both gzip, deflate, etc but also detects the content's
         string encoding.
         """
-        decoder = TextDecoder(encoding=self.charset_encoding)
+        decoder = TextDecoder(encoding=self.encoding)
         with self._wrap_decoder_errors():
             for chunk in self.iter_bytes():
                 yield decoder.decode(chunk)
@@ -1005,7 +997,7 @@ async def aiter_text(self) -> typing.AsyncIterator[str]:
         that handles both gzip, deflate, etc but also detects the content's
         string encoding.
         """
-        decoder = TextDecoder(encoding=self.charset_encoding)
+        decoder = TextDecoder(encoding=self.encoding)
         with self._wrap_decoder_errors():
             async for chunk in self.aiter_bytes():
                 yield decoder.decode(chunk)

diff --git a/setup.py b/setup.py
@@ -57,7 +57,6 @@ def get_packages(package):
     install_requires=[
         "certifi",
         "sniffio",
-        "chardet==3.*",
         "rfc3986[idna2008]>=1.3,<2",
         "httpcore==0.10.*",
     ],

diff --git a/tests/client/test_client.py b/tests/client/test_client.py
@@ -15,7 +15,7 @@ def test_get(server):
     assert response.content == b"Hello, world!"
     assert response.text == "Hello, world!"
     assert response.http_version == "HTTP/1.1"
-    assert response.encoding == "iso-8859-1"
+    assert response.encoding is None
     assert response.request.url == url
     assert response.headers
     assert response.is_redirect is False

diff --git a/tests/models/test_responses.py b/tests/models/test_responses.py
@@ -82,36 +82,36 @@ def test_response_content_type_encoding():
 
 def test_response_autodetect_encoding():
     """
-    Autodetect encoding if there is no charset info in a Content-Type header.
+    Autodetect encoding if there is no Content-Type header.
     """
-    content = "おはようございます。".encode("EUC-JP")
+    content = "おはようございます。".encode("utf-8")
     response = httpx.Response(
         200,
         content=content,
     )
     assert response.text == "おはようございます。"
-    assert response.encoding == "EUC-JP"
+    assert response.encoding is None
 
 
 def test_response_fallback_to_autodetect():
     """
     Fallback to autodetection if we get an invalid charset in the Content-Type header.
     """
     headers = {"Content-Type": "text-plain; charset=invalid-codec-name"}
-    content = "おはようございます。".encode("EUC-JP")
+    content = "おはようございます。".encode("utf-8")
     response = httpx.Response(
         200,
         content=content,
         headers=headers,
     )
     assert response.text == "おはようございます。"
-    assert response.encoding == "EUC-JP"
+    assert response.encoding is None
 
 
-def test_response_default_text_encoding():
+def test_response_no_charset_with_ascii_content():
     """
-    A media type of 'text/*' with no charset should default to ISO-8859-1.
-    See: https://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
+    A response with ascii encoded content should decode correctly,
+    even with no charset specified.
     """
     content = b"Hello, world!"
     headers = {"Content-Type": "text/plain"}
@@ -121,20 +121,56 @@ def test_response_default_text_encoding():
         headers=headers,
     )
     assert response.status_code == 200
-    assert response.encoding == "iso-8859-1"
+    assert response.encoding is None
     assert response.text == "Hello, world!"
 
 
-def test_response_default_encoding():
+def test_response_no_charset_with_utf8_content():
     """
-    Default to utf-8 if all else fails.
+    A response with UTF-8 encoded content should decode correctly,
+    even with no charset specified.
     """
+    content = "Unicode Snowman: ☃".encode("utf-8")
+    headers = {"Content-Type": "text/plain"}
     response = httpx.Response(
         200,
-        content=b"",
+        content=content,
+        headers=headers,
     )
-    assert response.text == ""
-    assert response.encoding == "utf-8"
+    assert response.text == "Unicode Snowman: ☃"
+    assert response.encoding is None
+
+
+def test_response_no_charset_with_iso_8859_1_content():
+    """
+    A response with ISO 8859-1 encoded content should decode correctly,
+    even with no charset specified.
+    """
+    content = "Accented: Österreich".encode("iso-8859-1")
+    headers = {"Content-Type": "text/plain"}
+    response = httpx.Response(
+        200,
+        content=content,
+        headers=headers,
+    )
+    assert response.text == "Accented: Österreich"
+    assert response.encoding is None
+
+
+def test_response_no_charset_with_cp_1252_content():
+    """
+    A response with Windows 1252 encoded content should decode correctly,
+    even with no charset specified.
+    """
+    content = "Euro Currency: €".encode("cp1252")
+    headers = {"Content-Type": "text/plain"}
+    response = httpx.Response(
+        200,
+        content=content,
+        headers=headers,
+    )
+    assert response.text == "Euro Currency: €"
+    assert response.encoding is None
 
 
 def test_response_non_text_encoding():
@@ -148,7 +184,7 @@ def test_response_non_text_encoding():
         headers=headers,
     )
     assert response.text == "xyz"
-    assert response.encoding == "ascii"
+    assert response.encoding is None
 
 
 def test_response_set_explicit_encoding():
@@ -185,7 +221,7 @@ def test_read():
 
     assert response.status_code == 200
     assert response.text == "Hello, world!"
-    assert response.encoding == "ascii"
+    assert response.encoding is None
     assert response.is_closed
 
     content = response.read()
@@ -204,7 +240,7 @@ async def test_aread():
 
     assert response.status_code == 200
     assert response.text == "Hello, world!"
-    assert response.encoding == "ascii"
+    assert response.encoding is None
     assert response.is_closed
 
     content = await response.aread()

diff --git a/tests/test_decoders.py b/tests/test_decoders.py
@@ -179,7 +179,7 @@ def test_decoding_errors(header_value):
     [
         ((b"Hello,", b" world!"), "ascii"),
         ((b"\xe3\x83", b"\x88\xe3\x83\xa9", b"\xe3", b"\x83\x99\xe3\x83\xab"), "utf-8"),
-        ((b"Euro character: \x88!", b""), "cp1251"),
+        ((b"Euro character: \x88!", b""), "cp1252"),
         # ((b"\x83g\x83\x89\x83x\x83\x8b",) * 64, "shift-jis"),
         # ((b"\x83g\x83\x89\x83x\x83\x8b",) * 600, "shift-jis"),
         # (