Drop chardet (#1269)

author Tom Christie <tom@tomchristie.com>

Tue, 15 Sep 2020 10:20:19 +0000 (11:20 +0100)

committer GitHub <noreply@github.com>

Tue, 15 Sep 2020 10:20:19 +0000 (11:20 +0100)
author Tom Christie <tom@tomchristie.com>
Tue, 15 Sep 2020 10:20:19 +0000 (11:20 +0100)
committer GitHub <noreply@github.com>
Tue, 15 Sep 2020 10:20:19 +0000 (11:20 +0100)
diff --git a/docs/quickstart.md b/docs/quickstart.md

index f11f7a45dacdc90340de8027000b8c32b8320b78..edc48cc4c3a5f2bb36d46e03b716718bb63be7c2 100644 (file)
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -65,14 +65,27 @@ HTTPX will automatically handle decoding the response content into Unicode text.
  '<!doctype html>\n<html>\n<head>\n<title>Example Domain</title>...'
  ```
  
-You can inspect what encoding has been used to decode the response.
+You can inspect what encoding will be used to decode the response.
  
  ```pycon
  >>> r.encoding
  'UTF-8'
  ```
  
-If you need to override the standard behavior and explicitly set the encoding to
+In some cases the response may not contain an explicit encoding, in which case HTTPX
+will attempt to automatically determine an encoding to use. This defaults to
+UTF-8, but also includes robust fallback behaviour for handling ascii,
+iso-8859-1 and windows 1252 encodings.
+
+```pycon
+>>> r.encoding
+None
+>>> r.text
+'<!doctype html>\n<html>\n<head>\n<title>Example Domain</title>...'
+```
+
+
+If you need to override the standard behaviour and explicitly set the encoding to
  use, then you can do that too.
  
  ```pycon
diff --git a/httpx/_decoders.py b/httpx/_decoders.py

index 40c6da9fc0f8782b3bc3dfcc1f6961b5a9e534f8..bac5f9c86f11ddd39eb17b1f8d00b05e4bf23583 100644 (file)
--- a/httpx/_decoders.py
+++ b/httpx/_decoders.py
@@ -7,8 +7,6 @@ import codecs
  import typing
  import zlib
  
-import chardet
-
  try:
      import brotli
  except ImportError:  # pragma: nocover
@@ -163,62 +161,52 @@ class TextDecoder:
      """
  
      def __init__(self, encoding: typing.Optional[str] = None):
-        self.decoder: typing.Optional[codecs.IncrementalDecoder] = (
-            None if encoding is None else codecs.getincrementaldecoder(encoding)()
-        )
-        self.detector = chardet.universaldetector.UniversalDetector()
-
-        # This buffer is only needed if 'decoder' is 'None'
-        # we want to trigger errors if data is getting added to
-        # our internal buffer for some silly reason while
-        # a decoder is discovered.
-        self.buffer: typing.Optional[bytearray] = None if self.decoder else bytearray()
+        self.decoder: typing.Optional[codecs.IncrementalDecoder] = None
+        if encoding is not None:
+            self.decoder = codecs.getincrementaldecoder(encoding)(errors="strict")
  
      def decode(self, data: bytes) -> str:
-        try:
-            if self.decoder is not None:
-                text = self.decoder.decode(data)
-            else:
-                assert self.buffer is not None
-                text = ""
-                self.detector.feed(data)
-                self.buffer += data
-
-                # Should be more than enough data to process, we don't
-                # want to buffer too long as chardet will wait until
-                # detector.close() is used to give back common
-                # encodings like 'utf-8'.
-                if len(self.buffer) >= 4096:
-                    self.decoder = codecs.getincrementaldecoder(
-                        self._detector_result()
-                    )()
-                    text = self.decoder.decode(bytes(self.buffer), False)
-                    self.buffer = None
-
-            return text
-        except UnicodeDecodeError as exc:  # pragma: nocover
-            raise ValueError(str(exc))
+        """
+        If an encoding is explicitly specified, then we use that.
+        Otherwise our strategy is to attempt UTF-8, and fallback to Windows 1252.
  
-    def flush(self) -> str:
-        try:
-            if self.decoder is None:
-                # Empty string case as chardet is guaranteed to not have a guess.
-                assert self.buffer is not None
-                if len(self.buffer) == 0:
-                    return ""
-                return bytes(self.buffer).decode(self._detector_result())
-
-            return self.decoder.decode(b"", True)
-        except UnicodeDecodeError as exc:  # pragma: nocover
-            raise ValueError(str(exc))
+        Note that UTF-8 is a strict superset of ascii, and Windows 1252 is a
+        superset of the non-control characters in iso-8859-1, so we essentially
+        end up supporting any of ascii, utf-8, iso-8859-1, cp1252.
  
-    def _detector_result(self) -> str:
-        self.detector.close()
-        result = self.detector.result["encoding"]
-        if not result:  # pragma: nocover
-            raise ValueError("Unable to determine encoding of content")
+        Given that UTF-8 is now by *far* the most widely used encoding, this
+        should be a pretty robust strategy for cases where a charset has
+        not been explicitly included.
  
-        return result
+        Useful stats on the prevalence of different charsets in the wild...
+
+        * https://w3techs.com/technologies/overview/character_encoding
+        * https://w3techs.com/technologies/history_overview/character_encoding
+
+        The HTML5 spec also has some useful guidelines, suggesting defaults of
+        either UTF-8 or Windows 1252 in most cases...
+
+        * https://dev.w3.org/html5/spec-LC/Overview.html
+        """
+        if self.decoder is None:
+            # If this is the first decode pass then we need to determine which
+            # encoding to use by attempting UTF-8 and raising any decode errors.
+            attempt_utf_8 = codecs.getincrementaldecoder("utf-8")(errors="strict")
+            try:
+                attempt_utf_8.decode(data)
+            except UnicodeDecodeError:
+                # Could not decode as UTF-8. Use Windows 1252.
+                self.decoder = codecs.getincrementaldecoder("cp1252")(errors="replace")
+            else:
+                # Can decode as UTF-8. Use UTF-8 with lenient error settings.
+                self.decoder = codecs.getincrementaldecoder("utf-8")(errors="replace")
+
+        return self.decoder.decode(data)
+
+    def flush(self) -> str:
+        if self.decoder is None:
+            return ""
+        return self.decoder.decode(b"", True)
  
  
  class LineDecoder:
diff --git a/httpx/_models.py b/httpx/_models.py

index 9d46752372fd622a01bc27616f9b9483053ff6d6..90f685f7b816b7086b6a5703203bbb142efa77ac 100644 (file)
--- a/httpx/_models.py
+++ b/httpx/_models.py
@@ -10,7 +10,6 @@ from collections.abc import MutableMapping
  from http.cookiejar import Cookie, CookieJar
  from urllib.parse import parse_qsl, quote, unquote, urlencode
  
-import chardet
  import rfc3986
  import rfc3986.exceptions
  
@@ -755,19 +754,22 @@ class Response:
              if not content:
                  self._text = ""
              else:
-                encoding = self.encoding
-                self._text = content.decode(encoding, errors="replace")
+                decoder = TextDecoder(encoding=self.encoding)
+                self._text = "".join([decoder.decode(self.content), decoder.flush()])
          return self._text
  
      @property
-    def encoding(self) -> str:
+    def encoding(self) -> typing.Optional[str]:
+        """
+        Return the encoding, which may have been set explicitly, or may have
+        been specified by the Content-Type header.
+        """
          if not hasattr(self, "_encoding"):
              encoding = self.charset_encoding
              if encoding is None or not is_known_encoding(encoding):
-                encoding = self.apparent_encoding
-                if encoding is None or not is_known_encoding(encoding):
-                    encoding = "utf-8"
-            self._encoding = encoding
+                self._encoding = None
+            else:
+                self._encoding = encoding
          return self._encoding
  
      @encoding.setter
@@ -783,25 +785,11 @@ class Response:
          if content_type is None:
              return None
  
-        parsed = cgi.parse_header(content_type)
-        media_type, params = parsed[0], parsed[-1]
-        if "charset" in params:
-            return params["charset"].strip("'\"")
-
-        # RFC 2616 specifies that 'iso-8859-1' should be used as the default
-        # for 'text/*' media types, if no charset is provided.
-        # See: https://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
-        if media_type.startswith("text/"):
-            return "iso-8859-1"
-
-        return None
+        _, params = cgi.parse_header(content_type)
+        if "charset" not in params:
+            return None
  
-    @property
-    def apparent_encoding(self) -> typing.Optional[str]:
-        """
-        Return the encoding, as it appears to autodetection.
-        """
-        return chardet.detect(self.content)["encoding"]
+        return params["charset"].strip("'\"")
  
      def _get_content_decoder(self) -> ContentDecoder:
          """
@@ -936,7 +924,7 @@ class Response:
          that handles both gzip, deflate, etc but also detects the content's
          string encoding.
          """
-        decoder = TextDecoder(encoding=self.charset_encoding)
+        decoder = TextDecoder(encoding=self.encoding)
          with self._wrap_decoder_errors():
              for chunk in self.iter_bytes():
                  yield decoder.decode(chunk)
@@ -1020,7 +1008,7 @@ class Response:
          that handles both gzip, deflate, etc but also detects the content's
          string encoding.
          """
-        decoder = TextDecoder(encoding=self.charset_encoding)
+        decoder = TextDecoder(encoding=self.encoding)
          with self._wrap_decoder_errors():
              async for chunk in self.aiter_bytes():
                  yield decoder.decode(chunk)
diff --git a/setup.py b/setup.py

index e811d2a8e723e919c0398a40db712e61d4454959..d7006dd805027c0030817bd2f569e4a734fe2430 100644 (file)
--- a/setup.py
+++ b/setup.py
@@ -57,7 +57,6 @@ setup(
      install_requires=[
          "certifi",
          "sniffio",
-        "chardet==3.*",
          "rfc3986[idna2008]>=1.3,<2",
          "httpcore==0.10.*",
      ],
diff --git a/tests/client/test_client.py b/tests/client/test_client.py

index d5a8c0e0041e93c25bff93a42ff1218f9bb1d4e3..f56e493f2f5e815ef91dd807bea1090ce1becac3 100644 (file)
--- a/tests/client/test_client.py
+++ b/tests/client/test_client.py
@@ -15,7 +15,7 @@ def test_get(server):
      assert response.content == b"Hello, world!"
      assert response.text == "Hello, world!"
      assert response.http_version == "HTTP/1.1"
-    assert response.encoding == "iso-8859-1"
+    assert response.encoding is None
      assert response.request.url == url
      assert response.headers
      assert response.is_redirect is False
diff --git a/tests/models/test_responses.py b/tests/models/test_responses.py

index b52e4846f3675814f46455000d1d78f8e3d5901f..1e033deba1758bcfe8575bdfde0b71b0dcd27a41 100644 (file)
--- a/tests/models/test_responses.py
+++ b/tests/models/test_responses.py
@@ -81,15 +81,15 @@ def test_response_content_type_encoding():
  
  def test_response_autodetect_encoding():
      """
-    Autodetect encoding if there is no charset info in a Content-Type header.
+    Autodetect encoding if there is no Content-Type header.
      """
-    content = "おはようございます。".encode("EUC-JP")
+    content = "おはようございます。".encode("utf-8")
      response = httpx.Response(
          200,
          content=content,
      )
      assert response.text == "おはようございます。"
-    assert response.encoding == "EUC-JP"
+    assert response.encoding is None
  
  
  def test_response_fallback_to_autodetect():
@@ -97,20 +97,20 @@ def test_response_fallback_to_autodetect():
      Fallback to autodetection if we get an invalid charset in the Content-Type header.
      """
      headers = {"Content-Type": "text-plain; charset=invalid-codec-name"}
-    content = "おはようございます。".encode("EUC-JP")
+    content = "おはようございます。".encode("utf-8")
      response = httpx.Response(
          200,
          content=content,
          headers=headers,
      )
      assert response.text == "おはようございます。"
-    assert response.encoding == "EUC-JP"
+    assert response.encoding is None
  
  
-def test_response_default_text_encoding():
+def test_response_no_charset_with_ascii_content():
      """
-    A media type of 'text/*' with no charset should default to ISO-8859-1.
-    See: https://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.7.1
+    A response with ascii encoded content should decode correctly,
+    even with no charset specified.
      """
      content = b"Hello, world!"
      headers = {"Content-Type": "text/plain"}
@@ -120,20 +120,56 @@ def test_response_default_text_encoding():
          headers=headers,
      )
      assert response.status_code == 200
-    assert response.encoding == "iso-8859-1"
+    assert response.encoding is None
      assert response.text == "Hello, world!"
  
  
-def test_response_default_encoding():
+def test_response_no_charset_with_utf8_content():
      """
-    Default to utf-8 if all else fails.
+    A response with UTF-8 encoded content should decode correctly,
+    even with no charset specified.
      """
+    content = "Unicode Snowman: ☃".encode("utf-8")
+    headers = {"Content-Type": "text/plain"}
      response = httpx.Response(
          200,
-        content=b"",
+        content=content,
+        headers=headers,
      )
-    assert response.text == ""
-    assert response.encoding == "utf-8"
+    assert response.text == "Unicode Snowman: ☃"
+    assert response.encoding is None
+
+
+def test_response_no_charset_with_iso_8859_1_content():
+    """
+    A response with ISO 8859-1 encoded content should decode correctly,
+    even with no charset specified.
+    """
+    content = "Accented: Österreich".encode("iso-8859-1")
+    headers = {"Content-Type": "text/plain"}
+    response = httpx.Response(
+        200,
+        content=content,
+        headers=headers,
+    )
+    assert response.text == "Accented: Österreich"
+    assert response.encoding is None
+
+
+def test_response_no_charset_with_cp_1252_content():
+    """
+    A response with Windows 1252 encoded content should decode correctly,
+    even with no charset specified.
+    """
+    content = "Euro Currency: €".encode("cp1252")
+    headers = {"Content-Type": "text/plain"}
+    response = httpx.Response(
+        200,
+        content=content,
+        headers=headers,
+    )
+    assert response.text == "Euro Currency: €"
+    assert response.encoding is None
  
  
  def test_response_non_text_encoding():
@@ -147,7 +183,7 @@ def test_response_non_text_encoding():
          headers=headers,
      )
      assert response.text == "xyz"
-    assert response.encoding == "ascii"
+    assert response.encoding is None
  
  
  def test_response_set_explicit_encoding():
@@ -184,7 +220,7 @@ def test_read():
  
      assert response.status_code == 200
      assert response.text == "Hello, world!"
-    assert response.encoding == "ascii"
+    assert response.encoding is None
      assert response.is_closed
  
      content = response.read()
@@ -203,7 +239,7 @@ async def test_aread():
  
      assert response.status_code == 200
      assert response.text == "Hello, world!"
-    assert response.encoding == "ascii"
+    assert response.encoding is None
      assert response.is_closed
  
      content = await response.aread()
diff --git a/tests/test_decoders.py b/tests/test_decoders.py

index 7dfca9ef50e5151b40b0be0a2e15c61b138db625..351fce0520485b13c8b4c202729aff4bf631f2e8 100644 (file)
--- a/tests/test_decoders.py
+++ b/tests/test_decoders.py
@@ -177,16 +177,8 @@ def test_decoding_errors(header_value):
      [
          ((b"Hello,", b" world!"), "ascii"),
          ((b"\xe3\x83", b"\x88\xe3\x83\xa9", b"\xe3", b"\x83\x99\xe3\x83\xab"), "utf-8"),
-        ((b"\x83g\x83\x89\x83x\x83\x8b",) * 64, "shift-jis"),
-        ((b"\x83g\x83\x89\x83x\x83\x8b",) * 600, "shift-jis"),
-        (
-            (b"\xcb\xee\xf0\xe5\xec \xe8\xef\xf1\xf3\xec \xe4\xee\xeb\xee\xf0",) * 64,
-            "MacCyrillic",
-        ),
-        (
-            (b"\xa5\xa6\xa5\xa7\xa5\xd6\xa4\xce\xb9\xf1\xba\xdd\xb2\xbd",) * 512,
-            "euc-jp",
-        ),
+        ((b"Euro character: \x88!", b""), "cp1252"),
+        ((b"Accented: \xd6sterreich", b""), "iso-8859-1"),
      ],
  )
  @pytest.mark.asyncio
author	Tom Christie <tom@tomchristie.com>
	Tue, 15 Sep 2020 10:20:19 +0000 (11:20 +0100)
committer	GitHub <noreply@github.com>
	Tue, 15 Sep 2020 10:20:19 +0000 (11:20 +0100)
docs/quickstart.md		patch \| blob \| blame \| history
httpx/_decoders.py		patch \| blob \| blame \| history
httpx/_models.py		patch \| blob \| blame \| history
setup.py		patch \| blob \| blame \| history
tests/client/test_client.py		patch \| blob \| blame \| history
tests/models/test_responses.py		patch \| blob \| blame \| history
tests/test_decoders.py		patch \| blob \| blame \| history