Add `charset_normalizer` detection. (#1791)

author Tom Christie <tom@tomchristie.com>

Fri, 13 Aug 2021 10:38:53 +0000 (11:38 +0100)

committer GitHub <noreply@github.com>

Fri, 13 Aug 2021 10:38:53 +0000 (11:38 +0100)
author Tom Christie <tom@tomchristie.com>
Fri, 13 Aug 2021 10:38:53 +0000 (11:38 +0100)
committer GitHub <noreply@github.com>
Fri, 13 Aug 2021 10:38:53 +0000 (11:38 +0100)
diff --git a/README.md b/README.md

index 2183ef08aacaa22da1a36434512c9e8d190f9035..54c496faee69137c8cd6a001201b18c61426cc9c 100644 (file)
--- a/README.md
+++ b/README.md
@@ -119,6 +119,7 @@ The HTTPX project relies on these excellent libraries:
    * `h11` - HTTP/1.1 support.
    * `h2` - HTTP/2 support. *(Optional)*
  * `certifi` - SSL certificates.
+* `charset_normalizer` - Charset auto-detection.
  * `rfc3986` - URL parsing & normalization.
    * `idna` - Internationalized domain name support.
  * `sniffio` - Async library autodetection.
diff --git a/docs/index.md b/docs/index.md

index a550f7859df47f41faf8612ab8e266e00d57301c..3a00ca8cbd08fe6288ced3a5c1131530875889bb 100644 (file)
--- a/docs/index.md
+++ b/docs/index.md
@@ -111,6 +111,7 @@ The HTTPX project relies on these excellent libraries:
    * `h11` - HTTP/1.1 support.
    * `h2` - HTTP/2 support. *(Optional)*
  * `certifi` - SSL certificates.
+* `charset_normalizer` - Charset auto-detection.
  * `rfc3986` - URL parsing & normalization.
    * `idna` - Internationalized domain name support.
  * `sniffio` - Async library autodetection.
diff --git a/httpx/_decoders.py b/httpx/_decoders.py

index 2230b77a9f1ff90d1e8a974e9412f56920275ad0..50e6798e9e13f3a44ef8b3a381006a00f3b798b4 100644 (file)
--- a/httpx/_decoders.py
+++ b/httpx/_decoders.py
@@ -241,52 +241,13 @@ class TextDecoder:
      Handles incrementally decoding bytes into text
      """
  
-    def __init__(self, encoding: typing.Optional[str] = None):
-        self.decoder: typing.Optional[codecs.IncrementalDecoder] = None
-        if encoding is not None:
-            self.decoder = codecs.getincrementaldecoder(encoding)(errors="strict")
+    def __init__(self, encoding: str = "utf-8"):
+        self.decoder = codecs.getincrementaldecoder(encoding)(errors="replace")
  
      def decode(self, data: bytes) -> str:
-        """
-        If an encoding is explicitly specified, then we use that.
-        Otherwise our strategy is to attempt UTF-8, and fallback to Windows 1252.
-
-        Note that UTF-8 is a strict superset of ascii, and Windows 1252 is a
-        superset of the non-control characters in iso-8859-1, so we essentially
-        end up supporting any of ascii, utf-8, iso-8859-1, cp1252.
-
-        Given that UTF-8 is now by *far* the most widely used encoding, this
-        should be a pretty robust strategy for cases where a charset has
-        not been explicitly included.
-
-        Useful stats on the prevalence of different charsets in the wild...
-
-        * https://w3techs.com/technologies/overview/character_encoding
-        * https://w3techs.com/technologies/history_overview/character_encoding
-
-        The HTML5 spec also has some useful guidelines, suggesting defaults of
-        either UTF-8 or Windows 1252 in most cases...
-
-        * https://dev.w3.org/html5/spec-LC/Overview.html
-        """
-        if self.decoder is None:
-            # If this is the first decode pass then we need to determine which
-            # encoding to use by attempting UTF-8 and raising any decode errors.
-            attempt_utf_8 = codecs.getincrementaldecoder("utf-8")(errors="strict")
-            try:
-                attempt_utf_8.decode(data)
-            except UnicodeDecodeError:
-                # Could not decode as UTF-8. Use Windows 1252.
-                self.decoder = codecs.getincrementaldecoder("cp1252")(errors="replace")
-            else:
-                # Can decode as UTF-8. Use UTF-8 with lenient error settings.
-                self.decoder = codecs.getincrementaldecoder("utf-8")(errors="replace")
-
          return self.decoder.decode(data)
  
      def flush(self) -> str:
-        if self.decoder is None:
-            return ""
          return self.decoder.decode(b"", True)
  
  
diff --git a/httpx/_models.py b/httpx/_models.py

index e2e577b3f53df73e68657b40c6944c29b2057c10..c86d37d93d0ab24802302fb5593b99d319e4f6e5 100644 (file)
--- a/httpx/_models.py
+++ b/httpx/_models.py
@@ -8,6 +8,7 @@ from collections.abc import MutableMapping
  from http.cookiejar import Cookie, CookieJar
  from urllib.parse import parse_qs, quote, unquote, urlencode
  
+import charset_normalizer
  import idna
  import rfc3986
  import rfc3986.exceptions
@@ -1314,22 +1315,26 @@ class Response:
              if not content:
                  self._text = ""
              else:
-                decoder = TextDecoder(encoding=self.encoding)
+                decoder = TextDecoder(encoding=self.encoding or "utf-8")
                  self._text = "".join([decoder.decode(self.content), decoder.flush()])
          return self._text
  
      @property
      def encoding(self) -> typing.Optional[str]:
          """
-        Return the encoding, which may have been set explicitly, or may have
-        been specified by the Content-Type header.
+        Return an encoding to use for decoding the byte content into text.
+        The priority for determining this is given by...
+
+        * `.encoding = <>` has been set explicitly.
+        * The encoding as specified by the charset parameter in the Content-Type header.
+        * The encoding as determined by `charset_normalizer`.
+        * UTF-8.
          """
          if not hasattr(self, "_encoding"):
              encoding = self.charset_encoding
              if encoding is None or not is_known_encoding(encoding):
-                self._encoding = None
-            else:
-                self._encoding = encoding
+                encoding = self.apparent_encoding
+            self._encoding = encoding
          return self._encoding
  
      @encoding.setter
@@ -1351,6 +1356,19 @@ class Response:
  
          return params["charset"].strip("'\"")
  
+    @property
+    def apparent_encoding(self) -> typing.Optional[str]:
+        """
+        Return the encoding, as detemined by `charset_normalizer`.
+        """
+        content = getattr(self, "_content", b"")
+        if len(content) < 32:
+            # charset_normalizer will issue warnings if we run it with
+            # fewer bytes than this cutoff.
+            return None
+        match = charset_normalizer.from_bytes(self.content).best()
+        return None if match is None else match.encoding
+
      def _get_content_decoder(self) -> ContentDecoder:
          """
          Returns a decoder instance which can be used to decode the raw byte
@@ -1411,10 +1429,7 @@ class Response:
          if self.charset_encoding is None and self.content and len(self.content) > 3:
              encoding = guess_json_utf(self.content)
              if encoding is not None:
-                try:
-                    return jsonlib.loads(self.content.decode(encoding), **kwargs)
-                except UnicodeDecodeError:
-                    pass
+                return jsonlib.loads(self.content.decode(encoding), **kwargs)
          return jsonlib.loads(self.text, **kwargs)
  
      @property
@@ -1495,7 +1510,7 @@ class Response:
          that handles both gzip, deflate, etc but also detects the content's
          string encoding.
          """
-        decoder = TextDecoder(encoding=self.encoding)
+        decoder = TextDecoder(encoding=self.encoding or "utf-8")
          chunker = TextChunker(chunk_size=chunk_size)
          with request_context(request=self._request):
              for byte_content in self.iter_bytes():
@@ -1593,7 +1608,7 @@ class Response:
          that handles both gzip, deflate, etc but also detects the content's
          string encoding.
          """
-        decoder = TextDecoder(encoding=self.encoding)
+        decoder = TextDecoder(encoding=self.encoding or "utf-8")
          chunker = TextChunker(chunk_size=chunk_size)
          with request_context(request=self._request):
              async for byte_content in self.aiter_bytes():
diff --git a/setup.py b/setup.py

index 8854039e21b935dbafb1203a2fe0e7ef4a71f758..aeee7c6c18d599faedf7b0568f3b7e2635bfd6b4 100644 (file)
--- a/setup.py
+++ b/setup.py
@@ -57,6 +57,7 @@ setup(
      zip_safe=False,
      install_requires=[
          "certifi",
+        "charset_normalizer",
          "sniffio",
          "rfc3986[idna2008]>=1.3,<2",
          "httpcore>=0.13.3,<0.14.0",
diff --git a/tests/models/test_responses.py b/tests/models/test_responses.py

index b7c2d57cd3d8c341ef69366ef696b4f146faf3b3..b9cc077320f3aa2c9dd3b10dd8b9687aa526e047 100644 (file)
--- a/tests/models/test_responses.py
+++ b/tests/models/test_responses.py
@@ -1,6 +1,5 @@
  import json
  import pickle
-from unittest import mock
  
  import brotlicffi
  import pytest
@@ -197,15 +196,16 @@ def test_response_no_charset_with_iso_8859_1_content():
      A response with ISO 8859-1 encoded content should decode correctly,
      even with no charset specified.
      """
-    content = "Accented: Österreich".encode("iso-8859-1")
+    content = "Accented: Österreich abcdefghijklmnopqrstuzwxyz".encode("iso-8859-1")
      headers = {"Content-Type": "text/plain"}
      response = httpx.Response(
          200,
          content=content,
          headers=headers,
      )
-    assert response.text == "Accented: Österreich"
-    assert response.encoding is None
+    assert response.text == "Accented: Österreich abcdefghijklmnopqrstuzwxyz"
+    assert response.charset_encoding is None
+    assert response.apparent_encoding is not None
  
  
  def test_response_no_charset_with_cp_1252_content():
@@ -213,15 +213,16 @@ def test_response_no_charset_with_cp_1252_content():
      A response with Windows 1252 encoded content should decode correctly,
      even with no charset specified.
      """
-    content = "Euro Currency: €".encode("cp1252")
+    content = "Euro Currency: € abcdefghijklmnopqrstuzwxyz".encode("cp1252")
      headers = {"Content-Type": "text/plain"}
      response = httpx.Response(
          200,
          content=content,
          headers=headers,
      )
-    assert response.text == "Euro Currency: €"
-    assert response.encoding is None
+    assert response.text == "Euro Currency: € abcdefghijklmnopqrstuzwxyz"
+    assert response.charset_encoding is None
+    assert response.apparent_encoding is not None
  
  
  def test_response_non_text_encoding():
@@ -718,9 +719,22 @@ def test_json_with_options():
      assert response.json(parse_int=str)["amount"] == "1"
  
  
-def test_json_without_specified_encoding():
+@pytest.mark.parametrize(
+    "encoding",
+    [
+        "utf-8",
+        "utf-8-sig",
+        "utf-16",
+        "utf-16-be",
+        "utf-16-le",
+        "utf-32",
+        "utf-32-be",
+        "utf-32-le",
+    ],
+)
+def test_json_without_specified_charset(encoding):
      data = {"greeting": "hello", "recipient": "world"}
-    content = json.dumps(data).encode("utf-32-be")
+    content = json.dumps(data).encode(encoding)
      headers = {"Content-Type": "application/json"}
      response = httpx.Response(
          200,
@@ -730,30 +744,29 @@ def test_json_without_specified_encoding():
      assert response.json() == data
  
  
-def test_json_without_specified_encoding_decode_error():
-    data = {"greeting": "hello", "recipient": "world"}
-    content = json.dumps(data).encode("utf-32-be")
-    headers = {"Content-Type": "application/json"}
-    # force incorrect guess from `guess_json_utf` to trigger error
-    with mock.patch("httpx._models.guess_json_utf", return_value="utf-32-le"):
-        response = httpx.Response(
-            200,
-            content=content,
-            headers=headers,
-        )
-        with pytest.raises(json.decoder.JSONDecodeError):
-            response.json()
-
-
-def test_json_without_specified_encoding_value_error():
+@pytest.mark.parametrize(
+    "encoding",
+    [
+        "utf-8",
+        "utf-8-sig",
+        "utf-16",
+        "utf-16-be",
+        "utf-16-le",
+        "utf-32",
+        "utf-32-be",
+        "utf-32-le",
+    ],
+)
+def test_json_with_specified_charset(encoding):
      data = {"greeting": "hello", "recipient": "world"}
-    content = json.dumps(data).encode("utf-32-be")
-    headers = {"Content-Type": "application/json"}
-    # force incorrect guess from `guess_json_utf` to trigger error
-    with mock.patch("httpx._models.guess_json_utf", return_value="utf-32-le"):
-        response = httpx.Response(200, content=content, headers=headers)
-        with pytest.raises(json.decoder.JSONDecodeError):
-            response.json()
+    content = json.dumps(data).encode(encoding)
+    headers = {"Content-Type": f"application/json; charset={encoding}"}
+    response = httpx.Response(
+        200,
+        content=content,
+        headers=headers,
+    )
+    assert response.json() == data
  
  
  @pytest.mark.parametrize(
diff --git a/tests/test_decoders.py b/tests/test_decoders.py

index f681a57a7ffb67ed814713af5dff347d582423ee..4f182efdcc2697ecff2b50b6e2a217737e1b52ba 100644 (file)
--- a/tests/test_decoders.py
+++ b/tests/test_decoders.py
@@ -179,8 +179,8 @@ def test_decoding_errors(header_value):
      [
          ((b"Hello,", b" world!"), "ascii"),
          ((b"\xe3\x83", b"\x88\xe3\x83\xa9", b"\xe3", b"\x83\x99\xe3\x83\xab"), "utf-8"),
-        ((b"Euro character: \x88!", b""), "cp1252"),
-        ((b"Accented: \xd6sterreich", b""), "iso-8859-1"),
+        ((b"Euro character: \x88! abcdefghijklmnopqrstuvwxyz", b""), "cp1252"),
+        ((b"Accented: \xd6sterreich abcdefghijklmnopqrstuvwxyz", b""), "iso-8859-1"),
      ],
  )
  @pytest.mark.asyncio
@@ -199,10 +199,9 @@ async def test_text_decoder(data, encoding):
      assert response.text == (b"".join(data)).decode(encoding)
  
      # Streaming `.aiter_text` iteratively.
-    response = httpx.Response(
-        200,
-        content=iterator(),
-    )
+    # Note that if we streamed the text *without* having read it first, then
+    # we won't get a `charset_normalizer` guess, and will instead always rely
+    # on utf-8 if no charset is specified.
      text = "".join([part async for part in response.aiter_text()])
      assert text == (b"".join(data)).decode(encoding)
author	Tom Christie <tom@tomchristie.com>
	Fri, 13 Aug 2021 10:38:53 +0000 (11:38 +0100)
committer	GitHub <noreply@github.com>
	Fri, 13 Aug 2021 10:38:53 +0000 (11:38 +0100)
README.md		patch \| blob \| blame \| history
docs/index.md		patch \| blob \| blame \| history
httpx/_decoders.py		patch \| blob \| blame \| history
httpx/_models.py		patch \| blob \| blame \| history
setup.py		patch \| blob \| blame \| history
tests/models/test_responses.py		patch \| blob \| blame \| history
tests/test_decoders.py		patch \| blob \| blame \| history