Encoding detection in Response.json (#116)

author Yeray Diaz Diaz <yeraydiazdiaz@gmail.com>

Thu, 11 Jul 2019 16:14:58 +0000 (17:14 +0100)

committer Tom Christie <tom@tomchristie.com>

Thu, 11 Jul 2019 16:14:58 +0000 (17:14 +0100)
author Yeray Diaz Diaz <yeraydiazdiaz@gmail.com>
Thu, 11 Jul 2019 16:14:58 +0000 (17:14 +0100)
committer Tom Christie <tom@tomchristie.com>
Thu, 11 Jul 2019 16:14:58 +0000 (17:14 +0100)
diff --git a/http3/models.py b/http3/models.py

index 7981270eb97e37a09747c6e0978364e95e8b397b..710c0303df4eb8d55b23a2553ecd4db34862261e 100644 (file)
--- a/http3/models.py
+++ b/http3/models.py
@@ -28,7 +28,12 @@ from .exceptions import (
  )
  from .multipart import multipart_encode
  from .status_codes import StatusCode
-from .utils import is_known_encoding, normalize_header_key, normalize_header_value
+from .utils import (
+    guess_json_utf,
+    is_known_encoding,
+    normalize_header_key,
+    normalize_header_value,
+)
  
  URLTypes = typing.Union["URL", str]
  
@@ -808,8 +813,15 @@ class BaseResponse:
          if message:
              raise HttpError(message)
  
-    def json(self) -> typing.Any:
-        return jsonlib.loads(self.content.decode("utf-8"))
+    def json(self, **kwargs: typing.Any) -> typing.Union[dict, list]:
+        if self.charset_encoding is None and self.content and len(self.content) > 3:
+            encoding = guess_json_utf(self.content)
+            if encoding is not None:
+                try:
+                    return jsonlib.loads(self.content.decode(encoding), **kwargs)
+                except UnicodeDecodeError:
+                    pass
+        return jsonlib.loads(self.text, **kwargs)
  
      @property
      def cookies(self) -> "Cookies":
diff --git a/http3/utils.py b/http3/utils.py

index 7d61aaf615881763647677a9209c6b1933569e48..3d0d6607d6645b50a37b03ca47c6c575e56426b0 100644 (file)
--- a/http3/utils.py
+++ b/http3/utils.py
@@ -29,3 +29,38 @@ def is_known_encoding(encoding: str) -> bool:
      except LookupError:
          return False
      return True
+
+
+# Null bytes; no need to recreate these on each call to guess_json_utf
+_null = "\x00".encode("ascii")  # encoding to ASCII for Python 3
+_null2 = _null * 2
+_null3 = _null * 3
+
+
+def guess_json_utf(data: bytes) -> typing.Optional[str]:
+    # JSON always starts with two ASCII characters, so detection is as
+    # easy as counting the nulls and from their location and count
+    # determine the encoding. Also detect a BOM, if present.
+    sample = data[:4]
+    if sample in (codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
+        return "utf-32"  # BOM included
+    if sample[:3] == codecs.BOM_UTF8:
+        return "utf-8-sig"  # BOM included, MS style (discouraged)
+    if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE):
+        return "utf-16"  # BOM included
+    nullcount = sample.count(_null)
+    if nullcount == 0:
+        return "utf-8"
+    if nullcount == 2:
+        if sample[::2] == _null2:  # 1st and 3rd are null
+            return "utf-16-be"
+        if sample[1::2] == _null2:  # 2nd and 4th are null
+            return "utf-16-le"
+        # Did not detect 2 valid UTF-16 ascii-range characters
+    if nullcount == 3:
+        if sample[:3] == _null3:
+            return "utf-32-be"
+        if sample[1:] == _null3:
+            return "utf-32-le"
+        # Did not detect a valid UTF-32 ascii-range character
+    return None
diff --git a/tests/models/test_responses.py b/tests/models/test_responses.py

index 31aa2310a0907ad7e5ff36a266921ea0e7cac71e..dbdcacab4bbfb7c75c2825b35891ceeae71b548c 100644 (file)
--- a/tests/models/test_responses.py
+++ b/tests/models/test_responses.py
@@ -1,3 +1,6 @@
+import json
+from unittest import mock
+
  import pytest
  
  import http3
@@ -250,3 +253,38 @@ def test_unknown_status_code():
      assert response.status_code == 600
      assert response.reason_phrase == ""
      assert response.text == ""
+
+
+def test_json_with_specified_encoding():
+    data = dict(greeting="hello", recipient="world")
+    content = json.dumps(data).encode("utf-16")
+    headers = {"Content-Type": "application/json, charset=utf-16"}
+    response = http3.Response(200, content=content, headers=headers)
+    assert response.json() == data
+
+
+def test_json_with_options():
+    data = dict(greeting="hello", recipient="world", amount=1)
+    content = json.dumps(data).encode("utf-16")
+    headers = {"Content-Type": "application/json, charset=utf-16"}
+    response = http3.Response(200, content=content, headers=headers)
+    assert response.json(parse_int=str)["amount"] == "1"
+
+
+def test_json_without_specified_encoding():
+    data = dict(greeting="hello", recipient="world")
+    content = json.dumps(data).encode("utf-32-be")
+    headers = {"Content-Type": "application/json"}
+    response = http3.Response(200, content=content, headers=headers)
+    assert response.json() == data
+
+
+def test_json_without_specified_encoding_decode_error():
+    data = dict(greeting="hello", recipient="world")
+    content = json.dumps(data).encode("utf-32-be")
+    headers = {"Content-Type": "application/json"}
+    # force incorrect guess from `guess_json_utf` to trigger error
+    with mock.patch("http3.models.guess_json_utf", return_value="utf-32"):
+        response = http3.Response(200, content=content, headers=headers)
+        with pytest.raises(json.JSONDecodeError):
+            response.json()
diff --git a/tests/test_utils.py b/tests/test_utils.py

new file mode 100644 (file)

index 0000000..a27b906
--- /dev/null
+++ b/tests/test_utils.py
@@ -0,0 +1,39 @@
+import pytest
+
+from http3.utils import guess_json_utf
+
+
+@pytest.mark.parametrize(
+    "encoding",
+    (
+        "utf-32",
+        "utf-8-sig",
+        "utf-16",
+        "utf-8",
+        "utf-16-be",
+        "utf-16-le",
+        "utf-32-be",
+        "utf-32-le",
+    ),
+)
+def test_encoded(encoding):
+    data = "{}".encode(encoding)
+    assert guess_json_utf(data) == encoding
+
+
+def test_bad_utf_like_encoding():
+    assert guess_json_utf(b"\x00\x00\x00\x00") is None
+
+
+@pytest.mark.parametrize(
+    ("encoding", "expected"),
+    (
+        ("utf-16-be", "utf-16"),
+        ("utf-16-le", "utf-16"),
+        ("utf-32-be", "utf-32"),
+        ("utf-32-le", "utf-32"),
+    ),
+)
+def test_guess_by_bom(encoding, expected):
+    data = u"\ufeff{}".encode(encoding)
+    assert guess_json_utf(data) == expected
author	Yeray Diaz Diaz <yeraydiazdiaz@gmail.com>
	Thu, 11 Jul 2019 16:14:58 +0000 (17:14 +0100)
committer	Tom Christie <tom@tomchristie.com>
	Thu, 11 Jul 2019 16:14:58 +0000 (17:14 +0100)
http3/models.py		patch \| blob \| blame \| history
http3/utils.py		patch \| blob \| blame \| history
tests/models/test_responses.py		patch \| blob \| blame \| history
tests/test_utils.py	[new file with mode: 0644]	patch \| blob