]> git.ipfire.org Git - thirdparty/httpx.git/commitdiff
Encoding detection in Response.json (#116)
authorYeray Diaz Diaz <yeraydiazdiaz@gmail.com>
Thu, 11 Jul 2019 16:14:58 +0000 (17:14 +0100)
committerTom Christie <tom@tomchristie.com>
Thu, 11 Jul 2019 16:14:58 +0000 (17:14 +0100)
* Use response text on `json`

Pass kwargs to the loads call

* Add failing test demonstrating corner case

* Copy `guess_json_utf` function from requests

* "Fix" type hinting and lint

* Actually add tests_utils.py

http3/models.py
http3/utils.py
tests/models/test_responses.py
tests/test_utils.py [new file with mode: 0644]

index 7981270eb97e37a09747c6e0978364e95e8b397b..710c0303df4eb8d55b23a2553ecd4db34862261e 100644 (file)
@@ -28,7 +28,12 @@ from .exceptions import (
 )
 from .multipart import multipart_encode
 from .status_codes import StatusCode
-from .utils import is_known_encoding, normalize_header_key, normalize_header_value
+from .utils import (
+    guess_json_utf,
+    is_known_encoding,
+    normalize_header_key,
+    normalize_header_value,
+)
 
 URLTypes = typing.Union["URL", str]
 
@@ -808,8 +813,15 @@ class BaseResponse:
         if message:
             raise HttpError(message)
 
-    def json(self) -> typing.Any:
-        return jsonlib.loads(self.content.decode("utf-8"))
+    def json(self, **kwargs: typing.Any) -> typing.Union[dict, list]:
+        if self.charset_encoding is None and self.content and len(self.content) > 3:
+            encoding = guess_json_utf(self.content)
+            if encoding is not None:
+                try:
+                    return jsonlib.loads(self.content.decode(encoding), **kwargs)
+                except UnicodeDecodeError:
+                    pass
+        return jsonlib.loads(self.text, **kwargs)
 
     @property
     def cookies(self) -> "Cookies":
index 7d61aaf615881763647677a9209c6b1933569e48..3d0d6607d6645b50a37b03ca47c6c575e56426b0 100644 (file)
@@ -29,3 +29,38 @@ def is_known_encoding(encoding: str) -> bool:
     except LookupError:
         return False
     return True
+
+
+# Null bytes; no need to recreate these on each call to guess_json_utf
+_null = "\x00".encode("ascii")  # encoding to ASCII for Python 3
+_null2 = _null * 2
+_null3 = _null * 3
+
+
+def guess_json_utf(data: bytes) -> typing.Optional[str]:
+    # JSON always starts with two ASCII characters, so detection is as
+    # easy as counting the nulls and from their location and count
+    # determine the encoding. Also detect a BOM, if present.
+    sample = data[:4]
+    if sample in (codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
+        return "utf-32"  # BOM included
+    if sample[:3] == codecs.BOM_UTF8:
+        return "utf-8-sig"  # BOM included, MS style (discouraged)
+    if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE):
+        return "utf-16"  # BOM included
+    nullcount = sample.count(_null)
+    if nullcount == 0:
+        return "utf-8"
+    if nullcount == 2:
+        if sample[::2] == _null2:  # 1st and 3rd are null
+            return "utf-16-be"
+        if sample[1::2] == _null2:  # 2nd and 4th are null
+            return "utf-16-le"
+        # Did not detect 2 valid UTF-16 ascii-range characters
+    if nullcount == 3:
+        if sample[:3] == _null3:
+            return "utf-32-be"
+        if sample[1:] == _null3:
+            return "utf-32-le"
+        # Did not detect a valid UTF-32 ascii-range character
+    return None
index 31aa2310a0907ad7e5ff36a266921ea0e7cac71e..dbdcacab4bbfb7c75c2825b35891ceeae71b548c 100644 (file)
@@ -1,3 +1,6 @@
+import json
+from unittest import mock
+
 import pytest
 
 import http3
@@ -250,3 +253,38 @@ def test_unknown_status_code():
     assert response.status_code == 600
     assert response.reason_phrase == ""
     assert response.text == ""
+
+
+def test_json_with_specified_encoding():
+    data = dict(greeting="hello", recipient="world")
+    content = json.dumps(data).encode("utf-16")
+    headers = {"Content-Type": "application/json, charset=utf-16"}
+    response = http3.Response(200, content=content, headers=headers)
+    assert response.json() == data
+
+
+def test_json_with_options():
+    data = dict(greeting="hello", recipient="world", amount=1)
+    content = json.dumps(data).encode("utf-16")
+    headers = {"Content-Type": "application/json, charset=utf-16"}
+    response = http3.Response(200, content=content, headers=headers)
+    assert response.json(parse_int=str)["amount"] == "1"
+
+
+def test_json_without_specified_encoding():
+    data = dict(greeting="hello", recipient="world")
+    content = json.dumps(data).encode("utf-32-be")
+    headers = {"Content-Type": "application/json"}
+    response = http3.Response(200, content=content, headers=headers)
+    assert response.json() == data
+
+
+def test_json_without_specified_encoding_decode_error():
+    data = dict(greeting="hello", recipient="world")
+    content = json.dumps(data).encode("utf-32-be")
+    headers = {"Content-Type": "application/json"}
+    # force incorrect guess from `guess_json_utf` to trigger error
+    with mock.patch("http3.models.guess_json_utf", return_value="utf-32"):
+        response = http3.Response(200, content=content, headers=headers)
+        with pytest.raises(json.JSONDecodeError):
+            response.json()
diff --git a/tests/test_utils.py b/tests/test_utils.py
new file mode 100644 (file)
index 0000000..a27b906
--- /dev/null
@@ -0,0 +1,39 @@
+import pytest
+
+from http3.utils import guess_json_utf
+
+
+@pytest.mark.parametrize(
+    "encoding",
+    (
+        "utf-32",
+        "utf-8-sig",
+        "utf-16",
+        "utf-8",
+        "utf-16-be",
+        "utf-16-le",
+        "utf-32-be",
+        "utf-32-le",
+    ),
+)
+def test_encoded(encoding):
+    data = "{}".encode(encoding)
+    assert guess_json_utf(data) == encoding
+
+
+def test_bad_utf_like_encoding():
+    assert guess_json_utf(b"\x00\x00\x00\x00") is None
+
+
+@pytest.mark.parametrize(
+    ("encoding", "expected"),
+    (
+        ("utf-16-be", "utf-16"),
+        ("utf-16-le", "utf-16"),
+        ("utf-32-be", "utf-32"),
+        ("utf-32-le", "utf-32"),
+    ),
+)
+def test_guess_by_bom(encoding, expected):
+    data = u"\ufeff{}".encode(encoding)
+    assert guess_json_utf(data) == expected