From: Yeray Diaz Diaz Date: Thu, 11 Jul 2019 16:14:58 +0000 (+0100) Subject: Encoding detection in Response.json (#116) X-Git-Tag: 0.6.8~18 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=5442006a41f94a3e41186910d7a6e8546adf0f89;p=thirdparty%2Fhttpx.git Encoding detection in Response.json (#116) * Use response text on `json` Pass kwargs to the loads call * Add failing test demonstrating corner case * Copy `guess_json_utf` function from requests * "Fix" type hinting and lint * Actually add tests_utils.py --- diff --git a/http3/models.py b/http3/models.py index 7981270e..710c0303 100644 --- a/http3/models.py +++ b/http3/models.py @@ -28,7 +28,12 @@ from .exceptions import ( ) from .multipart import multipart_encode from .status_codes import StatusCode -from .utils import is_known_encoding, normalize_header_key, normalize_header_value +from .utils import ( + guess_json_utf, + is_known_encoding, + normalize_header_key, + normalize_header_value, +) URLTypes = typing.Union["URL", str] @@ -808,8 +813,15 @@ class BaseResponse: if message: raise HttpError(message) - def json(self) -> typing.Any: - return jsonlib.loads(self.content.decode("utf-8")) + def json(self, **kwargs: typing.Any) -> typing.Union[dict, list]: + if self.charset_encoding is None and self.content and len(self.content) > 3: + encoding = guess_json_utf(self.content) + if encoding is not None: + try: + return jsonlib.loads(self.content.decode(encoding), **kwargs) + except UnicodeDecodeError: + pass + return jsonlib.loads(self.text, **kwargs) @property def cookies(self) -> "Cookies": diff --git a/http3/utils.py b/http3/utils.py index 7d61aaf6..3d0d6607 100644 --- a/http3/utils.py +++ b/http3/utils.py @@ -29,3 +29,38 @@ def is_known_encoding(encoding: str) -> bool: except LookupError: return False return True + + +# Null bytes; no need to recreate these on each call to guess_json_utf +_null = "\x00".encode("ascii") # encoding to ASCII for Python 3 +_null2 = _null * 2 +_null3 = _null * 3 + + +def guess_json_utf(data: bytes) -> typing.Optional[str]: + # JSON always starts with two ASCII characters, so detection is as + # easy as counting the nulls and from their location and count + # determine the encoding. Also detect a BOM, if present. + sample = data[:4] + if sample in (codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE): + return "utf-32" # BOM included + if sample[:3] == codecs.BOM_UTF8: + return "utf-8-sig" # BOM included, MS style (discouraged) + if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): + return "utf-16" # BOM included + nullcount = sample.count(_null) + if nullcount == 0: + return "utf-8" + if nullcount == 2: + if sample[::2] == _null2: # 1st and 3rd are null + return "utf-16-be" + if sample[1::2] == _null2: # 2nd and 4th are null + return "utf-16-le" + # Did not detect 2 valid UTF-16 ascii-range characters + if nullcount == 3: + if sample[:3] == _null3: + return "utf-32-be" + if sample[1:] == _null3: + return "utf-32-le" + # Did not detect a valid UTF-32 ascii-range character + return None diff --git a/tests/models/test_responses.py b/tests/models/test_responses.py index 31aa2310..dbdcacab 100644 --- a/tests/models/test_responses.py +++ b/tests/models/test_responses.py @@ -1,3 +1,6 @@ +import json +from unittest import mock + import pytest import http3 @@ -250,3 +253,38 @@ def test_unknown_status_code(): assert response.status_code == 600 assert response.reason_phrase == "" assert response.text == "" + + +def test_json_with_specified_encoding(): + data = dict(greeting="hello", recipient="world") + content = json.dumps(data).encode("utf-16") + headers = {"Content-Type": "application/json, charset=utf-16"} + response = http3.Response(200, content=content, headers=headers) + assert response.json() == data + + +def test_json_with_options(): + data = dict(greeting="hello", recipient="world", amount=1) + content = json.dumps(data).encode("utf-16") + headers = {"Content-Type": "application/json, charset=utf-16"} + response = http3.Response(200, content=content, headers=headers) + assert response.json(parse_int=str)["amount"] == "1" + + +def test_json_without_specified_encoding(): + data = dict(greeting="hello", recipient="world") + content = json.dumps(data).encode("utf-32-be") + headers = {"Content-Type": "application/json"} + response = http3.Response(200, content=content, headers=headers) + assert response.json() == data + + +def test_json_without_specified_encoding_decode_error(): + data = dict(greeting="hello", recipient="world") + content = json.dumps(data).encode("utf-32-be") + headers = {"Content-Type": "application/json"} + # force incorrect guess from `guess_json_utf` to trigger error + with mock.patch("http3.models.guess_json_utf", return_value="utf-32"): + response = http3.Response(200, content=content, headers=headers) + with pytest.raises(json.JSONDecodeError): + response.json() diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 00000000..a27b9067 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,39 @@ +import pytest + +from http3.utils import guess_json_utf + + +@pytest.mark.parametrize( + "encoding", + ( + "utf-32", + "utf-8-sig", + "utf-16", + "utf-8", + "utf-16-be", + "utf-16-le", + "utf-32-be", + "utf-32-le", + ), +) +def test_encoded(encoding): + data = "{}".encode(encoding) + assert guess_json_utf(data) == encoding + + +def test_bad_utf_like_encoding(): + assert guess_json_utf(b"\x00\x00\x00\x00") is None + + +@pytest.mark.parametrize( + ("encoding", "expected"), + ( + ("utf-16-be", "utf-16"), + ("utf-16-le", "utf-16"), + ("utf-32-be", "utf-32"), + ("utf-32-le", "utf-32"), + ), +) +def test_guess_by_bom(encoding, expected): + data = u"\ufeff{}".encode(encoding) + assert guess_json_utf(data) == expected