* `h11` - HTTP/1.1 support.
* `h2` - HTTP/2 support. *(Optional)*
* `certifi` - SSL certificates.
+* `charset_normalizer` - Charset auto-detection.
* `rfc3986` - URL parsing & normalization.
* `idna` - Internationalized domain name support.
* `sniffio` - Async library autodetection.
* `h11` - HTTP/1.1 support.
* `h2` - HTTP/2 support. *(Optional)*
* `certifi` - SSL certificates.
+* `charset_normalizer` - Charset auto-detection.
* `rfc3986` - URL parsing & normalization.
* `idna` - Internationalized domain name support.
* `sniffio` - Async library autodetection.
Handles incrementally decoding bytes into text
"""
- def __init__(self, encoding: typing.Optional[str] = None):
- self.decoder: typing.Optional[codecs.IncrementalDecoder] = None
- if encoding is not None:
- self.decoder = codecs.getincrementaldecoder(encoding)(errors="strict")
+ def __init__(self, encoding: str = "utf-8"):
+ self.decoder = codecs.getincrementaldecoder(encoding)(errors="replace")
def decode(self, data: bytes) -> str:
- """
- If an encoding is explicitly specified, then we use that.
- Otherwise our strategy is to attempt UTF-8, and fallback to Windows 1252.
-
- Note that UTF-8 is a strict superset of ascii, and Windows 1252 is a
- superset of the non-control characters in iso-8859-1, so we essentially
- end up supporting any of ascii, utf-8, iso-8859-1, cp1252.
-
- Given that UTF-8 is now by *far* the most widely used encoding, this
- should be a pretty robust strategy for cases where a charset has
- not been explicitly included.
-
- Useful stats on the prevalence of different charsets in the wild...
-
- * https://w3techs.com/technologies/overview/character_encoding
- * https://w3techs.com/technologies/history_overview/character_encoding
-
- The HTML5 spec also has some useful guidelines, suggesting defaults of
- either UTF-8 or Windows 1252 in most cases...
-
- * https://dev.w3.org/html5/spec-LC/Overview.html
- """
- if self.decoder is None:
- # If this is the first decode pass then we need to determine which
- # encoding to use by attempting UTF-8 and raising any decode errors.
- attempt_utf_8 = codecs.getincrementaldecoder("utf-8")(errors="strict")
- try:
- attempt_utf_8.decode(data)
- except UnicodeDecodeError:
- # Could not decode as UTF-8. Use Windows 1252.
- self.decoder = codecs.getincrementaldecoder("cp1252")(errors="replace")
- else:
- # Can decode as UTF-8. Use UTF-8 with lenient error settings.
- self.decoder = codecs.getincrementaldecoder("utf-8")(errors="replace")
-
return self.decoder.decode(data)
def flush(self) -> str:
- if self.decoder is None:
- return ""
return self.decoder.decode(b"", True)
from http.cookiejar import Cookie, CookieJar
from urllib.parse import parse_qs, quote, unquote, urlencode
+import charset_normalizer
import idna
import rfc3986
import rfc3986.exceptions
if not content:
self._text = ""
else:
- decoder = TextDecoder(encoding=self.encoding)
+ decoder = TextDecoder(encoding=self.encoding or "utf-8")
self._text = "".join([decoder.decode(self.content), decoder.flush()])
return self._text
@property
def encoding(self) -> typing.Optional[str]:
"""
- Return the encoding, which may have been set explicitly, or may have
- been specified by the Content-Type header.
+ Return an encoding to use for decoding the byte content into text.
+ The priority for determining this is given by...
+
+ * `.encoding = <>` has been set explicitly.
+ * The encoding as specified by the charset parameter in the Content-Type header.
+ * The encoding as determined by `charset_normalizer`.
+ * UTF-8.
"""
if not hasattr(self, "_encoding"):
encoding = self.charset_encoding
if encoding is None or not is_known_encoding(encoding):
- self._encoding = None
- else:
- self._encoding = encoding
+ encoding = self.apparent_encoding
+ self._encoding = encoding
return self._encoding
@encoding.setter
return params["charset"].strip("'\"")
+ @property
+ def apparent_encoding(self) -> typing.Optional[str]:
+ """
+ Return the encoding, as detemined by `charset_normalizer`.
+ """
+ content = getattr(self, "_content", b"")
+ if len(content) < 32:
+ # charset_normalizer will issue warnings if we run it with
+ # fewer bytes than this cutoff.
+ return None
+ match = charset_normalizer.from_bytes(self.content).best()
+ return None if match is None else match.encoding
+
def _get_content_decoder(self) -> ContentDecoder:
"""
Returns a decoder instance which can be used to decode the raw byte
if self.charset_encoding is None and self.content and len(self.content) > 3:
encoding = guess_json_utf(self.content)
if encoding is not None:
- try:
- return jsonlib.loads(self.content.decode(encoding), **kwargs)
- except UnicodeDecodeError:
- pass
+ return jsonlib.loads(self.content.decode(encoding), **kwargs)
return jsonlib.loads(self.text, **kwargs)
@property
that handles both gzip, deflate, etc but also detects the content's
string encoding.
"""
- decoder = TextDecoder(encoding=self.encoding)
+ decoder = TextDecoder(encoding=self.encoding or "utf-8")
chunker = TextChunker(chunk_size=chunk_size)
with request_context(request=self._request):
for byte_content in self.iter_bytes():
that handles both gzip, deflate, etc but also detects the content's
string encoding.
"""
- decoder = TextDecoder(encoding=self.encoding)
+ decoder = TextDecoder(encoding=self.encoding or "utf-8")
chunker = TextChunker(chunk_size=chunk_size)
with request_context(request=self._request):
async for byte_content in self.aiter_bytes():
zip_safe=False,
install_requires=[
"certifi",
+ "charset_normalizer",
"sniffio",
"rfc3986[idna2008]>=1.3,<2",
"httpcore>=0.13.3,<0.14.0",
import json
import pickle
-from unittest import mock
import brotlicffi
import pytest
A response with ISO 8859-1 encoded content should decode correctly,
even with no charset specified.
"""
- content = "Accented: Österreich".encode("iso-8859-1")
+ content = "Accented: Österreich abcdefghijklmnopqrstuzwxyz".encode("iso-8859-1")
headers = {"Content-Type": "text/plain"}
response = httpx.Response(
200,
content=content,
headers=headers,
)
- assert response.text == "Accented: Österreich"
- assert response.encoding is None
+ assert response.text == "Accented: Österreich abcdefghijklmnopqrstuzwxyz"
+ assert response.charset_encoding is None
+ assert response.apparent_encoding is not None
def test_response_no_charset_with_cp_1252_content():
A response with Windows 1252 encoded content should decode correctly,
even with no charset specified.
"""
- content = "Euro Currency: €".encode("cp1252")
+ content = "Euro Currency: € abcdefghijklmnopqrstuzwxyz".encode("cp1252")
headers = {"Content-Type": "text/plain"}
response = httpx.Response(
200,
content=content,
headers=headers,
)
- assert response.text == "Euro Currency: €"
- assert response.encoding is None
+ assert response.text == "Euro Currency: € abcdefghijklmnopqrstuzwxyz"
+ assert response.charset_encoding is None
+ assert response.apparent_encoding is not None
def test_response_non_text_encoding():
assert response.json(parse_int=str)["amount"] == "1"
-def test_json_without_specified_encoding():
+@pytest.mark.parametrize(
+ "encoding",
+ [
+ "utf-8",
+ "utf-8-sig",
+ "utf-16",
+ "utf-16-be",
+ "utf-16-le",
+ "utf-32",
+ "utf-32-be",
+ "utf-32-le",
+ ],
+)
+def test_json_without_specified_charset(encoding):
data = {"greeting": "hello", "recipient": "world"}
- content = json.dumps(data).encode("utf-32-be")
+ content = json.dumps(data).encode(encoding)
headers = {"Content-Type": "application/json"}
response = httpx.Response(
200,
assert response.json() == data
-def test_json_without_specified_encoding_decode_error():
- data = {"greeting": "hello", "recipient": "world"}
- content = json.dumps(data).encode("utf-32-be")
- headers = {"Content-Type": "application/json"}
- # force incorrect guess from `guess_json_utf` to trigger error
- with mock.patch("httpx._models.guess_json_utf", return_value="utf-32-le"):
- response = httpx.Response(
- 200,
- content=content,
- headers=headers,
- )
- with pytest.raises(json.decoder.JSONDecodeError):
- response.json()
-
-
-def test_json_without_specified_encoding_value_error():
+@pytest.mark.parametrize(
+ "encoding",
+ [
+ "utf-8",
+ "utf-8-sig",
+ "utf-16",
+ "utf-16-be",
+ "utf-16-le",
+ "utf-32",
+ "utf-32-be",
+ "utf-32-le",
+ ],
+)
+def test_json_with_specified_charset(encoding):
data = {"greeting": "hello", "recipient": "world"}
- content = json.dumps(data).encode("utf-32-be")
- headers = {"Content-Type": "application/json"}
- # force incorrect guess from `guess_json_utf` to trigger error
- with mock.patch("httpx._models.guess_json_utf", return_value="utf-32-le"):
- response = httpx.Response(200, content=content, headers=headers)
- with pytest.raises(json.decoder.JSONDecodeError):
- response.json()
+ content = json.dumps(data).encode(encoding)
+ headers = {"Content-Type": f"application/json; charset={encoding}"}
+ response = httpx.Response(
+ 200,
+ content=content,
+ headers=headers,
+ )
+ assert response.json() == data
@pytest.mark.parametrize(
[
((b"Hello,", b" world!"), "ascii"),
((b"\xe3\x83", b"\x88\xe3\x83\xa9", b"\xe3", b"\x83\x99\xe3\x83\xab"), "utf-8"),
- ((b"Euro character: \x88!", b""), "cp1252"),
- ((b"Accented: \xd6sterreich", b""), "iso-8859-1"),
+ ((b"Euro character: \x88! abcdefghijklmnopqrstuvwxyz", b""), "cp1252"),
+ ((b"Accented: \xd6sterreich abcdefghijklmnopqrstuvwxyz", b""), "iso-8859-1"),
],
)
@pytest.mark.asyncio
assert response.text == (b"".join(data)).decode(encoding)
# Streaming `.aiter_text` iteratively.
- response = httpx.Response(
- 200,
- content=iterator(),
- )
+ # Note that if we streamed the text *without* having read it first, then
+ # we won't get a `charset_normalizer` guess, and will instead always rely
+ # on utf-8 if no charset is specified.
text = "".join([part async for part in response.aiter_text()])
assert text == (b"".join(data)).decode(encoding)