From: Tom Christie Date: Thu, 19 Oct 2023 11:23:39 +0000 (+0100) Subject: Supported text codecs should handle available aliases X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=cff58c91dbee17701e37a175c6be6cdde8a573ec;p=thirdparty%2Fhttpx.git Supported text codecs should handle available aliases --- diff --git a/httpx/_utils.py b/httpx/_utils.py index 875dd29d..eb1708a0 100644 --- a/httpx/_utils.py +++ b/httpx/_utils.py @@ -25,46 +25,49 @@ _HTML5_FORM_ENCODING_RE = re.compile( r"|".join([re.escape(c) for c in _HTML5_FORM_ENCODING_REPLACEMENTS.keys()]) ) -# Text codecs as supported by Chromium, Oct. 2023. +# For our supported text codecs, we start with the text codecs as supported by Chromium, Oct. 2023. # https://chromium.googlesource.com/chromium/chromium/+/refs/heads/trunk/chrome/browser/character_encoding.cc#36 +# +# Then limit those to any which documented as included by cpython, +# which drops "windows-874", "iso-8859-8-i". +# +# Then make sure we're referencing them with the canonical name as used by the Python codecs. SUPPORTED_CODECS = { - "utf-8", - "utf-16le", - "iso-8859-1", - "windows-1252", - "gbk", - "gb18030", - "big5", - "big5-hkscs", - "euc-kr", - "shift-jis", - "euc-jp", - "iso-2022-jp", - "windows-874", - "iso-8859-15", - "macintosh", - "iso-8859-2", - "windows-1250", - "iso-8859-5", - "windows-1251", - "koi8-r", - "koi8-u", - "iso-8859-7", - "windows-1253", - "windows-1254", - "windows-1256", - "iso-8859-6", - "windows-1255", - "iso-8859-8-i", - "iso-8859-8", - "windows-1258", - "iso-8859-4", - "iso-8859-13", - "windows-1257", - "iso-8859-3", - "iso-8859-10", - "iso-8859-14", - "iso-8859-16", + "big5", # big5 + "big5hkscs", # big5-hkscs + "cp1250", # windows-1250 + "cp1251", # windows-1251 + "cp1252", # windows-1252 + "cp1253", # windows-1253 + "cp1254", # windows-1254 + "cp1255", # windows-1255 + "cp1256", # windows-1256 + "cp1257", # windows-1257 + "cp1258", # windows-1258 + "euc_jp", # euc-jp + "euc_kr", # euc-kr + "gb18030", # gb18030 + "gbk", # gbk + "iso2022_jp", # iso-2022-jp + "iso8859-1", # iso-8859-1 + "iso8859-2", # iso-8859-2 + "iso8859-3", # iso-8859-3 + "iso8859-4", # iso-8859-4 + "iso8859-5", # iso-8859-5 + "iso8859-6", # iso-8859-6 + "iso8859-7", # iso-8859-7 + "iso8859-8", # iso-8859-8 + "iso8859-10", # iso-8859-10 + "iso8859-13", # iso-8859-13 + "iso8859-14", # iso-8859-14 + "iso8859-15", # iso-8859-15 + "iso8859-16", # iso-8859-16 + "koi8-r", # koi8-r + "koi8-u", # koi8-u + "mac-roman", # macintosh + "shift_jis", # shift-jis + "utf-8", # utf-8 + "utf-16-le", # utf-16le } @@ -112,22 +115,14 @@ def primitive_value_to_str(value: "PrimitiveData") -> str: def is_known_encoding(encoding: str) -> bool: """ - Return `True` if `encoding` is a known codec. + Return `True` if `encoding` is a supported text codec. """ - # Only allow text codecs within our supported range. - if encoding.lower().replace("_", "-") not in SUPPORTED_CODECS: - return False - - # Also ensure that the codec is actually available. - # At the point of writing this was true for all the SUPPORTED_CODECS - # except "windows-874", "iso-8859-8-i", when using cpython. - # But there *could* feasibly be a different set of codecs available - # under some installations. try: - codecs.lookup(encoding) + codec = codecs.lookup(encoding) except LookupError: return False - return True + + return codec.name in SUPPORTED_CODECS def format_form_param(name: str, value: str) -> bytes: