From: Tom Christie <tom@tomchristie.com>
Date: Thu, 19 Oct 2023 11:23:39 +0000 (+0100)
Subject: Supported text codecs should handle available aliases
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=cff58c91dbee17701e37a175c6be6cdde8a573ec;p=thirdparty%2Fhttpx.git

Supported text codecs should handle available aliases
---

diff --git a/httpx/_utils.py b/httpx/_utils.py
index 875dd29d..eb1708a0 100644
--- a/httpx/_utils.py
+++ b/httpx/_utils.py
@@ -25,46 +25,49 @@ _HTML5_FORM_ENCODING_RE = re.compile(
     r"|".join([re.escape(c) for c in _HTML5_FORM_ENCODING_REPLACEMENTS.keys()])
 )
 
-# Text codecs as supported by Chromium, Oct. 2023.
+# For our supported text codecs, we start with the text codecs as supported by Chromium, Oct. 2023.
 # https://chromium.googlesource.com/chromium/chromium/+/refs/heads/trunk/chrome/browser/character_encoding.cc#36
+#
+# Then limit those to any which documented as included by cpython,
+# which drops "windows-874", "iso-8859-8-i".
+#
+# Then make sure we're referencing them with the canonical name as used by the Python codecs.
 SUPPORTED_CODECS = {
-    "utf-8",
-    "utf-16le",
-    "iso-8859-1",
-    "windows-1252",
-    "gbk",
-    "gb18030",
-    "big5",
-    "big5-hkscs",
-    "euc-kr",
-    "shift-jis",
-    "euc-jp",
-    "iso-2022-jp",
-    "windows-874",
-    "iso-8859-15",
-    "macintosh",
-    "iso-8859-2",
-    "windows-1250",
-    "iso-8859-5",
-    "windows-1251",
-    "koi8-r",
-    "koi8-u",
-    "iso-8859-7",
-    "windows-1253",
-    "windows-1254",
-    "windows-1256",
-    "iso-8859-6",
-    "windows-1255",
-    "iso-8859-8-i",
-    "iso-8859-8",
-    "windows-1258",
-    "iso-8859-4",
-    "iso-8859-13",
-    "windows-1257",
-    "iso-8859-3",
-    "iso-8859-10",
-    "iso-8859-14",
-    "iso-8859-16",
+    "big5",  # big5
+    "big5hkscs",  # big5-hkscs
+    "cp1250",  # windows-1250
+    "cp1251",  # windows-1251
+    "cp1252",  # windows-1252
+    "cp1253",  # windows-1253
+    "cp1254",  # windows-1254
+    "cp1255",  # windows-1255
+    "cp1256",  # windows-1256
+    "cp1257",  # windows-1257
+    "cp1258",  # windows-1258
+    "euc_jp",  # euc-jp
+    "euc_kr",  # euc-kr
+    "gb18030",  # gb18030
+    "gbk",  # gbk
+    "iso2022_jp",  # iso-2022-jp
+    "iso8859-1",  # iso-8859-1
+    "iso8859-2",  # iso-8859-2
+    "iso8859-3",  # iso-8859-3
+    "iso8859-4",  # iso-8859-4
+    "iso8859-5",  # iso-8859-5
+    "iso8859-6",  # iso-8859-6
+    "iso8859-7",  # iso-8859-7
+    "iso8859-8",  # iso-8859-8
+    "iso8859-10",  # iso-8859-10
+    "iso8859-13",  # iso-8859-13
+    "iso8859-14",  # iso-8859-14
+    "iso8859-15",  # iso-8859-15
+    "iso8859-16",  # iso-8859-16
+    "koi8-r",  # koi8-r
+    "koi8-u",  # koi8-u
+    "mac-roman",  # macintosh
+    "shift_jis",  # shift-jis
+    "utf-8",  # utf-8
+    "utf-16-le",  # utf-16le
 }
 
 
@@ -112,22 +115,14 @@ def primitive_value_to_str(value: "PrimitiveData") -> str:
 
 def is_known_encoding(encoding: str) -> bool:
     """
-    Return `True` if `encoding` is a known codec.
+    Return `True` if `encoding` is a supported text codec.
     """
-    # Only allow text codecs within our supported range.
-    if encoding.lower().replace("_", "-") not in SUPPORTED_CODECS:
-        return False
-
-    # Also ensure that the codec is actually available.
-    # At the point of writing this was true for all the SUPPORTED_CODECS
-    # except "windows-874", "iso-8859-8-i", when using cpython.
-    # But there *could* feasibly be a different set of codecs available
-    # under some installations.
     try:
-        codecs.lookup(encoding)
+        codec = codecs.lookup(encoding)
     except LookupError:
         return False
-    return True
+
+    return codec.name in SUPPORTED_CODECS
 
 
 def format_form_param(name: str, value: str) -> bytes: