gh-88886: Remove excessive encoding name normalization (GH-137167)

author Serhiy Storchaka <storchaka@gmail.com>

Tue, 9 Sep 2025 18:07:21 +0000 (21:07 +0300)

committer GitHub <noreply@github.com>

Tue, 9 Sep 2025 18:07:21 +0000 (21:07 +0300)
author Serhiy Storchaka <storchaka@gmail.com>
Tue, 9 Sep 2025 18:07:21 +0000 (21:07 +0300)
committer GitHub <noreply@github.com>
Tue, 9 Sep 2025 18:07:21 +0000 (21:07 +0300)
diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst

index 5932012c535b56dac7efdb06dfbc4b7f3e8798a4..90a695ef937f75b6c3f8ea2b29f229371fc19c7e 100644 (file)
--- a/Doc/library/codecs.rst
+++ b/Doc/library/codecs.rst
@@ -68,11 +68,21 @@ The full details for each codec can also be looked up directly:
     Looks up the codec info in the Python codec registry and returns a
     :class:`CodecInfo` object as defined below.
  
-   Encodings are first looked up in the registry's cache. If not found, the list of
+   This function first normalizes the *encoding*: all ASCII letters are
+   converted to lower case, spaces are replaced with hyphens.
+   Then encoding is looked up in the registry's cache. If not found, the list of
     registered search functions is scanned. If no :class:`CodecInfo` object is
     found, a :exc:`LookupError` is raised. Otherwise, the :class:`CodecInfo` object
     is stored in the cache and returned to the caller.
  
+   .. versionchanged:: 3.9
+      Any characters except ASCII letters and digits and a dot are converted to underscore.
+
+   .. versionchanged:: next
+      No characters are converted to underscore anymore.
+      Spaces are converted to hyphens.
+
+
  .. class:: CodecInfo(encode, decode, streamreader=None, streamwriter=None, incrementalencoder=None, incrementaldecoder=None, name=None)
  
     Codec details when looking up the codec registry. The constructor
@@ -167,14 +177,11 @@ function:
  .. function:: register(search_function, /)
  
     Register a codec search function. Search functions are expected to take one
-   argument, being the encoding name in all lower case letters with hyphens
-   and spaces converted to underscores, and return a :class:`CodecInfo` object.
+   argument, being the encoding name in all lower case letters with spaces
+   converted to hyphens, and return a :class:`CodecInfo` object.
     In case a search function cannot find a given encoding, it should return
     ``None``.
  
-   .. versionchanged:: 3.9
-      Hyphens and spaces are converted to underscore.
-
  
  .. function:: unregister(search_function, /)
  
@@ -1065,7 +1072,7 @@ or with dictionaries as mapping tables. The following table lists the codecs by
  name, together with a few common aliases, and the languages for which the
  encoding is likely used. Neither the list of aliases nor the list of languages
  is meant to be exhaustive. Notice that spelling alternatives that only differ in
-case or use a hyphen instead of an underscore are also valid aliases
+case or use a space or a hyphen instead of an underscore are also valid aliases
  because they are equivalent when normalized by
  :func:`~encodings.normalize_encoding`. For example, ``'utf-8'`` is a valid
  alias for the ``'utf_8'`` codec.
diff --git a/Lib/test/test_capi/test_codecs.py b/Lib/test/test_capi/test_codecs.py

index a0355c7a388c57e6e794324fbacc8ddedefc18a6..1a3f476ed0f30d6174a06822e05526a718f4c653 100644 (file)
--- a/Lib/test/test_capi/test_codecs.py
+++ b/Lib/test/test_capi/test_codecs.py
@@ -630,7 +630,6 @@ class CAPICodecs(unittest.TestCase):
          for name in [
              encoding_name,
              encoding_name.upper(),
-            encoding_name.replace('_', '-'),
          ]:
              with self.subTest(name):
                  self.assertTrue(_testcapi.codec_known_encoding(name))
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py

index fd7769e8c275d3d6ed700846e0d2ce6ecf8b04d6..c35a4508943506ef68731aec6c4ab27ef53c2b5c 100644 (file)
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -3873,26 +3873,22 @@ class Rot13UtilTest(unittest.TestCase):
  class CodecNameNormalizationTest(unittest.TestCase):
      """Test codec name normalization"""
      def test_codecs_lookup(self):
-        FOUND = (1, 2, 3, 4)
-        NOT_FOUND = (None, None, None, None)
          def search_function(encoding):
-            if encoding == "aaa_8":
-                return FOUND
+            if encoding.startswith("test."):
+                return (encoding, 2, 3, 4)
              else:
-                return NOT_FOUND
+                return None
  
          codecs.register(search_function)
          self.addCleanup(codecs.unregister, search_function)
-        self.assertEqual(FOUND, codecs.lookup('aaa_8'))
-        self.assertEqual(FOUND, codecs.lookup('AAA-8'))
-        self.assertEqual(FOUND, codecs.lookup('AAA---8'))
-        self.assertEqual(FOUND, codecs.lookup('AAA   8'))
-        self.assertEqual(FOUND, codecs.lookup('aaa\xe9\u20ac-8'))
-        self.assertEqual(NOT_FOUND, codecs.lookup('AAA.8'))
-        self.assertEqual(NOT_FOUND, codecs.lookup('AAA...8'))
-        self.assertEqual(NOT_FOUND, codecs.lookup('BBB-8'))
-        self.assertEqual(NOT_FOUND, codecs.lookup('BBB.8'))
-        self.assertEqual(NOT_FOUND, codecs.lookup('a\xe9\u20ac-8'))
+        self.assertEqual(codecs.lookup('test.aaa_8'), ('test.aaa_8', 2, 3, 4))
+        self.assertEqual(codecs.lookup('TEST.AAA-8'), ('test.aaa-8', 2, 3, 4))
+        self.assertEqual(codecs.lookup('TEST.AAA 8'), ('test.aaa-8', 2, 3, 4))
+        self.assertEqual(codecs.lookup('TEST.AAA---8'), ('test.aaa---8', 2, 3, 4))
+        self.assertEqual(codecs.lookup('TEST.AAA   8'), ('test.aaa---8', 2, 3, 4))
+        self.assertEqual(codecs.lookup('TEST.AAA\xe9\u20ac-8'), ('test.aaa\xe9\u20ac-8', 2, 3, 4))
+        self.assertEqual(codecs.lookup('TEST.AAA.8'), ('test.aaa.8', 2, 3, 4))
+        self.assertEqual(codecs.lookup('TEST.AAA...8'), ('test.aaa...8', 2, 3, 4))
  
      def test_encodings_normalize_encoding(self):
          # encodings.normalize_encoding() ignores non-ASCII characters.
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-07-28-17-01-05.gh-issue-88886.g4XFPb.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-07-28-17-01-05.gh-issue-88886.g4XFPb.rst

new file mode 100644 (file)

index 0000000..0d119ef
--- /dev/null
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-07-28-17-01-05.gh-issue-88886.g4XFPb.rst
@@ -0,0 +1,4 @@
+The codecs lookup function now again performs only minimal normalization of
+the encoding name before passing it to the search functions: all ASCII
+letters are converted to lower case, spaces are replaced with hyphens.
+This restores the pre-Python 3.9 behavior.
diff --git a/Python/codecs.c b/Python/codecs.c

index 8eb9f2db41359e1a47ca3265409fc911e6212f9f..364e07990ba42aaf6a0f9ad6917fe840c8e34285 100644 (file)
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -85,14 +85,15 @@ PyCodec_Unregister(PyObject *search_function)
  
  extern int _Py_normalize_encoding(const char *, char *, size_t);
  
-/* Convert a string to a normalized Python string(decoded from UTF-8): all characters are
-   converted to lower case, spaces and hyphens are replaced with underscores. */
+/* Convert a string to a normalized Python string: all ASCII letters are
+   converted to lower case, spaces are replaced with hyphens. */
  
-static
-PyObject *normalizestring(const char *string)
+static PyObject*
+normalizestring(const char *string)
  {
+    size_t i;
      size_t len = strlen(string);
-    char *encoding;
+    char *p;
      PyObject *v;
  
      if (len > PY_SSIZE_T_MAX) {
@@ -100,28 +101,30 @@ PyObject *normalizestring(const char *string)
          return NULL;
      }
  
-    encoding = PyMem_Malloc(len + 1);
-    if (encoding == NULL)
+    p = PyMem_Malloc(len + 1);
+    if (p == NULL)
          return PyErr_NoMemory();
-
-    if (!_Py_normalize_encoding(string, encoding, len + 1))
-    {
-        PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");
-        PyMem_Free(encoding);
-        return NULL;
-    }
-
-    v = PyUnicode_FromString(encoding);
-    PyMem_Free(encoding);
+    for (i = 0; i < len; i++) {
+        char ch = string[i];
+        if (ch == ' ')
+            ch = '-';
+        else
+            ch = Py_TOLOWER(Py_CHARMASK(ch));
+        p[i] = ch;
+    }
+    p[i] = '\0';
+    v = PyUnicode_FromString(p);
+    PyMem_Free(p);
      return v;
  }
  
  /* Lookup the given encoding and return a tuple providing the codec
     facilities.
  
-   The encoding string is looked up converted to all lower-case
-   characters. This makes encodings looked up through this mechanism
-   effectively case-insensitive.
+   ASCII letters in the encoding string is looked up converted to all
+   lower case. This makes encodings looked up through this mechanism
+   effectively case-insensitive. Spaces are replaced with hyphens for
+   names like "US ASCII" and "ISO 8859-1".
  
     If no codec is found, a LookupError is set and NULL returned.
  
@@ -142,8 +145,8 @@ PyObject *_PyCodec_Lookup(const char *encoding)
      assert(interp->codecs.initialized);
  
      /* Convert the encoding to a normalized Python string: all
-       characters are converted to lower case, spaces and hyphens are
-       replaced with underscores. */
+       ASCII letters are converted to lower case, spaces are
+       replaced with hyphens. */
      PyObject *v = normalizestring(encoding);
      if (v == NULL) {
          return NULL;
author	Serhiy Storchaka <storchaka@gmail.com>
	Tue, 9 Sep 2025 18:07:21 +0000 (21:07 +0300)
committer	GitHub <noreply@github.com>
	Tue, 9 Sep 2025 18:07:21 +0000 (21:07 +0300)
Doc/library/codecs.rst		patch \| blob \| blame \| history
Lib/test/test_capi/test_codecs.py		patch \| blob \| blame \| history
Lib/test/test_codecs.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Core_and_Builtins/2025-07-28-17-01-05.gh-issue-88886.g4XFPb.rst	[new file with mode: 0644]	patch \| blob
Python/codecs.c		patch \| blob \| blame \| history