gh-55531: Implement `normalize_encoding` in C (#136643)

author Stan Ulbrych <89152624+StanFromIreland@users.noreply.github.com>

Thu, 30 Oct 2025 14:31:47 +0000 (14:31 +0000)

committer GitHub <noreply@github.com>

Thu, 30 Oct 2025 14:31:47 +0000 (15:31 +0100)
author Stan Ulbrych <89152624+StanFromIreland@users.noreply.github.com>
Thu, 30 Oct 2025 14:31:47 +0000 (14:31 +0000)
committer GitHub <noreply@github.com>
Thu, 30 Oct 2025 14:31:47 +0000 (15:31 +0100)
diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py

index 298177eb8003a7af233fc286f2dfd954cb0cfd2d..e7e4ca3358e0f92ba89e7604cd3f018d2a8a5141 100644 (file)
--- a/Lib/encodings/__init__.py
+++ b/Lib/encodings/__init__.py
@@ -30,6 +30,7 @@ Written by Marc-Andre Lemburg (mal@lemburg.com).
  
  import codecs
  import sys
+from _codecs import _normalize_encoding
  from . import aliases
  
  _cache = {}
@@ -55,18 +56,7 @@ def normalize_encoding(encoding):
      if isinstance(encoding, bytes):
          encoding = str(encoding, "ascii")
  
-    chars = []
-    punct = False
-    for c in encoding:
-        if c.isalnum() or c == '.':
-            if punct and chars:
-                chars.append('_')
-            if c.isascii():
-                chars.append(c)
-            punct = False
-        else:
-            punct = True
-    return ''.join(chars)
+    return _normalize_encoding(encoding)
  
  def search_function(encoding):
  
diff --git a/Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst b/Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst

new file mode 100644 (file)

index 0000000..70e39a4
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst
@@ -0,0 +1,4 @@
+:mod:`encodings`: Improve :func:`~encodings.normalize_encoding` performance
+by implementing the function in C using the private
+``_Py_normalize_encoding`` which has been modified to make lowercase
+conversion optional.
diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c

index bdffeced7da5a98315fd1dbdd6d7cd3bfdf07b49..2f2edbb05ab5c5716ebc8726751b78a34e880776 100644 (file)
--- a/Modules/_codecsmodule.c
+++ b/Modules/_codecsmodule.c
@@ -1018,6 +1018,47 @@ _codecs_lookup_error_impl(PyObject *module, const char *name)
      return PyCodec_LookupError(name);
  }
  
+extern int _Py_normalize_encoding(const char *, char *, size_t, int);
+
+/*[clinic input]
+_codecs._normalize_encoding
+    encoding: unicode
+
+Normalize an encoding name *encoding*.
+
+Used for encodings.normalize_encoding. Does not convert to lower case.
+[clinic start generated code]*/
+
+static PyObject *
+_codecs__normalize_encoding_impl(PyObject *module, PyObject *encoding)
+/*[clinic end generated code: output=d27465d81e361f8e input=3ff3f4d64995b988]*/
+{
+    Py_ssize_t len;
+    const char *cstr = PyUnicode_AsUTF8AndSize(encoding, &len);
+    if (cstr == NULL) {
+        return NULL;
+    }
+
+    if (len > PY_SSIZE_T_MAX) {
+        PyErr_SetString(PyExc_OverflowError, "encoding is too large");
+        return NULL;
+    }
+
+    char *normalized = PyMem_Malloc(len + 1);
+    if (normalized == NULL) {
+        return PyErr_NoMemory();
+    }
+
+    if (!_Py_normalize_encoding(cstr, normalized, len + 1, 0)) {
+        PyMem_Free(normalized);
+        return NULL;
+    }
+
+    PyObject *result = PyUnicode_FromString(normalized);
+    PyMem_Free(normalized);
+    return result;
+}
+
  /* --- Module API --------------------------------------------------------- */
  
  static PyMethodDef _codecs_functions[] = {
@@ -1067,6 +1108,7 @@ static PyMethodDef _codecs_functions[] = {
      _CODECS_REGISTER_ERROR_METHODDEF
      _CODECS__UNREGISTER_ERROR_METHODDEF
      _CODECS_LOOKUP_ERROR_METHODDEF
+    _CODECS__NORMALIZE_ENCODING_METHODDEF
      {NULL, NULL}                /* sentinel */
  };
  
diff --git a/Modules/clinic/_codecsmodule.c.h b/Modules/clinic/_codecsmodule.c.h

index b031032575932670e60db9e479021ff001d9f21d..9e2a7950ebde64ddbd6e15f9b6c408a1adbde616 100644 (file)
--- a/Modules/clinic/_codecsmodule.c.h
+++ b/Modules/clinic/_codecsmodule.c.h
@@ -2779,6 +2779,70 @@ exit:
      return return_value;
  }
  
+PyDoc_STRVAR(_codecs__normalize_encoding__doc__,
+"_normalize_encoding($module, /, encoding)\n"
+"--\n"
+"\n"
+"Normalize an encoding name *encoding*.\n"
+"\n"
+"Used for encodings.normalize_encoding. Does not convert to lower case.");
+
+#define _CODECS__NORMALIZE_ENCODING_METHODDEF    \
+    {"_normalize_encoding", _PyCFunction_CAST(_codecs__normalize_encoding), METH_FASTCALL|METH_KEYWORDS, _codecs__normalize_encoding__doc__},
+
+static PyObject *
+_codecs__normalize_encoding_impl(PyObject *module, PyObject *encoding);
+
+static PyObject *
+_codecs__normalize_encoding(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+    PyObject *return_value = NULL;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 1
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        Py_hash_t ob_hash;
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_hash = -1,
+        .ob_item = { &_Py_ID(encoding), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"encoding", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "_normalize_encoding",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[1];
+    PyObject *encoding;
+
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser,
+            /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!args) {
+        goto exit;
+    }
+    if (!PyUnicode_Check(args[0])) {
+        _PyArg_BadArgument("_normalize_encoding", "argument 'encoding'", "str", args[0]);
+        goto exit;
+    }
+    encoding = args[0];
+    return_value = _codecs__normalize_encoding_impl(module, encoding);
+
+exit:
+    return return_value;
+}
+
  #ifndef _CODECS_MBCS_DECODE_METHODDEF
      #define _CODECS_MBCS_DECODE_METHODDEF
  #endif /* !defined(_CODECS_MBCS_DECODE_METHODDEF) */
@@ -2802,4 +2866,4 @@ exit:
  #ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF
      #define _CODECS_CODE_PAGE_ENCODE_METHODDEF
  #endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */
-/*[clinic end generated code: output=ed13f20dfb09e306 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=a968c493bb28be3e input=a9049054013a1b77]*/
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 1c443e88e0502916b2b948c8ad7daf1f79833cd3..4e8c132327b7d07f90a2c921edd46c959327648b 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -3449,13 +3449,14 @@ PyUnicode_FromEncodedObject(PyObject *obj,
      return v;
  }
  
-/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
-   also convert to lowercase. Return 1 on success, or 0 on error (encoding is
-   longer than lower_len-1). */
+/* Normalize an encoding name like encodings.normalize_encoding()
+   but allow to convert to lowercase if *to_lower* is true.
+   Return 1 on success, or 0 on error (encoding is longer than lower_len-1). */
  int
  _Py_normalize_encoding(const char *encoding,
                         char *lower,
-                       size_t lower_len)
+                       size_t lower_len,
+                       int to_lower)
  {
      const char *e;
      char *l;
@@ -3486,7 +3487,7 @@ _Py_normalize_encoding(const char *encoding,
              if (l == l_end) {
                  return 0;
              }
-            *l++ = Py_TOLOWER(c);
+            *l++ = to_lower ? Py_TOLOWER(c) : c;
          }
          else {
              punct = 1;
@@ -3521,7 +3522,7 @@ PyUnicode_Decode(const char *s,
      }
  
      /* Shortcuts for common default encodings */
-    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
+    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
          char *lower = buflower;
  
          /* Fast paths */
@@ -3778,7 +3779,7 @@ PyUnicode_AsEncodedString(PyObject *unicode,
      }
  
      /* Shortcuts for common default encodings */
-    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
+    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
          char *lower = buflower;
  
          /* Fast paths */
diff --git a/Python/fileutils.c b/Python/fileutils.c

index b808229716fd9ca6a0fb201a579f104a6fed89ff..93abd70a34d420c91c71d7a7548a198c48bcc0da 100644 (file)
--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@@ -178,7 +178,7 @@ _Py_mbrtowc(wchar_t *pwc, const char *str, size_t len, mbstate_t *pmbs)
  
  #define USE_FORCE_ASCII
  
-extern int _Py_normalize_encoding(const char *, char *, size_t);
+extern int _Py_normalize_encoding(const char *, char *, size_t, int);
  
  /* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale
     and POSIX locale. nl_langinfo(CODESET) announces an alias of the
@@ -229,7 +229,7 @@ check_force_ascii(void)
      }
  
      char encoding[20];   /* longest name: "iso_646.irv_1991\0" */
-    if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding))) {
+    if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding), 1)) {
          goto error;
      }
author	Stan Ulbrych <89152624+StanFromIreland@users.noreply.github.com>
	Thu, 30 Oct 2025 14:31:47 +0000 (14:31 +0000)
committer	GitHub <noreply@github.com>
	Thu, 30 Oct 2025 14:31:47 +0000 (15:31 +0100)
Lib/encodings/__init__.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2025-07-14-09-33-17.gh-issue-55531.Gt2e12.rst	[new file with mode: 0644]	patch \| blob
Modules/_codecsmodule.c		patch \| blob \| blame \| history
Modules/clinic/_codecsmodule.c.h		patch \| blob \| blame \| history
Objects/unicodeobject.c		patch \| blob \| blame \| history
Python/fileutils.c		patch \| blob \| blame \| history