gh-149079: Fix O(n^2) canonical ordering in unicodedata.normalize() (GH-149080)

author Seth Larson <seth@python.org>

Tue, 2 Jun 2026 09:39:50 +0000 (02:39 -0700)

committer GitHub <noreply@github.com>

Tue, 2 Jun 2026 09:39:50 +0000 (11:39 +0200)
author Seth Larson <seth@python.org>
Tue, 2 Jun 2026 09:39:50 +0000 (02:39 -0700)
committer GitHub <noreply@github.com>
Tue, 2 Jun 2026 09:39:50 +0000 (11:39 +0200)
diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py

index 060d81415aa1f1b61c0877f28afbb5df3fd09cc8..ad25be3da8cb34725e3b14a150a7d4cc9f97b575 100644 (file)
--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@@ -616,6 +616,34 @@ class BaseUnicodeFunctionsTest:
          b = 'C\u0338' * 20  + '\xC7'
          self.assertEqual(self.db.normalize('NFC', a), b)
  
+    def test_long_combining_mark_run(self):
+        # gh-149079: avoid quadratic canonical ordering.
+        payload = "a" + ("\u0300\u0327" * 32)
+        nfd = "a" + ("\u0327" * 32) + ("\u0300" * 32)
+        nfc = "\u00e0" + ("\u0327" * 32) + ("\u0300" * 31)
+
+        self.assertEqual(self.db.normalize("NFD", payload), nfd)
+        self.assertEqual(self.db.normalize("NFKD", payload), nfd)
+        self.assertEqual(self.db.normalize("NFC", payload), nfc)
+        self.assertEqual(self.db.normalize("NFKC", payload), nfc)
+
+    def test_combining_mark_run_fast_paths(self):
+        # gh-149079: cover short runs and already-sorted long runs.
+        short_payload = "a" + ("\u0300\u0327" * 9) + "\u0300"
+        short_nfd = "a" + ("\u0327" * 9) + ("\u0300" * 10)
+        short_nfc = "\u00e0" + ("\u0327" * 9) + ("\u0300" * 9)
+        long_sorted = "a" + ("\u0327" * 30) + ("\u0300" * 30)
+        long_sorted_nfc = "\u00e0" + ("\u0327" * 30) + ("\u0300" * 29)
+
+        self.assertEqual(self.db.normalize("NFD", short_payload), short_nfd)
+        self.assertEqual(self.db.normalize("NFKD", short_payload), short_nfd)
+        self.assertEqual(self.db.normalize("NFC", short_payload), short_nfc)
+        self.assertEqual(self.db.normalize("NFKC", short_payload), short_nfc)
+        self.assertEqual(self.db.normalize("NFD", long_sorted), long_sorted)
+        self.assertEqual(self.db.normalize("NFKD", long_sorted), long_sorted)
+        self.assertEqual(self.db.normalize("NFC", long_sorted), long_sorted_nfc)
+        self.assertEqual(self.db.normalize("NFKC", long_sorted), long_sorted_nfc)
+
      def test_issue29456(self):
          # Fix #29456
          u1176_str_a = '\u1100\u1176\u11a8'
diff --git a/Misc/NEWS.d/next/Security/2026-04-27-16-36-11.gh-issue-149079.vKl-LM.rst b/Misc/NEWS.d/next/Security/2026-04-27-16-36-11.gh-issue-149079.vKl-LM.rst

new file mode 100644 (file)

index 0000000..4ed22b5
--- /dev/null
+++ b/Misc/NEWS.d/next/Security/2026-04-27-16-36-11.gh-issue-149079.vKl-LM.rst
@@ -0,0 +1,5 @@
+Fix a potential denial of service in :func:`unicodedata.normalize`. The
+canonical ordering step of Unicode normalization used a quadratic-time insertion
+sort for reordering combining characters, which could be exploited with
+crafted input containing many combining characters in non-canonical order.
+Replaced with a linear-time counting sort for long runs.
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c

index 6bb25fc0b63781c08d036bf93078505ae08e8b85..60df68216938134b5475f4925873c7cb96ddbc90 100644 (file)
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -556,19 +556,80 @@ get_decomp_record(PyObject *self, Py_UCS4 code,
      (*index)++;
  }
  
+/* Small combining runs are usually cheaper with insertion sort. */
+#define CANONICAL_ORDERING_COUNTING_SORT_THRESHOLD 20
+
+static void
+canonical_ordering_sort_insertion(int kind, void *data,
+                                  Py_ssize_t start, Py_ssize_t end)
+{
+    for (Py_ssize_t i = start + 1; i < end; i++) {
+        Py_UCS4 code = PyUnicode_READ(kind, data, i);
+        unsigned char combining = _getrecord_ex(code)->combining;
+        Py_ssize_t j = i;
+
+        while (j > start) {
+            Py_UCS4 previous = PyUnicode_READ(kind, data, j - 1);
+            if (_getrecord_ex(previous)->combining <= combining) {
+                break;
+            }
+            PyUnicode_WRITE(kind, data, j, previous);
+            j--;
+        }
+        if (j != i) {
+            PyUnicode_WRITE(kind, data, j, code);
+        }
+    }
+}
+
+static void
+canonical_ordering_sort_counting(int kind, void *data,
+                                 Py_ssize_t start, Py_ssize_t end,
+                                 Py_UCS4 *sortbuf)
+{
+    Py_ssize_t counts[256] = {0};
+    Py_ssize_t run_length = end - start;
+    Py_ssize_t total = 0;
+
+    for (Py_ssize_t i = start; i < end; i++) {
+        Py_UCS4 code = PyUnicode_READ(kind, data, i);
+        unsigned char combining = _getrecord_ex(code)->combining;
+        counts[combining]++;
+    }
+
+    for (size_t i = 0; i < Py_ARRAY_LENGTH(counts); i++) {
+        Py_ssize_t count = counts[i];
+        counts[i] = total;
+        total += count;
+    }
+
+    /* Reuse counts[] as the next output slot for each CCC. */
+    for (Py_ssize_t i = start; i < end; i++) {
+        Py_UCS4 code = PyUnicode_READ(kind, data, i);
+        unsigned char combining = _getrecord_ex(code)->combining;
+        sortbuf[counts[combining]++] = code;
+    }
+    for (Py_ssize_t i = 0; i < run_length; i++) {
+        PyUnicode_WRITE(kind, data, start + i, sortbuf[i]);
+    }
+}
+
  static PyObject*
  nfd_nfkd(PyObject *self, PyObject *input, int k)
  {
      PyObject *result;
      Py_UCS4 *output;
      Py_ssize_t i, o, osize;
-    int kind;
-    const void *data;
+    int input_kind, result_kind;
+    const void *input_data;
+    void *result_data;
      /* Longest decomposition in Unicode 3.2: U+FDFA */
      Py_UCS4 stack[20];
      Py_ssize_t space, isize;
      int index, prefix, count, stackptr;
      unsigned char prev, cur;
+    Py_UCS4 *sortbuf = NULL;
+    Py_ssize_t sortbuflen = 0;
  
      stackptr = 0;
      isize = PyUnicode_GET_LENGTH(input);
@@ -588,11 +649,11 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
          return NULL;
      }
      i = o = 0;
-    kind = PyUnicode_KIND(input);
-    data = PyUnicode_DATA(input);
+    input_kind = PyUnicode_KIND(input);
+    input_data = PyUnicode_DATA(input);
  
      while (i < isize) {
-        stack[stackptr++] = PyUnicode_READ(kind, data, i++);
+        stack[stackptr++] = PyUnicode_READ(input_kind, input_data, i++);
          while(stackptr) {
              Py_UCS4 code = stack[--stackptr];
              /* Hangul Decomposition adds three characters in
@@ -660,34 +721,64 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
      if (!result)
          return NULL;
  
-    kind = PyUnicode_KIND(result);
-    data = PyUnicode_DATA(result);
+    result_kind = PyUnicode_KIND(result);
+    result_data = PyUnicode_DATA(result);
  
-    /* Sort canonically. */
+    /* Sort each consecutive combining-character run canonically. */
      i = 0;
-    prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
-    for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
-        cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
-        if (prev == 0 || cur == 0 || prev <= cur) {
-            prev = cur;
+    while (i < o) {
+        Py_ssize_t run_length, run_start;
+        int needs_sort = 0;
+
+        Py_UCS4 ch = PyUnicode_READ(result_kind, result_data, i);
+        prev = _getrecord_ex(ch)->combining;
+        if (prev == 0) {
+            i++;
              continue;
          }
-        /* Non-canonical order. Need to switch *i with previous. */
-        o = i - 1;
-        while (1) {
-            Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
-            PyUnicode_WRITE(kind, data, o+1,
-                            PyUnicode_READ(kind, data, o));
-            PyUnicode_WRITE(kind, data, o, tmp);
-            o--;
-            if (o < 0)
-                break;
-            prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
-            if (prev == 0 || prev <= cur)
+
+        run_start = i++;
+        while (i < o) {
+            Py_UCS4 ch = PyUnicode_READ(result_kind, result_data, i);
+            cur = _getrecord_ex(ch)->combining;
+            if (cur == 0) {
                  break;
+            }
+            if (prev > cur) {
+                needs_sort = 1;
+            }
+            prev = cur;
+            i++;
+        }
+        if (!needs_sort) {
+            continue;
+        }
+
+        run_length = i - run_start;
+        if (run_length < CANONICAL_ORDERING_COUNTING_SORT_THRESHOLD) {
+            canonical_ordering_sort_insertion(result_kind, result_data,
+                                              run_start, i);
+            continue;
          }
-        prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
+
+        if (run_length > sortbuflen) {
+            Py_UCS4 *new_sortbuf = PyMem_Resize(sortbuf,
+                                                Py_UCS4,
+                                                run_length);
+            if (new_sortbuf == NULL) {
+                PyErr_NoMemory();
+                PyMem_Free(sortbuf);
+                Py_DECREF(result);
+                return NULL;
+            }
+            sortbuf = new_sortbuf;
+            sortbuflen = run_length;
+        }
+
+        canonical_ordering_sort_counting(result_kind, result_data,
+                                         run_start, i, sortbuf);
      }
+    PyMem_Free(sortbuf);
      return result;
  }
author	Seth Larson <seth@python.org>
	Tue, 2 Jun 2026 09:39:50 +0000 (02:39 -0700)
committer	GitHub <noreply@github.com>
	Tue, 2 Jun 2026 09:39:50 +0000 (11:39 +0200)
Lib/test/test_unicodedata.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Security/2026-04-27-16-36-11.gh-issue-149079.vKl-LM.rst	[new file with mode: 0644]	patch \| blob
Modules/unicodedata.c		patch \| blob \| blame \| history