[3.14] gh-88091: Fix unicodedata.decomposition() for Hangul Syllables (GH-144993...

author Stan Ulbrych <89152624+StanFromIreland@users.noreply.github.com>

Tue, 24 Feb 2026 22:27:09 +0000 (22:27 +0000)

committer GitHub <noreply@github.com>

Tue, 24 Feb 2026 22:27:09 +0000 (00:27 +0200)
author Stan Ulbrych <89152624+StanFromIreland@users.noreply.github.com>
Tue, 24 Feb 2026 22:27:09 +0000 (22:27 +0000)
committer GitHub <noreply@github.com>
Tue, 24 Feb 2026 22:27:09 +0000 (00:27 +0200)
diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py

index abbcffbe3fcee9f6743096e59f3ce5e9993a93b8..93d573996407590982168d2a8a2e7d58ace99fa5 100644 (file)
--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@@ -89,9 +89,9 @@ class UnicodeFunctionsTest(unittest.TestCase):
  
      # Update this if the database changes. Make sure to do a full rebuild
      # (e.g. 'make distclean && make') to get the correct checksum.
-    expectedchecksum = ('35e842600fa7ae2db93739db08ef201b726a2374'
+    expectedchecksum = ('1ba453ec456896f1190d849b6e9b7c2e1a4128e0'
                          if quicktest else
-                        '23ab09ed4abdf93db23b97359108ed630dd8311d')
+                        '46ca89d9fe34881d0be3a4a4b29f5aa8c019640c')
  
      def test_function_checksum(self):
          db = self.db
@@ -346,6 +346,12 @@ class UnicodeFunctionsTest(unittest.TestCase):
          # New in 16.0.0
          self.assertEqual(self.db.decomposition('\U0001CCD6'), '' if self.old else '<font> 0041')
  
+        # Hangul characters
+        self.assertEqual(self.db.decomposition('\uAC00'), '1100 1161')
+        self.assertEqual(self.db.decomposition('\uD4DB'), '1111 1171 11B6')
+        self.assertEqual(self.db.decomposition('\uC2F8'), '110A 1161')
+        self.assertEqual(self.db.decomposition('\uD7A3'), '1112 1175 11C2')
+
          self.assertRaises(TypeError, self.db.decomposition)
          self.assertRaises(TypeError, self.db.decomposition, 'xx')
  
@@ -649,9 +655,9 @@ class UnicodeFunctionsTest(unittest.TestCase):
  class Unicode_3_2_0_FunctionsTest(UnicodeFunctionsTest):
      db = unicodedata.ucd_3_2_0
      old = True
-    expectedchecksum = ('4154d8d1232837e255edf3cdcbb5ab184d71f4a4'
+    expectedchecksum = ('883824cb6c0ccf994e4451ebf281e2d6d479af47'
                          if quicktest else
-                        'b0a8df4ce8cf910def4e75f2d03c93defcc9bb09')
+                        'caf1a7f2f380f927461837f1901ef20683f98683')
  
  
  class UnicodeMiscTest(unittest.TestCase):
diff --git a/Misc/NEWS.d/next/Library/2026-02-19-10-57-40.gh-issue-88091.N7qGV-.rst b/Misc/NEWS.d/next/Library/2026-02-19-10-57-40.gh-issue-88091.N7qGV-.rst

new file mode 100644 (file)

index 0000000..15cf250
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2026-02-19-10-57-40.gh-issue-88091.N7qGV-.rst
@@ -0,0 +1 @@
+Fix :func:`unicodedata.decomposition` for Hangul characters.
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c

index 0547046394485451e5fcee8ca95840d8d5a0eae0..83de1be56a7fafc9ee83171c13cd3ee3c2f929df 100644 (file)
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -388,6 +388,17 @@ unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
      return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
  }
  
+// For Hangul decomposition
+#define SBase   0xAC00
+#define LBase   0x1100
+#define VBase   0x1161
+#define TBase   0x11A7
+#define LCount  19
+#define VCount  21
+#define TCount  28
+#define NCount  (VCount*TCount)
+#define SCount  (LCount*NCount)
+
  /*[clinic input]
  unicodedata.UCD.decomposition
  
@@ -418,6 +429,25 @@ unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
              return Py_GetConstant(Py_CONSTANT_EMPTY_STR); /* unassigned */
      }
  
+    // Hangul Decomposition.
+    // See section 3.12.2, "Hangul Syllable Decomposition"
+    // https://www.unicode.org/versions/latest/core-spec/chapter-3/#G56669
+    if (SBase <= code && code < (SBase + SCount)) {
+        int SIndex = code - SBase;
+        int L = LBase + SIndex / NCount;
+        int V = VBase + (SIndex % NCount) / TCount;
+        int T = TBase + SIndex % TCount;
+        if (T != TBase) {
+            PyOS_snprintf(decomp, sizeof(decomp),
+                          "%04X %04X %04X", L, V, T);
+        }
+        else {
+            PyOS_snprintf(decomp, sizeof(decomp),
+                          "%04X %04X", L, V);
+        }
+        return PyUnicode_FromString(decomp);
+    }
+
      if (code < 0 || code >= 0x110000)
          index = 0;
      else {
@@ -480,16 +510,6 @@ get_decomp_record(PyObject *self, Py_UCS4 code,
      (*index)++;
  }
  
-#define SBase   0xAC00
-#define LBase   0x1100
-#define VBase   0x1161
-#define TBase   0x11A7
-#define LCount  19
-#define VCount  21
-#define TCount  28
-#define NCount  (VCount*TCount)
-#define SCount  (LCount*NCount)
-
  static PyObject*
  nfd_nfkd(PyObject *self, PyObject *input, int k)
  {
@@ -543,7 +563,9 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
                  }
                  output = new_output;
              }
-            /* Hangul Decomposition. */
+            // Hangul Decomposition.
+            // See section 3.12.2, "Hangul Syllable Decomposition"
+            // https://www.unicode.org/versions/latest/core-spec/chapter-3/#G56669
              if (SBase <= code && code < (SBase+SCount)) {
                  int SIndex = code - SBase;
                  int L = LBase + SIndex / NCount;
author	Stan Ulbrych <89152624+StanFromIreland@users.noreply.github.com>
	Tue, 24 Feb 2026 22:27:09 +0000 (22:27 +0000)
committer	GitHub <noreply@github.com>
	Tue, 24 Feb 2026 22:27:09 +0000 (00:27 +0200)
Lib/test/test_unicodedata.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2026-02-19-10-57-40.gh-issue-88091.N7qGV-.rst	[new file with mode: 0644]	patch \| blob
Modules/unicodedata.c		patch \| blob \| blame \| history