gh-129569: The function unicodedata.normalize() always returns built-in str (#129570)

author Hizuru <106918920+Hizuru3@users.noreply.github.com>

Fri, 21 Feb 2025 13:51:13 +0000 (22:51 +0900)

committer GitHub <noreply@github.com>

Fri, 21 Feb 2025 13:51:13 +0000 (14:51 +0100)
author Hizuru <106918920+Hizuru3@users.noreply.github.com>
Fri, 21 Feb 2025 13:51:13 +0000 (22:51 +0900)
committer GitHub <noreply@github.com>
Fri, 21 Feb 2025 13:51:13 +0000 (14:51 +0100)
diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py

index 0285f0d51f236593ca66bb8828235ef097153ee1..8e3fef6b6fe4a06dda9872ab02da5f8c8028f2ca 100644 (file)
--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@@ -467,6 +467,29 @@ class NormalizationTest(unittest.TestCase):
          # Check for bug 834676
          unicodedata.normalize('NFC', '\ud55c\uae00')
  
+    def test_normalize_return_type(self):
+        # gh-129569: normalize() return type must always be str
+        normalize = unicodedata.normalize
+
+        class MyStr(str):
+            pass
+
+        normalization_forms = ("NFC", "NFKC", "NFD", "NFKD")
+        input_strings = (
+            # normalized strings
+            "",
+            "ascii",
+            # unnormalized strings
+            "\u1e0b\u0323",
+            "\u0071\u0307\u0323",
+        )
+
+        for form in normalization_forms:
+            for input_str in input_strings:
+                with self.subTest(form=form, input_str=input_str):
+                    self.assertIs(type(normalize(form, input_str)), str)
+                    self.assertIs(type(normalize(form, MyStr(input_str))), str)
+
  
  if __name__ == "__main__":
      unittest.main()
diff --git a/Misc/NEWS.d/next/Library/2025-02-02-16-30-27.gh-issue-129569.i0kPOG.rst b/Misc/NEWS.d/next/Library/2025-02-02-16-30-27.gh-issue-129569.i0kPOG.rst

new file mode 100644 (file)

index 0000000..c4b8965
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-02-02-16-30-27.gh-issue-129569.i0kPOG.rst
@@ -0,0 +1 @@
+Fix :func:`unicodedata.normalize` to always return a built-in :class:`str` object when given an input of a :class:`str` subclass, regardless of whether the string is already normalized.
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c

index 60bde755d24574b50e5843f478d4f9135c0035c4..79be7674fc8ab5ddc2cedbdd077b9da0df033819 100644 (file)
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -933,34 +933,34 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
      if (PyUnicode_GET_LENGTH(input) == 0) {
          /* Special case empty input strings, since resizing
             them  later would cause internal errors. */
-        return Py_NewRef(input);
+        return PyUnicode_FromObject(input);
      }
  
      if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) {
          if (is_normalized_quickcheck(self, input,
                                       true,  false, true) == YES) {
-            return Py_NewRef(input);
+            return PyUnicode_FromObject(input);
          }
          return nfc_nfkc(self, input, 0);
      }
      if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) {
          if (is_normalized_quickcheck(self, input,
                                       true,  true,  true) == YES) {
-            return Py_NewRef(input);
+            return PyUnicode_FromObject(input);
          }
          return nfc_nfkc(self, input, 1);
      }
      if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) {
          if (is_normalized_quickcheck(self, input,
                                       false, false, true) == YES) {
-            return Py_NewRef(input);
+            return PyUnicode_FromObject(input);
          }
          return nfd_nfkd(self, input, 0);
      }
      if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) {
          if (is_normalized_quickcheck(self, input,
                                       false, true,  true) == YES) {
-            return Py_NewRef(input);
+            return PyUnicode_FromObject(input);
          }
          return nfd_nfkd(self, input, 1);
      }
author	Hizuru <106918920+Hizuru3@users.noreply.github.com>
	Fri, 21 Feb 2025 13:51:13 +0000 (22:51 +0900)
committer	GitHub <noreply@github.com>
	Fri, 21 Feb 2025 13:51:13 +0000 (14:51 +0100)
Lib/test/test_unicodedata.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2025-02-02-16-30-27.gh-issue-129569.i0kPOG.rst	[new file with mode: 0644]	patch \| blob
Modules/unicodedata.c		patch \| blob \| blame \| history