]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
gh-129569: The function unicodedata.normalize() always returns built-in str (#129570)
authorHizuru <106918920+Hizuru3@users.noreply.github.com>
Fri, 21 Feb 2025 13:51:13 +0000 (22:51 +0900)
committerGitHub <noreply@github.com>
Fri, 21 Feb 2025 13:51:13 +0000 (14:51 +0100)
Co-authored-by: Victor Stinner <vstinner@python.org>
Lib/test/test_unicodedata.py
Misc/NEWS.d/next/Library/2025-02-02-16-30-27.gh-issue-129569.i0kPOG.rst [new file with mode: 0644]
Modules/unicodedata.c

index 0285f0d51f236593ca66bb8828235ef097153ee1..8e3fef6b6fe4a06dda9872ab02da5f8c8028f2ca 100644 (file)
@@ -467,6 +467,29 @@ class NormalizationTest(unittest.TestCase):
         # Check for bug 834676
         unicodedata.normalize('NFC', '\ud55c\uae00')
 
+    def test_normalize_return_type(self):
+        # gh-129569: normalize() return type must always be str
+        normalize = unicodedata.normalize
+
+        class MyStr(str):
+            pass
+
+        normalization_forms = ("NFC", "NFKC", "NFD", "NFKD")
+        input_strings = (
+            # normalized strings
+            "",
+            "ascii",
+            # unnormalized strings
+            "\u1e0b\u0323",
+            "\u0071\u0307\u0323",
+        )
+
+        for form in normalization_forms:
+            for input_str in input_strings:
+                with self.subTest(form=form, input_str=input_str):
+                    self.assertIs(type(normalize(form, input_str)), str)
+                    self.assertIs(type(normalize(form, MyStr(input_str))), str)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/Misc/NEWS.d/next/Library/2025-02-02-16-30-27.gh-issue-129569.i0kPOG.rst b/Misc/NEWS.d/next/Library/2025-02-02-16-30-27.gh-issue-129569.i0kPOG.rst
new file mode 100644 (file)
index 0000000..c4b8965
--- /dev/null
@@ -0,0 +1 @@
+Fix :func:`unicodedata.normalize` to always return a built-in :class:`str` object when given an input of a :class:`str` subclass, regardless of whether the string is already normalized.
index 60bde755d24574b50e5843f478d4f9135c0035c4..79be7674fc8ab5ddc2cedbdd077b9da0df033819 100644 (file)
@@ -933,34 +933,34 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
     if (PyUnicode_GET_LENGTH(input) == 0) {
         /* Special case empty input strings, since resizing
            them  later would cause internal errors. */
-        return Py_NewRef(input);
+        return PyUnicode_FromObject(input);
     }
 
     if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) {
         if (is_normalized_quickcheck(self, input,
                                      true,  false, true) == YES) {
-            return Py_NewRef(input);
+            return PyUnicode_FromObject(input);
         }
         return nfc_nfkc(self, input, 0);
     }
     if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) {
         if (is_normalized_quickcheck(self, input,
                                      true,  true,  true) == YES) {
-            return Py_NewRef(input);
+            return PyUnicode_FromObject(input);
         }
         return nfc_nfkc(self, input, 1);
     }
     if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) {
         if (is_normalized_quickcheck(self, input,
                                      false, false, true) == YES) {
-            return Py_NewRef(input);
+            return PyUnicode_FromObject(input);
         }
         return nfd_nfkd(self, input, 0);
     }
     if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) {
         if (is_normalized_quickcheck(self, input,
                                      false, true,  true) == YES) {
-            return Py_NewRef(input);
+            return PyUnicode_FromObject(input);
         }
         return nfd_nfkd(self, input, 1);
     }