[3.13] gh-60462: Fix locale.strxfrm() on Solaris (GH-138242) (GH-138449)

author Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>

Wed, 3 Sep 2025 13:31:23 +0000 (15:31 +0200)

committer GitHub <noreply@github.com>

Wed, 3 Sep 2025 13:31:23 +0000 (13:31 +0000)
author Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
Wed, 3 Sep 2025 13:31:23 +0000 (15:31 +0200)
committer GitHub <noreply@github.com>
Wed, 3 Sep 2025 13:31:23 +0000 (13:31 +0000)
diff --git a/Misc/NEWS.d/next/Library/2025-08-30-10-04-28.gh-issue-60462.yh_vDc.rst b/Misc/NEWS.d/next/Library/2025-08-30-10-04-28.gh-issue-60462.yh_vDc.rst

new file mode 100644 (file)

index 0000000..1365b1b
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-08-30-10-04-28.gh-issue-60462.yh_vDc.rst
@@ -0,0 +1 @@
+Fix :func:`locale.strxfrm` on Solaris (and possibly other platforms).
diff --git a/Modules/_localemodule.c b/Modules/_localemodule.c

index 60156700bc7cec5eda1409333da4fc4441c648d3..85d9062c0627f57ee159a960655b2d320c1f78dc 100644 (file)
--- a/Modules/_localemodule.c
+++ b/Modules/_localemodule.c
@@ -472,7 +472,54 @@ _locale_strxfrm_impl(PyObject *module, PyObject *str)
              goto exit;
          }
      }
-    result = PyUnicode_FromWideChar(buf, n2);
+    /* The result is just a sequence of integers, they are not necessary
+       Unicode code points, so PyUnicode_FromWideChar() cannot be used
+       here. For example, 0xD83D 0xDC0D should not be larger than 0xFF41.
+     */
+#if SIZEOF_WCHAR_T == 4
+    {
+        /* Some codes can exceed the range of Unicode code points
+           (0 - 0x10FFFF), so they cannot be directly used in
+           PyUnicode_FromKindAndData(). They should be first encoded in
+           a way that preserves the lexicographical order.
+
+           Codes in the range 0-0xFFFF represent themself.
+           Codes larger than 0xFFFF are encoded as a pair:
+           * 0x1xxxx -- the highest 16 bits
+           * 0x0xxxx -- the lowest 16 bits
+         */
+        size_t n3 = 0;
+        for (size_t i = 0; i < n2; i++) {
+            if ((Py_UCS4)buf[i] > 0x10000u) {
+                n3++;
+            }
+        }
+        if (n3) {
+            n3 += n2; // no integer overflow
+            Py_UCS4 *buf2 = PyMem_New(Py_UCS4, n3);
+            if (buf2 == NULL) {
+                PyErr_NoMemory();
+                goto exit;
+            }
+            size_t j = 0;
+            for (size_t i = 0; i < n2; i++) {
+                Py_UCS4 c = (Py_UCS4)buf[i];
+                if (c > 0x10000u) {
+                    buf2[j++] = (c >> 16) | 0x10000u;
+                    buf2[j++] = c & 0xFFFFu;
+                }
+                else {
+                    buf2[j++] = c;
+                }
+            }
+            assert(j == n3);
+            result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, buf2, n3);
+            PyMem_Free(buf2);
+            goto exit;
+        }
+    }
+#endif
+    result = PyUnicode_FromKindAndData(sizeof(wchar_t), buf, n2);
  exit:
      PyMem_Free(buf);
      PyMem_Free(s);
author	Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
	Wed, 3 Sep 2025 13:31:23 +0000 (15:31 +0200)
committer	GitHub <noreply@github.com>
	Wed, 3 Sep 2025 13:31:23 +0000 (13:31 +0000)
Misc/NEWS.d/next/Library/2025-08-30-10-04-28.gh-issue-60462.yh_vDc.rst	[new file with mode: 0644]	patch \| blob
Modules/_localemodule.c		patch \| blob \| blame \| history