goto exit;
}
}
- result = PyUnicode_FromWideChar(buf, n2);
+ /* The result is just a sequence of integers, they are not necessary
+ Unicode code points, so PyUnicode_FromWideChar() cannot be used
+ here. For example, 0xD83D 0xDC0D should not be larger than 0xFF41.
+ */
+#if SIZEOF_WCHAR_T == 4
+ {
+ /* Some codes can exceed the range of Unicode code points
+ (0 - 0x10FFFF), so they cannot be directly used in
+ PyUnicode_FromKindAndData(). They should be first encoded in
+ a way that preserves the lexicographical order.
+
+ Codes in the range 0-0xFFFF represent themself.
+ Codes larger than 0xFFFF are encoded as a pair:
+ * 0x1xxxx -- the highest 16 bits
+ * 0x0xxxx -- the lowest 16 bits
+ */
+ size_t n3 = 0;
+ for (size_t i = 0; i < n2; i++) {
+ if ((Py_UCS4)buf[i] > 0x10000u) {
+ n3++;
+ }
+ }
+ if (n3) {
+ n3 += n2; // no integer overflow
+ Py_UCS4 *buf2 = PyMem_New(Py_UCS4, n3);
+ if (buf2 == NULL) {
+ PyErr_NoMemory();
+ goto exit;
+ }
+ size_t j = 0;
+ for (size_t i = 0; i < n2; i++) {
+ Py_UCS4 c = (Py_UCS4)buf[i];
+ if (c > 0x10000u) {
+ buf2[j++] = (c >> 16) | 0x10000u;
+ buf2[j++] = c & 0xFFFFu;
+ }
+ else {
+ buf2[j++] = c;
+ }
+ }
+ assert(j == n3);
+ result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, buf2, n3);
+ PyMem_Free(buf2);
+ goto exit;
+ }
+ }
+#endif
+ result = PyUnicode_FromKindAndData(sizeof(wchar_t), buf, n2);
exit:
PyMem_Free(buf);
PyMem_Free(s);