Implement sys.maxunicode.
Explicitly wrap around upper/lower computations for wide Py_UNICODE.
When decoding large characters with UTF-8, represent expected test
results using the \U notation.
PyObject *unicode /* Unicode object */
);
+/* Get the maximum ordinal for a Unicode character. */
+extern DL_IMPORT(Py_UNICODE) PyUnicode_GetMax(void);
+
/* Resize an already allocated Unicode object to the new size length.
*unicode is modified to point to the new (resized) object and 0
''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))) )
# UTF-8 specific decoding tests
verify(unicode(''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))),
- 'utf-8') == u'\ud84d\udc56' )
+ 'utf-8') == u'\U00023456' )
verify(unicode(''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82))),
- 'utf-8') == u'\ud800\udc02' )
+ 'utf-8') == u'\U00010002' )
verify(unicode(''.join((chr(0xe2), chr(0x82), chr(0xac))),
'utf-8') == u'\u20ac' )
/* Returns the titlecase Unicode characters corresponding to ch or just
ch if no titlecase mapping is known. */
-Py_UNICODE _PyUnicode_ToTitlecase(register const Py_UNICODE ch)
+Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch)
{
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
if (ctype->title)
- return ch + ctype->title;
-
- return ch + ctype->upper;
+ ch += ctype->title;
+ else
+ ch += ctype->upper;
+
+#ifdef USE_UCS4_STORAGE
+ /* The database assumes that the values wrap around at 0x10000. */
+ if (ch > 0x10000)
+ ch -= 0x10000;
+#endif
+ return ch;
}
/* Returns 1 for Unicode characters having the category 'Lt', 0
/* Returns the uppercase Unicode characters corresponding to ch or just
ch if no uppercase mapping is known. */
-Py_UNICODE _PyUnicode_ToUppercase(register const Py_UNICODE ch)
+Py_UNICODE _PyUnicode_ToUppercase(register Py_UNICODE ch)
{
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
- return ch + ctype->upper;
+ ch += ctype->upper;
+#ifdef USE_UCS4_STORAGE
+ /* The database assumes that the values wrap around at 0x10000. */
+ if (ch > 0x10000)
+ ch -= 0x10000;
+#endif
+ return ch;
}
/* Returns the lowercase Unicode characters corresponding to ch or just
ch if no lowercase mapping is known. */
-Py_UNICODE _PyUnicode_ToLowercase(register const Py_UNICODE ch)
+Py_UNICODE _PyUnicode_ToLowercase(register Py_UNICODE ch)
{
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
- return ch + ctype->lower;
+ ch += ctype->lower;
+#ifdef USE_UCS4_STORAGE
+ /* The database assumes that the values wrap around at 0x10000. */
+ if (ch > 0x10000)
+ ch -= 0x10000;
+#endif
+ return ch;
}
/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
*/
static char unicode_default_encoding[100];
+Py_UNICODE
+PyUnicode_GetMax()
+{
+#ifdef USE_UCS4_STORAGE
+ return 0x10FFFF;
+#else
+ /* This is actually an illegal character, so it should
+ not be passed to unichr. */
+ return 0xFFFF;
+#endif
+}
+
/* --- Unicode Object ----------------------------------------------------- */
static
cbWritten += 2;
}
else if (ch < 0x10000) {
-#if Py_UNICODE_SIZE == 4
- *p++ = 0xe0 | (ch>>12);
- *p++ = 0x80 | ((ch>>6) & 0x3f);
- *p++ = 0x80 | (ch & 0x3f);
- cbWritten += 3;
-#else
/* Check for high surrogate */
if (0xD800 <= ch && ch <= 0xDBFF) {
if (i != size) {
}
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
*p++ = (char)(0x80 | (ch & 0x3f));
-#endif
} else {
*p++ = 0xf0 | (ch>>18);
*p++ = 0x80 | ((ch>>12) & 0x3f);
Static objects:\n\
\n\
maxint -- the largest supported integer (the smallest is -maxint-1)\n\
+maxunicode -- the largest supported character\n\
builtin_module_names -- tuple of module names built into this intepreter\n\
version -- the version of this interpreter as a string\n\
version_info -- version information as a tuple\n\
PyDict_SetItemString(sysdict, "maxint",
v = PyInt_FromLong(PyInt_GetMax()));
Py_XDECREF(v);
+ PyDict_SetItemString(sysdict, "maxunicode",
+ v = PyInt_FromLong(PyUnicode_GetMax()));
+ Py_XDECREF(v);
PyDict_SetItemString(sysdict, "builtin_module_names",
v = list_builtin_module_names());
Py_XDECREF(v);