Encode surrogates in UTF-8 even for a wide Py_UNICODE.

author Martin v. Löwis <martin@v.loewis.de>

Wed, 27 Jun 2001 06:28:56 +0000 (06:28 +0000)

committer Martin v. Löwis <martin@v.loewis.de>

Wed, 27 Jun 2001 06:28:56 +0000 (06:28 +0000)
author Martin v. Löwis <martin@v.loewis.de>
Wed, 27 Jun 2001 06:28:56 +0000 (06:28 +0000)
committer Martin v. Löwis <martin@v.loewis.de>
Wed, 27 Jun 2001 06:28:56 +0000 (06:28 +0000)
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h

index 87e01af1736fb4940f73f42781fb704cfda27a5e..d89537fc91edbba59156c4939f2d2b30cdd19803 100644 (file)
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -274,6 +274,9 @@ extern DL_IMPORT(int) PyUnicode_GetSize(
      PyObject *unicode          /* Unicode object */
      );
  
+/* Get the maximum ordinal for a Unicode character. */
+extern DL_IMPORT(Py_UNICODE) PyUnicode_GetMax(void);
+
  /* Resize an already allocated Unicode object to the new size length.
  
     *unicode is modified to point to the new (resized) object and 0
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py

index c82ac6959e06bc6c3c2dcba392fb8c6f1a2f8c8f..c9732d66d86bbbed0ae90401f1260aea9d37065a 100644 (file)
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -386,9 +386,9 @@ verify(u'\ud84d\udc56'.encode('utf-8') == \
         ''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))) )
  # UTF-8 specific decoding tests
  verify(unicode(''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))),
-               'utf-8') == u'\ud84d\udc56' )
+               'utf-8') == u'\U00023456' )
  verify(unicode(''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82))),
-               'utf-8') == u'\ud800\udc02' )
+               'utf-8') == u'\U00010002' )
  verify(unicode(''.join((chr(0xe2), chr(0x82), chr(0xac))),
                 'utf-8') == u'\u20ac' )
  
diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c

index 3bc19b2d447f3470e4811c2d1fa3c3638c0956ff..13fc6128c146bb32517f8b754c294dad258e1dc0 100644 (file)
--- a/Objects/unicodectype.c
+++ b/Objects/unicodectype.c
@@ -59,14 +59,21 @@ int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)
  /* Returns the titlecase Unicode characters corresponding to ch or just
     ch if no titlecase mapping is known. */
  
-Py_UNICODE _PyUnicode_ToTitlecase(register const Py_UNICODE ch)
+Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch)
  {
      const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
  
      if (ctype->title)
-        return ch + ctype->title;
-
-    return ch + ctype->upper;
+        ch += ctype->title;
+    else
+       ch += ctype->upper;
+
+#ifdef USE_UCS4_STORAGE
+    /* The database assumes that the values wrap around at 0x10000. */
+    if (ch > 0x10000)
+       ch -= 0x10000;
+#endif
+    return ch;
  }
  
  /* Returns 1 for Unicode characters having the category 'Lt', 0
@@ -348,21 +355,33 @@ int _PyUnicode_IsUppercase(register const Py_UNICODE ch)
  /* Returns the uppercase Unicode characters corresponding to ch or just
     ch if no uppercase mapping is known. */
  
-Py_UNICODE _PyUnicode_ToUppercase(register const Py_UNICODE ch)
+Py_UNICODE _PyUnicode_ToUppercase(register Py_UNICODE ch)
  {
      const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
  
-    return ch + ctype->upper;
+    ch += ctype->upper;
+#ifdef USE_UCS4_STORAGE
+    /* The database assumes that the values wrap around at 0x10000. */
+    if (ch > 0x10000)
+       ch -= 0x10000;
+#endif
+    return ch;
  }
  
  /* Returns the lowercase Unicode characters corresponding to ch or just
     ch if no lowercase mapping is known. */
  
-Py_UNICODE _PyUnicode_ToLowercase(register const Py_UNICODE ch)
+Py_UNICODE _PyUnicode_ToLowercase(register Py_UNICODE ch)
  {
      const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
  
-    return ch + ctype->lower;
+    ch += ctype->lower;
+#ifdef USE_UCS4_STORAGE
+    /* The database assumes that the values wrap around at 0x10000. */
+    if (ch > 0x10000)
+       ch -= 0x10000;
+#endif
+    return ch;
  }
  
  /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index ffac3710df4062bfa6314516d9699008c22ec689..2f66c3cf93ea6d1fcb50e0870d5bed6b89bfb6f8 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -103,6 +103,18 @@ static PyUnicodeObject *unicode_latin1[256];
  */
  static char unicode_default_encoding[100];
  
+Py_UNICODE
+PyUnicode_GetMax()
+{
+#ifdef USE_UCS4_STORAGE
+       return 0x10FFFF;
+#else
+       /* This is actually an illegal character, so it should
+          not be passed to unichr. */
+       return 0xFFFF;
+#endif
+}
+
  /* --- Unicode Object ----------------------------------------------------- */
  
  static
@@ -884,12 +896,6 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
              cbWritten += 2;
          }
          else if (ch < 0x10000) {
-#if Py_UNICODE_SIZE == 4
-           *p++ = 0xe0 | (ch>>12);
-            *p++ = 0x80 | ((ch>>6) & 0x3f);
-            *p++ = 0x80 | (ch & 0x3f);
-            cbWritten += 3;
-#else
              /* Check for high surrogate */
              if (0xD800 <= ch && ch <= 0xDBFF) {
                  if (i != size) {
@@ -920,7 +926,6 @@ PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
              }
              *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
              *p++ = (char)(0x80 | (ch & 0x3f));
-#endif
          } else {
              *p++ = 0xf0 | (ch>>18);
              *p++ = 0x80 | ((ch>>12) & 0x3f);
diff --git a/Python/sysmodule.c b/Python/sysmodule.c

index 62e08414d8c9d93f3ce2977032ab89d87d27ffbb..fe880d5443ca4b04dc9d83252f7a20d6f4894039 100644 (file)
--- a/Python/sysmodule.c
+++ b/Python/sysmodule.c
@@ -533,6 +533,7 @@ exc_traceback -- traceback of exception currently being handled\n\
  Static objects:\n\
  \n\
  maxint -- the largest supported integer (the smallest is -maxint-1)\n\
+maxunicode -- the largest supported character\n\
  builtin_module_names -- tuple of module names built into this intepreter\n\
  version -- the version of this interpreter as a string\n\
  version_info -- version information as a tuple\n\
@@ -643,6 +644,9 @@ _PySys_Init(void)
         PyDict_SetItemString(sysdict, "maxint",
                              v = PyInt_FromLong(PyInt_GetMax()));
         Py_XDECREF(v);
+       PyDict_SetItemString(sysdict, "maxunicode",
+                            v = PyInt_FromLong(PyUnicode_GetMax()));
+       Py_XDECREF(v);
         PyDict_SetItemString(sysdict, "builtin_module_names",
                    v = list_builtin_module_names());
         Py_XDECREF(v);
author	Martin v. Löwis <martin@v.loewis.de>
	Wed, 27 Jun 2001 06:28:56 +0000 (06:28 +0000)
committer	Martin v. Löwis <martin@v.loewis.de>
	Wed, 27 Jun 2001 06:28:56 +0000 (06:28 +0000)
Include/unicodeobject.h		patch \| blob \| blame \| history
Lib/test/test_unicode.py		patch \| blob \| blame \| history
Objects/unicodectype.c		patch \| blob \| blame \| history
Objects/unicodeobject.c		patch \| blob \| blame \| history
Python/sysmodule.c		patch \| blob \| blame \| history