Merged revisions 67937-67938 via svnmerge from

author Georg Brandl <georg@python.org>

Sat, 3 Jan 2009 23:34:15 +0000 (23:34 +0000)

committer Georg Brandl <georg@python.org>

Sat, 3 Jan 2009 23:34:15 +0000 (23:34 +0000)
author Georg Brandl <georg@python.org>
Sat, 3 Jan 2009 23:34:15 +0000 (23:34 +0000)
committer Georg Brandl <georg@python.org>
Sat, 3 Jan 2009 23:34:15 +0000 (23:34 +0000)
diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py

index 44f84774a5a8b23ec9c1ca53486ea19780640e78..f65093ae41f640a950b53a6c3c0d3c5ff9040464 100644 (file)
--- a/Lib/test/pickletester.py
+++ b/Lib/test/pickletester.py
@@ -484,14 +484,21 @@ class AbstractPickleTests(unittest.TestCase):
              self.assertRaises(ValueError, self.loads, buf)
  
      def test_unicode(self):
-        endcases = ['', '<\\u>', '<\\\u1234>', '<\n>',
-                    '<\\>', '<\\\U00012345>']
+        endcases = ['', '<\\u>', '<\\\u1234>', '<\n>',  '<\\>',
+                    '<\\\U00012345>']
          for proto in protocols:
              for u in endcases:
                  p = self.dumps(u, proto)
                  u2 = self.loads(p)
                  self.assertEqual(u2, u)
  
+    def test_unicode_high_plane(self):
+        t = '\U00012345'
+        for proto in protocols:
+            p = self.dumps(t, proto)
+            t2 = self.loads(p)
+            self.assertEqual(t2, t)
+
      def test_bytes(self):
          for proto in protocols:
              for u in b'', b'xyz', b'xyz'*100:
diff --git a/Modules/_pickle.c b/Modules/_pickle.c

index a0810b99a9f9086315c7e9ebd457403e7d7b9509..6cc90b3f1e8b00dd7eb57fcb93f2434f930d78a3 100644 (file)
--- a/Modules/_pickle.c
+++ b/Modules/_pickle.c
@@ -1109,16 +1109,21 @@ raw_unicode_escape(const Py_UNICODE *s, Py_ssize_t size)
      static const char *hexdigits = "0123456789abcdef";
  
  #ifdef Py_UNICODE_WIDE
-    repr = PyBytes_FromStringAndSize(NULL, 10 * size);
+    const Py_ssize_t expandsize = 10;
  #else
-    repr = PyBytes_FromStringAndSize(NULL, 6 * size);
+    const Py_ssize_t expandsize = 6;
  #endif
+    
+    if (size > PY_SSIZE_T_MAX / expandsize)
+        return PyErr_NoMemory();
+    
+    repr = PyByteArray_FromStringAndSize(NULL, expandsize * size);
      if (repr == NULL)
          return NULL;
      if (size == 0)
          goto done;
  
-    p = q = PyBytes_AS_STRING(repr);
+    p = q = PyByteArray_AS_STRING(repr);
      while (size-- > 0) {
          Py_UNICODE ch = *s++;
  #ifdef Py_UNICODE_WIDE
@@ -1136,6 +1141,32 @@ raw_unicode_escape(const Py_UNICODE *s, Py_ssize_t size)
              *p++ = hexdigits[ch & 15];
          }
          else
+#else
+            /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
+            if (ch >= 0xD800 && ch < 0xDC00) {
+                Py_UNICODE ch2;
+                Py_UCS4 ucs;
+
+                ch2 = *s++;
+                size--;
+                if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
+                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
+                    *p++ = '\\';
+                    *p++ = 'U';
+                    *p++ = hexdigits[(ucs >> 28) & 0xf];
+                    *p++ = hexdigits[(ucs >> 24) & 0xf];
+                    *p++ = hexdigits[(ucs >> 20) & 0xf];
+                    *p++ = hexdigits[(ucs >> 16) & 0xf];
+                    *p++ = hexdigits[(ucs >> 12) & 0xf];
+                    *p++ = hexdigits[(ucs >> 8) & 0xf];
+                    *p++ = hexdigits[(ucs >> 4) & 0xf];
+                    *p++ = hexdigits[ucs & 0xf];
+                    continue;
+                }
+                /* Fall through: isolated surrogates are copied as-is */
+                s--;
+                size++;
+            }
  #endif
          /* Map 16-bit characters to '\uxxxx' */
          if (ch >= 256 || ch == '\\' || ch == '\n') {
@@ -1146,14 +1177,14 @@ raw_unicode_escape(const Py_UNICODE *s, Py_ssize_t size)
              *p++ = hexdigits[(ch >> 4) & 0xf];
              *p++ = hexdigits[ch & 15];
          }
-       /* Copy everything else as-is */
+        /* Copy everything else as-is */
          else
              *p++ = (char) ch;
      }
      size = p - q;
  
    done:
-    result = PyBytes_FromStringAndSize(PyBytes_AS_STRING(repr), size);
+    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), size);
      Py_DECREF(repr);
      return result;
  }
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index f854100b4a95ee3659fd7939bc0bcfa1b69cc421..d22128f7ff3b0644170ea4fab2f817b80236447a 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -3257,20 +3257,14 @@ PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
  
  PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
  {
-    PyObject *s, *result;
+    PyObject *s;
      if (!PyUnicode_Check(unicode)) {
          PyErr_BadArgument();
          return NULL;
      }
      s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
                                        PyUnicode_GET_SIZE(unicode));
-
-    if (!s)
-        return NULL;
-    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s),
-                                        PyByteArray_GET_SIZE(s));
-    Py_DECREF(s);
-    return result;
+    return s;
  }
  
  /* --- Raw Unicode Escape Codec ------------------------------------------- */
@@ -3482,7 +3476,7 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
  
  PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
  {
-    PyObject *s, *result;
+    PyObject *s;
      if (!PyUnicode_Check(unicode)) {
          PyErr_BadArgument();
          return NULL;
@@ -3490,12 +3484,7 @@ PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
      s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
                                           PyUnicode_GET_SIZE(unicode));
  
-    if (!s)
-        return NULL;
-    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s),
-                                        PyByteArray_GET_SIZE(s));
-    Py_DECREF(s);
-    return result;
+    return s;
  }
  
  /* --- Unicode Internal Codec ------------------------------------------- */
author	Georg Brandl <georg@python.org>
	Sat, 3 Jan 2009 23:34:15 +0000 (23:34 +0000)
committer	Georg Brandl <georg@python.org>
	Sat, 3 Jan 2009 23:34:15 +0000 (23:34 +0000)
Lib/test/pickletester.py		patch \| blob \| blame \| history
Modules/_pickle.c		patch \| blob \| blame \| history
Objects/unicodeobject.c		patch \| blob \| blame \| history