From: Georg Brandl Date: Sat, 3 Jan 2009 23:34:15 +0000 (+0000) Subject: Merged revisions 67937-67938 via svnmerge from X-Git-Tag: v3.0.1~190 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a12d9d46456c789968bde7cfbae8e96485d01ddf;p=thirdparty%2FPython%2Fcpython.git Merged revisions 67937-67938 via svnmerge from svn+ssh://svn.python.org/python/branches/py3k ........ r67937 | alexandre.vassalotti | 2008-12-27 08:32:41 +0100 (Sat, 27 Dec 2008) | 3 lines Update copy of PyUnicode_EncodeRawUnicodeEscape in _pickle. Add astral character test case. ........ r67938 | alexandre.vassalotti | 2008-12-27 10:09:15 +0100 (Sat, 27 Dec 2008) | 3 lines Fix wrong bytes type conversion in PyUnicode_AsUnicodeEscapeString. Fix wrong bytes type conversion in PyUnicode_AsUnicodeDecodeString. ........ --- diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py index 44f84774a5a8..f65093ae41f6 100644 --- a/Lib/test/pickletester.py +++ b/Lib/test/pickletester.py @@ -484,14 +484,21 @@ class AbstractPickleTests(unittest.TestCase): self.assertRaises(ValueError, self.loads, buf) def test_unicode(self): - endcases = ['', '<\\u>', '<\\\u1234>', '<\n>', - '<\\>', '<\\\U00012345>'] + endcases = ['', '<\\u>', '<\\\u1234>', '<\n>', '<\\>', + '<\\\U00012345>'] for proto in protocols: for u in endcases: p = self.dumps(u, proto) u2 = self.loads(p) self.assertEqual(u2, u) + def test_unicode_high_plane(self): + t = '\U00012345' + for proto in protocols: + p = self.dumps(t, proto) + t2 = self.loads(p) + self.assertEqual(t2, t) + def test_bytes(self): for proto in protocols: for u in b'', b'xyz', b'xyz'*100: diff --git a/Modules/_pickle.c b/Modules/_pickle.c index a0810b99a9f9..6cc90b3f1e8b 100644 --- a/Modules/_pickle.c +++ b/Modules/_pickle.c @@ -1109,16 +1109,21 @@ raw_unicode_escape(const Py_UNICODE *s, Py_ssize_t size) static const char *hexdigits = "0123456789abcdef"; #ifdef Py_UNICODE_WIDE - repr = PyBytes_FromStringAndSize(NULL, 10 * size); + const Py_ssize_t expandsize = 10; #else - repr = PyBytes_FromStringAndSize(NULL, 6 * size); + const Py_ssize_t expandsize = 6; #endif + + if (size > PY_SSIZE_T_MAX / expandsize) + return PyErr_NoMemory(); + + repr = PyByteArray_FromStringAndSize(NULL, expandsize * size); if (repr == NULL) return NULL; if (size == 0) goto done; - p = q = PyBytes_AS_STRING(repr); + p = q = PyByteArray_AS_STRING(repr); while (size-- > 0) { Py_UNICODE ch = *s++; #ifdef Py_UNICODE_WIDE @@ -1136,6 +1141,32 @@ raw_unicode_escape(const Py_UNICODE *s, Py_ssize_t size) *p++ = hexdigits[ch & 15]; } else +#else + /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */ + if (ch >= 0xD800 && ch < 0xDC00) { + Py_UNICODE ch2; + Py_UCS4 ucs; + + ch2 = *s++; + size--; + if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { + ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; + *p++ = '\\'; + *p++ = 'U'; + *p++ = hexdigits[(ucs >> 28) & 0xf]; + *p++ = hexdigits[(ucs >> 24) & 0xf]; + *p++ = hexdigits[(ucs >> 20) & 0xf]; + *p++ = hexdigits[(ucs >> 16) & 0xf]; + *p++ = hexdigits[(ucs >> 12) & 0xf]; + *p++ = hexdigits[(ucs >> 8) & 0xf]; + *p++ = hexdigits[(ucs >> 4) & 0xf]; + *p++ = hexdigits[ucs & 0xf]; + continue; + } + /* Fall through: isolated surrogates are copied as-is */ + s--; + size++; + } #endif /* Map 16-bit characters to '\uxxxx' */ if (ch >= 256 || ch == '\\' || ch == '\n') { @@ -1146,14 +1177,14 @@ raw_unicode_escape(const Py_UNICODE *s, Py_ssize_t size) *p++ = hexdigits[(ch >> 4) & 0xf]; *p++ = hexdigits[ch & 15]; } - /* Copy everything else as-is */ + /* Copy everything else as-is */ else *p++ = (char) ch; } size = p - q; done: - result = PyBytes_FromStringAndSize(PyBytes_AS_STRING(repr), size); + result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), size); Py_DECREF(repr); return result; } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index f854100b4a95..d22128f7ff3b 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3257,20 +3257,14 @@ PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) { - PyObject *s, *result; + PyObject *s; if (!PyUnicode_Check(unicode)) { PyErr_BadArgument(); return NULL; } s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode)); - - if (!s) - return NULL; - result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s), - PyByteArray_GET_SIZE(s)); - Py_DECREF(s); - return result; + return s; } /* --- Raw Unicode Escape Codec ------------------------------------------- */ @@ -3482,7 +3476,7 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) { - PyObject *s, *result; + PyObject *s; if (!PyUnicode_Check(unicode)) { PyErr_BadArgument(); return NULL; @@ -3490,12 +3484,7 @@ PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode)); - if (!s) - return NULL; - result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(s), - PyByteArray_GET_SIZE(s)); - Py_DECREF(s); - return result; + return s; } /* --- Unicode Internal Codec ------------------------------------------- */