From d1bb75505fbb5b47f4f3d420e00964b38c24b203 Mon Sep 17 00:00:00 2001 From: "Michael W. Hudson" Date: Mon, 7 Oct 2002 12:32:57 +0000 Subject: [PATCH] Backport: 2002/08/11 12:23:04 lemburg Python/bltinmodule.c 2.262 2002/08/11 12:23:04 lemburg Objects/unicodeobject.c 2.162 2002/08/11 12:23:03 lemburg Misc/NEWS 1.461 2002/08/11 12:23:03 lemburg Lib/test/test_unicode.py 1.65 2002/08/11 12:23:03 lemburg Include/unicodeobject.h 2.39 Add C API PyUnicode_FromOrdinal() which exposes unichr() at C level. u'%c' will now raise a ValueError in case the argument is an integer outside the valid range of Unicode code point ordinals. Closes SF bug #593581. --- Include/unicodeobject.h | 12 +++++++++ Lib/test/test_unicode.py | 16 ++++++++++++ Misc/NEWS | 6 +++++ Objects/unicodeobject.c | 56 +++++++++++++++++++++++++++++++++++++++- Python/bltinmodule.c | 35 +------------------------ 5 files changed, 90 insertions(+), 35 deletions(-) diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 103649deb350..523f3be3736e 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -517,6 +517,18 @@ extern DL_IMPORT(int) PyUnicode_AsWideChar( #endif +/* --- Unicode ordinals --------------------------------------------------- */ + +/* Create a Unicode Object from the given Unicode code point ordinal. + + The ordinal must be in range(0x10000) on narrow Python builds + (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is + raised in case it is not. + +*/ + +extern DL_IMPORT(PyObject*) PyUnicode_FromOrdinal(int ordinal); + /* === Builtin Codecs ===================================================== Many of these APIs take two arguments encoding and errors. These diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 6125c92efc97..d4209249b668 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -442,6 +442,14 @@ except KeyError: else: verify(value == u'abc, def') +for ordinal in (-100, 0x200000): + try: + u"%c" % ordinal + except ValueError: + pass + else: + print '*** formatting u"%%c" %% %i should give a ValueError' % ordinal + # formatting jobs delegated from the string implementation: verify('...%(foo)s...' % {'foo':u"abc"} == u'...abc...') verify('...%(foo)s...' % {'foo':"abc"} == '...abc...') @@ -737,6 +745,14 @@ for encoding in ( except ValueError,why: print '*** codec for "%s" failed: %s' % (encoding, why) +# UTF-8 must be roundtrip safe for all UCS-2 code points +# This excludes surrogates: in the full range, there would be +# a surrogate pair (\udbff\udc00), which gets converted back +# to a non-BMP character (\U0010fc00) +u = u''.join(map(unichr, range(0,0xd800)+range(0xe000,0x10000))) +for encoding in ('utf-8',): + verify(unicode(u.encode(encoding),encoding) == u) + print 'done.' print 'Testing Unicode string concatenation...', diff --git a/Misc/NEWS b/Misc/NEWS index 05728f2d4d6a..b6b0dd0e7a12 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -4,6 +4,9 @@ Release date: dd-mmm-2002 Core and builtins +- u'%c' will now raise a ValueError in case the argument is an + integer outside the valid range of Unicode code point ordinals. + - When x is an object whose class implements __mul__ and __rmul__, 1.0*x would correctly invoke __rmul__, but 1*x would erroneously invoke __mul__. This was due to the sequence-repeat code in the int @@ -87,6 +90,9 @@ Build C API +- New C API PyUnicode_FromOrdinal() which exposes unichr() at C + level. + Windows - SF bug 595919: popenN return only text mode pipes diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 642b0f643a0f..edee91e27e0b 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -390,6 +390,45 @@ int PyUnicode_AsWideChar(PyUnicodeObject *unicode, #endif +PyObject *PyUnicode_FromOrdinal(int ordinal) +{ + Py_UNICODE s[2]; + +#ifdef Py_UNICODE_WIDE + if (ordinal < 0 || ordinal > 0x10ffff) { + PyErr_SetString(PyExc_ValueError, + "unichr() arg not in range(0x110000) " + "(wide Python build)"); + return NULL; + } +#else + if (ordinal < 0 || ordinal > 0xffff) { + PyErr_SetString(PyExc_ValueError, + "unichr() arg not in range(0x10000) " + "(narrow Python build)"); + return NULL; + } +#endif + + if (ordinal <= 0xffff) { + /* UCS-2 character */ + s[0] = (Py_UNICODE) ordinal; + return PyUnicode_FromUnicode(s, 1); + } + else { +#ifndef Py_UNICODE_WIDE + /* UCS-4 character. store as two surrogate characters */ + ordinal -= 0x10000L; + s[0] = 0xD800 + (Py_UNICODE) (ordinal >> 10); + s[1] = 0xDC00 + (Py_UNICODE) (ordinal & 0x03FF); + return PyUnicode_FromUnicode(s, 2); +#else + s[0] = (Py_UNICODE)ordinal; + return PyUnicode_FromUnicode(s, 1); +#endif + } +} + PyObject *PyUnicode_FromObject(register PyObject *obj) { /* XXX Perhaps we should make this API an alias of @@ -5322,7 +5361,22 @@ formatchar(Py_UNICODE *buf, x = PyInt_AsLong(v); if (x == -1 && PyErr_Occurred()) goto onError; - buf[0] = (char) x; +#ifdef Py_UNICODE_WIDE + if (x < 0 || x > 0x10ffff) { + PyErr_SetString(PyExc_ValueError, + "%c arg not in range(0x110000) " + "(wide Python build)"); + return -1; + } +#else + if (x < 0 || x > 0xffff) { + PyErr_SetString(PyExc_ValueError, + "%c arg not in range(0x10000) " + "(narrow Python build)"); + return -1; + } +#endif + buf[0] = (Py_UNICODE) x; } buf[1] = '\0'; return 1; diff --git a/Python/bltinmodule.c b/Python/bltinmodule.c index 4601c49c486c..b85f6858ecc4 100644 --- a/Python/bltinmodule.c +++ b/Python/bltinmodule.c @@ -301,44 +301,11 @@ static PyObject * builtin_unichr(PyObject *self, PyObject *args) { long x; - Py_UNICODE s[2]; if (!PyArg_ParseTuple(args, "l:unichr", &x)) return NULL; -#ifdef Py_UNICODE_WIDE - if (x < 0 || x > 0x10ffff) { - PyErr_SetString(PyExc_ValueError, - "unichr() arg not in range(0x110000) " - "(wide Python build)"); - return NULL; - } -#else - if (x < 0 || x > 0xffff) { - PyErr_SetString(PyExc_ValueError, - "unichr() arg not in range(0x10000) " - "(narrow Python build)"); - return NULL; - } -#endif - - if (x <= 0xffff) { - /* UCS-2 character */ - s[0] = (Py_UNICODE) x; - return PyUnicode_FromUnicode(s, 1); - } - else { -#ifndef Py_UNICODE_WIDE - /* UCS-4 character. store as two surrogate characters */ - x -= 0x10000L; - s[0] = 0xD800 + (Py_UNICODE) (x >> 10); - s[1] = 0xDC00 + (Py_UNICODE) (x & 0x03FF); - return PyUnicode_FromUnicode(s, 2); -#else - s[0] = (Py_UNICODE)x; - return PyUnicode_FromUnicode(s, 1); -#endif - } + return PyUnicode_FromOrdinal(x); } static char unichr_doc[] = -- 2.47.3