From d1bb75505fbb5b47f4f3d420e00964b38c24b203 Mon Sep 17 00:00:00 2001
From: "Michael W. Hudson" <mwh@python.net>
Date: Mon, 7 Oct 2002 12:32:57 +0000
Subject: [PATCH] Backport:

2002/08/11 12:23:04 lemburg Python/bltinmodule.c 2.262
2002/08/11 12:23:04 lemburg Objects/unicodeobject.c 2.162
2002/08/11 12:23:03 lemburg Misc/NEWS 1.461
2002/08/11 12:23:03 lemburg Lib/test/test_unicode.py 1.65
2002/08/11 12:23:03 lemburg Include/unicodeobject.h 2.39
Add C API PyUnicode_FromOrdinal() which exposes unichr() at C level.

u'%c' will now raise a ValueError in case the argument is an
integer outside the valid range of Unicode code point ordinals.

Closes SF bug #593581.
---
 Include/unicodeobject.h  | 12 +++++++++
 Lib/test/test_unicode.py | 16 ++++++++++++
 Misc/NEWS                |  6 +++++
 Objects/unicodeobject.c  | 56 +++++++++++++++++++++++++++++++++++++++-
 Python/bltinmodule.c     | 35 +------------------------
 5 files changed, 90 insertions(+), 35 deletions(-)

diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index 103649deb350..523f3be3736e 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -517,6 +517,18 @@ extern DL_IMPORT(int) PyUnicode_AsWideChar(
 
 #endif
 
+/* --- Unicode ordinals --------------------------------------------------- */
+
+/* Create a Unicode Object from the given Unicode code point ordinal. 
+ 
+   The ordinal must be in range(0x10000) on narrow Python builds
+   (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
+   raised in case it is not.
+
+*/
+
+extern DL_IMPORT(PyObject*) PyUnicode_FromOrdinal(int ordinal);
+
 /* === Builtin Codecs ===================================================== 
 
    Many of these APIs take two arguments encoding and errors. These
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index 6125c92efc97..d4209249b668 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -442,6 +442,14 @@ except KeyError:
 else:
     verify(value == u'abc, def')
 
+for ordinal in (-100, 0x200000):
+    try:
+        u"%c" % ordinal
+    except ValueError:
+        pass
+    else:
+        print '*** formatting u"%%c" %% %i should give a ValueError' % ordinal
+
 # formatting jobs delegated from the string implementation:
 verify('...%(foo)s...' % {'foo':u"abc"} == u'...abc...')
 verify('...%(foo)s...' % {'foo':"abc"} == '...abc...')
@@ -737,6 +745,14 @@ for encoding in (
     except ValueError,why:
         print '*** codec for "%s" failed: %s' % (encoding, why)
 
+# UTF-8 must be roundtrip safe for all UCS-2 code points
+# This excludes surrogates: in the full range, there would be
+# a surrogate pair (\udbff\udc00), which gets converted back
+# to a non-BMP character (\U0010fc00)
+u = u''.join(map(unichr, range(0,0xd800)+range(0xe000,0x10000)))
+for encoding in ('utf-8',):
+    verify(unicode(u.encode(encoding),encoding) == u)
+
 print 'done.'
 
 print 'Testing Unicode string concatenation...',
diff --git a/Misc/NEWS b/Misc/NEWS
index 05728f2d4d6a..b6b0dd0e7a12 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -4,6 +4,9 @@ Release date: dd-mmm-2002
 
 Core and builtins
 
+- u'%c' will now raise a ValueError in case the argument is an
+  integer outside the valid range of Unicode code point ordinals.
+
 - When x is an object whose class implements __mul__ and __rmul__,
   1.0*x would correctly invoke __rmul__, but 1*x would erroneously
   invoke __mul__.  This was due to the sequence-repeat code in the int
@@ -87,6 +90,9 @@ Build
 
 C API
 
+- New C API PyUnicode_FromOrdinal() which exposes unichr() at C
+  level.
+
 Windows
 
 - SF bug 595919:  popenN return only text mode pipes
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 642b0f643a0f..edee91e27e0b 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -390,6 +390,45 @@ int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
 
 #endif
 
+PyObject *PyUnicode_FromOrdinal(int ordinal)
+{
+    Py_UNICODE s[2];
+
+#ifdef Py_UNICODE_WIDE
+    if (ordinal < 0 || ordinal > 0x10ffff) {
+	PyErr_SetString(PyExc_ValueError,
+			"unichr() arg not in range(0x110000) "
+			"(wide Python build)");
+	return NULL;
+    }
+#else
+    if (ordinal < 0 || ordinal > 0xffff) {
+	PyErr_SetString(PyExc_ValueError,
+			"unichr() arg not in range(0x10000) "
+			"(narrow Python build)");
+	return NULL;
+    }
+#endif
+
+    if (ordinal <= 0xffff) {
+	/* UCS-2 character */
+	s[0] = (Py_UNICODE) ordinal;
+	return PyUnicode_FromUnicode(s, 1);
+    }
+    else {
+#ifndef Py_UNICODE_WIDE
+	/* UCS-4 character.  store as two surrogate characters */
+	ordinal -= 0x10000L;
+	s[0] = 0xD800 + (Py_UNICODE) (ordinal >> 10);
+	s[1] = 0xDC00 + (Py_UNICODE) (ordinal & 0x03FF);
+	return PyUnicode_FromUnicode(s, 2);
+#else
+	s[0] = (Py_UNICODE)ordinal;
+	return PyUnicode_FromUnicode(s, 1);
+#endif
+    }
+}
+
 PyObject *PyUnicode_FromObject(register PyObject *obj)
 {
     /* XXX Perhaps we should make this API an alias of
@@ -5322,7 +5361,22 @@ formatchar(Py_UNICODE *buf,
 	x = PyInt_AsLong(v);
 	if (x == -1 && PyErr_Occurred())
 	    goto onError;
-	buf[0] = (char) x;
+#ifdef Py_UNICODE_WIDE
+	if (x < 0 || x > 0x10ffff) {
+	    PyErr_SetString(PyExc_ValueError,
+			    "%c arg not in range(0x110000) "
+			    "(wide Python build)");
+	    return -1;
+	}
+#else
+	if (x < 0 || x > 0xffff) {
+	    PyErr_SetString(PyExc_ValueError,
+			    "%c arg not in range(0x10000) "
+			    "(narrow Python build)");
+	    return -1;
+	}
+#endif
+	buf[0] = (Py_UNICODE) x;
     }
     buf[1] = '\0';
     return 1;
diff --git a/Python/bltinmodule.c b/Python/bltinmodule.c
index 4601c49c486c..b85f6858ecc4 100644
--- a/Python/bltinmodule.c
+++ b/Python/bltinmodule.c
@@ -301,44 +301,11 @@ static PyObject *
 builtin_unichr(PyObject *self, PyObject *args)
 {
 	long x;
-	Py_UNICODE s[2];
 
 	if (!PyArg_ParseTuple(args, "l:unichr", &x))
 		return NULL;
 
-#ifdef Py_UNICODE_WIDE
-	if (x < 0 || x > 0x10ffff) {
-		PyErr_SetString(PyExc_ValueError,
-				"unichr() arg not in range(0x110000) "
-				"(wide Python build)");
-		return NULL;
-	}
-#else
-	if (x < 0 || x > 0xffff) {
-		PyErr_SetString(PyExc_ValueError,
-				"unichr() arg not in range(0x10000) "
-				"(narrow Python build)");
-		return NULL;
-	}
-#endif
-
-	if (x <= 0xffff) {
-		/* UCS-2 character */
-		s[0] = (Py_UNICODE) x;
-		return PyUnicode_FromUnicode(s, 1);
-	}
-	else {
-#ifndef Py_UNICODE_WIDE
-		/* UCS-4 character.  store as two surrogate characters */
-		x -= 0x10000L;
-		s[0] = 0xD800 + (Py_UNICODE) (x >> 10);
-		s[1] = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
-		return PyUnicode_FromUnicode(s, 2);
-#else
-		s[0] = (Py_UNICODE)x;
-		return PyUnicode_FromUnicode(s, 1);
-#endif
-	}
+	return PyUnicode_FromOrdinal(x);
 }
 
 static char unichr_doc[] =
-- 
2.47.3