From: Marc-André Lemburg Date: Tue, 24 Sep 2002 14:06:55 +0000 (+0000) Subject: Backport the UTF-8 codec from 2.3 and add a work-around to let the X-Git-Tag: v2.2.2b1~130 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=1e616dcafbf75906da2a1fc2d6415b7071bdda93;p=thirdparty%2FPython%2Fcpython.git Backport the UTF-8 codec from 2.3 and add a work-around to let the UTF-8 decoder accept broken UTF-8 sequences which encode lone high surrogates (the pre-2.2.2 versions forgot to generate the UTF-8 prefix \xed for these). Fixes SF bug #610783: Lone surrogates cause bad .pyc files. --- diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index cb3f9f3ee674..6125c92efc97 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -553,19 +553,40 @@ else: verify(unicode('+3ADYAA-', 'utf-7', 'replace') == u'\ufffd') # UTF-8 specific encoding tests: -verify(u'\u20ac'.encode('utf-8') == \ - ''.join((chr(0xe2), chr(0x82), chr(0xac))) ) -verify(u'\ud800\udc02'.encode('utf-8') == \ - ''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82))) ) -verify(u'\ud84d\udc56'.encode('utf-8') == \ - ''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))) ) +verify(u''.encode('utf-8') == '') +verify(u'\u20ac'.encode('utf-8') == '\xe2\x82\xac') +verify(u'\ud800\udc02'.encode('utf-8') == '\xf0\x90\x80\x82') +verify(u'\ud84d\udc56'.encode('utf-8') == '\xf0\xa3\x91\x96') +verify(u'\ud800'.encode('utf-8') == '\xed\xa0\x80') +verify(u'\udc00'.encode('utf-8') == '\xed\xb0\x80') +verify((u'\ud800\udc02'*1000).encode('utf-8') == + '\xf0\x90\x80\x82'*1000) +verify(u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f' + u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00' + u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c' + u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067' + u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das' + u' Nunstuck git und'.encode('utf-8') == + '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81' + '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3' + '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe' + '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83' + '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8' + '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81' + '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81' + '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3' + '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf' + '\xe3\x80\x8cWenn ist das Nunstuck git und') + # UTF-8 specific decoding tests -verify(unicode(''.join((chr(0xf0), chr(0xa3), chr(0x91), chr(0x96))), - 'utf-8') == u'\U00023456' ) -verify(unicode(''.join((chr(0xf0), chr(0x90), chr(0x80), chr(0x82))), - 'utf-8') == u'\U00010002' ) -verify(unicode(''.join((chr(0xe2), chr(0x82), chr(0xac))), - 'utf-8') == u'\u20ac' ) +verify(unicode('\xf0\xa3\x91\x96', 'utf-8') == u'\U00023456' ) +verify(unicode('\xf0\x90\x80\x82', 'utf-8') == u'\U00010002' ) +verify(unicode('\xe2\x82\xac', 'utf-8') == u'\u20ac' ) +# test UTF-8 2.2.1 bug work-around +verify(unicode('\xa0\x80', 'utf-8') == u'\ud800' ) +verify(unicode('\xaf\xbf', 'utf-8') == u'\udbff' ) +verify(unicode('\xed\xb0\x80', 'utf-8') == u'\udc00' ) +verify(unicode('\xed\xbf\xbf', 'utf-8') == u'\udfff' ) # Other possible utf-8 test cases: # * strict decoding testing for all of the diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index a0b10f02e167..58ea03373428 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1034,6 +1034,37 @@ PyObject *PyUnicode_DecodeUTF8(const char *s, switch (n) { case 0: + /* Work-around for bug in Python 2.2.0 and 2.2.1: the + UTF-8 encoder "forgot" to add the correct \xed prefix + for the lone surrogates 0xd800 - 0xdcff. */ + if (((unsigned char)s[0] >= 0xa0) && + ((unsigned char)s[0] <= 0xaf)) { + n = 2; + if (s + n > e) { + errmsg = "unexpected end of data"; + goto utf8Error; + } + if ((s[0] & 0xc0) != 0x80 || + (s[1] & 0xc0) != 0x80) { + errmsg = "invalid data"; + goto utf8Error; + } + ch = 0xd000 + ((s[0] & 0x3f) << 6) + (s[1] & 0x3f); + if (ch < 0x0800) { + /* Note: UTF-8 encodings of surrogates are considered + legal UTF-8 sequences; + + XXX For wide builds (UCS-4) we should probably try + to recombine the surrogates into a single code + unit. + */ + errmsg = "illegal encoding"; + goto utf8Error; + } + else + *p++ = (Py_UNICODE)ch; + break; + } errmsg = "unexpected code byte"; goto utf8Error; @@ -1062,12 +1093,19 @@ PyObject *PyUnicode_DecodeUTF8(const char *s, goto utf8Error; } ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); - if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) { + if (ch < 0x0800) { + /* Note: UTF-8 encodings of surrogates are considered + legal UTF-8 sequences; + + XXX For wide builds (UCS-4) we should probably try + to recombine the surrogates into a single code + unit. + */ errmsg = "illegal encoding"; goto utf8Error; } else - *p++ = (Py_UNICODE)ch; + *p++ = (Py_UNICODE)ch; break; case 4: @@ -1081,9 +1119,9 @@ PyObject *PyUnicode_DecodeUTF8(const char *s, ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); /* validate and convert to UTF-16 */ if ((ch < 0x10000) /* minimum value allowed for 4 - byte encoding */ + byte encoding */ || (ch > 0x10ffff)) /* maximum value allowed for - UTF-16 */ + UTF-16 */ { errmsg = "illegal encoding"; goto utf8Error; @@ -1128,125 +1166,104 @@ onError: return NULL; } -/* Not used anymore, now that the encoder supports UTF-16 - surrogates. */ -#if 0 -static -int utf8_encoding_error(const Py_UNICODE **source, - char **dest, - const char *errors, - const char *details) +/* Allocation strategy: if the string is short, convert into a stack buffer + and allocate exactly as much space needed at the end. Else allocate the + maximum possible needed (4 result bytes per Unicode character), and return + the excess memory at the end. +*/ +PyObject * +PyUnicode_EncodeUTF8(const Py_UNICODE *s, + int size, + const char *errors) { - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "UTF-8 encoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - return 0; - } - else if (strcmp(errors,"replace") == 0) { - **dest = '?'; - (*dest)++; - return 0; +#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ + + int i; /* index into s of next input byte */ + PyObject *v; /* result string object */ + char *p; /* next free byte in output buffer */ + int nallocated; /* number of result bytes allocated */ + int nneeded; /* number of result bytes needed */ + char stackbuf[MAX_SHORT_UNICHARS * 4]; + + assert(s != NULL); + assert(size >= 0); + + if (size <= MAX_SHORT_UNICHARS) { + /* Write into the stack buffer; nallocated can't overflow. + * At the end, we'll allocate exactly as much heap space as it + * turns out we need. + */ + nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); + v = NULL; /* will allocate after we're done */ + p = stackbuf; } else { - PyErr_Format(PyExc_ValueError, - "UTF-8 encoding error; " - "unknown error handling code: %.400s", - errors); - return -1; + /* Overallocate on the heap, and give the excess back at the end. */ + nallocated = size * 4; + if (nallocated / 4 != size) /* overflow! */ + return PyErr_NoMemory(); + v = PyString_FromStringAndSize(NULL, nallocated); + if (v == NULL) + return NULL; + p = PyString_AS_STRING(v); } -} -#endif -PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s, - int size, - const char *errors) -{ - PyObject *v; - char *p; - char *q; - Py_UCS4 ch2; - unsigned int cbAllocated = 3 * size; - int i = 0; - - v = PyString_FromStringAndSize(NULL, cbAllocated); - if (v == NULL) - return NULL; - if (size == 0) - return v; - - p = q = PyString_AS_STRING(v); - while (i < size) { + for (i = 0; i < size;) { Py_UCS4 ch = s[i++]; + if (ch < 0x80) + /* Encode ASCII */ *p++ = (char) ch; else if (ch < 0x0800) { - *p++ = 0xc0 | (ch >> 6); - *p++ = 0x80 | (ch & 0x3f); + /* Encode Latin-1 */ + *p++ = (char)(0xc0 | (ch >> 6)); + *p++ = (char)(0x80 | (ch & 0x3f)); } - - else if (ch < 0x10000) { - /* Check for high surrogate */ - if (0xD800 <= ch && ch <= 0xDBFF) { - if (i != size) { - ch2 = s[i]; + else { + /* Encode UCS2 Unicode ordinals */ + if (ch < 0x10000) { + /* Special case: check for high surrogate */ + if (0xD800 <= ch && ch <= 0xDBFF && i != size) { + Py_UCS4 ch2 = s[i]; + /* Check for low surrogate and combine the two to + form a UCS4 value */ if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { - - if ((Py_uintptr_t)(p - q) >= (cbAllocated - 4)) { - /* Provide enough room for some more - surrogates */ - cbAllocated += 4*10; - if (_PyString_Resize(&v, cbAllocated)) - goto onError; - p = PyString_AS_STRING(v) + (p - q); - q = PyString_AS_STRING(v); - } - - /* combine the two values */ - ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000; - - *p++ = (char)((ch >> 18) | 0xf0); - *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); + ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; i++; + goto encodeUCS4; } + /* Fall through: handles isolated high surrogates */ } - } - else *p++ = (char)(0xe0 | (ch >> 12)); - + *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); + *p++ = (char)(0x80 | (ch & 0x3f)); + continue; + } +encodeUCS4: + /* Encode UCS4 Unicode ordinals */ + *p++ = (char)(0xf0 | (ch >> 18)); + *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); *p++ = (char)(0x80 | (ch & 0x3f)); - - } else { - if ((Py_uintptr_t)(p - q) >= (cbAllocated - 4)) { - /* Provide enough room for some more - surrogates */ - cbAllocated += 4*10; - if (_PyString_Resize(&v, cbAllocated)) - goto onError; - p = PyString_AS_STRING(v) + (p - q); - q = PyString_AS_STRING(v); - } - - *p++ = 0xf0 | (ch>>18); - *p++ = 0x80 | ((ch>>12) & 0x3f); - *p++ = 0x80 | ((ch>>6) & 0x3f); - *p++ = 0x80 | (ch & 0x3f); } } - *p = '\0'; - if (_PyString_Resize(&v, p - q)) - goto onError; + + if (v == NULL) { + /* This was stack allocated. */ + nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int); + assert(nneeded <= nallocated); + v = PyString_FromStringAndSize(stackbuf, nneeded); + } + else { + /* Cut back to size actually needed. */ + nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int); + assert(nneeded <= nallocated); + _PyString_Resize(&v, nneeded); + } return v; - onError: - Py_XDECREF(v); - return NULL; +#undef MAX_SHORT_UNICHARS } PyObject *PyUnicode_AsUTF8String(PyObject *unicode)