From: Serhiy Storchaka Date: Sat, 26 Jan 2013 10:18:17 +0000 (+0200) Subject: Issue #10156: In the interpreter's initialization phase, unicode globals X-Git-Tag: v3.4.0a1~1521^2~4 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=ed3c4128c061aef01a19bdfa4ac8b87e43f9d768;p=thirdparty%2FPython%2Fcpython.git Issue #10156: In the interpreter's initialization phase, unicode globals are now initialized dynamically as needed. --- ed3c4128c061aef01a19bdfa4ac8b87e43f9d768 diff --cc Objects/unicodeobject.c index 5030e8d6349e,c96a91c39732..b4f4185caacf --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@@ -47,10 -47,19 +47,11 @@@ OF OR IN CONNECTION WITH THE USE OR PER #include #endif -/* Endianness switches; defaults to little endian */ - -#ifdef WORDS_BIGENDIAN -# define BYTEORDER_IS_BIG_ENDIAN -#else -# define BYTEORDER_IS_LITTLE_ENDIAN -#endif - /* --- Globals ------------------------------------------------------------ - The globals are initialized by the _PyUnicode_Init() API and should - not be used before calling that API. + NOTE: In the interpreter's initialization phase, some globals are currently + initialized dynamically as needed. In the process Unicode objects may + be created before the Unicode type is ready. */ @@@ -404,11 -432,12 +424,10 @@@ unicode_result_wchar(PyObject *unicode #ifndef Py_DEBUG Py_ssize_t len; - assert(Py_REFCNT(unicode) == 1); - len = _PyUnicode_WSTR_LENGTH(unicode); if (len == 0) { - Py_INCREF(unicode_empty); Py_DECREF(unicode); - return unicode_empty; + _Py_RETURN_UNICODE_EMPTY(); } if (len == 1) { @@@ -4201,16 -4330,14 +4207,15 @@@ PyUnicode_DecodeUTF7Stateful(const cha if (size == 0) { if (consumed) *consumed = 0; - Py_INCREF(unicode_empty); - return unicode_empty; - return unicode; ++ _Py_RETURN_UNICODE_EMPTY(); } - shiftOutStart = outpos = 0; + /* Start off assuming it's all ASCII. Widen later as necessary. */ + _PyUnicodeWriter_Init(&writer, 0); + if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) + goto onError; + + shiftOutStart = 0; e = s + size; while (s < e) { @@@ -4851,92 -4980,84 +4855,91 @@@ PyUnicode_DecodeUTF32Stateful(const cha byte order setting accordingly. In native mode, the leading BOM mark is skipped, in all other modes, it is copied to the output stream as-is (giving a ZWNBSP character). */ - if (bo == 0) { - if (size >= 4) { - const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | - (q[iorder[1]] << 8) | q[iorder[0]]; -#ifdef BYTEORDER_IS_LITTLE_ENDIAN - if (bom == 0x0000FEFF) { - q += 4; - bo = -1; - } - else if (bom == 0xFFFE0000) { - q += 4; - bo = 1; - } -#else - if (bom == 0x0000FEFF) { - q += 4; - bo = 1; - } - else if (bom == 0xFFFE0000) { - q += 4; - bo = -1; - } -#endif + if (bo == 0 && size >= 4) { + Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; + if (bom == 0x0000FEFF) { + bo = -1; + q += 4; + } + else if (bom == 0xFFFE0000) { + bo = 1; + q += 4; } + if (byteorder) + *byteorder = bo; } - if (bo == -1) { - /* force LE */ - iorder[0] = 0; - iorder[1] = 1; - iorder[2] = 2; - iorder[3] = 3; - } - else if (bo == 1) { - /* force BE */ - iorder[0] = 3; - iorder[1] = 2; - iorder[2] = 1; - iorder[3] = 0; + if (q == e) { + if (consumed) + *consumed = size; - Py_INCREF(unicode_empty); - return unicode_empty; ++ _Py_RETURN_UNICODE_EMPTY(); } - /* This might be one to much, because of a BOM */ - unicode = PyUnicode_New((size+3)/4, 127); - if (!unicode) - return NULL; - if (size == 0) - return unicode; - outpos = 0; +#ifdef WORDS_BIGENDIAN + le = bo < 0; +#else + le = bo <= 0; +#endif - while (q < e) { - Py_UCS4 ch; - /* remaining bytes at the end? (size should be divisible by 4) */ - if (e-q<4) { - if (consumed) + _PyUnicodeWriter_Init(&writer, 0); + if (_PyUnicodeWriter_Prepare(&writer, (e - q + 3) / 4, 127) == -1) + goto onError; + + while (1) { + Py_UCS4 ch = 0; + Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer); + + if (e - q >= 4) { + enum PyUnicode_Kind kind = writer.kind; + void *data = writer.data; + const unsigned char *last = e - 4; + Py_ssize_t pos = writer.pos; + if (le) { + do { + ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0]; + if (ch > maxch) + break; + PyUnicode_WRITE(kind, data, pos++, ch); + q += 4; + } while (q <= last); + } + else { + do { + ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3]; + if (ch > maxch) + break; + PyUnicode_WRITE(kind, data, pos++, ch); + q += 4; + } while (q <= last); + } + writer.pos = pos; + } + + if (ch <= maxch) { + if (q == e || consumed) break; + /* remaining bytes at the end? (size should be divisible by 4) */ errmsg = "truncated data"; - startinpos = ((const char *)q)-starts; - endinpos = ((const char *)e)-starts; - goto utf32Error; - /* The remaining input chars are ignored if the callback - chooses to skip the input */ + startinpos = ((const char *)q) - starts; + endinpos = ((const char *)e) - starts; } - ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | - (q[iorder[1]] << 8) | q[iorder[0]]; - - if (ch >= 0x110000) - { + else { + if (ch < 0x110000) { + if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1) + goto onError; + PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch); + writer.pos++; + q += 4; + continue; + } errmsg = "codepoint not in range(0x110000)"; - startinpos = ((const char *)q)-starts; - endinpos = startinpos+4; - goto utf32Error; + startinpos = ((const char *)q) - starts; + endinpos = startinpos + 4; } - if (unicode_putchar(&unicode, &outpos, ch) < 0) - goto onError; - q += 4; - continue; - utf32Error: - if (unicode_decode_call_errorhandler( + + /* The remaining input chars are ignored if the callback + chooses to skip the input */ + if (unicode_decode_call_errorhandler_writer( errors, &errorHandler, "utf32", errmsg, &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, @@@ -5108,11 -5237,10 +5111,10 @@@ PyUnicode_DecodeUTF16Stateful(const cha if (q == e) { if (consumed) *consumed = size; - Py_INCREF(unicode_empty); - return unicode_empty; + _Py_RETURN_UNICODE_EMPTY(); } -#ifdef BYTEORDER_IS_LITTLE_ENDIAN +#if PY_LITTLE_ENDIAN native_ordering = bo <= 0; #else native_ordering = bo >= 0; @@@ -5384,12 -5516,9 +5386,10 @@@ PyUnicode_DecodeUnicodeEscape(const cha PyObject *errorHandler = NULL; PyObject *exc = NULL; Py_ssize_t len; - Py_ssize_t i; len = length_of_escaped_ascii_string(s, size); - if (len == 0) { - Py_INCREF(unicode_empty); - return unicode_empty; - } ++ if (len == 0) ++ _Py_RETURN_UNICODE_EMPTY(); /* After length_of_escaped_ascii_string() there are two alternatives, either the string is pure ASCII with named escapes like \n, etc. @@@ -5781,11 -5915,6 +5781,9 @@@ PyUnicode_DecodeRawUnicodeEscape(const PyObject *errorHandler = NULL; PyObject *exc = NULL; - if (size == 0) { - Py_INCREF(unicode_empty); - return unicode_empty; - } ++ if (size == 0) ++ _Py_RETURN_UNICODE_EMPTY(); + /* Escaped strings will always be longer than the resulting Unicode string, so we start with size here and then reduce the length after conversion to the true value. (But decoding error @@@ -5988,15 -6113,13 +5986,13 @@@ _PyUnicode_DecodeUnicodeInternal(const 1)) return NULL; - if (size == 0) { - Py_INCREF(unicode_empty); - return unicode_empty; - } ++ if (size == 0) ++ _Py_RETURN_UNICODE_EMPTY(); + /* XXX overflow detection missing */ - v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127); - if (v == NULL) + _PyUnicodeWriter_Init(&writer, 0); + if (_PyUnicodeWriter_Prepare(&writer, (size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127) == -1) goto onError; - if (PyUnicode_GET_LENGTH(v) == 0) - return v; - outpos = 0; end = s + size; while (s < end) { @@@ -7298,14 -7422,12 +7291,12 @@@ PyUnicode_DecodeCharmap(const char *s if (mapping == NULL) return PyUnicode_DecodeLatin1(s, size, errors); - if (size == 0) { - Py_INCREF(unicode_empty); - return unicode_empty; - } - v = PyUnicode_New(size, 127); - if (v == NULL) - goto onError; + if (size == 0) - return v; - outpos = 0; ++ _Py_RETURN_UNICODE_EMPTY(); + _PyUnicodeWriter_Init(&writer, 0); + if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) + goto onError; + e = s + size; if (PyUnicode_CheckExact(mapping)) { Py_ssize_t maplen;