From: Serhiy Storchaka <storchaka@gmail.com>
Date: Sat, 26 Jan 2013 10:18:17 +0000 (+0200)
Subject: Issue #10156: In the interpreter's initialization phase, unicode globals
X-Git-Tag: v3.4.0a1~1521^2~4
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=ed3c4128c061aef01a19bdfa4ac8b87e43f9d768;p=thirdparty%2FPython%2Fcpython.git

Issue #10156: In the interpreter's initialization phase, unicode globals
are now initialized dynamically as needed.
---

ed3c4128c061aef01a19bdfa4ac8b87e43f9d768
diff --cc Objects/unicodeobject.c
index 5030e8d6349e,c96a91c39732..b4f4185caacf
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@@ -47,10 -47,19 +47,11 @@@ OF OR IN CONNECTION WITH THE USE OR PER
  #include <windows.h>
  #endif
  
 -/* Endianness switches; defaults to little endian */
 -
 -#ifdef WORDS_BIGENDIAN
 -# define BYTEORDER_IS_BIG_ENDIAN
 -#else
 -# define BYTEORDER_IS_LITTLE_ENDIAN
 -#endif
 -
  /* --- Globals ------------------------------------------------------------
  
-    The globals are initialized by the _PyUnicode_Init() API and should
-    not be used before calling that API.
+ NOTE: In the interpreter's initialization phase, some globals are currently
+       initialized dynamically as needed. In the process Unicode objects may
+       be created before the Unicode type is ready.
  
  */
  
@@@ -404,11 -432,12 +424,10 @@@ unicode_result_wchar(PyObject *unicode
  #ifndef Py_DEBUG
      Py_ssize_t len;
  
 -    assert(Py_REFCNT(unicode) == 1);
 -
      len = _PyUnicode_WSTR_LENGTH(unicode);
      if (len == 0) {
-         Py_INCREF(unicode_empty);
          Py_DECREF(unicode);
-         return unicode_empty;
+         _Py_RETURN_UNICODE_EMPTY();
      }
  
      if (len == 1) {
@@@ -4201,16 -4330,14 +4207,15 @@@ PyUnicode_DecodeUTF7Stateful(const cha
      if (size == 0) {
          if (consumed)
              *consumed = 0;
-         Py_INCREF(unicode_empty);
-         return unicode_empty;
 -        return unicode;
++        _Py_RETURN_UNICODE_EMPTY();
      }
  
 -    shiftOutStart = outpos = 0;
 +    /* Start off assuming it's all ASCII. Widen later as necessary. */
 +    _PyUnicodeWriter_Init(&writer, 0);
 +    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
 +        goto onError;
 +
 +    shiftOutStart = 0;
      e = s + size;
  
      while (s < e) {
@@@ -4851,92 -4980,84 +4855,91 @@@ PyUnicode_DecodeUTF32Stateful(const cha
         byte order setting accordingly. In native mode, the leading BOM
         mark is skipped, in all other modes, it is copied to the output
         stream as-is (giving a ZWNBSP character). */
 -    if (bo == 0) {
 -        if (size >= 4) {
 -            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
 -                (q[iorder[1]] << 8) | q[iorder[0]];
 -#ifdef BYTEORDER_IS_LITTLE_ENDIAN
 -            if (bom == 0x0000FEFF) {
 -                q += 4;
 -                bo = -1;
 -            }
 -            else if (bom == 0xFFFE0000) {
 -                q += 4;
 -                bo = 1;
 -            }
 -#else
 -            if (bom == 0x0000FEFF) {
 -                q += 4;
 -                bo = 1;
 -            }
 -            else if (bom == 0xFFFE0000) {
 -                q += 4;
 -                bo = -1;
 -            }
 -#endif
 +    if (bo == 0 && size >= 4) {
 +        Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
 +        if (bom == 0x0000FEFF) {
 +            bo = -1;
 +            q += 4;
 +        }
 +        else if (bom == 0xFFFE0000) {
 +            bo = 1;
 +            q += 4;
          }
 +        if (byteorder)
 +            *byteorder = bo;
      }
  
 -    if (bo == -1) {
 -        /* force LE */
 -        iorder[0] = 0;
 -        iorder[1] = 1;
 -        iorder[2] = 2;
 -        iorder[3] = 3;
 -    }
 -    else if (bo == 1) {
 -        /* force BE */
 -        iorder[0] = 3;
 -        iorder[1] = 2;
 -        iorder[2] = 1;
 -        iorder[3] = 0;
 +    if (q == e) {
 +        if (consumed)
 +            *consumed = size;
-         Py_INCREF(unicode_empty);
-         return unicode_empty;
++        _Py_RETURN_UNICODE_EMPTY();
      }
  
 -    /* This might be one to much, because of a BOM */
 -    unicode = PyUnicode_New((size+3)/4, 127);
 -    if (!unicode)
 -        return NULL;
 -    if (size == 0)
 -        return unicode;
 -    outpos = 0;
 +#ifdef WORDS_BIGENDIAN
 +    le = bo < 0;
 +#else
 +    le = bo <= 0;
 +#endif
  
 -    while (q < e) {
 -        Py_UCS4 ch;
 -        /* remaining bytes at the end? (size should be divisible by 4) */
 -        if (e-q<4) {
 -            if (consumed)
 +    _PyUnicodeWriter_Init(&writer, 0);
 +    if (_PyUnicodeWriter_Prepare(&writer, (e - q + 3) / 4, 127) == -1)
 +        goto onError;
 +
 +    while (1) {
 +        Py_UCS4 ch = 0;
 +        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
 +
 +        if (e - q >= 4) {
 +            enum PyUnicode_Kind kind = writer.kind;
 +            void *data = writer.data;
 +            const unsigned char *last = e - 4;
 +            Py_ssize_t pos = writer.pos;
 +            if (le) {
 +                do {
 +                    ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
 +                    if (ch > maxch)
 +                        break;
 +                    PyUnicode_WRITE(kind, data, pos++, ch);
 +                    q += 4;
 +                } while (q <= last);
 +            }
 +            else {
 +                do {
 +                    ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
 +                    if (ch > maxch)
 +                        break;
 +                    PyUnicode_WRITE(kind, data, pos++, ch);
 +                    q += 4;
 +                } while (q <= last);
 +            }
 +            writer.pos = pos;
 +        }
 +
 +        if (ch <= maxch) {
 +            if (q == e || consumed)
                  break;
 +            /* remaining bytes at the end? (size should be divisible by 4) */
              errmsg = "truncated data";
 -            startinpos = ((const char *)q)-starts;
 -            endinpos = ((const char *)e)-starts;
 -            goto utf32Error;
 -            /* The remaining input chars are ignored if the callback
 -               chooses to skip the input */
 +            startinpos = ((const char *)q) - starts;
 +            endinpos = ((const char *)e) - starts;
          }
 -        ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
 -            (q[iorder[1]] << 8) | q[iorder[0]];
 -
 -        if (ch >= 0x110000)
 -        {
 +        else {
 +            if (ch < 0x110000) {
 +                if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
 +                    goto onError;
 +                PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
 +                writer.pos++;
 +                q += 4;
 +                continue;
 +            }
              errmsg = "codepoint not in range(0x110000)";
 -            startinpos = ((const char *)q)-starts;
 -            endinpos = startinpos+4;
 -            goto utf32Error;
 +            startinpos = ((const char *)q) - starts;
 +            endinpos = startinpos + 4;
          }
 -        if (unicode_putchar(&unicode, &outpos, ch) < 0)
 -            goto onError;
 -        q += 4;
 -        continue;
 -      utf32Error:
 -        if (unicode_decode_call_errorhandler(
 +
 +        /* The remaining input chars are ignored if the callback
 +           chooses to skip the input */
 +        if (unicode_decode_call_errorhandler_writer(
                  errors, &errorHandler,
                  "utf32", errmsg,
                  &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
@@@ -5108,11 -5237,10 +5111,10 @@@ PyUnicode_DecodeUTF16Stateful(const cha
      if (q == e) {
          if (consumed)
              *consumed = size;
-         Py_INCREF(unicode_empty);
-         return unicode_empty;
+         _Py_RETURN_UNICODE_EMPTY();
      }
  
 -#ifdef BYTEORDER_IS_LITTLE_ENDIAN
 +#if PY_LITTLE_ENDIAN
      native_ordering = bo <= 0;
  #else
      native_ordering = bo >= 0;
@@@ -5384,12 -5516,9 +5386,10 @@@ PyUnicode_DecodeUnicodeEscape(const cha
      PyObject *errorHandler = NULL;
      PyObject *exc = NULL;
      Py_ssize_t len;
 -    Py_ssize_t i;
  
      len = length_of_escaped_ascii_string(s, size);
-     if (len == 0) {
-         Py_INCREF(unicode_empty);
-         return unicode_empty;
-     }
++    if (len == 0)
++        _Py_RETURN_UNICODE_EMPTY();
  
      /* After length_of_escaped_ascii_string() there are two alternatives,
         either the string is pure ASCII with named escapes like \n, etc.
@@@ -5781,11 -5915,6 +5781,9 @@@ PyUnicode_DecodeRawUnicodeEscape(const 
      PyObject *errorHandler = NULL;
      PyObject *exc = NULL;
  
-     if (size == 0) {
-         Py_INCREF(unicode_empty);
-         return unicode_empty;
-     }
++    if (size == 0)
++        _Py_RETURN_UNICODE_EMPTY();
 +
      /* Escaped strings will always be longer than the resulting
         Unicode string, so we start with size here and then reduce the
         length after conversion to the true value. (But decoding error
@@@ -5988,15 -6113,13 +5986,13 @@@ _PyUnicode_DecodeUnicodeInternal(const 
                       1))
          return NULL;
  
-     if (size == 0) {
-         Py_INCREF(unicode_empty);
-         return unicode_empty;
-     }
++    if (size == 0)
++        _Py_RETURN_UNICODE_EMPTY();
 +
      /* XXX overflow detection missing */
 -    v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
 -    if (v == NULL)
 +    _PyUnicodeWriter_Init(&writer, 0);
 +    if (_PyUnicodeWriter_Prepare(&writer, (size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127) == -1)
          goto onError;
 -    if (PyUnicode_GET_LENGTH(v) == 0)
 -        return v;
 -    outpos = 0;
      end = s + size;
  
      while (s < end) {
@@@ -7298,14 -7422,12 +7291,12 @@@ PyUnicode_DecodeCharmap(const char *s
      if (mapping == NULL)
          return PyUnicode_DecodeLatin1(s, size, errors);
  
-     if (size == 0) {
-         Py_INCREF(unicode_empty);
-         return unicode_empty;
-     }
 -    v = PyUnicode_New(size, 127);
 -    if (v == NULL)
 -        goto onError;
+     if (size == 0)
 -        return v;
 -    outpos = 0;
++        _Py_RETURN_UNICODE_EMPTY();
 +    _PyUnicodeWriter_Init(&writer, 0);
 +    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
 +        goto onError;
 +
      e = s + size;
      if (PyUnicode_CheckExact(mapping)) {
          Py_ssize_t maplen;