Close #14625: Rewrite the UTF-32 decoder. It is now 3x to 4x faster

author Victor Stinner <victor.stinner@gmail.com>

Tue, 30 Oct 2012 22:12:47 +0000 (23:12 +0100)

committer Victor Stinner <victor.stinner@gmail.com>

Tue, 30 Oct 2012 22:12:47 +0000 (23:12 +0100)
author Victor Stinner <victor.stinner@gmail.com>
Tue, 30 Oct 2012 22:12:47 +0000 (23:12 +0100)
committer Victor Stinner <victor.stinner@gmail.com>
Tue, 30 Oct 2012 22:12:47 +0000 (23:12 +0100)
diff --git a/Doc/whatsnew/3.4.rst b/Doc/whatsnew/3.4.rst

index 944d754aa35ece87f77587e0a433fcdbf30106f5..d2608384097c8582d63e8102396e16a300bb5983 100644 (file)
--- a/Doc/whatsnew/3.4.rst
+++ b/Doc/whatsnew/3.4.rst
@@ -157,7 +157,7 @@ Optimizations
  
  Major performance enhancements have been added:
  
-* None yet.
+* The UTF-32 decoder is now 3x to 4x faster.
  
  
  Build and C API Changes
diff --git a/Misc/NEWS b/Misc/NEWS

index c101aa5e9b7785f1588c4968228180dd4c3b477b..a6ad757fadcf00ee133bfc1843c3916685a977b2 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,9 @@ What's New in Python 3.4.0 Alpha 1?
  Core and Builtins
  -----------------
  
+- Issue #14625: Rewrite the UTF-32 decoder. It is now 3x to 4x faster. Patch
+  written by Serhiy Storchaka.
+
  - Issue #16197: Update winreg docstrings and documentation to match code.
    Patch by Zachary Ware.
  
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 3e2e8e3f264ff0785bf3321bd5618843fdf0487d..0f5bdfcb3666f0332a0c074b7e16b948a099d99d 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -4804,14 +4804,8 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
      Py_ssize_t outpos;
      PyObject *unicode;
      const unsigned char *q, *e;
-    int bo = 0;       /* assume native ordering by default */
+    int le, bo = 0;       /* assume native ordering by default */
      const char *errmsg = "";
-    /* Offsets from q for retrieving bytes in the right order. */
-#if PY_LITTLE_ENDIAN
-    int iorder[] = {0, 1, 2, 3};
-#else
-    int iorder[] = {3, 2, 1, 0};
-#endif
      PyObject *errorHandler = NULL;
      PyObject *exc = NULL;
  
@@ -4825,83 +4819,88 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
         byte order setting accordingly. In native mode, the leading BOM
         mark is skipped, in all other modes, it is copied to the output
         stream as-is (giving a ZWNBSP character). */
-    if (bo == 0) {
-        if (size >= 4) {
-            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
-                (q[iorder[1]] << 8) | q[iorder[0]];
-#if PY_LITTLE_ENDIAN
-            if (bom == 0x0000FEFF) {
-                q += 4;
-                bo = -1;
-            }
-            else if (bom == 0xFFFE0000) {
-                q += 4;
-                bo = 1;
-            }
-#else
-            if (bom == 0x0000FEFF) {
-                q += 4;
-                bo = 1;
-            }
-            else if (bom == 0xFFFE0000) {
-                q += 4;
-                bo = -1;
-            }
-#endif
+    if (bo == 0 && size >= 4) {
+        Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
+        if (bom == 0x0000FEFF) {
+            bo = -1;
+            q += 4;
          }
+        else if (bom == 0xFFFE0000) {
+            bo = 1;
+            q += 4;
+        }
+        if (byteorder)
+            *byteorder = bo;
      }
  
-    if (bo == -1) {
-        /* force LE */
-        iorder[0] = 0;
-        iorder[1] = 1;
-        iorder[2] = 2;
-        iorder[3] = 3;
-    }
-    else if (bo == 1) {
-        /* force BE */
-        iorder[0] = 3;
-        iorder[1] = 2;
-        iorder[2] = 1;
-        iorder[3] = 0;
+    if (q == e) {
+        if (consumed)
+            *consumed = size;
+        Py_INCREF(unicode_empty);
+        return unicode_empty;
      }
  
-    /* This might be one to much, because of a BOM */
-    unicode = PyUnicode_New((size+3)/4, 127);
+#ifdef WORDS_BIGENDIAN
+    le = bo < 0;
+#else
+    le = bo <= 0;
+#endif
+
+    unicode = PyUnicode_New((e - q + 3) / 4, 127);
      if (!unicode)
          return NULL;
-    if (size == 0)
-        return unicode;
+
      outpos = 0;
+    while (1) {
+        Py_UCS4 ch = 0;
+        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(unicode);
+
+        if (e - q >= 4) {
+            enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
+            void *data = PyUnicode_DATA(unicode);
+            const unsigned char *last = e - 4;
+            if (le) {
+                do {
+                    ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
+                    if (ch > maxch)
+                        break;
+                    PyUnicode_WRITE(kind, data, outpos++, ch);
+                    q += 4;
+                } while (q <= last);
+            }
+            else {
+                do {
+                    ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
+                    if (ch > maxch)
+                        break;
+                    PyUnicode_WRITE(kind, data, outpos++, ch);
+                    q += 4;
+                } while (q <= last);
+            }
+        }
  
-    while (q < e) {
-        Py_UCS4 ch;
-        /* remaining bytes at the end? (size should be divisible by 4) */
-        if (e-q<4) {
-            if (consumed)
+        if (ch <= maxch) {
+            if (q == e || consumed)
                  break;
+            /* remaining bytes at the end? (size should be divisible by 4) */
              errmsg = "truncated data";
-            startinpos = ((const char *)q)-starts;
-            endinpos = ((const char *)e)-starts;
-            goto utf32Error;
-            /* The remaining input chars are ignored if the callback
-               chooses to skip the input */
+            startinpos = ((const char *)q) - starts;
+            endinpos = ((const char *)e) - starts;
          }
-        ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
-            (q[iorder[1]] << 8) | q[iorder[0]];
-
-        if (ch >= 0x110000)
-        {
+        else {
+            if (ch < 0x110000) {
+                if (unicode_putchar(&unicode, &outpos, ch) < 0)
+                    goto onError;
+                q += 4;
+                continue;
+            }
              errmsg = "codepoint not in range(0x110000)";
-            startinpos = ((const char *)q)-starts;
-            endinpos = startinpos+4;
-            goto utf32Error;
+            startinpos = ((const char *)q) - starts;
+            endinpos = startinpos + 4;
          }
-        if (unicode_putchar(&unicode, &outpos, ch) < 0)
-            goto onError;
-        q += 4;
-        continue;
-      utf32Error:
+
+        /* The remaining input chars are ignored if the callback
+           chooses to skip the input */
          if (unicode_decode_call_errorhandler(
                  errors, &errorHandler,
                  "utf32", errmsg,
@@ -4910,9 +4909,6 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
              goto onError;
      }
  
-    if (byteorder)
-        *byteorder = bo;
-
      if (consumed)
          *consumed = (const char *)q-starts;
author	Victor Stinner <victor.stinner@gmail.com>
	Tue, 30 Oct 2012 22:12:47 +0000 (23:12 +0100)
committer	Victor Stinner <victor.stinner@gmail.com>
	Tue, 30 Oct 2012 22:12:47 +0000 (23:12 +0100)
Doc/whatsnew/3.4.rst		patch \| blob \| blame \| history
Misc/NEWS		patch \| blob \| blame \| history
Objects/unicodeobject.c		patch \| blob \| blame \| history