gh-119396: Optimize unicode_repr() (#119617)

author Victor Stinner <vstinner@python.org>

Tue, 28 May 2024 16:05:20 +0000 (18:05 +0200)

committer GitHub <noreply@github.com>

Tue, 28 May 2024 16:05:20 +0000 (18:05 +0200)
author Victor Stinner <vstinner@python.org>
Tue, 28 May 2024 16:05:20 +0000 (18:05 +0200)
committer GitHub <noreply@github.com>
Tue, 28 May 2024 16:05:20 +0000 (18:05 +0200)
diff --git a/Makefile.pre.in b/Makefile.pre.in

index 9e99c95e2af0427f2d9b1bb8ae7bbac639085f41..a80d9334ba5134b1a04b0fe63f2779d3ce37bacf 100644 (file)
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -1841,6 +1841,7 @@ UNICODE_DEPS = \
                 $(srcdir)/Objects/stringlib/localeutil.h \
                 $(srcdir)/Objects/stringlib/partition.h \
                 $(srcdir)/Objects/stringlib/replace.h \
+               $(srcdir)/Objects/stringlib/repr.h \
                 $(srcdir)/Objects/stringlib/split.h \
                 $(srcdir)/Objects/stringlib/ucs1lib.h \
                 $(srcdir)/Objects/stringlib/ucs2lib.h \
diff --git a/Objects/stringlib/repr.h b/Objects/stringlib/repr.h

new file mode 100644 (file)

index 0000000..87b1a8b
--- /dev/null
+++ b/Objects/stringlib/repr.h
@@ -0,0 +1,95 @@
+/* stringlib: repr() implementation */
+
+#ifndef STRINGLIB_FASTSEARCH_H
+#error must include "stringlib/fastsearch.h" before including this module
+#endif
+
+
+static void
+STRINGLIB(repr)(PyObject *unicode, Py_UCS4 quote,
+                STRINGLIB_CHAR *odata)
+{
+    Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
+    const void *idata = PyUnicode_DATA(unicode);
+    int ikind = PyUnicode_KIND(unicode);
+
+    *odata++ = quote;
+    for (Py_ssize_t i = 0; i < isize; i++) {
+        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
+
+        /* Escape quotes and backslashes */
+        if ((ch == quote) || (ch == '\\')) {
+            *odata++ = '\\';
+            *odata++ = ch;
+            continue;
+        }
+
+        /* Map special whitespace to '\t', \n', '\r' */
+        if (ch == '\t') {
+            *odata++ = '\\';
+            *odata++ = 't';
+        }
+        else if (ch == '\n') {
+            *odata++ = '\\';
+            *odata++ = 'n';
+        }
+        else if (ch == '\r') {
+            *odata++ = '\\';
+            *odata++ = 'r';
+        }
+
+        /* Map non-printable US ASCII to '\xhh' */
+        else if (ch < ' ' || ch == 0x7F) {
+            *odata++ = '\\';
+            *odata++ = 'x';
+            *odata++ = Py_hexdigits[(ch >> 4) & 0x000F];
+            *odata++ = Py_hexdigits[ch & 0x000F];
+        }
+
+        /* Copy ASCII characters as-is */
+        else if (ch < 0x7F) {
+            *odata++ = ch;
+        }
+
+        /* Non-ASCII characters */
+        else {
+            /* Map Unicode whitespace and control characters
+               (categories Z* and C* except ASCII space)
+            */
+            if (!Py_UNICODE_ISPRINTABLE(ch)) {
+                *odata++ = '\\';
+                /* Map 8-bit characters to '\xhh' */
+                if (ch <= 0xff) {
+                    *odata++ = 'x';
+                    *odata++ = Py_hexdigits[(ch >> 4) & 0x000F];
+                    *odata++ = Py_hexdigits[ch & 0x000F];
+                }
+                /* Map 16-bit characters to '\uxxxx' */
+                else if (ch <= 0xffff) {
+                    *odata++ = 'u';
+                    *odata++ = Py_hexdigits[(ch >> 12) & 0xF];
+                    *odata++ = Py_hexdigits[(ch >> 8) & 0xF];
+                    *odata++ = Py_hexdigits[(ch >> 4) & 0xF];
+                    *odata++ = Py_hexdigits[ch & 0xF];
+                }
+                /* Map 21-bit characters to '\U00xxxxxx' */
+                else {
+                    *odata++ = 'U';
+                    *odata++ = Py_hexdigits[(ch >> 28) & 0xF];
+                    *odata++ = Py_hexdigits[(ch >> 24) & 0xF];
+                    *odata++ = Py_hexdigits[(ch >> 20) & 0xF];
+                    *odata++ = Py_hexdigits[(ch >> 16) & 0xF];
+                    *odata++ = Py_hexdigits[(ch >> 12) & 0xF];
+                    *odata++ = Py_hexdigits[(ch >> 8) & 0xF];
+                    *odata++ = Py_hexdigits[(ch >> 4) & 0xF];
+                    *odata++ = Py_hexdigits[ch & 0xF];
+                }
+            }
+            /* Copy characters as-is */
+            else {
+                *odata++ = ch;
+            }
+        }
+    }
+    *odata = quote;
+}
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 92db31f1e498f97fa5ec38b086e1c5de10bb6d60..eb37b478cc4de1daca4fa4032a66a19ae76f69fe 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -899,6 +899,7 @@ ensure_unicode(PyObject *obj)
  #include "stringlib/count.h"
  #include "stringlib/find.h"
  #include "stringlib/replace.h"
+#include "stringlib/repr.h"
  #include "stringlib/find_max_char.h"
  #include "stringlib/undef.h"
  
@@ -909,6 +910,7 @@ ensure_unicode(PyObject *obj)
  #include "stringlib/count.h"
  #include "stringlib/find.h"
  #include "stringlib/replace.h"
+#include "stringlib/repr.h"
  #include "stringlib/find_max_char.h"
  #include "stringlib/undef.h"
  
@@ -919,6 +921,7 @@ ensure_unicode(PyObject *obj)
  #include "stringlib/count.h"
  #include "stringlib/find.h"
  #include "stringlib/replace.h"
+#include "stringlib/repr.h"
  #include "stringlib/find_max_char.h"
  #include "stringlib/undef.h"
  
@@ -12336,24 +12339,17 @@ unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
  static PyObject *
  unicode_repr(PyObject *unicode)
  {
-    PyObject *repr;
-    Py_ssize_t isize;
-    Py_ssize_t osize, squote, dquote, i, o;
-    Py_UCS4 max, quote;
-    int ikind, okind, unchanged;
-    const void *idata;
-    void *odata;
-
-    isize = PyUnicode_GET_LENGTH(unicode);
-    idata = PyUnicode_DATA(unicode);
+    Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
+    const void *idata = PyUnicode_DATA(unicode);
  
      /* Compute length of output, quote characters, and
         maximum character */
-    osize = 0;
-    max = 127;
-    squote = dquote = 0;
-    ikind = PyUnicode_KIND(unicode);
-    for (i = 0; i < isize; i++) {
+    Py_ssize_t osize = 0;
+    Py_UCS4 maxch = 127;
+    Py_ssize_t squote = 0;
+    Py_ssize_t dquote = 0;
+    int ikind = PyUnicode_KIND(unicode);
+    for (Py_ssize_t i = 0; i < isize; i++) {
          Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
          Py_ssize_t incr = 1;
          switch (ch) {
@@ -12369,7 +12365,7 @@ unicode_repr(PyObject *unicode)
              else if (ch < 0x7f)
                  ;
              else if (Py_UNICODE_ISPRINTABLE(ch))
-                max = ch > max ? ch : max;
+                maxch = (ch > maxch) ? ch : maxch;
              else if (ch < 0x100)
                  incr = 4; /* \xHH */
              else if (ch < 0x10000)
@@ -12385,10 +12381,10 @@ unicode_repr(PyObject *unicode)
          osize += incr;
      }
  
-    quote = '\'';
-    unchanged = (osize == isize);
+    Py_UCS4 quote = '\'';
+    int changed = (osize != isize);
      if (squote) {
-        unchanged = 0;
+        changed = 1;
          if (dquote)
              /* Both squote and dquote present. Use squote,
                 and escape them */
@@ -12398,99 +12394,35 @@ unicode_repr(PyObject *unicode)
      }
      osize += 2;   /* quotes */
  
-    repr = PyUnicode_New(osize, max);
+    PyObject *repr = PyUnicode_New(osize, maxch);
      if (repr == NULL)
          return NULL;
-    okind = PyUnicode_KIND(repr);
-    odata = PyUnicode_DATA(repr);
+    int okind = PyUnicode_KIND(repr);
+    void *odata = PyUnicode_DATA(repr);
+
+    if (!changed) {
+        PyUnicode_WRITE(okind, odata, 0, quote);
  
-    PyUnicode_WRITE(okind, odata, 0, quote);
-    PyUnicode_WRITE(okind, odata, osize-1, quote);
-    if (unchanged) {
          _PyUnicode_FastCopyCharacters(repr, 1,
                                        unicode, 0,
                                        isize);
+
+        PyUnicode_WRITE(okind, odata, osize-1, quote);
      }
      else {
-        for (i = 0, o = 1; i < isize; i++) {
-            Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
-
-            /* Escape quotes and backslashes */
-            if ((ch == quote) || (ch == '\\')) {
-                PyUnicode_WRITE(okind, odata, o++, '\\');
-                PyUnicode_WRITE(okind, odata, o++, ch);
-                continue;
-            }
-
-            /* Map special whitespace to '\t', \n', '\r' */
-            if (ch == '\t') {
-                PyUnicode_WRITE(okind, odata, o++, '\\');
-                PyUnicode_WRITE(okind, odata, o++, 't');
-            }
-            else if (ch == '\n') {
-                PyUnicode_WRITE(okind, odata, o++, '\\');
-                PyUnicode_WRITE(okind, odata, o++, 'n');
-            }
-            else if (ch == '\r') {
-                PyUnicode_WRITE(okind, odata, o++, '\\');
-                PyUnicode_WRITE(okind, odata, o++, 'r');
-            }
-
-            /* Map non-printable US ASCII to '\xhh' */
-            else if (ch < ' ' || ch == 0x7F) {
-                PyUnicode_WRITE(okind, odata, o++, '\\');
-                PyUnicode_WRITE(okind, odata, o++, 'x');
-                PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
-                PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
-            }
-
-            /* Copy ASCII characters as-is */
-            else if (ch < 0x7F) {
-                PyUnicode_WRITE(okind, odata, o++, ch);
-            }
-
-            /* Non-ASCII characters */
-            else {
-                /* Map Unicode whitespace and control characters
-                   (categories Z* and C* except ASCII space)
-                */
-                if (!Py_UNICODE_ISPRINTABLE(ch)) {
-                    PyUnicode_WRITE(okind, odata, o++, '\\');
-                    /* Map 8-bit characters to '\xhh' */
-                    if (ch <= 0xff) {
-                        PyUnicode_WRITE(okind, odata, o++, 'x');
-                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
-                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
-                    }
-                    /* Map 16-bit characters to '\uxxxx' */
-                    else if (ch <= 0xffff) {
-                        PyUnicode_WRITE(okind, odata, o++, 'u');
-                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
-                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
-                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
-                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
-                    }
-                    /* Map 21-bit characters to '\U00xxxxxx' */
-                    else {
-                        PyUnicode_WRITE(okind, odata, o++, 'U');
-                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
-                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
-                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
-                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
-                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
-                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
-                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
-                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
-                    }
-                }
-                /* Copy characters as-is */
-                else {
-                    PyUnicode_WRITE(okind, odata, o++, ch);
-                }
-            }
+        switch (okind) {
+        case PyUnicode_1BYTE_KIND:
+            ucs1lib_repr(unicode, quote, odata);
+            break;
+        case PyUnicode_2BYTE_KIND:
+            ucs2lib_repr(unicode, quote, odata);
+            break;
+        default:
+            assert(okind == PyUnicode_4BYTE_KIND);
+            ucs4lib_repr(unicode, quote, odata);
          }
      }
-    /* Closing quote already added at the beginning */
+
      assert(_PyUnicode_CheckConsistency(repr, 1));
      return repr;
  }
diff --git a/Tools/c-analyzer/cpython/_parser.py b/Tools/c-analyzer/cpython/_parser.py

index 12010f0e9c054918e7daa08860cc33b27e679399..4623f2c8d671bdb4605e639808ae5809e2b0cb1b 100644 (file)
--- a/Tools/c-analyzer/cpython/_parser.py
+++ b/Tools/c-analyzer/cpython/_parser.py
@@ -167,6 +167,7 @@ Objects/stringlib/count.h   Objects/stringlib/fastsearch.h
  Objects/stringlib/find.h       Objects/stringlib/fastsearch.h
  Objects/stringlib/partition.h  Objects/stringlib/fastsearch.h
  Objects/stringlib/replace.h    Objects/stringlib/fastsearch.h
+Objects/stringlib/repr.h       Objects/stringlib/fastsearch.h
  Objects/stringlib/split.h      Objects/stringlib/fastsearch.h
  
  # @end=tsv@
author	Victor Stinner <vstinner@python.org>
	Tue, 28 May 2024 16:05:20 +0000 (18:05 +0200)
committer	GitHub <noreply@github.com>
	Tue, 28 May 2024 16:05:20 +0000 (18:05 +0200)
Makefile.pre.in		patch \| blob \| blame \| history
Objects/stringlib/repr.h	[new file with mode: 0644]	patch \| blob
Objects/unicodeobject.c		patch \| blob \| blame \| history
Tools/c-analyzer/cpython/_parser.py		patch \| blob \| blame \| history