Unicode: optimize creating of 1-character strings

author Victor Stinner <victor.stinner@gmail.com>

Thu, 3 May 2012 00:17:04 +0000 (02:17 +0200)

committer Victor Stinner <victor.stinner@gmail.com>

Thu, 3 May 2012 00:17:04 +0000 (02:17 +0200)
author Victor Stinner <victor.stinner@gmail.com>
Thu, 3 May 2012 00:17:04 +0000 (02:17 +0200)
committer Victor Stinner <victor.stinner@gmail.com>
Thu, 3 May 2012 00:17:04 +0000 (02:17 +0200)
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index e22fcfd02b126b5ea4254dc179aa9445a01c5f0c..f427fd3d11ccf6b17ea2047c3a14a3589357e48d 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1919,8 +1919,18 @@ _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
          return unicode_empty;
      }
      assert(size > 0);
-    if (size == 1 && u[0] < 256)
-        return get_latin1_char((unsigned char)u[0]);
+    if (size == 1) {
+        Py_UCS4 ch = u[0];
+        if (ch < 256)
+            return get_latin1_char((unsigned char)ch);
+
+        res = PyUnicode_New(1, ch);
+        if (res == NULL)
+            return NULL;
+        PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
+        assert(_PyUnicode_CheckConsistency(res, 1));
+        return res;
+    }
  
      max_char = ucs2lib_find_max_char(u, u + size);
      res = PyUnicode_New(size, max_char);
@@ -1947,8 +1957,18 @@ _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
          return unicode_empty;
      }
      assert(size > 0);
-    if (size == 1 && u[0] < 256)
-        return get_latin1_char((unsigned char)u[0]);
+    if (size == 1) {
+        Py_UCS4 ch = u[0];
+        if (ch < 256)
+            return get_latin1_char((unsigned char)ch);
+
+        res = PyUnicode_New(1, ch);
+        if (res == NULL)
+            return NULL;
+        PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
+        assert(_PyUnicode_CheckConsistency(res, 1));
+        return res;
+    }
  
      max_char = ucs4lib_find_max_char(u, u + size);
      res = PyUnicode_New(size, max_char);
@@ -11368,10 +11388,33 @@ unicode_find(PyObject *self, PyObject *args)
  static PyObject *
  unicode_getitem(PyObject *self, Py_ssize_t index)
  {
-    Py_UCS4 ch = PyUnicode_ReadChar(self, index);
-    if (ch == (Py_UCS4)-1)
+    void *data;
+    enum PyUnicode_Kind kind;
+    Py_UCS4 ch;
+    PyObject *res;
+
+    if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
+        PyErr_BadArgument();
+        return NULL;
+    }
+    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
+        PyErr_SetString(PyExc_IndexError, "string index out of range");
          return NULL;
-    return PyUnicode_FromOrdinal(ch);
+    }
+    kind = PyUnicode_KIND(self);
+    data = PyUnicode_DATA(self);
+    ch = PyUnicode_READ(kind, data, index);
+    if (ch < 256)
+        return get_latin1_char(ch);
+
+    res = PyUnicode_New(1, ch);
+    if (res == NULL)
+        return NULL;
+    kind = PyUnicode_KIND(res);
+    data = PyUnicode_DATA(res);
+    PyUnicode_WRITE(kind, data, 0, ch);
+    assert(_PyUnicode_CheckConsistency(res, 1));
+    return res;
  }
  
  /* Believe it or not, this produces the same value for ASCII strings
@@ -12039,7 +12082,6 @@ PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
      }
  
      if (PyUnicode_IS_ASCII(self)) {
-        kind = PyUnicode_KIND(self);
          data = PyUnicode_1BYTE_DATA(self);
          return unicode_fromascii(data + start, length);
      }
author	Victor Stinner <victor.stinner@gmail.com>
	Thu, 3 May 2012 00:17:04 +0000 (02:17 +0200)
committer	Victor Stinner <victor.stinner@gmail.com>
	Thu, 3 May 2012 00:17:04 +0000 (02:17 +0200)