gh-91576: Speed up iteration of strings (#91574)

author Kumar Aditya <59607654+kumaraditya303@users.noreply.github.com>

Mon, 18 Apr 2022 14:18:27 +0000 (19:48 +0530)

committer GitHub <noreply@github.com>

Mon, 18 Apr 2022 14:18:27 +0000 (07:18 -0700)
author Kumar Aditya <59607654+kumaraditya303@users.noreply.github.com>
Mon, 18 Apr 2022 14:18:27 +0000 (19:48 +0530)
committer GitHub <noreply@github.com>
Mon, 18 Apr 2022 14:18:27 +0000 (07:18 -0700)
diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h

index c7f06051a622fca94487eb47ced91bb0a3fa2ffb..75b90501db1568835c8a600a0ad799f7faf61694 100644 (file)
--- a/Include/internal/pycore_unicodeobject.h
+++ b/Include/internal/pycore_unicodeobject.h
@@ -20,6 +20,7 @@ extern void _PyUnicode_Fini(PyInterpreterState *);
  extern void _PyUnicode_FiniTypes(PyInterpreterState *);
  extern void _PyStaticUnicode_Dealloc(PyObject *);
  
+extern PyTypeObject _PyUnicodeASCIIIter_Type;
  
  /* other API */
  
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py

index df7afd5046a56b41e8857e0a9147333c5928b240..c98fabf8bc9b5a733470395aea1153b5a0ef3168 100644 (file)
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -9,6 +9,7 @@ import _string
  import codecs
  import itertools
  import operator
+import pickle
  import struct
  import sys
  import textwrap
@@ -185,6 +186,36 @@ class UnicodeTest(string_tests.CommonTest,
          self.assertEqual(next(it), "\u3333")
          self.assertRaises(StopIteration, next, it)
  
+    def test_iterators_invocation(self):
+        cases = [type(iter('abc')), type(iter('🚀'))]
+        for cls in cases:
+            with self.subTest(cls=cls):
+                self.assertRaises(TypeError, cls)
+
+    def test_iteration(self):
+        cases = ['abc', '🚀🚀🚀', "\u1111\u2222\u3333"]
+        for case in cases:
+            with self.subTest(string=case):
+                self.assertEqual(case, "".join(iter(case)))
+
+    def test_exhausted_iterator(self):
+        cases = ['abc', '🚀🚀🚀', "\u1111\u2222\u3333"]
+        for case in cases:
+            with self.subTest(case=case):
+                iterator = iter(case)
+                tuple(iterator)
+                self.assertRaises(StopIteration, next, iterator)
+
+    def test_pickle_iterator(self):
+        cases = ['abc', '🚀🚀🚀', "\u1111\u2222\u3333"]
+        for case in cases:
+            with self.subTest(case=case):
+                for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+                    it = iter(case)
+                    with self.subTest(proto=proto):
+                        pickled = "".join(pickle.loads(pickle.dumps(it, proto)))
+                        self.assertEqual(case, pickled)
+
      def test_count(self):
          string_tests.CommonTest.test_count(self)
          # check mixed argument types
diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-04-15-16-57-23.gh-issue-91576.adoDj_.rst b/Misc/NEWS.d/next/Core and Builtins/2022-04-15-16-57-23.gh-issue-91576.adoDj_.rst

new file mode 100644 (file)

index 0000000..b792f3e
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2022-04-15-16-57-23.gh-issue-91576.adoDj_.rst
@@ -0,0 +1 @@
+Speed up iteration of ascii strings by 50%. Patch by Kumar Aditya.
diff --git a/Objects/object.c b/Objects/object.c

index 33dab5ecbf205fd5ab7c9c9ae2bb389a8d830f65..fe2d76f578e2ada0ee0279303db275d5e46f0310 100644 (file)
--- a/Objects/object.c
+++ b/Objects/object.c
@@ -1936,6 +1936,7 @@ static PyTypeObject* static_types[] = {
      &_PyNamespace_Type,
      &_PyNone_Type,
      &_PyNotImplemented_Type,
+    &_PyUnicodeASCIIIter_Type,
      &_PyUnion_Type,
      &_PyWeakref_CallableProxyType,
      &_PyWeakref_ProxyType,
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index d35a671a816802c335b5d2f46b099233da76c27c..6b05c37faabfc129da87b68e3f030710ec45850c 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -15697,7 +15697,7 @@ unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
  static PyObject *
  unicodeiter_next(unicodeiterobject *it)
  {
-    PyObject *seq, *item;
+    PyObject *seq;
  
      assert(it != NULL);
      seq = it->it_seq;
@@ -15709,10 +15709,8 @@ unicodeiter_next(unicodeiterobject *it)
          int kind = PyUnicode_KIND(seq);
          const void *data = PyUnicode_DATA(seq);
          Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
-        item = PyUnicode_FromOrdinal(chr);
-        if (item != NULL)
-            ++it->it_index;
-        return item;
+        it->it_index++;
+        return unicode_char(chr);
      }
  
      it->it_seq = NULL;
@@ -15720,6 +15718,29 @@ unicodeiter_next(unicodeiterobject *it)
      return NULL;
  }
  
+static PyObject *
+unicode_ascii_iter_next(unicodeiterobject *it)
+{
+    assert(it != NULL);
+    PyObject *seq = it->it_seq;
+    if (seq == NULL) {
+        return NULL;
+    }
+    assert(_PyUnicode_CHECK(seq));
+    assert(PyUnicode_IS_COMPACT_ASCII(seq));
+    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
+        const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
+        Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
+                                              data, it->it_index);
+        it->it_index++;
+        PyObject *item = (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
+        return Py_NewRef(item);
+    }
+    it->it_seq = NULL;
+    Py_DECREF(seq);
+    return NULL;
+}
+
  static PyObject *
  unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
  {
@@ -15808,6 +15829,19 @@ PyTypeObject PyUnicodeIter_Type = {
      0,
  };
  
+PyTypeObject _PyUnicodeASCIIIter_Type = {
+    PyVarObject_HEAD_INIT(&PyType_Type, 0)
+    .tp_name = "str_ascii_iterator",
+    .tp_basicsize = sizeof(unicodeiterobject),
+    .tp_dealloc = (destructor)unicodeiter_dealloc,
+    .tp_getattro = PyObject_GenericGetAttr,
+    .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
+    .tp_traverse = (traverseproc)unicodeiter_traverse,
+    .tp_iter = PyObject_SelfIter,
+    .tp_iternext = (iternextfunc)unicode_ascii_iter_next,
+    .tp_methods = unicodeiter_methods,
+};
+
  static PyObject *
  unicode_iter(PyObject *seq)
  {
@@ -15819,7 +15853,12 @@ unicode_iter(PyObject *seq)
      }
      if (PyUnicode_READY(seq) == -1)
          return NULL;
-    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
+    if (PyUnicode_IS_COMPACT_ASCII(seq)) {
+        it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
+    }
+    else {
+        it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
+    }
      if (it == NULL)
          return NULL;
      it->it_index = 0;
author	Kumar Aditya <59607654+kumaraditya303@users.noreply.github.com>
	Mon, 18 Apr 2022 14:18:27 +0000 (19:48 +0530)
committer	GitHub <noreply@github.com>
	Mon, 18 Apr 2022 14:18:27 +0000 (07:18 -0700)
Include/internal/pycore_unicodeobject.h		patch \| blob \| blame \| history
Lib/test/test_unicode.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Core and Builtins/2022-04-15-16-57-23.gh-issue-91576.adoDj_.rst	[new file with mode: 0644]	patch \| blob
Objects/object.c		patch \| blob \| blame \| history
Objects/unicodeobject.c		patch \| blob \| blame \| history