]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
gh-103285: Rewrite _splitlines_no_ff to improve performance (#103307)
authorTian Gao <gaogaotiantian@hotmail.com>
Mon, 24 Apr 2023 05:03:49 +0000 (22:03 -0700)
committerGitHub <noreply@github.com>
Mon, 24 Apr 2023 05:03:49 +0000 (23:03 -0600)
Lib/ast.py
Lib/test/test_ast.py
Misc/NEWS.d/next/Library/2023-04-06-04-35-59.gh-issue-103285.rCZ9-G.rst [new file with mode: 0644]

index 2cbc80a9835aa56d8c793f220e8f46069d35902b..d9733a79d3a78fb9d0cd017dd5c8278c63cd9927 100644 (file)
@@ -25,6 +25,7 @@
     :license: Python License.
 """
 import sys
+import re
 from _ast import *
 from contextlib import contextmanager, nullcontext
 from enum import IntEnum, auto, _simple_enum
@@ -305,28 +306,17 @@ def get_docstring(node, clean=True):
     return text
 
 
-def _splitlines_no_ff(source):
+_line_pattern = re.compile(r"(.*?(?:\r\n|\n|\r|$))")
+def _splitlines_no_ff(source, maxlines=None):
     """Split a string into lines ignoring form feed and other chars.
 
     This mimics how the Python parser splits source code.
     """
-    idx = 0
     lines = []
-    next_line = ''
-    while idx < len(source):
-        c = source[idx]
-        next_line += c
-        idx += 1
-        # Keep \r\n together
-        if c == '\r' and idx < len(source) and source[idx] == '\n':
-            next_line += '\n'
-            idx += 1
-        if c in '\r\n':
-            lines.append(next_line)
-            next_line = ''
-
-    if next_line:
-        lines.append(next_line)
+    for lineno, match in enumerate(_line_pattern.finditer(source), 1):
+        if maxlines is not None and lineno > maxlines:
+            break
+        lines.append(match[0])
     return lines
 
 
@@ -360,7 +350,7 @@ def get_source_segment(source, node, *, padded=False):
     except AttributeError:
         return None
 
-    lines = _splitlines_no_ff(source)
+    lines = _splitlines_no_ff(source, maxlines=end_lineno+1)
     if end_lineno == lineno:
         return lines[lineno].encode()[col_offset:end_col_offset].decode()
 
index a579bfd793078419c7813c3bea984693b2092aa5..8eef7baec7011832f7c1991c2dc4260a9de90e5f 100644 (file)
@@ -2293,6 +2293,17 @@ class EndPositionTests(unittest.TestCase):
         cdef = ast.parse(s).body[0]
         self.assertEqual(ast.get_source_segment(s, cdef.body[0], padded=True), s_method)
 
+    def test_source_segment_newlines(self):
+        s = 'def f():\n  pass\ndef g():\r  pass\r\ndef h():\r\n  pass\r\n'
+        f, g, h = ast.parse(s).body
+        self._check_content(s, f, 'def f():\n  pass')
+        self._check_content(s, g, 'def g():\r  pass')
+        self._check_content(s, h, 'def h():\r\n  pass')
+
+        s = 'def f():\n  a = 1\r  b = 2\r\n  c = 3\n'
+        f = ast.parse(s).body[0]
+        self._check_content(s, f, s.rstrip())
+
     def test_source_segment_missing_info(self):
         s = 'v = 1\r\nw = 1\nx = 1\n\ry = 1\r\n'
         v, w, x, y = ast.parse(s).body
diff --git a/Misc/NEWS.d/next/Library/2023-04-06-04-35-59.gh-issue-103285.rCZ9-G.rst b/Misc/NEWS.d/next/Library/2023-04-06-04-35-59.gh-issue-103285.rCZ9-G.rst
new file mode 100644 (file)
index 0000000..62b4364
--- /dev/null
@@ -0,0 +1 @@
+Improve performance of :func:`ast.get_source_segment`.