gh-130167: Optimise ``textwrap.dedent()`` (#131919)

author Adam Turner <9087854+AA-Turner@users.noreply.github.com>

Mon, 31 Mar 2025 00:35:12 +0000 (01:35 +0100)

committer GitHub <noreply@github.com>

Mon, 31 Mar 2025 00:35:12 +0000 (00:35 +0000)
author Adam Turner <9087854+AA-Turner@users.noreply.github.com>
Mon, 31 Mar 2025 00:35:12 +0000 (01:35 +0100)
committer GitHub <noreply@github.com>
Mon, 31 Mar 2025 00:35:12 +0000 (00:35 +0000)
diff --git a/Lib/test/test_textwrap.py b/Lib/test/test_textwrap.py

index dfbc2b93dfc0d6dc5a77c67bb88b3972f6cda4c9..77366988b57fa750ad28d4b42689a5f6e573b106 100644 (file)
--- a/Lib/test/test_textwrap.py
+++ b/Lib/test/test_textwrap.py
@@ -769,6 +769,56 @@ class DedentTestCase(unittest.TestCase):
          """assert that dedent() has no effect on 'text'"""
          self.assertEqual(text, dedent(text))
  
+    def test_dedent_whitespace(self):
+        # The empty string.
+        text = ""
+        self.assertUnchanged(text)
+
+        # Only spaces.
+        text = "    "
+        expect = ""
+        self.assertEqual(expect, dedent(text))
+
+        # Only tabs.
+        text = "\t\t\t\t"
+        expect = ""
+        self.assertEqual(expect, dedent(text))
+
+        # A mixture.
+        text = " \t  \t\t  \t "
+        expect = ""
+        self.assertEqual(expect, dedent(text))
+
+        # ASCII whitespace.
+        text = "\f\n\r\t\v "
+        expect = "\n"
+        self.assertEqual(expect, dedent(text))
+
+        # One newline.
+        text = "\n"
+        expect = "\n"
+        self.assertEqual(expect, dedent(text))
+
+        # Windows-style newlines.
+        text = "\r\n" * 5
+        expect = "\n" * 5
+        self.assertEqual(expect, dedent(text))
+
+        # Whitespace mixture.
+        text = "    \n\t\n  \n\t\t\n\n\n       "
+        expect = "\n\n\n\n\n\n"
+        self.assertEqual(expect, dedent(text))
+
+        # Lines consisting only of whitespace are always normalised
+        text = "a\n \n\t\n"
+        expect = "a\n\n\n"
+        self.assertEqual(expect, dedent(text))
+
+        # Whitespace characters on non-empty lines are retained
+        text = "a\r\n\r\n\r\n"
+        expect = "a\r\n\n\n"
+        self.assertEqual(expect, dedent(text))
+
      def test_dedent_nomargin(self):
          # No lines indented.
          text = "Hello there.\nHow are you?\nOh good, I'm glad."
diff --git a/Lib/textwrap.py b/Lib/textwrap.py

index 1bf07aa46cad99f9e08e8618166469456ad5647c..bb6a1186316275073e5b0b098fe14eac278a9906 100644 (file)
--- a/Lib/textwrap.py
+++ b/Lib/textwrap.py
@@ -413,9 +413,6 @@ def shorten(text, width, **kwargs):
  
  # -- Loosely related functionality -------------------------------------
  
-_whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE)
-_leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE)
-
  def dedent(text):
      """Remove any common leading whitespace from every line in `text`.
  
@@ -429,42 +426,21 @@ def dedent(text):
  
      Entirely blank lines are normalized to a newline character.
      """
-    # Look for the longest leading string of spaces and tabs common to
-    # all lines.
-    margin = None
-    text = _whitespace_only_re.sub('', text)
-    indents = _leading_whitespace_re.findall(text)
-    for indent in indents:
-        if margin is None:
-            margin = indent
-
-        # Current line more deeply indented than previous winner:
-        # no change (previous winner is still on top).
-        elif indent.startswith(margin):
-            pass
-
-        # Current line consistent with and no deeper than previous winner:
-        # it's the new winner.
-        elif margin.startswith(indent):
-            margin = indent
-
-        # Find the largest common whitespace between current line and previous
-        # winner.
-        else:
-            for i, (x, y) in enumerate(zip(margin, indent)):
-                if x != y:
-                    margin = margin[:i]
-                    break
+    if not text:
+        return text
+
+    lines = text.split('\n')
  
-    # sanity check (testing/debugging only)
-    if 0 and margin:
-        for line in text.split("\n"):
-            assert not line or line.startswith(margin), \
-                   "line = %r, margin = %r" % (line, margin)
+    # Get length of leading whitespace, inspired by ``os.path.commonprefix()``.
+    non_blank_lines = [l for l in lines if l and not l.isspace()]
+    l1 = min(non_blank_lines, default='')
+    l2 = max(non_blank_lines, default='')
+    margin = 0
+    for margin, c in enumerate(l1):
+        if c != l2[margin] or c not in ' \t':
+            break
  
-    if margin:
-        text = re.sub(r'(?m)^' + margin, '', text)
-    return text
+    return '\n'.join([l[margin:] if not l.isspace() else '' for l in lines])
  
  
  def indent(text, prefix, predicate=None):
diff --git a/Misc/NEWS.d/next/Library/2025-03-30-19-55-10.gh-issue-131792.NNjzFA.rst b/Misc/NEWS.d/next/Library/2025-03-30-19-55-10.gh-issue-131792.NNjzFA.rst

new file mode 100644 (file)

index 0000000..62b619c
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-03-30-19-55-10.gh-issue-131792.NNjzFA.rst
@@ -0,0 +1,5 @@
+Improved performance of :func:`textwrap.dedent` by an average of ~2.4x,
+(with improvements of up to 4x for large inputs),
+and fixed a bug where blank lines with whitespace characters other than space
+or horizontal tab were not normalised to the newline.
+Patch by Adam Turner, Marius Juston, and Pieter Eendebak.
author	Adam Turner <9087854+AA-Turner@users.noreply.github.com>
	Mon, 31 Mar 2025 00:35:12 +0000 (01:35 +0100)
committer	GitHub <noreply@github.com>
	Mon, 31 Mar 2025 00:35:12 +0000 (00:35 +0000)
Lib/test/test_textwrap.py		patch \| blob \| blame \| history
Lib/textwrap.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2025-03-30-19-55-10.gh-issue-131792.NNjzFA.rst	[new file with mode: 0644]	patch \| blob