[3.13] gh-125553: Fix backslash continuation in `untokenize` (GH-126010) (#129153)

author Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>

Tue, 21 Jan 2025 21:04:55 +0000 (22:04 +0100)

committer GitHub <noreply@github.com>

Tue, 21 Jan 2025 21:04:55 +0000 (21:04 +0000)
author Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
Tue, 21 Jan 2025 21:04:55 +0000 (22:04 +0100)
committer GitHub <noreply@github.com>
Tue, 21 Jan 2025 21:04:55 +0000 (21:04 +0000)
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py

index 75710db7d053756eae51070c8deef3f67366c012..480bff743a9f8a32fa67ecdf004a66b2ea2b7b31 100644 (file)
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1,4 +1,5 @@
  import os
+import re
  import token
  import tokenize
  import unittest
@@ -1819,6 +1820,22 @@ class UntokenizeTest(TestCase):
          self.assertEqual(tokenize.untokenize(iter(tokens)), b'Hello ')
  
  
+def contains_ambiguous_backslash(source):
+    """Return `True` if the source contains a backslash on a
+    line by itself. For example:
+
+    a = (1
+        \\
+    )
+
+    Code like this cannot be untokenized exactly. This is because
+    the tokenizer does not produce any tokens for the line containing
+    the backslash and so there is no way to know its indent.
+    """
+    pattern = re.compile(br'\n\s*\\\r?\n')
+    return pattern.search(source) is not None
+
+
  class TestRoundtrip(TestCase):
  
      def check_roundtrip(self, f):
@@ -1829,6 +1846,9 @@ class TestRoundtrip(TestCase):
          tokenize.untokenize(), and the latter tokenized again to 2-tuples.
          The test fails if the 3 pair tokenizations do not match.
  
+        If the source code can be untokenized unambiguously, the
+        untokenized code must match the original code exactly.
+
          When untokenize bugs are fixed, untokenize with 5-tuples should
          reproduce code that does not contain a backslash continuation
          following spaces.  A proper test should test this.
@@ -1852,6 +1872,13 @@ class TestRoundtrip(TestCase):
          tokens2_from5 = [tok[:2] for tok in tokenize.tokenize(readline5)]
          self.assertEqual(tokens2_from5, tokens2)
  
+        if not contains_ambiguous_backslash(code):
+            # The BOM does not produce a token so there is no way to preserve it.
+            code_without_bom = code.removeprefix(b'\xef\xbb\xbf')
+            readline = iter(code_without_bom.splitlines(keepends=True)).__next__
+            untokenized_code = tokenize.untokenize(tokenize.tokenize(readline))
+            self.assertEqual(code_without_bom, untokenized_code)
+
      def check_line_extraction(self, f):
          if isinstance(f, str):
              code = f.encode('utf-8')
diff --git a/Lib/tokenize.py b/Lib/tokenize.py

index 430447d35e1adf0f3fc8daf4c15a90bcc039df10..7ca552c4fc590edfe74ead58ca76be1aa9b9f06b 100644 (file)
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -169,6 +169,7 @@ class Untokenizer:
          self.prev_row = 1
          self.prev_col = 0
          self.prev_type = None
+        self.prev_line = ""
          self.encoding = None
  
      def add_whitespace(self, start):
@@ -176,14 +177,28 @@ class Untokenizer:
          if row < self.prev_row or row == self.prev_row and col < self.prev_col:
              raise ValueError("start ({},{}) precedes previous end ({},{})"
                               .format(row, col, self.prev_row, self.prev_col))
-        row_offset = row - self.prev_row
-        if row_offset:
-            self.tokens.append("\\\n" * row_offset)
-            self.prev_col = 0
+        self.add_backslash_continuation(start)
          col_offset = col - self.prev_col
          if col_offset:
              self.tokens.append(" " * col_offset)
  
+    def add_backslash_continuation(self, start):
+        """Add backslash continuation characters if the row has increased
+        without encountering a newline token.
+
+        This also inserts the correct amount of whitespace before the backslash.
+        """
+        row = start[0]
+        row_offset = row - self.prev_row
+        if row_offset == 0:
+            return
+
+        newline = '\r\n' if self.prev_line.endswith('\r\n') else '\n'
+        line = self.prev_line.rstrip('\\\r\n')
+        ws = ''.join(_itertools.takewhile(str.isspace, reversed(line)))
+        self.tokens.append(ws + f"\\{newline}" * row_offset)
+        self.prev_col = 0
+
      def escape_brackets(self, token):
          characters = []
          consume_until_next_bracket = False
@@ -243,8 +258,6 @@ class Untokenizer:
                      end_line, end_col = end
                      extra_chars = last_line.count("{{") + last_line.count("}}")
                      end = (end_line, end_col + extra_chars)
-            elif tok_type in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END):
-                self.tokens.append(" ")
  
              self.add_whitespace(start)
              self.tokens.append(token)
@@ -253,6 +266,7 @@ class Untokenizer:
                  self.prev_row += 1
                  self.prev_col = 0
              self.prev_type = tok_type
+            self.prev_line = line
          return "".join(self.tokens)
  
      def compat(self, token, iterable):
diff --git a/Misc/NEWS.d/next/Library/2024-10-26-16-59-02.gh-issue-125553.4pDLzt.rst b/Misc/NEWS.d/next/Library/2024-10-26-16-59-02.gh-issue-125553.4pDLzt.rst

new file mode 100644 (file)

index 0000000..291c5e6
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-10-26-16-59-02.gh-issue-125553.4pDLzt.rst
@@ -0,0 +1,2 @@
+Fix round-trip invariance for backslash continuations in
+:func:`tokenize.untokenize`.
author	Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
	Tue, 21 Jan 2025 21:04:55 +0000 (22:04 +0100)
committer	GitHub <noreply@github.com>
	Tue, 21 Jan 2025 21:04:55 +0000 (21:04 +0000)
Lib/test/test_tokenize.py		patch \| blob \| blame \| history
Lib/tokenize.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2024-10-26-16-59-02.gh-issue-125553.4pDLzt.rst	[new file with mode: 0644]	patch \| blob