[3.12] gh-115154: Fix untokenize handling of unicode named literals (GH-115171) ...

author Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>

Mon, 19 Feb 2024 16:38:43 +0000 (17:38 +0100)

committer GitHub <noreply@github.com>

Mon, 19 Feb 2024 16:38:43 +0000 (16:38 +0000)
author Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
Mon, 19 Feb 2024 16:38:43 +0000 (17:38 +0100)
committer GitHub <noreply@github.com>
Mon, 19 Feb 2024 16:38:43 +0000 (16:38 +0000)
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py

index 2886bceb7ba931988ae84e02908bd5bc6d3ffe57..c52b58b4ffdbca2dc837e7632b7e36cbf3596091 100644 (file)
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1874,6 +1874,43 @@ class TestRoundtrip(TestCase):
                               "    print('Can not import' # comment2\n)"
                               "else:   print('Loaded')\n")
  
+        self.check_roundtrip("f'\\N{EXCLAMATION MARK}'")
+        self.check_roundtrip(r"f'\\N{SNAKE}'")
+        self.check_roundtrip(r"f'\\N{{SNAKE}}'")
+        self.check_roundtrip(r"f'\N{SNAKE}'")
+        self.check_roundtrip(r"f'\\\N{SNAKE}'")
+        self.check_roundtrip(r"f'\\\\\N{SNAKE}'")
+        self.check_roundtrip(r"f'\\\\\\\N{SNAKE}'")
+
+        self.check_roundtrip(r"f'\\N{1}'")
+        self.check_roundtrip(r"f'\\\\N{2}'")
+        self.check_roundtrip(r"f'\\\\\\N{3}'")
+        self.check_roundtrip(r"f'\\\\\\\\N{4}'")
+
+        self.check_roundtrip(r"f'\\N{{'")
+        self.check_roundtrip(r"f'\\\\N{{'")
+        self.check_roundtrip(r"f'\\\\\\N{{'")
+        self.check_roundtrip(r"f'\\\\\\\\N{{'")
+        cases = [
+    """
+if 1:
+    "foo"
+"bar"
+""",
+    """
+if 1:
+    ("foo"
+     "bar")
+""",
+    """
+if 1:
+    "foo"
+    "bar"
+""" ]
+        for case in cases:
+            self.check_roundtrip(case)
+
+
      def test_continuation(self):
          # Balancing continuation
          self.check_roundtrip("a = (3,4, \n"
@@ -1908,9 +1945,6 @@ class TestRoundtrip(TestCase):
          tempdir = os.path.dirname(__file__) or os.curdir
          testfiles = glob.glob(os.path.join(glob.escape(tempdir), "test*.py"))
  
-        # TODO: Remove this once we can untokenize PEP 701 syntax
-        testfiles.remove(os.path.join(tempdir, "test_fstring.py"))
-
          if not support.is_resource_enabled("cpu"):
              testfiles = random.sample(testfiles, 10)
  
diff --git a/Lib/tokenize.py b/Lib/tokenize.py

index 49e8144edddab7e729ad19d5ddf62af6686f82ab..7af7a5cc1cd6809d9cb532c96c4f94367ce9296c 100644 (file)
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -170,6 +170,7 @@ class Untokenizer:
          self.tokens = []
          self.prev_row = 1
          self.prev_col = 0
+        self.prev_type = None
          self.encoding = None
  
      def add_whitespace(self, start):
@@ -185,6 +186,29 @@ class Untokenizer:
          if col_offset:
              self.tokens.append(" " * col_offset)
  
+    def escape_brackets(self, token):
+        characters = []
+        consume_until_next_bracket = False
+        for character in token:
+            if character == "}":
+                if consume_until_next_bracket:
+                    consume_until_next_bracket = False
+                else:
+                    characters.append(character)
+            if character == "{":
+                n_backslashes = sum(
+                    1 for char in _itertools.takewhile(
+                        "\\".__eq__,
+                        characters[-2::-1]
+                    )
+                )
+                if n_backslashes % 2 == 0:
+                    characters.append(character)
+                else:
+                    consume_until_next_bracket = True
+            characters.append(character)
+        return "".join(characters)
+
      def untokenize(self, iterable):
          it = iter(iterable)
          indents = []
@@ -216,11 +240,13 @@ class Untokenizer:
                  startline = False
              elif tok_type == FSTRING_MIDDLE:
                  if '{' in token or '}' in token:
+                    token = self.escape_brackets(token)
+                    last_line = token.splitlines()[-1]
                      end_line, end_col = end
-                    end = (end_line, end_col + token.count('{') + token.count('}'))
-                    token = re.sub('{', '{{', token)
-                    token = re.sub('}', '}}', token)
-
+                    extra_chars = last_line.count("{{") + last_line.count("}}")
+                    end = (end_line, end_col + extra_chars)
+            elif tok_type in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END):
+                self.tokens.append(" ")
  
              self.add_whitespace(start)
              self.tokens.append(token)
@@ -228,6 +254,7 @@ class Untokenizer:
              if tok_type in (NEWLINE, NL):
                  self.prev_row += 1
                  self.prev_col = 0
+            self.prev_type = tok_type
          return "".join(self.tokens)
  
      def compat(self, token, iterable):
@@ -235,6 +262,7 @@ class Untokenizer:
          toks_append = self.tokens.append
          startline = token[0] in (NEWLINE, NL)
          prevstring = False
+        in_fstring = 0
  
          for tok in _itertools.chain([token], iterable):
              toknum, tokval = tok[:2]
@@ -253,6 +281,10 @@ class Untokenizer:
              else:
                  prevstring = False
  
+            if toknum == FSTRING_START:
+                in_fstring += 1
+            elif toknum == FSTRING_END:
+                in_fstring -= 1
              if toknum == INDENT:
                  indents.append(tokval)
                  continue
@@ -265,11 +297,18 @@ class Untokenizer:
                  toks_append(indents[-1])
                  startline = False
              elif toknum == FSTRING_MIDDLE:
-                if '{' in tokval or '}' in tokval:
-                    tokval = re.sub('{', '{{', tokval)
-                    tokval = re.sub('}', '}}', tokval)
+                tokval = self.escape_brackets(tokval)
+
+            # Insert a space between two consecutive brackets if we are in an f-string
+            if tokval in {"{", "}"} and self.tokens and self.tokens[-1] == tokval and in_fstring:
+                tokval = ' ' + tokval
+
+            # Insert a space between two consecutive f-strings
+            if toknum in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END):
+                self.tokens.append(" ")
  
              toks_append(tokval)
+            self.prev_type = toknum
  
  
  def untokenize(iterable):
diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-02-08-16-01-18.gh-issue-115154.ji96FV.rst b/Misc/NEWS.d/next/Core and Builtins/2024-02-08-16-01-18.gh-issue-115154.ji96FV.rst

new file mode 100644 (file)

index 0000000..045596b
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2024-02-08-16-01-18.gh-issue-115154.ji96FV.rst
@@ -0,0 +1,2 @@
+Fix a bug that was causing the :func:`tokenize.untokenize` function to
+handle unicode named literals incorrectly. Patch by Pablo Galindo
author	Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
	Mon, 19 Feb 2024 16:38:43 +0000 (17:38 +0100)
committer	GitHub <noreply@github.com>
	Mon, 19 Feb 2024 16:38:43 +0000 (16:38 +0000)
Lib/test/test_tokenize.py		patch \| blob \| blame \| history
Lib/tokenize.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Core and Builtins/2024-02-08-16-01-18.gh-issue-115154.ji96FV.rst	[new file with mode: 0644]	patch \| blob