gh-134675: Add t-string prefixes to tokenizer module, lexical analysis doc, and add...

author Eric V. Smith <ericvsmith@users.noreply.github.com>

Mon, 26 May 2025 17:49:39 +0000 (13:49 -0400)

committer GitHub <noreply@github.com>

Mon, 26 May 2025 17:49:39 +0000 (13:49 -0400)
author Eric V. Smith <ericvsmith@users.noreply.github.com>
Mon, 26 May 2025 17:49:39 +0000 (13:49 -0400)
committer GitHub <noreply@github.com>
Mon, 26 May 2025 17:49:39 +0000 (13:49 -0400)
diff --git a/Doc/reference/lexical_analysis.rst b/Doc/reference/lexical_analysis.rst

index 6c4a4ea81afe2946e81a0d6fb2dc3dd8f09418a9..b22eb4db7945d1de558634872e1745614e3af624 100644 (file)
--- a/Doc/reference/lexical_analysis.rst
+++ b/Doc/reference/lexical_analysis.rst
@@ -489,8 +489,9 @@ String literals are described by the following lexical definitions:
  
  .. productionlist:: python-grammar
     stringliteral: [`stringprefix`](`shortstring` | `longstring`)
-   stringprefix: "r" | "u" | "R" | "U" | "f" | "F"
+   stringprefix: "r" | "u" | "R" | "U" | "f" | "F" | "t" | "T"
                 : | "fr" | "Fr" | "fR" | "FR" | "rf" | "rF" | "Rf" | "RF"
+               : | "tr" | "Tr" | "tR" | "TR" | "rt" | "rT" | "Rt" | "RT"
     shortstring: "'" `shortstringitem`* "'" | '"' `shortstringitem`* '"'
     longstring: "'''" `longstringitem`* "'''" | '"""' `longstringitem`* '"""'
     shortstringitem: `shortstringchar` | `stringescapeseq`
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py

index e6b19fe1812d441d1b261dbf621501c1e38dd207..d4b51841891b283786b2f0f18fd05f97fe6fcd65 100644 (file)
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1,6 +1,8 @@
  import contextlib
+import itertools
  import os
  import re
+import string
  import tempfile
  import token
  import tokenize
@@ -3238,5 +3240,59 @@ class CommandLineTest(unittest.TestCase):
              self.check_output(source, expect, flag)
  
  
+class StringPrefixTest(unittest.TestCase):
+    def test_prefixes(self):
+        # Get the list of defined string prefixes.  I don't see an
+        # obvious documented way of doing this, but probably the best
+        # thing is to split apart tokenize.StringPrefix.
+
+        # Make sure StringPrefix begins and ends in parens.
+        self.assertEqual(tokenize.StringPrefix[0], '(')
+        self.assertEqual(tokenize.StringPrefix[-1], ')')
+
+        # Then split apart everything else by '|'.
+        defined_prefixes = set(tokenize.StringPrefix[1:-1].split('|'))
+
+        # Now compute the actual string prefixes, by exec-ing all
+        # valid prefix combinations, followed by an empty string.
+
+        # Try all prefix lengths until we find a length that has zero
+        # valid prefixes.  This will miss the case where for example
+        # there are no valid 3 character prefixes, but there are valid
+        # 4 character prefixes.  That seems extremely unlikely.
+
+        # Note that the empty prefix is being included, because length
+        # starts at 0.  That's expected, since StringPrefix includes
+        # the empty prefix.
+
+        valid_prefixes = set()
+        for length in itertools.count():
+            num_at_this_length = 0
+            for prefix in (
+                "".join(l) for l in list(itertools.combinations(string.ascii_lowercase, length))
+            ):
+                for t in itertools.permutations(prefix):
+                    for u in itertools.product(*[(c, c.upper()) for c in t]):
+                        p = ''.join(u)
+                        if p == "not":
+                            # 'not' can never be a string prefix,
+                            # because it's a valid expression: not ""
+                            continue
+                        try:
+                            eval(f'{p}""')
+
+                            # No syntax error, so p is a valid string
+                            # prefix.
+
+                            valid_prefixes.add(p)
+                            num_at_this_length += 1
+                        except SyntaxError:
+                            pass
+            if num_at_this_length == 0:
+                break
+
+        self.assertEqual(defined_prefixes, valid_prefixes)
+
+
  if __name__ == "__main__":
      unittest.main()
diff --git a/Lib/tokenize.py b/Lib/tokenize.py

index 559a7aecbde2d187b698b09ee8339ab106631ad1..7e71755068e1df39f02d94d6eef42ef57e15b8e3 100644 (file)
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -86,7 +86,7 @@ def _all_string_prefixes():
      # The valid string prefixes. Only contain the lower case versions,
      #  and don't contain any permutations (include 'fr', but not
      #  'rf'). The various permutations will be generated.
-    _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
+    _valid_string_prefixes = ['b', 'r', 'u', 'f', 't', 'br', 'fr', 'tr']
      # if we add binary f-strings, add: ['fb', 'fbr']
      result = {''}
      for prefix in _valid_string_prefixes:
author	Eric V. Smith <ericvsmith@users.noreply.github.com>
	Mon, 26 May 2025 17:49:39 +0000 (13:49 -0400)
committer	GitHub <noreply@github.com>
	Mon, 26 May 2025 17:49:39 +0000 (13:49 -0400)
Doc/reference/lexical_analysis.rst		patch \| blob \| blame \| history
Lib/test/test_tokenize.py		patch \| blob \| blame \| history
Lib/tokenize.py		patch \| blob \| blame \| history