gh-102856: Python tokenizer implementation for PEP 701 (#104323)

author Marta Gómez Macías <mgmacias@google.com>

Sun, 21 May 2023 00:03:02 +0000 (02:03 +0200)

committer GitHub <noreply@github.com>

Sun, 21 May 2023 00:03:02 +0000 (01:03 +0100)
author Marta Gómez Macías <mgmacias@google.com>
Sun, 21 May 2023 00:03:02 +0000 (02:03 +0200)
committer GitHub <noreply@github.com>
Sun, 21 May 2023 00:03:02 +0000 (01:03 +0100)
diff --git a/Doc/library/token-list.inc b/Doc/library/token-list.inc

index 3b345099bf54b56cc1ca0eaa7a0695c26c5ee564..e885de88cad9ae0be28fd3be4e44977ecff2d49e 100644 (file)
--- a/Doc/library/token-list.inc
+++ b/Doc/library/token-list.inc
@@ -223,6 +223,10 @@
  
  .. data:: FSTRING_END
  
+.. data:: COMMENT
+
+.. data:: NL
+
  .. data:: ERRORTOKEN
  
  .. data:: N_TOKENS
diff --git a/Doc/library/token.rst b/Doc/library/token.rst

index a1aceba96ce030698a0842c297bdf92b832c7d2d..903847bb206d62ad4eafe19fa3dae68413c5b4b4 100644 (file)
--- a/Doc/library/token.rst
+++ b/Doc/library/token.rst
@@ -50,11 +50,13 @@ The following token type values aren't used by the C tokenizer but are needed fo
  the :mod:`tokenize` module.
  
  .. data:: COMMENT
+   :noindex:
  
     Token value used to indicate a comment.
  
  
  .. data:: NL
+   :noindex:
  
     Token value used to indicate a non-terminating newline.  The
     :data:`NEWLINE` token indicates the end of a logical line of Python code;
diff --git a/Grammar/Tokens b/Grammar/Tokens

index 096876fdd130f8e52945c9ce2a78467b35c7cbd8..618ae811d824b0911a16b85f83f52d95d5a1f439 100644 (file)
--- a/Grammar/Tokens
+++ b/Grammar/Tokens
@@ -64,9 +64,9 @@ SOFT_KEYWORD
  FSTRING_START
  FSTRING_MIDDLE
  FSTRING_END
+COMMENT
+NL
  ERRORTOKEN
  
  # These aren't used by the C tokenizer but are needed for tokenize.py
-COMMENT
-NL
  ENCODING
diff --git a/Include/internal/pycore_global_objects_fini_generated.h b/Include/internal/pycore_global_objects_fini_generated.h

index 8ca3545d8b3fbca1f5727fb08a20d8bcc3cf5a72..5a1993eac23a8afe45c8af8cafdaa118d50f58be 100644 (file)
--- a/Include/internal/pycore_global_objects_fini_generated.h
+++ b/Include/internal/pycore_global_objects_fini_generated.h
@@ -918,6 +918,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
      _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exception));
      _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exp));
      _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(extend));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(extra_tokens));
      _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(facility));
      _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(factory));
      _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(false));
diff --git a/Include/internal/pycore_global_strings.h b/Include/internal/pycore_global_strings.h

index 8e429bbfa26f698392531c6c1ed3ca729d1bc66b..61967877ab4ac88016a7d791a806d3004837bd55 100644 (file)
--- a/Include/internal/pycore_global_strings.h
+++ b/Include/internal/pycore_global_strings.h
@@ -406,6 +406,7 @@ struct _Py_global_strings {
          STRUCT_FOR_ID(exception)
          STRUCT_FOR_ID(exp)
          STRUCT_FOR_ID(extend)
+        STRUCT_FOR_ID(extra_tokens)
          STRUCT_FOR_ID(facility)
          STRUCT_FOR_ID(factory)
          STRUCT_FOR_ID(false)
diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h

index 3edf076696d980dcf9aa6a97e9df9459c4367621..59ec49af358f2ead90dc6007b3aad3128395b817 100644 (file)
--- a/Include/internal/pycore_runtime_init_generated.h
+++ b/Include/internal/pycore_runtime_init_generated.h
@@ -912,6 +912,7 @@ extern "C" {
      INIT_ID(exception), \
      INIT_ID(exp), \
      INIT_ID(extend), \
+    INIT_ID(extra_tokens), \
      INIT_ID(facility), \
      INIT_ID(factory), \
      INIT_ID(false), \
diff --git a/Include/internal/pycore_token.h b/Include/internal/pycore_token.h

index b9df8766736adf6e3f595967b440ba0b25ba73e3..c02e637fee1ee27de0247812be24e07f54e1d345 100644 (file)
--- a/Include/internal/pycore_token.h
+++ b/Include/internal/pycore_token.h
@@ -77,7 +77,9 @@ extern "C" {
  #define FSTRING_START   61
  #define FSTRING_MIDDLE  62
  #define FSTRING_END     63
-#define ERRORTOKEN      64
+#define COMMENT         64
+#define NL              65
+#define ERRORTOKEN      66
  #define N_TOKENS        68
  #define NT_OFFSET       256
  
diff --git a/Include/internal/pycore_unicodeobject_generated.h b/Include/internal/pycore_unicodeobject_generated.h

index 0e1f71798a6ee3d206c0c651b9db1cace99f1a65..8f8a067e4c18080b6451490107e472c1b547afdc 100644 (file)
--- a/Include/internal/pycore_unicodeobject_generated.h
+++ b/Include/internal/pycore_unicodeobject_generated.h
@@ -1059,6 +1059,9 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
      string = &_Py_ID(extend);
      assert(_PyUnicode_CheckConsistency(string, 1));
      _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(extra_tokens);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
      string = &_Py_ID(facility);
      assert(_PyUnicode_CheckConsistency(string, 1));
      _PyUnicode_InternInPlace(interp, &string);
diff --git a/Lib/inspect.py b/Lib/inspect.py

index 63f5aa91d270b74e85643d136effb552c26a9780..7709a95003efbd751bd41716ad5b69750c27128b 100644 (file)
--- a/Lib/inspect.py
+++ b/Lib/inspect.py
@@ -2187,7 +2187,7 @@ def _signature_strip_non_python_syntax(signature):
              if string == ',':
                  current_parameter += 1
  
-        if (type == ERRORTOKEN) and (string == '$'):
+        if (type == OP) and (string == '$'):
              assert self_parameter is None
              self_parameter = current_parameter
              continue
@@ -2195,7 +2195,7 @@ def _signature_strip_non_python_syntax(signature):
          add(string)
          if (string == ','):
              add(' ')
-    clean_signature = ''.join(text)
+    clean_signature = ''.join(text).strip()
      return clean_signature, self_parameter
  
  
diff --git a/Lib/tabnanny.py b/Lib/tabnanny.py

index 9d2df59d36ff4771461ea1f3e92f8250b4e051de..e2ac6837f157d537b4a1070da906e0ee2e0d74ae 100755 (executable)
--- a/Lib/tabnanny.py
+++ b/Lib/tabnanny.py
@@ -107,6 +107,10 @@ def check(file):
          errprint("%r: Token Error: %s" % (file, msg))
          return
  
+    except SyntaxError as msg:
+        errprint("%r: Token Error: %s" % (file, msg))
+        return
+
      except IndentationError as msg:
          errprint("%r: Indentation Error: %s" % (file, msg))
          return
@@ -272,6 +276,12 @@ def format_witnesses(w):
      return prefix + " " + ', '.join(firsts)
  
  def process_tokens(tokens):
+    try:
+        _process_tokens(tokens)
+    except TabError as e:
+        raise NannyNag(e.lineno, e.msg, e.text)
+
+def _process_tokens(tokens):
      INDENT = tokenize.INDENT
      DEDENT = tokenize.DEDENT
      NEWLINE = tokenize.NEWLINE
diff --git a/Lib/test/test_tabnanny.py b/Lib/test/test_tabnanny.py

index afb8da719b0eeda68bdceaf853c7c2446a28894d..dac47318011d9dd4191edf4b742242127719255b 100644 (file)
--- a/Lib/test/test_tabnanny.py
+++ b/Lib/test/test_tabnanny.py
@@ -223,7 +223,7 @@ class TestCheck(TestCase):
          with TemporaryPyFile(SOURCE_CODES["nannynag_errored"]) as file_path:
              out = f"{file_path!r}: *** Line 3: trouble in tab city! ***\n"
              out += "offending line: '\\tprint(\"world\")\\n'\n"
-            out += "indent not equal e.g. at tab size 1\n"
+            out += "inconsistent use of tabs and spaces in indentation\n"
  
              tabnanny.verbose = 1
              self.verify_tabnanny_check(file_path, out=out)
@@ -315,7 +315,7 @@ class TestCommandLine(TestCase):
      def test_with_errored_file(self):
          """Should displays error when errored python file is given."""
          with TemporaryPyFile(SOURCE_CODES["wrong_indented"]) as file_path:
-            stderr  = f"{file_path!r}: Indentation Error: "
+            stderr  = f"{file_path!r}: Token Error: "
              stderr += ('unindent does not match any outer indentation level'
                      ' (<tokenize>, line 3)')
              self.validate_cmd(file_path, stderr=stderr, expect_failure=True)
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py

index 911b53e58165881425bf1217f2ffffd126032829..dda7243bfa19fe4bb36abf23543d05cc12fbbc5c 100644 (file)
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -3,7 +3,7 @@ from test.support import os_helper
  from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
                       STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
                       open as tokenize_open, Untokenizer, generate_tokens,
-                     NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT)
+                     NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo)
  from io import BytesIO, StringIO
  import unittest
  from textwrap import dedent
@@ -82,7 +82,7 @@ class TokenizeTest(TestCase):
      NAME       'False'       (4, 11) (4, 16)
      COMMENT    '# NEWLINE'   (4, 17) (4, 26)
      NEWLINE    '\\n'          (4, 26) (4, 27)
-    DEDENT     ''            (5, 0) (5, 0)
+    DEDENT     ''            (4, 27) (4, 27)
      """)
          indent_error_file = b"""\
  def k(x):
@@ -230,6 +230,10 @@ def k(x):
                  continue
              self.assertEqual(number_token(lit), lit)
          for lit in INVALID_UNDERSCORE_LITERALS:
+            try:
+                number_token(lit)
+            except SyntaxError:
+                continue
              self.assertNotEqual(number_token(lit), lit)
  
      def test_string(self):
@@ -381,21 +385,119 @@ c"""', """\
      STRING     'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4)
      """)
          self.check_tokenize('f"abc"', """\
-    STRING     'f"abc"'      (1, 0) (1, 6)
+    FSTRING_START 'f"'          (1, 0) (1, 2)
+    FSTRING_MIDDLE 'abc'         (1, 2) (1, 5)
+    FSTRING_END '"'           (1, 5) (1, 6)
      """)
          self.check_tokenize('fR"a{b}c"', """\
-    STRING     'fR"a{b}c"'   (1, 0) (1, 9)
+    FSTRING_START 'fR"'         (1, 0) (1, 3)
+    FSTRING_MIDDLE 'a'           (1, 3) (1, 4)
+    OP         '{'           (1, 4) (1, 5)
+    NAME       'b'           (1, 5) (1, 6)
+    OP         '}'           (1, 6) (1, 7)
+    FSTRING_MIDDLE 'c'           (1, 7) (1, 8)
+    FSTRING_END '"'           (1, 8) (1, 9)
+    """)
+        self.check_tokenize('fR"a{{{b!r}}}c"', """\
+    FSTRING_START 'fR"'         (1, 0) (1, 3)
+    FSTRING_MIDDLE 'a{'          (1, 3) (1, 5)
+    OP         '{'           (1, 6) (1, 7)
+    NAME       'b'           (1, 7) (1, 8)
+    OP         '!'           (1, 8) (1, 9)
+    NAME       'r'           (1, 9) (1, 10)
+    OP         '}'           (1, 10) (1, 11)
+    FSTRING_MIDDLE '}'           (1, 11) (1, 12)
+    FSTRING_MIDDLE 'c'           (1, 13) (1, 14)
+    FSTRING_END '"'           (1, 14) (1, 15)
+    """)
+        self.check_tokenize('f"{{{1+1}}}"', """\
+    FSTRING_START 'f"'          (1, 0) (1, 2)
+    FSTRING_MIDDLE '{'           (1, 2) (1, 3)
+    OP         '{'           (1, 4) (1, 5)
+    NUMBER     '1'           (1, 5) (1, 6)
+    OP         '+'           (1, 6) (1, 7)
+    NUMBER     '1'           (1, 7) (1, 8)
+    OP         '}'           (1, 8) (1, 9)
+    FSTRING_MIDDLE '}'           (1, 9) (1, 10)
+    FSTRING_END '"'           (1, 11) (1, 12)
+    """)
+        self.check_tokenize('f"""{f\'\'\'{f\'{f"{1+1}"}\'}\'\'\'}"""', """\
+    FSTRING_START 'f\"""'        (1, 0) (1, 4)
+    OP         '{'           (1, 4) (1, 5)
+    FSTRING_START "f'''"        (1, 5) (1, 9)
+    OP         '{'           (1, 9) (1, 10)
+    FSTRING_START "f'"          (1, 10) (1, 12)
+    OP         '{'           (1, 12) (1, 13)
+    FSTRING_START 'f"'          (1, 13) (1, 15)
+    OP         '{'           (1, 15) (1, 16)
+    NUMBER     '1'           (1, 16) (1, 17)
+    OP         '+'           (1, 17) (1, 18)
+    NUMBER     '1'           (1, 18) (1, 19)
+    OP         '}'           (1, 19) (1, 20)
+    FSTRING_END '"'           (1, 20) (1, 21)
+    OP         '}'           (1, 21) (1, 22)
+    FSTRING_END "'"           (1, 22) (1, 23)
+    OP         '}'           (1, 23) (1, 24)
+    FSTRING_END "'''"         (1, 24) (1, 27)
+    OP         '}'           (1, 27) (1, 28)
+    FSTRING_END '\"""'         (1, 28) (1, 31)
+    """)
+        self.check_tokenize('f"""     x\nstr(data, encoding={invalid!r})\n"""', """\
+    FSTRING_START 'f\"""'        (1, 0) (1, 4)
+    FSTRING_MIDDLE '     x\\nstr(data, encoding=' (1, 4) (2, 19)
+    OP         '{'           (2, 19) (2, 20)
+    NAME       'invalid'     (2, 20) (2, 27)
+    OP         '!'           (2, 27) (2, 28)
+    NAME       'r'           (2, 28) (2, 29)
+    OP         '}'           (2, 29) (2, 30)
+    FSTRING_MIDDLE ')\\n'         (2, 30) (3, 0)
+    FSTRING_END '\"""'         (3, 0) (3, 3)
+    """)
+        self.check_tokenize('f"""123456789\nsomething{None}bad"""', """\
+    FSTRING_START 'f\"""'        (1, 0) (1, 4)
+    FSTRING_MIDDLE '123456789\\nsomething' (1, 4) (2, 9)
+    OP         '{'           (2, 9) (2, 10)
+    NAME       'None'        (2, 10) (2, 14)
+    OP         '}'           (2, 14) (2, 15)
+    FSTRING_MIDDLE 'bad'         (2, 15) (2, 18)
+    FSTRING_END '\"""'         (2, 18) (2, 21)
      """)
          self.check_tokenize('f"""abc"""', """\
-    STRING     'f\"\"\"abc\"\"\"'  (1, 0) (1, 10)
+    FSTRING_START 'f\"""'        (1, 0) (1, 4)
+    FSTRING_MIDDLE 'abc'         (1, 4) (1, 7)
+    FSTRING_END '\"""'         (1, 7) (1, 10)
      """)
          self.check_tokenize(r'f"abc\
  def"', """\
-    STRING     'f"abc\\\\\\ndef"' (1, 0) (2, 4)
+    FSTRING_START 'f"'          (1, 0) (1, 2)
+    FSTRING_MIDDLE 'abc\\\\\\ndef'  (1, 2) (2, 3)
+    FSTRING_END '"'           (2, 3) (2, 4)
      """)
          self.check_tokenize(r'Rf"abc\
  def"', """\
-    STRING     'Rf"abc\\\\\\ndef"' (1, 0) (2, 4)
+    FSTRING_START 'Rf"'         (1, 0) (1, 3)
+    FSTRING_MIDDLE 'abc\\\\\\ndef'  (1, 3) (2, 3)
+    FSTRING_END '"'           (2, 3) (2, 4)
+    """)
+        self.check_tokenize("f'some words {a+b:.3f} more words {c+d=} final words'", """\
+    FSTRING_START "f'"          (1, 0) (1, 2)
+    FSTRING_MIDDLE 'some words ' (1, 2) (1, 13)
+    OP         '{'           (1, 13) (1, 14)
+    NAME       'a'           (1, 14) (1, 15)
+    OP         '+'           (1, 15) (1, 16)
+    NAME       'b'           (1, 16) (1, 17)
+    OP         ':'           (1, 17) (1, 18)
+    FSTRING_MIDDLE '.3f'         (1, 18) (1, 21)
+    OP         '}'           (1, 21) (1, 22)
+    FSTRING_MIDDLE ' more words ' (1, 22) (1, 34)
+    OP         '{'           (1, 34) (1, 35)
+    NAME       'c'           (1, 35) (1, 36)
+    OP         '+'           (1, 36) (1, 37)
+    NAME       'd'           (1, 37) (1, 38)
+    OP         '='           (1, 38) (1, 39)
+    OP         '}'           (1, 39) (1, 40)
+    FSTRING_MIDDLE ' final words' (1, 40) (1, 52)
+    FSTRING_END "'"           (1, 52) (1, 53)
      """)
  
      def test_function(self):
@@ -644,8 +746,8 @@ def"', """\
      NEWLINE    '\\n'          (2, 5) (2, 6)
      INDENT     '        \\t'  (3, 0) (3, 9)
      NAME       'pass'        (3, 9) (3, 13)
-    DEDENT     ''            (4, 0) (4, 0)
-    DEDENT     ''            (4, 0) (4, 0)
+    DEDENT     ''            (3, 14) (3, 14)
+    DEDENT     ''            (3, 14) (3, 14)
      """)
  
      def test_non_ascii_identifiers(self):
@@ -857,7 +959,7 @@ async def foo():
      NUMBER     '1'           (2, 17) (2, 18)
      OP         ':'           (2, 18) (2, 19)
      NAME       'pass'        (2, 20) (2, 24)
-    DEDENT     ''            (3, 0) (3, 0)
+    DEDENT     ''            (2, 25) (2, 25)
      """)
  
          self.check_tokenize('''async def foo(async): await''', """\
@@ -905,7 +1007,7 @@ def f():
      NAME       'await'       (6, 2) (6, 7)
      OP         '='           (6, 8) (6, 9)
      NUMBER     '2'           (6, 10) (6, 11)
-    DEDENT     ''            (7, 0) (7, 0)
+    DEDENT     ''            (6, 12) (6, 12)
      """)
  
          self.check_tokenize('''\
@@ -943,7 +1045,7 @@ async def f():
      NAME       'await'       (6, 2) (6, 7)
      OP         '='           (6, 8) (6, 9)
      NUMBER     '2'           (6, 10) (6, 11)
-    DEDENT     ''            (7, 0) (7, 0)
+    DEDENT     ''            (6, 12) (6, 12)
      """)
  
  class GenerateTokensTest(TokenizeTest):
@@ -968,7 +1070,7 @@ def decistmt(s):
              ])
          else:
              result.append((toknum, tokval))
-    return untokenize(result).decode('utf-8')
+    return untokenize(result).decode('utf-8').strip()
  
  class TestMisc(TestCase):
  
@@ -1040,33 +1142,16 @@ class Test_Tokenize(TestCase):
              nonlocal first
              if not first:
                  first = True
-                return line
+                yield line
              else:
-                return b''
+                yield b''
  
          # skip the initial encoding token and the end tokens
-        tokens = list(_tokenize(readline, encoding='utf-8'))[1:-2]
-        expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
+        tokens = list(_tokenize(readline(), encoding='utf-8'))[:-2]
+        expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"\n')]
          self.assertEqual(tokens, expected_tokens,
                           "bytes not decoded with encoding")
  
-    def test__tokenize_does_not_decode_with_encoding_none(self):
-        literal = '"ЉЊЈЁЂ"'
-        first = False
-        def readline():
-            nonlocal first
-            if not first:
-                first = True
-                return literal
-            else:
-                return b''
-
-        # skip the end tokens
-        tokens = list(_tokenize(readline, encoding=None))[:-2]
-        expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
-        self.assertEqual(tokens, expected_tokens,
-                         "string not tokenized when encoding is None")
-
  
  class TestDetectEncoding(TestCase):
  
@@ -1326,7 +1411,7 @@ class TestTokenize(TestCase):
  
      def test_tokenize(self):
          import tokenize as tokenize_module
-        encoding = object()
+        encoding = "utf-8"
          encoding_used = None
          def mock_detect_encoding(readline):
              return encoding, [b'first', b'second']
@@ -1336,7 +1421,10 @@ class TestTokenize(TestCase):
              encoding_used = encoding
              out = []
              while True:
-                next_line = readline()
+                try:
+                    next_line = next(readline)
+                except StopIteration:
+                    return out
                  if next_line:
                      out.append(next_line)
                      continue
@@ -1356,7 +1444,7 @@ class TestTokenize(TestCase):
          tokenize_module._tokenize = mock__tokenize
          try:
              results = tokenize(mock_readline)
-            self.assertEqual(list(results),
+            self.assertEqual(list(results)[1:],
                               [b'first', b'second', b'1', b'2', b'3', b'4'])
          finally:
              tokenize_module.detect_encoding = orig_detect_encoding
@@ -1652,8 +1740,8 @@ class TestRoundtrip(TestCase):
              if support.verbose >= 2:
                  print('tokenize', testfile)
              with open(testfile, 'rb') as f:
-                with self.subTest(file=testfile):
-                    self.check_roundtrip(f)
+                # with self.subTest(file=testfile):
+                self.check_roundtrip(f)
  
  
      def roundtrip(self, code):
@@ -2496,13 +2584,13 @@ async def f():
      def test_unicode(self):
  
          self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\
-    NAME       'Örter'       (1, 0) (1, 6)
-    EQUAL      '='           (1, 7) (1, 8)
-    STRING     "u'places'"   (1, 9) (1, 18)
-    NEWLINE    ''            (1, 18) (1, 18)
-    NAME       'grün'        (2, 0) (2, 5)
-    EQUAL      '='           (2, 6) (2, 7)
-    STRING     "U'green'"    (2, 8) (2, 16)
+    NAME       'Örter'       (1, 0) (1, 5)
+    EQUAL      '='           (1, 6) (1, 7)
+    STRING     "u'places'"   (1, 8) (1, 17)
+    NEWLINE    ''            (1, 17) (1, 17)
+    NAME       'grün'        (2, 0) (2, 4)
+    EQUAL      '='           (2, 5) (2, 6)
+    STRING     "U'green'"    (2, 7) (2, 15)
      """)
  
      def test_invalid_syntax(self):
@@ -2559,8 +2647,7 @@ async def f():
          compile(valid, "<string>", "exec")
  
          invalid = generate_source(MAXINDENT)
-        tokens = list(_generate_tokens_from_c_tokenizer(invalid))
-        self.assertEqual(tokens[-1].type, NEWLINE)
+        self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(invalid)))
          self.assertRaises(
              IndentationError, compile, invalid, "<string>", "exec"
          )
diff --git a/Lib/token.py b/Lib/token.py

index 1459d12b376f827ae869d2381edd55dfd2b19125..487f6edd3c951cabb802998c8595f00eff2a17a1 100644 (file)
--- a/Lib/token.py
+++ b/Lib/token.py
@@ -67,10 +67,10 @@ SOFT_KEYWORD = 60
  FSTRING_START = 61
  FSTRING_MIDDLE = 62
  FSTRING_END = 63
+COMMENT = 64
+NL = 65
  # These aren't used by the C tokenizer but are needed for tokenize.py
-ERRORTOKEN = 64
-COMMENT = 65
-NL = 66
+ERRORTOKEN = 66
  ENCODING = 67
  N_TOKENS = 68
  # Special definitions for cooperation with parser
diff --git a/Lib/tokenize.py b/Lib/tokenize.py

index 46d2224f5cc0833e4b38819d0255367bb3625ac4..bfe40c627fde5739cde067d273d1cb2bc63cb0eb 100644 (file)
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -56,112 +56,11 @@ class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line'
          else:
              return self.type
  
-def group(*choices): return '(' + '|'.join(choices) + ')'
-def any(*choices): return group(*choices) + '*'
-def maybe(*choices): return group(*choices) + '?'
-
-# Note: we use unicode matching for names ("\w") but ascii matching for
-# number literals.
-Whitespace = r'[ \f\t]*'
-Comment = r'#[^\r\n]*'
-Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
-Name = r'\w+'
-
-Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
-Binnumber = r'0[bB](?:_?[01])+'
-Octnumber = r'0[oO](?:_?[0-7])+'
-Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
-Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
-Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
-Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
-                   r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
-Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
-Floatnumber = group(Pointfloat, Expfloat)
-Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
-Number = group(Imagnumber, Floatnumber, Intnumber)
-
-# Return the empty string, plus all of the valid string prefixes.
-def _all_string_prefixes():
-    # The valid string prefixes. Only contain the lower case versions,
-    #  and don't contain any permutations (include 'fr', but not
-    #  'rf'). The various permutations will be generated.
-    _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
-    # if we add binary f-strings, add: ['fb', 'fbr']
-    result = {''}
-    for prefix in _valid_string_prefixes:
-        for t in _itertools.permutations(prefix):
-            # create a list with upper and lower versions of each
-            #  character
-            for u in _itertools.product(*[(c, c.upper()) for c in t]):
-                result.add(''.join(u))
-    return result
-
-@functools.lru_cache
-def _compile(expr):
-    return re.compile(expr, re.UNICODE)
-
-# Note that since _all_string_prefixes includes the empty string,
-#  StringPrefix can be the empty string (making it optional).
-StringPrefix = group(*_all_string_prefixes())
-
-# Tail end of ' string.
-Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
-# Tail end of " string.
-Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
-# Tail end of ''' string.
-Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
-# Tail end of """ string.
-Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
-Triple = group(StringPrefix + "'''", StringPrefix + '"""')
-# Single-line ' or " string.
-String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
-               StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
-
-# Sorting in reverse order puts the long operators before their prefixes.
-# Otherwise if = came before ==, == would get recognized as two instances
-# of =.
-Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True)))
-Funny = group(r'\r?\n', Special)
-
-PlainToken = group(Number, Funny, String, Name)
-Token = Ignore + PlainToken
-
-# First (or only) line of ' or " string.
-ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
-                group("'", r'\\\r?\n'),
-                StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
-                group('"', r'\\\r?\n'))
-PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
-PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
-
-# For a given string prefix plus quotes, endpats maps it to a regex
-#  to match the remainder of that string. _prefix can be empty, for
-#  a normal single or triple quoted string (with no prefix).
-endpats = {}
-for _prefix in _all_string_prefixes():
-    endpats[_prefix + "'"] = Single
-    endpats[_prefix + '"'] = Double
-    endpats[_prefix + "'''"] = Single3
-    endpats[_prefix + '"""'] = Double3
-del _prefix
-
-# A set of all of the single and triple quoted string prefixes,
-#  including the opening quotes.
-single_quoted = set()
-triple_quoted = set()
-for t in _all_string_prefixes():
-    for u in (t + '"', t + "'"):
-        single_quoted.add(u)
-    for u in (t + '"""', t + "'''"):
-        triple_quoted.add(u)
-del t, u
-
-tabsize = 8
  
  class TokenError(Exception): pass
  
-class StopTokenizing(Exception): pass
  
+class StopTokenizing(Exception): pass
  
  class Untokenizer:
  
@@ -213,6 +112,14 @@ class Untokenizer:
                      self.tokens.append(indent)
                      self.prev_col = len(indent)
                  startline = False
+            elif tok_type == FSTRING_MIDDLE:
+                if '{' in token or '}' in token:
+                    end_line, end_col = end
+                    end = (end_line, end_col + token.count('{') + token.count('}'))
+                    token = re.sub('{', '{{', token)
+                    token = re.sub('}', '}}', token)
+
+
              self.add_whitespace(start)
              self.tokens.append(token)
              self.prev_row, self.prev_col = end
@@ -255,6 +162,11 @@ class Untokenizer:
              elif startline and indents:
                  toks_append(indents[-1])
                  startline = False
+            elif toknum == FSTRING_MIDDLE:
+                if '{' in tokval or '}' in tokval:
+                    tokval = re.sub('{', '{{', tokval)
+                    tokval = re.sub('}', '}}', tokval)
+
              toks_append(tokval)
  
  
@@ -404,7 +316,6 @@ def open(filename):
          buffer.close()
          raise
  
-
  def tokenize(readline):
      """
      The tokenize() generator requires one argument, readline, which
@@ -425,192 +336,32 @@ def tokenize(readline):
      which tells you which encoding was used to decode the bytes stream.
      """
      encoding, consumed = detect_encoding(readline)
-    empty = _itertools.repeat(b"")
-    rl_gen = _itertools.chain(consumed, iter(readline, b""), empty)
-    return _tokenize(rl_gen.__next__, encoding)
-
-
-def _tokenize(readline, encoding):
-    lnum = parenlev = continued = 0
-    numchars = '0123456789'
-    contstr, needcont = '', 0
-    contline = None
-    indents = [0]
-
+    rl_gen = _itertools.chain(consumed, iter(readline, b""))
      if encoding is not None:
          if encoding == "utf-8-sig":
              # BOM will already have been stripped.
              encoding = "utf-8"
          yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
-    last_line = b''
-    line = b''
-    while True:                                # loop over lines in stream
-        try:
-            # We capture the value of the line variable here because
-            # readline uses the empty string '' to signal end of input,
-            # hence `line` itself will always be overwritten at the end
-            # of this loop.
-            last_line = line
-            line = readline()
-        except StopIteration:
-            line = b''
-
-        if encoding is not None:
-            line = line.decode(encoding)
-        lnum += 1
-        pos, max = 0, len(line)
-
-        if contstr:                            # continued string
-            if not line:
-                raise TokenError("EOF in multi-line string", strstart)
-            endmatch = endprog.match(line)
-            if endmatch:
-                pos = end = endmatch.end(0)
-                yield TokenInfo(STRING, contstr + line[:end],
-                       strstart, (lnum, end), contline + line)
-                contstr, needcont = '', 0
-                contline = None
-            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
-                yield TokenInfo(ERRORTOKEN, contstr + line,
-                           strstart, (lnum, len(line)), contline)
-                contstr = ''
-                contline = None
-                continue
-            else:
-                contstr = contstr + line
-                contline = contline + line
-                continue
-
-        elif parenlev == 0 and not continued:  # new statement
-            if not line: break
-            column = 0
-            while pos < max:                   # measure leading whitespace
-                if line[pos] == ' ':
-                    column += 1
-                elif line[pos] == '\t':
-                    column = (column//tabsize + 1)*tabsize
-                elif line[pos] == '\f':
-                    column = 0
-                else:
-                    break
-                pos += 1
-            if pos == max:
-                break
-
-            if line[pos] in '#\r\n':           # skip comments or blank lines
-                if line[pos] == '#':
-                    comment_token = line[pos:].rstrip('\r\n')
-                    yield TokenInfo(COMMENT, comment_token,
-                           (lnum, pos), (lnum, pos + len(comment_token)), line)
-                    pos += len(comment_token)
-
-                yield TokenInfo(NL, line[pos:],
-                           (lnum, pos), (lnum, len(line)), line)
-                continue
-
-            if column > indents[-1]:           # count indents or dedents
-                indents.append(column)
-                yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
-            while column < indents[-1]:
-                if column not in indents:
-                    raise IndentationError(
-                        "unindent does not match any outer indentation level",
-                        ("<tokenize>", lnum, pos, line))
-                indents = indents[:-1]
-
-                yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
-
-        else:                                  # continued statement
-            if not line:
-                raise TokenError("EOF in multi-line statement", (lnum, 0))
-            continued = 0
-
-        while pos < max:
-            pseudomatch = _compile(PseudoToken).match(line, pos)
-            if pseudomatch:                                # scan for tokens
-                start, end = pseudomatch.span(1)
-                spos, epos, pos = (lnum, start), (lnum, end), end
-                if start == end:
-                    continue
-                token, initial = line[start:end], line[start]
-
-                if (initial in numchars or                 # ordinary number
-                    (initial == '.' and token != '.' and token != '...')):
-                    yield TokenInfo(NUMBER, token, spos, epos, line)
-                elif initial in '\r\n':
-                    if parenlev > 0:
-                        yield TokenInfo(NL, token, spos, epos, line)
-                    else:
-                        yield TokenInfo(NEWLINE, token, spos, epos, line)
-
-                elif initial == '#':
-                    assert not token.endswith("\n")
-                    yield TokenInfo(COMMENT, token, spos, epos, line)
-
-                elif token in triple_quoted:
-                    endprog = _compile(endpats[token])
-                    endmatch = endprog.match(line, pos)
-                    if endmatch:                           # all on one line
-                        pos = endmatch.end(0)
-                        token = line[start:pos]
-                        yield TokenInfo(STRING, token, spos, (lnum, pos), line)
-                    else:
-                        strstart = (lnum, start)           # multiple lines
-                        contstr = line[start:]
-                        contline = line
-                        break
-
-                # Check up to the first 3 chars of the token to see if
-                #  they're in the single_quoted set. If so, they start
-                #  a string.
-                # We're using the first 3, because we're looking for
-                #  "rb'" (for example) at the start of the token. If
-                #  we switch to longer prefixes, this needs to be
-                #  adjusted.
-                # Note that initial == token[:1].
-                # Also note that single quote checking must come after
-                #  triple quote checking (above).
-                elif (initial in single_quoted or
-                      token[:2] in single_quoted or
-                      token[:3] in single_quoted):
-                    if token[-1] == '\n':                  # continued string
-                        strstart = (lnum, start)
-                        # Again, using the first 3 chars of the
-                        #  token. This is looking for the matching end
-                        #  regex for the correct type of quote
-                        #  character. So it's really looking for
-                        #  endpats["'"] or endpats['"'], by trying to
-                        #  skip string prefix characters, if any.
-                        endprog = _compile(endpats.get(initial) or
-                                           endpats.get(token[1]) or
-                                           endpats.get(token[2]))
-                        contstr, needcont = line[start:], 1
-                        contline = line
-                        break
-                    else:                                  # ordinary string
-                        yield TokenInfo(STRING, token, spos, epos, line)
-
-                elif initial.isidentifier():               # ordinary name
-                    yield TokenInfo(NAME, token, spos, epos, line)
-                elif initial == '\\':                      # continued stmt
-                    continued = 1
-                else:
-                    if initial in '([{':
-                        parenlev += 1
-                    elif initial in ')]}':
-                        parenlev -= 1
-                    yield TokenInfo(OP, token, spos, epos, line)
-            else:
-                yield TokenInfo(ERRORTOKEN, line[pos],
-                           (lnum, pos), (lnum, pos+1), line)
-                pos += 1
-
-    # Add an implicit NEWLINE if the input doesn't end in one
-    if last_line and last_line[-1] not in '\r\n' and not last_line.strip().startswith("#"):
-        yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
-    for indent in indents[1:]:                 # pop remaining indent levels
-        yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
-    yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
+    yield from _tokenize(rl_gen, encoding)
+
+def _tokenize(rl_gen, encoding):
+    source = b"".join(rl_gen).decode(encoding)
+    token = None
+    for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
+        # TODO: Marta -> limpiar esto
+        if 6 < token.type <= 54:
+            token = token._replace(type=OP)
+        if token.type in {ASYNC, AWAIT}:
+            token = token._replace(type=NAME)
+        if token.type == NEWLINE:
+            l_start, c_start = token.start
+            l_end, c_end = token.end
+            token = token._replace(string='\n', start=(l_start, c_start), end=(l_end, c_end+1))
+
+        yield token
+    if token is not None:
+        last_line, _ = token.start
+        yield TokenInfo(ENDMARKER, '', (last_line + 1, 0), (last_line + 1, 0), '')
  
  
  def generate_tokens(readline):
@@ -619,7 +370,16 @@ def generate_tokens(readline):
      This has the same API as tokenize(), except that it expects the *readline*
      callable to return str objects instead of bytes.
      """
-    return _tokenize(readline, None)
+    def _gen():
+        while True:
+            try:
+                line = readline()
+            except StopIteration:
+                return
+            if not line:
+                return
+            yield line.encode()
+    return _tokenize(_gen(), 'utf-8')
  
  def main():
      import argparse
@@ -656,7 +416,10 @@ def main():
                  tokens = list(tokenize(f.readline))
          else:
              filename = "<stdin>"
-            tokens = _tokenize(sys.stdin.readline, None)
+            tokens = _tokenize(
+                (x.encode('utf-8') for x in iter(sys.stdin.readline, "")
+            ), "utf-8")
+
  
          # Output the tokenization
          for token in tokens:
@@ -682,10 +445,10 @@ def main():
          perror("unexpected error: %s" % err)
          raise
  
-def _generate_tokens_from_c_tokenizer(source):
+def _generate_tokens_from_c_tokenizer(source, extra_tokens=False):
      """Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
      import _tokenize as c_tokenizer
-    for info in c_tokenizer.TokenizerIter(source):
+    for info in c_tokenizer.TokenizerIter(source, extra_tokens=extra_tokens):
          tok, type, lineno, end_lineno, col_off, end_col_off, line = info
          yield TokenInfo(type, tok, (lineno, col_off), (end_lineno, end_col_off), line)
  
diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-05-20-23-08-48.gh-issue-102856.Knv9WT.rst b/Misc/NEWS.d/next/Core and Builtins/2023-05-20-23-08-48.gh-issue-102856.Knv9WT.rst

new file mode 100644 (file)

index 0000000..ff831c9
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2023-05-20-23-08-48.gh-issue-102856.Knv9WT.rst
@@ -0,0 +1 @@
+Implement PEP 701 changes in the :mod:`tokenize` module. Patch by Marta Gómez Macías and Pablo Galindo Salgado
diff --git a/Parser/pegen.c b/Parser/pegen.c

index da410ea84ecb8e56b5fbf4092d7a05b00998ed96..b031a6f5d440e85a833e47e2ea4ac99fd08dda29 100644 (file)
--- a/Parser/pegen.c
+++ b/Parser/pegen.c
@@ -208,7 +208,7 @@ int
  _PyPegen_fill_token(Parser *p)
  {
      struct token new_token;
-    new_token.metadata = NULL;
+    _PyToken_Init(&new_token);
      int type = _PyTokenizer_Get(p->tok, &new_token);
  
      // Record and skip '# type: ignore' comments
@@ -251,7 +251,7 @@ _PyPegen_fill_token(Parser *p)
      Token *t = p->tokens[p->fill];
      return initialize_token(p, t, &new_token, type);
  error:
-    Py_XDECREF(new_token.metadata);
+    _PyToken_Free(&new_token);
      return -1;
  }
  
diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c

index 1f227da0194e3cc980e344dca889c361f40eb610..af529057f50e70203a11b0cb348cd4abc5cccf2a 100644 (file)
--- a/Parser/pegen_errors.c
+++ b/Parser/pegen_errors.c
@@ -165,7 +165,7 @@ _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
  
      int ret = 0;
      struct token new_token;
-    new_token.metadata = NULL;
+    _PyToken_Init(&new_token);
  
      for (;;) {
          switch (_PyTokenizer_Get(p->tok, &new_token)) {
@@ -193,7 +193,7 @@ _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
  
  
  exit:
-    Py_XDECREF(new_token.metadata);
+    _PyToken_Free(&new_token);
      // If we're in an f-string, we want the syntax error in the expression part
      // to propagate, so that tokenizer errors (like expecting '}') that happen afterwards
      // do not swallow it.
diff --git a/Parser/token.c b/Parser/token.c

index 82267fbfcd0c54e133d838d3e57964b4d8968f3a..2bc963a91c7701d3c2f1e4a64276ce5d265e522b 100644 (file)
--- a/Parser/token.c
+++ b/Parser/token.c
@@ -70,9 +70,9 @@ const char * const _PyParser_TokenNames[] = {
      "FSTRING_START",
      "FSTRING_MIDDLE",
      "FSTRING_END",
+    "COMMENT",
+    "NL",
      "<ERRORTOKEN>",
-    "<COMMENT>",
-    "<NL>",
      "<ENCODING>",
      "<N_TOKENS>",
  };
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c

index c5dc9e706fe474f9171fbb9d9e0da99dd742b5fb..fb94fbeac42bae64686e3b2a6c3a219b15548b4d 100644 (file)
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -111,6 +111,8 @@ tok_new(void)
      tok->interactive_underflow = IUNDERFLOW_NORMAL;
      tok->str = NULL;
      tok->report_warnings = 1;
+    tok->tok_extra_tokens = 0;
+    tok->comment_newline = 0;
      tok->tok_mode_stack[0] = (tokenizer_mode){.kind =TOK_REGULAR_MODE, .f_string_quote='\0', .f_string_quote_size = 0, .f_string_debug=0};
      tok->tok_mode_stack_index = 0;
      tok->tok_report_warnings = 1;
@@ -980,6 +982,16 @@ _PyTokenizer_Free(struct tok_state *tok)
      PyMem_Free(tok);
  }
  
+void
+_PyToken_Free(struct token *token) {
+    Py_XDECREF(token->metadata);
+}
+
+void
+_PyToken_Init(struct token *token) {
+    token->metadata = NULL;
+}
+
  static int
  tok_readline_raw(struct tok_state *tok)
  {
@@ -1636,6 +1648,7 @@ token_setup(struct tok_state *tok, struct token *token, int type, const char *st
      return type;
  }
  
+
  static int
  tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token)
  {
@@ -1649,6 +1662,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
      tok->starting_col_offset = -1;
      blankline = 0;
  
+
      /* Get indentation level */
      if (tok->atbol) {
          int col = 0;
@@ -1749,12 +1763,20 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
      tok->starting_col_offset = tok->col_offset;
  
      /* Return pending indents/dedents */
-    if (tok->pendin != 0) {
+   if (tok->pendin != 0) {
          if (tok->pendin < 0) {
+            if (tok->tok_extra_tokens) {
+                p_start = tok->cur;
+                p_end = tok->cur;
+            }
              tok->pendin++;
              return MAKE_TOKEN(DEDENT);
          }
          else {
+            if (tok->tok_extra_tokens) {
+                p_start = tok->buf;
+                p_end = tok->cur;
+            }
              tok->pendin--;
              return MAKE_TOKEN(INDENT);
          }
@@ -1803,13 +1825,18 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
              return MAKE_TOKEN(syntaxerror(tok, "f-string expression part cannot include '#'"));
          }
  
-        const char *prefix, *p, *type_start;
+        const char* p = NULL;
+        const char *prefix, *type_start;
          int current_starting_col_offset;
  
          while (c != EOF && c != '\n') {
              c = tok_nextc(tok);
          }
  
+        if (tok->tok_extra_tokens) {
+            p = tok->start;
+        }
+
          if (tok->type_comments) {
              p = tok->start;
              current_starting_col_offset = tok->starting_col_offset;
@@ -1864,6 +1891,13 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
                  }
              }
          }
+        if (tok->tok_extra_tokens) {
+            tok_backup(tok, c);  /* don't eat the newline or EOF */
+            p_start = p;
+            p_end = tok->cur;
+            tok->comment_newline = blankline;
+            return MAKE_TOKEN(COMMENT);
+        }
      }
  
      if (tok->done == E_INTERACT_STOP) {
@@ -1949,6 +1983,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
  
                  struct tok_state ahead_tok;
                  struct token ahead_token;
+                _PyToken_Init(&ahead_token);
                  int ahead_tok_kind;
  
                  memcpy(&ahead_tok, tok, sizeof(ahead_tok));
@@ -1964,8 +1999,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
                         returning a plain NAME token, return ASYNC. */
                      tok->async_def_indent = tok->indent;
                      tok->async_def = 1;
+                    _PyToken_Free(&ahead_token);
                      return MAKE_TOKEN(ASYNC);
                  }
+                _PyToken_Free(&ahead_token);
              }
          }
  
@@ -1976,8 +2013,19 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
      if (c == '\n') {
          tok->atbol = 1;
          if (blankline || tok->level > 0) {
+            if (tok->tok_extra_tokens) {
+                p_start = tok->start;
+                p_end = tok->cur;
+                return MAKE_TOKEN(NL);
+            }
              goto nextline;
          }
+        if (tok->comment_newline && tok->tok_extra_tokens) {
+            tok->comment_newline = 0;
+                p_start = tok->start;
+                p_end = tok->cur;
+                return MAKE_TOKEN(NL);
+        }
          p_start = tok->start;
          p_end = tok->cur - 1; /* Leave '\n' out of the string */
          tok->cont_line = 0;
@@ -2563,6 +2611,9 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
  
  f_string_middle:
  
+    // TODO: This is a bit of a hack, but it works for now. We need to find a better way to handle
+    // this.
+    tok->multi_line_start = tok->line_start;
      while (end_quote_size != current_tok->f_string_quote_size) {
          int c = tok_nextc(tok);
          if (tok->done == E_ERROR) {
@@ -2788,7 +2839,9 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
      // if fetching the encoding shows a warning.
      tok->report_warnings = 0;
      while (tok->lineno < 2 && tok->done == E_OK) {
+        _PyToken_Init(&token);
          _PyTokenizer_Get(tok, &token);
+        _PyToken_Free(&token);
      }
      fclose(fp);
      if (tok->encoding) {
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h

index fd169cf3d1bab65aa44cf41fd4326fe3e6768d1b..3f34763239acdaa075db09d12a34c1c259f62ff3 100644 (file)
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -128,6 +128,8 @@ struct tok_state {
      tokenizer_mode tok_mode_stack[MAXFSTRINGLEVEL];
      int tok_mode_stack_index;
      int tok_report_warnings;
+    int tok_extra_tokens;
+    int comment_newline;
  #ifdef Py_DEBUG
      int debug;
  #endif
@@ -138,6 +140,8 @@ extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int);
  extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*,
                                                const char *, const char *);
  extern void _PyTokenizer_Free(struct tok_state *);
+extern void _PyToken_Free(struct token *);
+extern void _PyToken_Init(struct token *);
  extern int _PyTokenizer_Get(struct tok_state *, struct token *);
  
  #define tok_dump _Py_tok_dump
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c

index 3394a5108cb5355c170a66dd78208e72d664f61c..ece238672e34fdef1740a86a0aa97b0688cfe79b 100644 (file)
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -1,5 +1,8 @@
  #include "Python.h"
+#include "errcode.h"
  #include "../Parser/tokenizer.h"
+#include "../Parser/pegen.h"      // _PyPegen_byte_offset_to_character_offset()
+#include "../Parser/pegen.h"      // _PyPegen_byte_offset_to_character_offset()
  
  static struct PyModuleDef _tokenizemodule;
  
@@ -34,11 +37,14 @@ typedef struct
  _tokenizer.tokenizeriter.__new__ as tokenizeriter_new
  
      source: str
+    *
+    extra_tokens: bool
  [clinic start generated code]*/
  
  static PyObject *
-tokenizeriter_new_impl(PyTypeObject *type, const char *source)
-/*[clinic end generated code: output=7fd9f46cf9263cbb input=4384b368407375c6]*/
+tokenizeriter_new_impl(PyTypeObject *type, const char *source,
+                       int extra_tokens)
+/*[clinic end generated code: output=f6f9d8b4beec8106 input=90dc5b6a5df180c2]*/
  {
      tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
      if (self == NULL) {
@@ -54,20 +60,123 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source)
          return NULL;
      }
      self->tok->filename = filename;
+    if (extra_tokens) {
+        self->tok->tok_extra_tokens = 1;
+    }
      return (PyObject *)self;
  }
  
+static int
+_tokenizer_error(struct tok_state *tok)
+{
+    if (PyErr_Occurred()) {
+        return -1;
+    }
+
+    const char *msg = NULL;
+    PyObject* errtype = PyExc_SyntaxError;
+    switch (tok->done) {
+        case E_TOKEN:
+            msg = "invalid token";
+            break;
+        case E_EOF:
+            if (tok->level) {
+                    PyErr_Format(PyExc_SyntaxError,
+                                 "parenthesis '%c' was never closed",
+                                tok->parenstack[tok->level-1]);
+            } else {
+                PyErr_SetString(PyExc_SyntaxError, "unexpected EOF while parsing");
+            }
+            return -1;
+        case E_DEDENT:
+            PyErr_Format(PyExc_IndentationError,
+                        "unindent does not match any outer indentation level "
+                        "(<tokenize>, line %d)",
+                        tok->lineno);
+            return -1;
+        case E_INTR:
+            if (!PyErr_Occurred()) {
+                PyErr_SetNone(PyExc_KeyboardInterrupt);
+            }
+            return -1;
+        case E_NOMEM:
+            PyErr_NoMemory();
+            return -1;
+        case E_TABSPACE:
+            errtype = PyExc_TabError;
+            msg = "inconsistent use of tabs and spaces in indentation";
+            break;
+        case E_TOODEEP:
+            errtype = PyExc_IndentationError;
+            msg = "too many levels of indentation";
+            break;
+        case E_LINECONT: {
+            msg = "unexpected character after line continuation character";
+            break;
+        }
+        default:
+            msg = "unknown tokenization error";
+    }
+
+    PyObject* errstr = NULL;
+    PyObject* error_line = NULL;
+    PyObject* tmp = NULL;
+    PyObject* value = NULL;
+    int result = 0;
+
+    Py_ssize_t size = tok->inp - tok->buf;
+    error_line = PyUnicode_DecodeUTF8(tok->buf, size, "replace");
+    if (!error_line) {
+        result = -1;
+        goto exit;
+    }
+
+    tmp = Py_BuildValue("(OnnOii)", tok->filename, tok->lineno, 0, error_line, 0, 0);
+    if (!tmp) {
+        result = -1;
+        goto exit;
+    }
+
+    errstr = PyUnicode_FromString(msg);
+    if (!errstr) {
+        result = -1;
+        goto exit;
+    }
+
+    value = PyTuple_Pack(2, errstr, tmp);
+    if (!value) {
+        result = -1;
+        goto exit;
+    }
+
+    PyErr_SetObject(errtype, value);
+
+exit:
+    Py_XDECREF(errstr);
+    Py_XDECREF(error_line);
+    Py_XDECREF(tmp);
+    Py_XDECREF(value);
+    return result;
+}
+
  static PyObject *
  tokenizeriter_next(tokenizeriterobject *it)
  {
+    PyObject* result = NULL;
      struct token token;
+    _PyToken_Init(&token);
+
      int type = _PyTokenizer_Get(it->tok, &token);
-    if (type == ERRORTOKEN && PyErr_Occurred()) {
-        return NULL;
+    if (type == ERRORTOKEN) {
+        if(!PyErr_Occurred()) {
+            _tokenizer_error(it->tok);
+            assert(PyErr_Occurred());
+        }
+        goto exit;
      }
      if (type == ERRORTOKEN || type == ENDMARKER) {
          PyErr_SetString(PyExc_StopIteration, "EOF");
-        return NULL;
+        goto exit;
      }
      PyObject *str = NULL;
      if (token.start == NULL || token.end == NULL) {
@@ -77,28 +186,31 @@ tokenizeriter_next(tokenizeriterobject *it)
          str = PyUnicode_FromStringAndSize(token.start, token.end - token.start);
      }
      if (str == NULL) {
-        return NULL;
+        goto exit;
      }
  
      Py_ssize_t size = it->tok->inp - it->tok->buf;
      PyObject *line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace");
      if (line == NULL) {
          Py_DECREF(str);
-        return NULL;
+        goto exit;
      }
      const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
-    int lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
-    int end_lineno = it->tok->lineno;
-    int col_offset = -1;
-    int end_col_offset = -1;
+    Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
+    Py_ssize_t end_lineno = it->tok->lineno;
+    Py_ssize_t col_offset = -1;
+    Py_ssize_t end_col_offset = -1;
      if (token.start != NULL && token.start >= line_start) {
-        col_offset = (int)(token.start - line_start);
+        col_offset = _PyPegen_byte_offset_to_character_offset(line, token.start - line_start);
      }
      if (token.end != NULL && token.end >= it->tok->line_start) {
-        end_col_offset = (int)(token.end - it->tok->line_start);
+        end_col_offset = _PyPegen_byte_offset_to_character_offset(line, token.end - it->tok->line_start);
      }
  
-    return Py_BuildValue("(NiiiiiN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
+    result = Py_BuildValue("(NinnnnN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
+exit:
+    _PyToken_Free(&token);
+    return result;
  }
  
  static void
diff --git a/Python/clinic/Python-tokenize.c.h b/Python/clinic/Python-tokenize.c.h

index 6af93743f40dabd7eb1aa134bc3d48084ebfd49d..7e779388a92dbf3e17f0fb873007425f3e41f41a 100644 (file)
--- a/Python/clinic/Python-tokenize.c.h
+++ b/Python/clinic/Python-tokenize.c.h
@@ -9,7 +9,8 @@ preserve
  
  
  static PyObject *
-tokenizeriter_new_impl(PyTypeObject *type, const char *source);
+tokenizeriter_new_impl(PyTypeObject *type, const char *source,
+                       int extra_tokens);
  
  static PyObject *
  tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
@@ -17,14 +18,14 @@ tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
      PyObject *return_value = NULL;
      #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
  
-    #define NUM_KEYWORDS 1
+    #define NUM_KEYWORDS 2
      static struct {
          PyGC_Head _this_is_not_used;
          PyObject_VAR_HEAD
          PyObject *ob_item[NUM_KEYWORDS];
      } _kwtuple = {
          .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
-        .ob_item = { &_Py_ID(source), },
+        .ob_item = { &_Py_ID(source), &_Py_ID(extra_tokens), },
      };
      #undef NUM_KEYWORDS
      #define KWTUPLE (&_kwtuple.ob_base.ob_base)
@@ -33,19 +34,20 @@ tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
      #  define KWTUPLE NULL
      #endif  // !Py_BUILD_CORE
  
-    static const char * const _keywords[] = {"source", NULL};
+    static const char * const _keywords[] = {"source", "extra_tokens", NULL};
      static _PyArg_Parser _parser = {
          .keywords = _keywords,
          .fname = "tokenizeriter",
          .kwtuple = KWTUPLE,
      };
      #undef KWTUPLE
-    PyObject *argsbuf[1];
+    PyObject *argsbuf[2];
      PyObject * const *fastargs;
      Py_ssize_t nargs = PyTuple_GET_SIZE(args);
      const char *source;
+    int extra_tokens;
  
-    fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser, 1, 1, 0, argsbuf);
+    fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser, 1, 1, 1, argsbuf);
      if (!fastargs) {
          goto exit;
      }
@@ -62,9 +64,13 @@ tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
          PyErr_SetString(PyExc_ValueError, "embedded null character");
          goto exit;
      }
-    return_value = tokenizeriter_new_impl(type, source);
+    extra_tokens = PyObject_IsTrue(fastargs[1]);
+    if (extra_tokens < 0) {
+        goto exit;
+    }
+    return_value = tokenizeriter_new_impl(type, source, extra_tokens);
  
  exit:
      return return_value;
  }
-/*[clinic end generated code: output=8c2c09f651961986 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=940b564c67f6e0e2 input=a9049054013a1b77]*/
author	Marta Gómez Macías <mgmacias@google.com>
	Sun, 21 May 2023 00:03:02 +0000 (02:03 +0200)
committer	GitHub <noreply@github.com>
	Sun, 21 May 2023 00:03:02 +0000 (01:03 +0100)
Doc/library/token-list.inc		patch \| blob \| blame \| history
Doc/library/token.rst		patch \| blob \| blame \| history
Grammar/Tokens		patch \| blob \| blame \| history
Include/internal/pycore_global_objects_fini_generated.h		patch \| blob \| blame \| history
Include/internal/pycore_global_strings.h		patch \| blob \| blame \| history
Include/internal/pycore_runtime_init_generated.h		patch \| blob \| blame \| history
Include/internal/pycore_token.h		patch \| blob \| blame \| history
Include/internal/pycore_unicodeobject_generated.h		patch \| blob \| blame \| history
Lib/inspect.py		patch \| blob \| blame \| history
Lib/tabnanny.py		patch \| blob \| blame \| history
Lib/test/test_tabnanny.py		patch \| blob \| blame \| history
Lib/test/test_tokenize.py		patch \| blob \| blame \| history
Lib/token.py		patch \| blob \| blame \| history
Lib/tokenize.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Core and Builtins/2023-05-20-23-08-48.gh-issue-102856.Knv9WT.rst	[new file with mode: 0644]	patch \| blob
Parser/pegen.c		patch \| blob \| blame \| history
Parser/pegen_errors.c		patch \| blob \| blame \| history
Parser/token.c		patch \| blob \| blame \| history
Parser/tokenizer.c		patch \| blob \| blame \| history
Parser/tokenizer.h		patch \| blob \| blame \| history
Python/Python-tokenize.c		patch \| blob \| blame \| history
Python/clinic/Python-tokenize.c.h		patch \| blob \| blame \| history