gh-104976: Ensure trailing dedent tokens are emitted as the previous tokenizer (...

author Pablo Galindo Salgado <Pablogsal@gmail.com>

Fri, 26 May 2023 21:02:26 +0000 (22:02 +0100)

committer GitHub <noreply@github.com>

Fri, 26 May 2023 21:02:26 +0000 (22:02 +0100)
author Pablo Galindo Salgado <Pablogsal@gmail.com>
Fri, 26 May 2023 21:02:26 +0000 (22:02 +0100)
committer GitHub <noreply@github.com>
Fri, 26 May 2023 21:02:26 +0000 (22:02 +0100)
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py

index 0b7c25838d6782e3f9113904342ec39602f62432..abb68859be944ca9389e36108a632e3828866bc4 100644 (file)
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -82,7 +82,7 @@ class TokenizeTest(TestCase):
      NAME       'False'       (4, 11) (4, 16)
      COMMENT    '# NEWLINE'   (4, 17) (4, 26)
      NEWLINE    '\\n'          (4, 26) (4, 27)
-    DEDENT     ''            (4, 27) (4, 27)
+    DEDENT     ''            (5, 0) (5, 0)
      """)
          indent_error_file = b"""\
  def k(x):
@@ -755,8 +755,8 @@ def"', """\
      NEWLINE    '\\n'          (2, 5) (2, 6)
      INDENT     '        \\t'  (3, 0) (3, 9)
      NAME       'pass'        (3, 9) (3, 13)
-    DEDENT     ''            (3, 14) (3, 14)
-    DEDENT     ''            (3, 14) (3, 14)
+    DEDENT     ''            (4, 0) (4, 0)
+    DEDENT     ''            (4, 0) (4, 0)
      """)
  
      def test_non_ascii_identifiers(self):
@@ -968,7 +968,7 @@ async def foo():
      NUMBER     '1'           (2, 17) (2, 18)
      OP         ':'           (2, 18) (2, 19)
      NAME       'pass'        (2, 20) (2, 24)
-    DEDENT     ''            (2, 25) (2, 25)
+    DEDENT     ''            (3, 0) (3, 0)
      """)
  
          self.check_tokenize('''async def foo(async): await''', """\
@@ -1016,7 +1016,7 @@ def f():
      NAME       'await'       (6, 2) (6, 7)
      OP         '='           (6, 8) (6, 9)
      NUMBER     '2'           (6, 10) (6, 11)
-    DEDENT     ''            (6, 12) (6, 12)
+    DEDENT     ''            (7, 0) (7, 0)
      """)
  
          self.check_tokenize('''\
@@ -1054,7 +1054,7 @@ async def f():
      NAME       'await'       (6, 2) (6, 7)
      OP         '='           (6, 8) (6, 9)
      NUMBER     '2'           (6, 10) (6, 11)
-    DEDENT     ''            (6, 12) (6, 12)
+    DEDENT     ''            (7, 0) (7, 0)
      """)
  
      def test_newline_after_parenthesized_block_with_comment(self):
@@ -2680,7 +2680,8 @@ async def f():
  
          valid = generate_source(MAXINDENT - 1)
          tokens = list(_generate_tokens_from_c_tokenizer(valid))
-        self.assertEqual(tokens[-1].type, DEDENT)
+        self.assertEqual(tokens[-2].type, DEDENT)
+        self.assertEqual(tokens[-1].type, ENDMARKER)
          compile(valid, "<string>", "exec")
  
          invalid = generate_source(MAXINDENT)
diff --git a/Lib/tokenize.py b/Lib/tokenize.py

index 911f0f12f9bb7e83365ad4477ecc2acb86f427f4..4895e94d1dfda7b4f66d6c91dbe5f0882f2bfea5 100644 (file)
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -447,13 +447,8 @@ def tokenize(readline):
  
  def _tokenize(rl_gen, encoding):
      source = b"".join(rl_gen).decode(encoding)
-    token = None
      for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
          yield token
-    if token is not None:
-        last_line, _ = token.start
-        yield TokenInfo(ENDMARKER, '', (last_line + 1, 0), (last_line + 1, 0), '')
-
  
  def generate_tokens(readline):
      """Tokenize a source reading Python code as unicode strings.
diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-05-26-15-16-11.gh-issue-104976.6dLitD.rst b/Misc/NEWS.d/next/Core and Builtins/2023-05-26-15-16-11.gh-issue-104976.6dLitD.rst

new file mode 100644 (file)

index 0000000..377e8e7
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2023-05-26-15-16-11.gh-issue-104976.6dLitD.rst
@@ -0,0 +1,3 @@
+Ensure that trailing ``DEDENT`` :class:`tokenize.TokenInfo` objects emitted
+by the :mod:`tokenize` module are reported as in Python 3.11. Patch by Pablo
+Galindo
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c

index 88087c12562413e0818948b61187af417b4435df..01c2215366a73633a7aede0ee52a45c6127823f6 100644 (file)
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -30,6 +30,7 @@ class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_t
  typedef struct
  {
      PyObject_HEAD struct tok_state *tok;
+    int done;
  } tokenizeriterobject;
  
  /*[clinic input]
@@ -63,6 +64,7 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source,
      if (extra_tokens) {
          self->tok->tok_extra_tokens = 1;
      }
+    self->done = 0;
      return (PyObject *)self;
  }
  
@@ -179,8 +181,9 @@ tokenizeriter_next(tokenizeriterobject *it)
          }
          goto exit;
      }
-    if (type == ERRORTOKEN || type == ENDMARKER) {
+    if (it->done || type == ERRORTOKEN) {
          PyErr_SetString(PyExc_StopIteration, "EOF");
+        it->done = 1;
          goto exit;
      }
      PyObject *str = NULL;
@@ -194,9 +197,19 @@ tokenizeriter_next(tokenizeriterobject *it)
          goto exit;
      }
  
+    int is_trailing_token = 0;
+    if (type == ENDMARKER || (type == DEDENT && it->tok->done == E_EOF)) {
+        is_trailing_token = 1;
+    }
+
      const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
-    Py_ssize_t size = it->tok->inp - line_start;
-    PyObject *line = PyUnicode_DecodeUTF8(line_start, size, "replace");
+    PyObject* line = NULL;
+    if (it->tok->tok_extra_tokens && is_trailing_token) {
+        line = PyUnicode_FromString("");
+    } else {
+        Py_ssize_t size = it->tok->inp - line_start;
+        line = PyUnicode_DecodeUTF8(line_start, size, "replace");
+    }
      if (line == NULL) {
          Py_DECREF(str);
          goto exit;
@@ -214,6 +227,10 @@ tokenizeriter_next(tokenizeriterobject *it)
      }
  
      if (it->tok->tok_extra_tokens) {
+        if (is_trailing_token) {
+            lineno = end_lineno = lineno + 1;
+            col_offset = end_col_offset = 0;
+        }
          // Necessary adjustments to match the original Python tokenize
          // implementation
          if (type > DEDENT && type < OP) {
@@ -231,6 +248,9 @@ tokenizeriter_next(tokenizeriterobject *it)
      result = Py_BuildValue("(iN(nn)(nn)N)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
  exit:
      _PyToken_Free(&token);
+    if (type == ENDMARKER) {
+        it->done = 1;
+    }
      return result;
  }
author	Pablo Galindo Salgado <Pablogsal@gmail.com>
	Fri, 26 May 2023 21:02:26 +0000 (22:02 +0100)
committer	GitHub <noreply@github.com>
	Fri, 26 May 2023 21:02:26 +0000 (22:02 +0100)
Lib/test/test_tokenize.py		patch \| blob \| blame \| history
Lib/tokenize.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Core and Builtins/2023-05-26-15-16-11.gh-issue-104976.6dLitD.rst	[new file with mode: 0644]	patch \| blob
Python/Python-tokenize.c		patch \| blob \| blame \| history