gh-105390: Correctly raise TokenError instead of SyntaxError for tokenize errors...

author Pablo Galindo Salgado <Pablogsal@gmail.com>

Wed, 7 Jun 2023 11:04:40 +0000 (12:04 +0100)

committer GitHub <noreply@github.com>

Wed, 7 Jun 2023 11:04:40 +0000 (12:04 +0100)
author Pablo Galindo Salgado <Pablogsal@gmail.com>
Wed, 7 Jun 2023 11:04:40 +0000 (12:04 +0100)
committer GitHub <noreply@github.com>
Wed, 7 Jun 2023 11:04:40 +0000 (12:04 +0100)
diff --git a/Doc/library/tokenize.rst b/Doc/library/tokenize.rst

index 11f569df2e7cde774f2d14f120796e373208824f..41222a771d1b47dc35908f822a8e493a7dd6f944 100644 (file)
--- a/Doc/library/tokenize.rst
+++ b/Doc/library/tokenize.rst
@@ -139,11 +139,6 @@ function it uses to do this is available:
         2,
         3
  
-Note that unclosed single-quoted strings do not cause an error to be
-raised. They are tokenized as :data:`~token.ERRORTOKEN`, followed by the
-tokenization of their contents.
-
-
  .. _tokenize-cli:
  
  Command-Line Usage
diff --git a/Doc/whatsnew/3.12.rst b/Doc/whatsnew/3.12.rst

index 79491b4bfdfd42a8f2f3e370860af7dba0702fe7..358467499e018c5415b712d7e9f40d7580abf80c 100644 (file)
--- a/Doc/whatsnew/3.12.rst
+++ b/Doc/whatsnew/3.12.rst
@@ -1490,14 +1490,15 @@ Changes in the Python API
    Additionally, there may be some minor behavioral changes as a consecuence of the
    changes required to support :pep:`701`. Some of these changes include:
  
-  * Some final ``DEDENT`` tokens are now emitted within the bounds of the
-    input. This means that for a file containing 3 lines, the old version of the
-    tokenizer returned a ``DEDENT`` token in line 4 whilst the new version returns
-    the token in line 3.
-
    * The ``type`` attribute of the tokens emitted when tokenizing some invalid Python
      characters such as ``!`` has changed from ``ERRORTOKEN`` to ``OP``.
  
+  * Incomplete single-line strings now also raise :exc:`tokenize.TokenError` as incomplete
+    multiline strings do.
+
+  * Some incomplete or invalid Python code now raises :exc:`tokenize.TokenError` instead of
+    returning arbitrary ``ERRORTOKEN`` tokens when tokenizing it.
+
  Build Changes
  =============
  
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py

index 5ac17095b185f598e3874eaae9bfcaf845df82b8..f2847b2fb327f86c0df7caa104ddf50bce80e759 100644 (file)
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -3,7 +3,8 @@ from test.support import os_helper
  from tokenize import (tokenize, untokenize, NUMBER, NAME, OP,
                       STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
                       open as tokenize_open, Untokenizer, generate_tokens,
-                     NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo)
+                     NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo,
+                     TokenError)
  from io import BytesIO, StringIO
  import unittest
  from textwrap import dedent
@@ -286,7 +287,7 @@ def k(x):
          for lit in INVALID_UNDERSCORE_LITERALS:
              try:
                  number_token(lit)
-            except SyntaxError:
+            except TokenError:
                  continue
              self.assertNotEqual(number_token(lit), lit)
  
@@ -1379,7 +1380,7 @@ class TestDetectEncoding(TestCase):
                  self.assertEqual(found, "iso-8859-1")
  
      def test_syntaxerror_latin1(self):
-        # Issue 14629: need to raise SyntaxError if the first
+        # Issue 14629: need to raise TokenError if the first
          # line(s) have non-UTF-8 characters
          lines = (
              b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
@@ -2754,7 +2755,7 @@ async def f():
              "]",
          ]:
              with self.subTest(case=case):
-                self.assertRaises(SyntaxError, get_tokens, case)
+                self.assertRaises(TokenError, get_tokens, case)
  
      def test_max_indent(self):
          MAXINDENT = 100
@@ -2773,7 +2774,7 @@ async def f():
  
          invalid = generate_source(MAXINDENT)
          the_input = StringIO(invalid)
-        self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline)))
+        self.assertRaises(IndentationError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline)))
          self.assertRaises(
              IndentationError, compile, invalid, "<string>", "exec"
          )
diff --git a/Lib/tokenize.py b/Lib/tokenize.py

index a07a8bf45891ac6d9fd682875cbc1f3c1d38437a..49e8144edddab7e729ad19d5ddf62af6686f82ab 100644 (file)
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -517,14 +517,30 @@ def main():
          perror("unexpected error: %s" % err)
          raise
  
+def _transform_msg(msg):
+    """Transform error messages from the C tokenizer into the Python tokenize
+
+    The C tokenizer is more picky than the Python one, so we need to massage
+    the error messages a bit for backwards compatibility.
+    """
+    if "unterminated triple-quoted string literal" in msg:
+        return "EOF in multi-line string"
+    return msg
+
  def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False):
      """Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
      if encoding is None:
          it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens)
      else:
          it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens)
-    for info in it:
-        yield TokenInfo._make(info)
+    try:
+        for info in it:
+            yield TokenInfo._make(info)
+    except SyntaxError as e:
+        if type(e) != SyntaxError:
+            raise e from None
+        msg = _transform_msg(e.msg)
+        raise TokenError(msg, (e.lineno, e.offset)) from None
  
  
  if __name__ == "__main__":
diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-06-06-17-10-42.gh-issue-105390.DvqI-e.rst b/Misc/NEWS.d/next/Core and Builtins/2023-06-06-17-10-42.gh-issue-105390.DvqI-e.rst

new file mode 100644 (file)

index 0000000..de59b54
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2023-06-06-17-10-42.gh-issue-105390.DvqI-e.rst
@@ -0,0 +1,3 @@
+Correctly raise :exc:`tokenize.TokenError` exceptions instead of
+:exc:`SyntaxError` for tokenize errors such as incomplete input. Patch by
+Pablo Galindo
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c

index 223de54d658507792d9d2cb3f8d67698e0ffc5a5..4d2179348eed20d768b0bf8e52f8912f3c1a88fb 100644 (file)
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -84,13 +84,8 @@ _tokenizer_error(struct tok_state *tok)
              msg = "invalid token";
              break;
          case E_EOF:
-            if (tok->level > 0) {
-                    PyErr_Format(PyExc_SyntaxError,
-                                 "parenthesis '%c' was never closed",
-                                tok->parenstack[tok->level-1]);
-            } else {
-                PyErr_SetString(PyExc_SyntaxError, "unexpected EOF while parsing");
-            }
+            PyErr_SetString(PyExc_SyntaxError, "unexpected EOF in multi-line statement");
+            PyErr_SyntaxLocationObject(tok->filename, tok->lineno, tok->inp - tok->buf < 0 ? 0 : tok->inp - tok->buf);
              return -1;
          case E_DEDENT:
              msg = "unindent does not match any outer indentation level";
author	Pablo Galindo Salgado <Pablogsal@gmail.com>
	Wed, 7 Jun 2023 11:04:40 +0000 (12:04 +0100)
committer	GitHub <noreply@github.com>
	Wed, 7 Jun 2023 11:04:40 +0000 (12:04 +0100)
Doc/library/tokenize.rst		patch \| blob \| blame \| history
Doc/whatsnew/3.12.rst		patch \| blob \| blame \| history
Lib/test/test_tokenize.py		patch \| blob \| blame \| history
Lib/tokenize.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Core and Builtins/2023-06-06-17-10-42.gh-issue-105390.DvqI-e.rst	[new file with mode: 0644]	patch \| blob
Python/Python-tokenize.c		patch \| blob \| blame \| history