2,
3
-Note that unclosed single-quoted strings do not cause an error to be
-raised. They are tokenized as :data:`~token.ERRORTOKEN`, followed by the
-tokenization of their contents.
-
-
.. _tokenize-cli:
Command-Line Usage
Additionally, there may be some minor behavioral changes as a consecuence of the
changes required to support :pep:`701`. Some of these changes include:
- * Some final ``DEDENT`` tokens are now emitted within the bounds of the
- input. This means that for a file containing 3 lines, the old version of the
- tokenizer returned a ``DEDENT`` token in line 4 whilst the new version returns
- the token in line 3.
-
* The ``type`` attribute of the tokens emitted when tokenizing some invalid Python
characters such as ``!`` has changed from ``ERRORTOKEN`` to ``OP``.
+ * Incomplete single-line strings now also raise :exc:`tokenize.TokenError` as incomplete
+ multiline strings do.
+
+ * Some incomplete or invalid Python code now raises :exc:`tokenize.TokenError` instead of
+ returning arbitrary ``ERRORTOKEN`` tokens when tokenizing it.
+
Build Changes
=============
from tokenize import (tokenize, untokenize, NUMBER, NAME, OP,
STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
open as tokenize_open, Untokenizer, generate_tokens,
- NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo)
+ NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo,
+ TokenError)
from io import BytesIO, StringIO
import unittest
from textwrap import dedent
for lit in INVALID_UNDERSCORE_LITERALS:
try:
number_token(lit)
- except SyntaxError:
+ except TokenError:
continue
self.assertNotEqual(number_token(lit), lit)
self.assertEqual(found, "iso-8859-1")
def test_syntaxerror_latin1(self):
- # Issue 14629: need to raise SyntaxError if the first
+ # Issue 14629: need to raise TokenError if the first
# line(s) have non-UTF-8 characters
lines = (
b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
"]",
]:
with self.subTest(case=case):
- self.assertRaises(SyntaxError, get_tokens, case)
+ self.assertRaises(TokenError, get_tokens, case)
def test_max_indent(self):
MAXINDENT = 100
invalid = generate_source(MAXINDENT)
the_input = StringIO(invalid)
- self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline)))
+ self.assertRaises(IndentationError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline)))
self.assertRaises(
IndentationError, compile, invalid, "<string>", "exec"
)
perror("unexpected error: %s" % err)
raise
+def _transform_msg(msg):
+ """Transform error messages from the C tokenizer into the Python tokenize
+
+ The C tokenizer is more picky than the Python one, so we need to massage
+ the error messages a bit for backwards compatibility.
+ """
+ if "unterminated triple-quoted string literal" in msg:
+ return "EOF in multi-line string"
+ return msg
+
def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False):
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
if encoding is None:
it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens)
else:
it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens)
- for info in it:
- yield TokenInfo._make(info)
+ try:
+ for info in it:
+ yield TokenInfo._make(info)
+ except SyntaxError as e:
+ if type(e) != SyntaxError:
+ raise e from None
+ msg = _transform_msg(e.msg)
+ raise TokenError(msg, (e.lineno, e.offset)) from None
if __name__ == "__main__":
--- /dev/null
+Correctly raise :exc:`tokenize.TokenError` exceptions instead of
+:exc:`SyntaxError` for tokenize errors such as incomplete input. Patch by
+Pablo Galindo
msg = "invalid token";
break;
case E_EOF:
- if (tok->level > 0) {
- PyErr_Format(PyExc_SyntaxError,
- "parenthesis '%c' was never closed",
- tok->parenstack[tok->level-1]);
- } else {
- PyErr_SetString(PyExc_SyntaxError, "unexpected EOF while parsing");
- }
+ PyErr_SetString(PyExc_SyntaxError, "unexpected EOF in multi-line statement");
+ PyErr_SyntaxLocationObject(tok->filename, tok->lineno, tok->inp - tok->buf < 0 ? 0 : tok->inp - tok->buf);
return -1;
case E_DEDENT:
msg = "unindent does not match any outer indentation level";