gh-105549: Tokenize separately NUMBER and NAME tokens and allow 0-prefixed literals...

author Pablo Galindo Salgado <Pablogsal@gmail.com>

Fri, 9 Jun 2023 20:39:01 +0000 (21:39 +0100)

committer GitHub <noreply@github.com>

Fri, 9 Jun 2023 20:39:01 +0000 (21:39 +0100)
author Pablo Galindo Salgado <Pablogsal@gmail.com>
Fri, 9 Jun 2023 20:39:01 +0000 (21:39 +0100)
committer GitHub <noreply@github.com>
Fri, 9 Jun 2023 20:39:01 +0000 (21:39 +0100)
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py

index 2c124f062e7fd64f48ae992c0a97a084c0e6700b..df9c9db322dc948a9c215717aba3dfce90857def 100644 (file)
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -284,7 +284,12 @@ def k(x):
                  # this won't work with compound complex inputs
                  continue
              self.assertEqual(number_token(lit), lit)
+        # Valid cases with extra underscores in the tokenize module
+        # See gh-105549 for context
+        extra_valid_cases = {"0_7", "09_99"}
          for lit in INVALID_UNDERSCORE_LITERALS:
+            if lit in extra_valid_cases:
+                continue
              try:
                  number_token(lit)
              except TokenError:
@@ -1873,6 +1878,34 @@ class TestRoundtrip(TestCase):
          self.check_roundtrip(code)
  
  
+class InvalidPythonTests(TestCase):
+    def test_number_followed_by_name(self):
+        # See issue #gh-105549
+        source = "2sin(x)"
+        expected_tokens = [
+            TokenInfo(type=token.NUMBER, string='2', start=(1, 0), end=(1, 1), line='2sin(x)'),
+            TokenInfo(type=token.NAME, string='sin', start=(1, 1), end=(1, 4), line='2sin(x)'),
+            TokenInfo(type=token.OP, string='(', start=(1, 4), end=(1, 5), line='2sin(x)'),
+            TokenInfo(type=token.NAME, string='x', start=(1, 5), end=(1, 6), line='2sin(x)'),
+            TokenInfo(type=token.OP, string=')', start=(1, 6), end=(1, 7), line='2sin(x)'),
+            TokenInfo(type=token.NEWLINE, string='', start=(1, 7), end=(1, 8), line='2sin(x)'),
+            TokenInfo(type=token.ENDMARKER, string='', start=(2, 0), end=(2, 0), line='')
+        ]
+
+        tokens = list(generate_tokens(StringIO(source).readline))
+        self.assertEqual(tokens, expected_tokens)
+
+    def test_number_starting_with_zero(self):
+        source = "01234"
+        expected_tokens = [
+            TokenInfo(type=token.NUMBER, string='01234', start=(1, 0), end=(1, 5), line='01234'),
+            TokenInfo(type=token.NEWLINE, string='', start=(1, 5), end=(1, 6), line='01234'),
+            TokenInfo(type=token.ENDMARKER, string='', start=(2, 0), end=(2, 0), line='')
+        ]
+
+        tokens = list(generate_tokens(StringIO(source).readline))
+        self.assertEqual(tokens, expected_tokens)
+
  class CTokenizeTest(TestCase):
      def check_tokenize(self, s, expected):
          # Format the tokens in s in a table format.
diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-06-09-12-59-18.gh-issue-105549.PYfTNp.rst b/Misc/NEWS.d/next/Core and Builtins/2023-06-09-12-59-18.gh-issue-105549.PYfTNp.rst

new file mode 100644 (file)

index 0000000..c3dcaee
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2023-06-09-12-59-18.gh-issue-105549.PYfTNp.rst
@@ -0,0 +1,2 @@
+Tokenize separately `NUMBER` and `NAME` tokens that are not ambiguous. Patch
+by Pablo Galindo
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c

index b349f59abbce999389a08f9c2169ecfec2930f85..bf6bfd92d3a47d496d94e98aca811de811f52db2 100644 (file)
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1600,8 +1600,12 @@ lookahead(struct tok_state *tok, const char *test)
  }
  
  static int
-verify_end_of_number(struct tok_state *tok, int c, const char *kind)
-{
+verify_end_of_number(struct tok_state *tok, int c, const char *kind) {
+    if (tok->tok_extra_tokens) {
+        // When we are parsing extra tokens, we don't want to emit warnings
+        // about invalid literals, because we want to be a bit more liberal.
+        return 1;
+    }
      /* Emit a deprecation warning only if the numeric literal is immediately
       * followed by one of keywords which can occur after a numeric literal
       * in valid code: "and", "else", "for", "if", "in", "is" and "or".
@@ -1659,6 +1663,9 @@ verify_end_of_number(struct tok_state *tok, int c, const char *kind)
  static int
  verify_identifier(struct tok_state *tok)
  {
+    if (tok->tok_extra_tokens) {
+        return 1;
+    }
      PyObject *s;
      if (tok->decoding_erred)
          return 0;
@@ -2318,7 +2325,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
                  else if (c == 'j' || c == 'J') {
                      goto imaginary;
                  }
-                else if (nonzero) {
+                else if (nonzero && !tok->tok_extra_tokens) {
                      /* Old-style octal: now disallowed. */
                      tok_backup(tok, c);
                      return MAKE_TOKEN(syntaxerror_known_range(
author	Pablo Galindo Salgado <Pablogsal@gmail.com>
	Fri, 9 Jun 2023 20:39:01 +0000 (21:39 +0100)
committer	GitHub <noreply@github.com>
	Fri, 9 Jun 2023 20:39:01 +0000 (21:39 +0100)
Lib/test/test_tokenize.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Core and Builtins/2023-06-09-12-59-18.gh-issue-105549.PYfTNp.rst	[new file with mode: 0644]	patch \| blob
Parser/tokenizer.c		patch \| blob \| blame \| history