bpo-25324: copy tok_name before changing it (#1608)

author Albert-Jan Nijburg <albertjan@trinket.io>

Wed, 31 May 2017 14:00:21 +0000 (15:00 +0100)

committer Victor Stinner <victor.stinner@gmail.com>

Wed, 31 May 2017 14:00:21 +0000 (16:00 +0200)
author Albert-Jan Nijburg <albertjan@trinket.io>
Wed, 31 May 2017 14:00:21 +0000 (15:00 +0100)
committer Victor Stinner <victor.stinner@gmail.com>
Wed, 31 May 2017 14:00:21 +0000 (16:00 +0200)
diff --git a/Doc/library/token.rst b/Doc/library/token.rst

index effb7113230e742fb0ceb52c349e47ba57f80fbf..4bf15d5a81c50b14041b033a861367fa753ade9f 100644 (file)
--- a/Doc/library/token.rst
+++ b/Doc/library/token.rst
@@ -101,6 +101,9 @@ The token constants are:
            AWAIT
            ASYNC
            ERRORTOKEN
+          COMMENT
+          NL
+          ENCODING
            N_TOKENS
            NT_OFFSET
  
@@ -108,3 +111,8 @@ The token constants are:
        Added :data:`AWAIT` and :data:`ASYNC` tokens. Starting with
        Python 3.7, "async" and "await" will be tokenized as :data:`NAME`
        tokens, and :data:`AWAIT` and :data:`ASYNC` will be removed.
+
+   .. versionchanged:: 3.7
+      Added :data:`COMMENT`, :data:`NL` and :data:`ENCODING` to bring
+      the tokens in the C code in line with the tokens needed in
+      :mod:`tokenize` module. These tokens aren't used by the C tokenizer.
+\ No newline at end of file
diff --git a/Include/token.h b/Include/token.h

index 595afa01580888ea996ab69b8078ff5e179df998..b28830b8b40387c55ebcf3c7adaa22030ab12922 100644 (file)
--- a/Include/token.h
+++ b/Include/token.h
@@ -67,7 +67,11 @@ extern "C" {
  #define AWAIT          54
  #define ASYNC          55
  #define ERRORTOKEN     56
-#define N_TOKENS       57
+/* These aren't used by the C tokenizer but are needed for tokenize.py */
+#define COMMENT                57
+#define NL             58
+#define ENCODING               59
+#define N_TOKENS       60
  
  /* Special definitions for cooperation with parser */
  
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py

index dcaf58f5272bacfab5ed754074d0e8116e8ca362..538612cf94ea30d5bbca34accc52226af886827c 100644 (file)
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1343,13 +1343,13 @@ class TestTokenize(TestCase):
          tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
          num_optypes = len(optypes)
          self.assertEqual(len(tokens), 2 + num_optypes)
-        self.assertEqual(token.tok_name[tokens[0].exact_type],
-                         token.tok_name[ENCODING])
+        self.assertEqual(tok_name[tokens[0].exact_type],
+                         tok_name[ENCODING])
          for i in range(num_optypes):
-            self.assertEqual(token.tok_name[tokens[i + 1].exact_type],
-                             token.tok_name[optypes[i]])
-        self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type],
-                         token.tok_name[token.ENDMARKER])
+            self.assertEqual(tok_name[tokens[i + 1].exact_type],
+                             tok_name[optypes[i]])
+        self.assertEqual(tok_name[tokens[1 + num_optypes].exact_type],
+                         tok_name[token.ENDMARKER])
  
      def test_exact_type(self):
          self.assertExactTypeEqual('()', token.LPAR, token.RPAR)
diff --git a/Lib/token.py b/Lib/token.py

index 5fdb2221333e2c855461df5e33f32883e41f381d..091f80bf191b9a8163fc7ea8cc860aec50338d8f 100644 (file)
--- a/Lib/token.py
+++ b/Lib/token.py
@@ -63,11 +63,17 @@ AT = 49
  ATEQUAL = 50
  RARROW = 51
  ELLIPSIS = 52
+# Don't forget to update the table _PyParser_TokenNames in tokenizer.c!
  OP = 53
  AWAIT = 54
  ASYNC = 55
  ERRORTOKEN = 56
-N_TOKENS = 57
+# These aren't used by the C tokenizer but are needed for tokenize.py
+COMMENT = 57
+NL = 58
+ENCODING = 59
+N_TOKENS = 60
+# Special definitions for cooperation with parser
  NT_OFFSET = 256
  #--end constants--
  
@@ -102,15 +108,26 @@ def _main():
      with fp:
          lines = fp.read().split("\n")
      prog = re.compile(
-        "#define[ \t][ \t]*([A-Z0-9][A-Z0-9_]*)[ \t][ \t]*([0-9][0-9]*)",
+        r"#define[ \t][ \t]*([A-Z0-9][A-Z0-9_]*)[ \t][ \t]*([0-9][0-9]*)",
          re.IGNORECASE)
+    comment_regex = re.compile(
+        r"^\s*/\*\s*(.+?)\s*\*/\s*$",
+        re.IGNORECASE)
+
      tokens = {}
+    prev_val = None
      for line in lines:
          match = prog.match(line)
          if match:
              name, val = match.group(1, 2)
              val = int(val)
-            tokens[val] = name          # reverse so we can sort them...
+            tokens[val] = {'token': name}          # reverse so we can sort them...
+            prev_val = val
+        else:
+            comment_match = comment_regex.match(line)
+            if comment_match and prev_val is not None:
+                comment = comment_match.group(1)
+                tokens[prev_val]['comment'] = comment
      keys = sorted(tokens.keys())
      # load the output skeleton from the target:
      try:
@@ -127,8 +144,10 @@ def _main():
          sys.stderr.write("target does not contain format markers")
          sys.exit(3)
      lines = []
-    for val in keys:
-        lines.append("%s = %d" % (tokens[val], val))
+    for key in keys:
+        lines.append("%s = %d" % (tokens[key]["token"], key))
+        if "comment" in tokens[key]:
+            lines.append("# %s" % tokens[key]["comment"])
      format[start:end] = lines
      try:
          fp = open(outFileName, 'w')
diff --git a/Lib/tokenize.py b/Lib/tokenize.py

index 9017bb13e78dcdce0b7c61ed0bc2c4be63646567..5fa4152609378a968a5f5f552e778e8f891ffeb1 100644 (file)
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -38,17 +38,10 @@ cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
  blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
  
  import token
-__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
-                           "NL", "untokenize", "ENCODING", "TokenInfo"]
+__all__ = token.__all__ + ["tokenize", "detect_encoding",
+                           "untokenize", "TokenInfo"]
  del token
  
-COMMENT = N_TOKENS
-tok_name[COMMENT] = 'COMMENT'
-NL = N_TOKENS + 1
-tok_name[NL] = 'NL'
-ENCODING = N_TOKENS + 2
-tok_name[ENCODING] = 'ENCODING'
-N_TOKENS += 3
  EXACT_TOKEN_TYPES = {
      '(':   LPAR,
      ')':   RPAR,
diff --git a/Misc/NEWS b/Misc/NEWS

index 6f90175bf7713f19b17f32f76336f4e61bbb6ce0..cda5ce0311ed5b49321a42b3a38c9ff262f5ae8e 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,10 @@ What's New in Python 3.7.0 alpha 1?
  Core and Builtins
  -----------------
  
+- bpo-25324: Tokens needed for parsing in Python moved to C. ``COMMENT``,
+  ``NL`` and ``ENCODING``. This way the tokens and tok_names in the token
+  module don't get changed when you import the tokenize module.
+
  - bpo-29104: Fixed parsing backslashes in f-strings.
  
  - bpo-27945: Fixed various segfaults with dict when input collections are
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c

index 5cc9533c3cd7bae320522beb20d1d00ff3b11e54..7f2f3e6028480dc685c99408de861993b7c2a9dd 100644 (file)
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -106,6 +106,9 @@ const char *_PyParser_TokenNames[] = {
      "AWAIT",
      "ASYNC",
      "<ERRORTOKEN>",
+    "COMMENT",
+    "NL",
+    "ENCODING"
      "<N_TOKENS>"
  };
author	Albert-Jan Nijburg <albertjan@trinket.io>
	Wed, 31 May 2017 14:00:21 +0000 (15:00 +0100)
committer	Victor Stinner <victor.stinner@gmail.com>
	Wed, 31 May 2017 14:00:21 +0000 (16:00 +0200)
Doc/library/token.rst		patch \| blob \| blame \| history
Include/token.h		patch \| blob \| blame \| history
Lib/test/test_tokenize.py		patch \| blob \| blame \| history
Lib/token.py		patch \| blob \| blame \| history
Lib/tokenize.py		patch \| blob \| blame \| history
Misc/NEWS		patch \| blob \| blame \| history
Parser/tokenizer.c		patch \| blob \| blame \| history