gh-102856: Tokenize performance improvement (#104731)

author Marta Gómez Macías <mgmacias@google.com>

Mon, 22 May 2023 00:29:04 +0000 (02:29 +0200)

committer GitHub <noreply@github.com>

Mon, 22 May 2023 00:29:04 +0000 (00:29 +0000)
author Marta Gómez Macías <mgmacias@google.com>
Mon, 22 May 2023 00:29:04 +0000 (02:29 +0200)
committer GitHub <noreply@github.com>
Mon, 22 May 2023 00:29:04 +0000 (00:29 +0000)
diff --git a/Lib/tokenize.py b/Lib/tokenize.py

index cef2773feac24b10fc500bcda74ab778d3e562d0..911f0f12f9bb7e83365ad4477ecc2acb86f427f4 100644 (file)
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -449,16 +449,6 @@ def _tokenize(rl_gen, encoding):
      source = b"".join(rl_gen).decode(encoding)
      token = None
      for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
-        # TODO: Marta -> limpiar esto
-        if 6 < token.type <= 54:
-            token = token._replace(type=OP)
-        if token.type in {ASYNC, AWAIT}:
-            token = token._replace(type=NAME)
-        if token.type == NEWLINE:
-            l_start, c_start = token.start
-            l_end, c_end = token.end
-            token = token._replace(string='\n', start=(l_start, c_start), end=(l_end, c_end+1))
-
          yield token
      if token is not None:
          last_line, _ = token.start
@@ -550,8 +540,7 @@ def _generate_tokens_from_c_tokenizer(source, extra_tokens=False):
      """Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
      import _tokenize as c_tokenizer
      for info in c_tokenizer.TokenizerIter(source, extra_tokens=extra_tokens):
-        tok, type, lineno, end_lineno, col_off, end_col_off, line = info
-        yield TokenInfo(type, tok, (lineno, col_off), (end_lineno, end_col_off), line)
+        yield TokenInfo._make(info)
  
  
  if __name__ == "__main__":
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c

index ece238672e34fdef1740a86a0aa97b0688cfe79b..43b44be94583ee0bd6d7d466ec15a554d4f254c8 100644 (file)
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -207,7 +207,22 @@ tokenizeriter_next(tokenizeriterobject *it)
          end_col_offset = _PyPegen_byte_offset_to_character_offset(line, token.end - it->tok->line_start);
      }
  
-    result = Py_BuildValue("(NinnnnN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
+    if (it->tok->tok_extra_tokens) {
+        // Necessary adjustments to match the original Python tokenize
+        // implementation
+        if (type > DEDENT && type < OP) {
+            type = OP;
+        }
+        else if (type == ASYNC || type == AWAIT) {
+            type = NAME;
+        }
+        else if (type == NEWLINE) {
+            str = PyUnicode_FromString("\n");
+            end_col_offset++;
+        }
+    }
+
+    result = Py_BuildValue("(iN(nn)(nn)N)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
  exit:
      _PyToken_Free(&token);
      return result;
author	Marta Gómez Macías <mgmacias@google.com>
	Mon, 22 May 2023 00:29:04 +0000 (02:29 +0200)
committer	GitHub <noreply@github.com>
	Mon, 22 May 2023 00:29:04 +0000 (00:29 +0000)
Lib/tokenize.py		patch \| blob \| blame \| history
Python/Python-tokenize.c		patch \| blob \| blame \| history