gh-97973: Return all necessary information from the tokenizer (GH-97984)

author Lysandros Nikolaou <lisandrosnik@gmail.com>

Thu, 6 Oct 2022 23:07:17 +0000 (16:07 -0700)

committer GitHub <noreply@github.com>

Thu, 6 Oct 2022 23:07:17 +0000 (16:07 -0700)
author Lysandros Nikolaou <lisandrosnik@gmail.com>
Thu, 6 Oct 2022 23:07:17 +0000 (16:07 -0700)
committer GitHub <noreply@github.com>
Thu, 6 Oct 2022 23:07:17 +0000 (16:07 -0700)
diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-10-06-20-41-29.gh-issue-97973.gB-xWi.rst b/Misc/NEWS.d/next/Core and Builtins/2022-10-06-20-41-29.gh-issue-97973.gB-xWi.rst

new file mode 100644 (file)

index 0000000..a0095a6
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2022-10-06-20-41-29.gh-issue-97973.gB-xWi.rst
@@ -0,0 +1 @@
+Modify the tokenizer to return all necessary information the parser needs to set location information in the AST nodes, so that the parser does not have to calculate those doing pointer arithmetic.
diff --git a/Parser/pegen.c b/Parser/pegen.c

index a5d123da51296c1344282604f7c052284788a3fe..1317606749b8c0d1e8d477afdf026d94e351dc68 100644 (file)
--- a/Parser/pegen.c
+++ b/Parser/pegen.c
@@ -123,16 +123,18 @@ growable_comment_array_deallocate(growable_comment_array *arr) {
  }
  
  static int
-_get_keyword_or_name_type(Parser *p, const char *name, int name_len)
+_get_keyword_or_name_type(Parser *p, struct token *new_token)
  {
+    int name_len = new_token->end_col_offset - new_token->col_offset;
      assert(name_len > 0);
+
      if (name_len >= p->n_keyword_lists ||
          p->keywords[name_len] == NULL ||
          p->keywords[name_len]->type == -1) {
          return NAME;
      }
      for (KeywordToken *k = p->keywords[name_len]; k != NULL && k->type != -1; k++) {
-        if (strncmp(k->str, name, name_len) == 0) {
+        if (strncmp(k->str, new_token->start, name_len) == 0) {
              return k->type;
          }
      }
@@ -140,33 +142,26 @@ _get_keyword_or_name_type(Parser *p, const char *name, int name_len)
  }
  
  static int
-initialize_token(Parser *p, Token *token, const char *start, const char *end, int token_type) {
-    assert(token != NULL);
+initialize_token(Parser *p, Token *parser_token, struct token *new_token, int token_type) {
+    assert(parser_token != NULL);
  
-    token->type = (token_type == NAME) ? _get_keyword_or_name_type(p, start, (int)(end - start)) : token_type;
-    token->bytes = PyBytes_FromStringAndSize(start, end - start);
-    if (token->bytes == NULL) {
+    parser_token->type = (token_type == NAME) ? _get_keyword_or_name_type(p, new_token) : token_type;
+    parser_token->bytes = PyBytes_FromStringAndSize(new_token->start, new_token->end - new_token->start);
+    if (parser_token->bytes == NULL) {
          return -1;
      }
-
-    if (_PyArena_AddPyObject(p->arena, token->bytes) < 0) {
-        Py_DECREF(token->bytes);
+    if (_PyArena_AddPyObject(p->arena, parser_token->bytes) < 0) {
+        Py_DECREF(parser_token->bytes);
          return -1;
      }
  
-    token->level = p->tok->level;
-
-    const char *line_start = token_type == STRING ? p->tok->multi_line_start : p->tok->line_start;
-    int lineno = token_type == STRING ? p->tok->first_lineno : p->tok->lineno;
-    int end_lineno = p->tok->lineno;
-
-    int col_offset = (start != NULL && start >= line_start) ? (int)(start - line_start) : -1;
-    int end_col_offset = (end != NULL && end >= p->tok->line_start) ? (int)(end - p->tok->line_start) : -1;
-
-    token->lineno = lineno;
-    token->col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + col_offset : col_offset;
-    token->end_lineno = end_lineno;
-    token->end_col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + end_col_offset : end_col_offset;
+    parser_token->level = new_token->level;
+    parser_token->lineno = new_token->lineno;
+    parser_token->col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + new_token->col_offset
+                                                                    : new_token->col_offset;
+    parser_token->end_lineno = new_token->end_lineno;
+    parser_token->end_col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + new_token->end_col_offset
+                                                                 : new_token->end_col_offset;
  
      p->fill += 1;
  
@@ -202,26 +197,25 @@ _resize_tokens_array(Parser *p) {
  int
  _PyPegen_fill_token(Parser *p)
  {
-    const char *start;
-    const char *end;
-    int type = _PyTokenizer_Get(p->tok, &start, &end);
+    struct token new_token;
+    int type = _PyTokenizer_Get(p->tok, &new_token);
  
      // Record and skip '# type: ignore' comments
      while (type == TYPE_IGNORE) {
-        Py_ssize_t len = end - start;
+        Py_ssize_t len = new_token.end_col_offset - new_token.col_offset;
          char *tag = PyMem_Malloc(len + 1);
          if (tag == NULL) {
              PyErr_NoMemory();
              return -1;
          }
-        strncpy(tag, start, len);
+        strncpy(tag, new_token.start, len);
          tag[len] = '\0';
          // Ownership of tag passes to the growable array
          if (!growable_comment_array_add(&p->type_ignore_comments, p->tok->lineno, tag)) {
              PyErr_NoMemory();
              return -1;
          }
-        type = _PyTokenizer_Get(p->tok, &start, &end);
+        type = _PyTokenizer_Get(p->tok, &new_token);
      }
  
      // If we have reached the end and we are in single input mode we need to insert a newline and reset the parsing
@@ -244,7 +238,7 @@ _PyPegen_fill_token(Parser *p)
      }
  
      Token *t = p->tokens[p->fill];
-    return initialize_token(p, t, start, end, type);
+    return initialize_token(p, t, &new_token, type);
  }
  
  #if defined(Py_DEBUG)
diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c

index 95bbd43dc326213d680c0919da9f386a0264b5bc..7738cbaf9ef39e9e1f63cc1d70e232dbd44e10ff 100644 (file)
--- a/Parser/pegen_errors.c
+++ b/Parser/pegen_errors.c
@@ -164,11 +164,10 @@ _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
      Py_ssize_t current_err_line = current_token->lineno;
  
      int ret = 0;
+    struct token new_token;
  
      for (;;) {
-        const char *start;
-        const char *end;
-        switch (_PyTokenizer_Get(p->tok, &start, &end)) {
+        switch (_PyTokenizer_Get(p->tok, &new_token)) {
              case ERRORTOKEN:
                  if (p->tok->level != 0) {
                      int error_lineno = p->tok->parenlinenostack[p->tok->level-1];
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c

index 3c37fd9c45a49ef3e6bd5e6e4d29ba48093edea6..c5d3e580247cc12b568c7548ead47f8dbe8640d5 100644 (file)
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -36,6 +36,8 @@
  /* Don't ever change this -- it would break the portability of Python code */
  #define TABSIZE 8
  
+#define MAKE_TOKEN(token_type) token_setup(tok, token, token_type, p_start, p_end)
+
  /* Forward */
  static struct tok_state *tok_new(void);
  static int tok_nextc(struct tok_state *tok);
@@ -1174,8 +1176,6 @@ syntaxerror_known_range(struct tok_state *tok,
      return ret;
  }
  
-
-
  static int
  indenterror(struct tok_state *tok)
  {
@@ -1391,12 +1391,32 @@ tok_continuation_line(struct tok_state *tok) {
  }
  
  static int
-tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
+token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end)
+{
+    assert((start == NULL && end == NULL) || (start != NULL && end != NULL));
+    token->level = tok->level;
+    token->lineno = type == STRING ? tok->first_lineno : tok->lineno;
+    token->end_lineno = tok->lineno;
+    token->col_offset = -1;
+    token->end_col_offset = -1;
+    token->start = start;
+    token->end = end;
+    if (start != NULL && end != NULL) {
+        const char *line_start = type == STRING ? tok->multi_line_start : tok->line_start;
+        token->col_offset = (start >= line_start) ? (int)(start - line_start) : -1;
+        token->end_col_offset = (end >= tok->line_start) ? (int)(end - tok->line_start) : -1;
+    }
+    return type;
+}
+
+static int
+tok_get(struct tok_state *tok, struct token *token)
  {
      int c;
      int blankline, nonascii;
  
-    *p_start = *p_end = NULL;
+    const char *p_start = NULL;
+    const char *p_end = NULL;
    nextline:
      tok->start = NULL;
      blankline = 0;
@@ -1426,7 +1446,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                  // the level of indentation of whatever comes next.
                  cont_line_col = cont_line_col ? cont_line_col : col;
                  if ((c = tok_continuation_line(tok)) == -1) {
-                    return ERRORTOKEN;
+                    return MAKE_TOKEN(ERRORTOKEN);
                  }
              }
              else {
@@ -1461,7 +1481,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
              if (col == tok->indstack[tok->indent]) {
                  /* No change */
                  if (altcol != tok->altindstack[tok->indent]) {
-                    return indenterror(tok);
+                    return MAKE_TOKEN(indenterror(tok));
                  }
              }
              else if (col > tok->indstack[tok->indent]) {
@@ -1469,10 +1489,10 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                  if (tok->indent+1 >= MAXINDENT) {
                      tok->done = E_TOODEEP;
                      tok->cur = tok->inp;
-                    return ERRORTOKEN;
+                    return MAKE_TOKEN(ERRORTOKEN);
                  }
                  if (altcol <= tok->altindstack[tok->indent]) {
-                    return indenterror(tok);
+                    return MAKE_TOKEN(indenterror(tok));
                  }
                  tok->pendin++;
                  tok->indstack[++tok->indent] = col;
@@ -1488,10 +1508,10 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                  if (col != tok->indstack[tok->indent]) {
                      tok->done = E_DEDENT;
                      tok->cur = tok->inp;
-                    return ERRORTOKEN;
+                    return MAKE_TOKEN(ERRORTOKEN);
                  }
                  if (altcol != tok->altindstack[tok->indent]) {
-                    return indenterror(tok);
+                    return MAKE_TOKEN(indenterror(tok));
                  }
              }
          }
@@ -1503,11 +1523,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
      if (tok->pendin != 0) {
          if (tok->pendin < 0) {
              tok->pendin++;
-            return DEDENT;
+            return MAKE_TOKEN(DEDENT);
          }
          else {
              tok->pendin--;
-            return INDENT;
+            return MAKE_TOKEN(INDENT);
          }
      }
  
@@ -1587,34 +1607,34 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                           && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
  
                  if (is_type_ignore) {
-                    *p_start = ignore_end;
-                    *p_end = tok->cur;
+                    p_start = ignore_end;
+                    p_end = tok->cur;
  
                      /* If this type ignore is the only thing on the line, consume the newline also. */
                      if (blankline) {
                          tok_nextc(tok);
                          tok->atbol = 1;
                      }
-                    return TYPE_IGNORE;
+                    return MAKE_TOKEN(TYPE_IGNORE);
                  } else {
-                    *p_start = type_start;  /* after type_comment_prefix */
-                    *p_end = tok->cur;
-                    return TYPE_COMMENT;
+                    p_start = type_start;
+                    p_end = tok->cur;
+                    return MAKE_TOKEN(TYPE_COMMENT);
                  }
              }
          }
      }
  
      if (tok->done == E_INTERACT_STOP) {
-        return ENDMARKER;
+        return MAKE_TOKEN(ENDMARKER);
      }
  
      /* Check for EOF and errors now */
      if (c == EOF) {
          if (tok->level) {
-            return ERRORTOKEN;
+            return MAKE_TOKEN(ERRORTOKEN);
          }
-        return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
+        return MAKE_TOKEN(tok->done == E_EOF ? ENDMARKER : ERRORTOKEN);
      }
  
      /* Identifier (most frequent token!) */
@@ -1654,11 +1674,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
          }
          tok_backup(tok, c);
          if (nonascii && !verify_identifier(tok)) {
-            return ERRORTOKEN;
+            return MAKE_TOKEN(ERRORTOKEN);
          }
  
-        *p_start = tok->start;
-        *p_end = tok->cur;
+        p_start = tok->start;
+        p_end = tok->cur;
  
          /* async/await parsing block. */
          if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
@@ -1673,10 +1693,10 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
              if (!tok->async_hacks || tok->async_def) {
                  /* Always recognize the keywords. */
                  if (memcmp(tok->start, "async", 5) == 0) {
-                    return ASYNC;
+                    return MAKE_TOKEN(ASYNC);
                  }
                  if (memcmp(tok->start, "await", 5) == 0) {
-                    return AWAIT;
+                    return MAKE_TOKEN(AWAIT);
                  }
              }
              else if (memcmp(tok->start, "async", 5) == 0) {
@@ -1684,13 +1704,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                     Look ahead one token to see if that is 'def'. */
  
                  struct tok_state ahead_tok;
-                const char *ahead_tok_start = NULL;
-                const char *ahead_tok_end = NULL;
+                struct token ahead_token;
                  int ahead_tok_kind;
  
                  memcpy(&ahead_tok, tok, sizeof(ahead_tok));
-                ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
-                                         &ahead_tok_end);
+                ahead_tok_kind = tok_get(&ahead_tok, &ahead_token);
  
                  if (ahead_tok_kind == NAME
                      && ahead_tok.cur - ahead_tok.start == 3
@@ -1700,12 +1718,12 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                         returning a plain NAME token, return ASYNC. */
                      tok->async_def_indent = tok->indent;
                      tok->async_def = 1;
-                    return ASYNC;
+                    return MAKE_TOKEN(ASYNC);
                  }
              }
          }
  
-        return NAME;
+        return MAKE_TOKEN(NAME);
      }
  
      /* Newline */
@@ -1714,15 +1732,15 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
          if (blankline || tok->level > 0) {
              goto nextline;
          }
-        *p_start = tok->start;
-        *p_end = tok->cur - 1; /* Leave '\n' out of the string */
+        p_start = tok->start;
+        p_end = tok->cur - 1; /* Leave '\n' out of the string */
          tok->cont_line = 0;
          if (tok->async_def) {
              /* We're somewhere inside an 'async def' function, and
                 we've encountered a NEWLINE after its signature. */
              tok->async_def_nl = 1;
          }
-        return NEWLINE;
+        return MAKE_TOKEN(NEWLINE);
      }
  
      /* Period or number starting with period? */
@@ -1733,9 +1751,9 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
          } else if (c == '.') {
              c = tok_nextc(tok);
              if (c == '.') {
-                *p_start = tok->start;
-                *p_end = tok->cur;
-                return ELLIPSIS;
+                p_start = tok->start;
+                p_end = tok->cur;
+                return MAKE_TOKEN(ELLIPSIS);
              }
              else {
                  tok_backup(tok, c);
@@ -1745,9 +1763,9 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
          else {
              tok_backup(tok, c);
          }
-        *p_start = tok->start;
-        *p_end = tok->cur;
-        return DOT;
+        p_start = tok->start;
+        p_end = tok->cur;
+        return MAKE_TOKEN(DOT);
      }
  
      /* Number */
@@ -1764,14 +1782,14 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                      }
                      if (!isxdigit(c)) {
                          tok_backup(tok, c);
-                        return syntaxerror(tok, "invalid hexadecimal literal");
+                        return MAKE_TOKEN(syntaxerror(tok, "invalid hexadecimal literal"));
                      }
                      do {
                          c = tok_nextc(tok);
                      } while (isxdigit(c));
                  } while (c == '_');
                  if (!verify_end_of_number(tok, c, "hexadecimal")) {
-                    return ERRORTOKEN;
+                    return MAKE_TOKEN(ERRORTOKEN);
                  }
              }
              else if (c == 'o' || c == 'O') {
@@ -1783,12 +1801,12 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                      }
                      if (c < '0' || c >= '8') {
                          if (isdigit(c)) {
-                            return syntaxerror(tok,
-                                    "invalid digit '%c' in octal literal", c);
+                            return MAKE_TOKEN(syntaxerror(tok,
+                                    "invalid digit '%c' in octal literal", c));
                          }
                          else {
                              tok_backup(tok, c);
-                            return syntaxerror(tok, "invalid octal literal");
+                            return MAKE_TOKEN(syntaxerror(tok, "invalid octal literal"));
                          }
                      }
                      do {
@@ -1796,11 +1814,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                      } while ('0' <= c && c < '8');
                  } while (c == '_');
                  if (isdigit(c)) {
-                    return syntaxerror(tok,
-                            "invalid digit '%c' in octal literal", c);
+                    return MAKE_TOKEN(syntaxerror(tok,
+                            "invalid digit '%c' in octal literal", c));
                  }
                  if (!verify_end_of_number(tok, c, "octal")) {
-                    return ERRORTOKEN;
+                    return MAKE_TOKEN(ERRORTOKEN);
                  }
              }
              else if (c == 'b' || c == 'B') {
@@ -1812,12 +1830,11 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                      }
                      if (c != '0' && c != '1') {
                          if (isdigit(c)) {
-                            return syntaxerror(tok,
-                                    "invalid digit '%c' in binary literal", c);
+                            return MAKE_TOKEN(syntaxerror(tok, "invalid digit '%c' in binary literal", c));
                          }
                          else {
                              tok_backup(tok, c);
-                            return syntaxerror(tok, "invalid binary literal");
+                            return MAKE_TOKEN(syntaxerror(tok, "invalid binary literal"));
                          }
                      }
                      do {
@@ -1825,11 +1842,10 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                      } while (c == '0' || c == '1');
                  } while (c == '_');
                  if (isdigit(c)) {
-                    return syntaxerror(tok,
-                            "invalid digit '%c' in binary literal", c);
+                    return MAKE_TOKEN(syntaxerror(tok, "invalid digit '%c' in binary literal", c));
                  }
                  if (!verify_end_of_number(tok, c, "binary")) {
-                    return ERRORTOKEN;
+                    return MAKE_TOKEN(ERRORTOKEN);
                  }
              }
              else {
@@ -1841,7 +1857,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                          c = tok_nextc(tok);
                          if (!isdigit(c)) {
                              tok_backup(tok, c);
-                            return syntaxerror(tok, "invalid decimal literal");
+                            return MAKE_TOKEN(syntaxerror(tok, "invalid decimal literal"));
                          }
                      }
                      if (c != '0') {
@@ -1854,7 +1870,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                      nonzero = 1;
                      c = tok_decimal_tail(tok);
                      if (c == 0) {
-                        return ERRORTOKEN;
+                        return MAKE_TOKEN(ERRORTOKEN);
                      }
                  }
                  if (c == '.') {
@@ -1870,15 +1886,15 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                  else if (nonzero) {
                      /* Old-style octal: now disallowed. */
                      tok_backup(tok, c);
-                    return syntaxerror_known_range(
+                    return MAKE_TOKEN(syntaxerror_known_range(
                              tok, (int)(tok->start + 1 - tok->line_start),
                              (int)(zeros_end - tok->line_start),
                              "leading zeros in decimal integer "
                              "literals are not permitted; "
-                            "use an 0o prefix for octal integers");
+                            "use an 0o prefix for octal integers"));
                  }
                  if (!verify_end_of_number(tok, c, "decimal")) {
-                    return ERRORTOKEN;
+                    return MAKE_TOKEN(ERRORTOKEN);
                  }
              }
          }
@@ -1886,7 +1902,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
              /* Decimal */
              c = tok_decimal_tail(tok);
              if (c == 0) {
-                return ERRORTOKEN;
+                return MAKE_TOKEN(ERRORTOKEN);
              }
              {
                  /* Accept floating point numbers. */
@@ -1897,7 +1913,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                      if (isdigit(c)) {
                          c = tok_decimal_tail(tok);
                          if (c == 0) {
-                            return ERRORTOKEN;
+                            return MAKE_TOKEN(ERRORTOKEN);
                          }
                      }
                  }
@@ -1911,21 +1927,21 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                          c = tok_nextc(tok);
                          if (!isdigit(c)) {
                              tok_backup(tok, c);
-                            return syntaxerror(tok, "invalid decimal literal");
+                            return MAKE_TOKEN(syntaxerror(tok, "invalid decimal literal"));
                          }
                      } else if (!isdigit(c)) {
                          tok_backup(tok, c);
                          if (!verify_end_of_number(tok, e, "decimal")) {
-                            return ERRORTOKEN;
+                            return MAKE_TOKEN(ERRORTOKEN);
                          }
                          tok_backup(tok, e);
-                        *p_start = tok->start;
-                        *p_end = tok->cur;
-                        return NUMBER;
+                        p_start = tok->start;
+                        p_end = tok->cur;
+                        return MAKE_TOKEN(NUMBER);
                      }
                      c = tok_decimal_tail(tok);
                      if (c == 0) {
-                        return ERRORTOKEN;
+                        return MAKE_TOKEN(ERRORTOKEN);
                      }
                  }
                  if (c == 'j' || c == 'J') {
@@ -1933,18 +1949,18 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
          imaginary:
                      c = tok_nextc(tok);
                      if (!verify_end_of_number(tok, c, "imaginary")) {
-                        return ERRORTOKEN;
+                        return MAKE_TOKEN(ERRORTOKEN);
                      }
                  }
                  else if (!verify_end_of_number(tok, c, "decimal")) {
-                    return ERRORTOKEN;
+                    return MAKE_TOKEN(ERRORTOKEN);
                  }
              }
          }
          tok_backup(tok, c);
-        *p_start = tok->start;
-        *p_end = tok->cur;
-        return NUMBER;
+        p_start = tok->start;
+        p_end = tok->cur;
+        return MAKE_TOKEN(NUMBER);
      }
  
    letter_quote:
@@ -1997,7 +2013,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                      if (c != '\n') {
                          tok->done = E_EOFS;
                      }
-                    return ERRORTOKEN;
+                    return MAKE_TOKEN(ERRORTOKEN);
                  }
                  else {
                      syntaxerror(tok, "unterminated string literal (detected at"
@@ -2005,7 +2021,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                      if (c != '\n') {
                          tok->done = E_EOLS;
                      }
-                    return ERRORTOKEN;
+                    return MAKE_TOKEN(ERRORTOKEN);
                  }
              }
              if (c == quote) {
@@ -2019,15 +2035,15 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
              }
          }
  
-        *p_start = tok->start;
-        *p_end = tok->cur;
-        return STRING;
+        p_start = tok->start;
+        p_end = tok->cur;
+        return MAKE_TOKEN(STRING);
      }
  
      /* Line continuation */
      if (c == '\\') {
          if ((c = tok_continuation_line(tok)) == -1) {
-            return ERRORTOKEN;
+            return MAKE_TOKEN(ERRORTOKEN);
          }
          tok->cont_line = 1;
          goto again; /* Read next line */
@@ -2036,19 +2052,19 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
      /* Check for two-character token */
      {
          int c2 = tok_nextc(tok);
-        int token = _PyToken_TwoChars(c, c2);
-        if (token != OP) {
+        int current_token = _PyToken_TwoChars(c, c2);
+        if (current_token != OP) {
              int c3 = tok_nextc(tok);
-            int token3 = _PyToken_ThreeChars(c, c2, c3);
-            if (token3 != OP) {
-                token = token3;
+            int current_token3 = _PyToken_ThreeChars(c, c2, c3);
+            if (current_token3 != OP) {
+                current_token = current_token3;
              }
              else {
                  tok_backup(tok, c3);
              }
-            *p_start = tok->start;
-            *p_end = tok->cur;
-            return token;
+            p_start = tok->start;
+            p_end = tok->cur;
+            return MAKE_TOKEN(current_token);
          }
          tok_backup(tok, c2);
      }
@@ -2059,7 +2075,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
      case '[':
      case '{':
          if (tok->level >= MAXLEVEL) {
-            return syntaxerror(tok, "too many nested parentheses");
+            return MAKE_TOKEN(syntaxerror(tok, "too many nested parentheses"));
          }
          tok->parenstack[tok->level] = c;
          tok->parenlinenostack[tok->level] = tok->lineno;
@@ -2070,7 +2086,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
      case ']':
      case '}':
          if (!tok->level) {
-            return syntaxerror(tok, "unmatched '%c'", c);
+            return MAKE_TOKEN(syntaxerror(tok, "unmatched '%c'", c));
          }
          tok->level--;
          int opening = tok->parenstack[tok->level];
@@ -2079,16 +2095,16 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                (opening == '{' && c == '}')))
          {
              if (tok->parenlinenostack[tok->level] != tok->lineno) {
-                return syntaxerror(tok,
+                return MAKE_TOKEN(syntaxerror(tok,
                          "closing parenthesis '%c' does not match "
                          "opening parenthesis '%c' on line %d",
-                        c, opening, tok->parenlinenostack[tok->level]);
+                        c, opening, tok->parenlinenostack[tok->level]));
              }
              else {
-                return syntaxerror(tok,
+                return MAKE_TOKEN(syntaxerror(tok,
                          "closing parenthesis '%c' does not match "
                          "opening parenthesis '%c'",
-                        c, opening);
+                        c, opening));
              }
          }
          break;
@@ -2097,20 +2113,19 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
      if (!Py_UNICODE_ISPRINTABLE(c)) {
          char hex[9];
          (void)PyOS_snprintf(hex, sizeof(hex), "%04X", c);
-        return syntaxerror(tok, "invalid non-printable character U+%s", hex);
+        return MAKE_TOKEN(syntaxerror(tok, "invalid non-printable character U+%s", hex));
      }
  
      /* Punctuation character */
-    *p_start = tok->start;
-    *p_end = tok->cur;
-    return _PyToken_OneChar(c);
+    p_start = tok->start;
+    p_end = tok->cur;
+    return MAKE_TOKEN(_PyToken_OneChar(c));
  }
  
  int
-_PyTokenizer_Get(struct tok_state *tok,
-                 const char **p_start, const char **p_end)
+_PyTokenizer_Get(struct tok_state *tok, struct token *token)
  {
-    int result = tok_get(tok, p_start, p_end);
+    int result = tok_get(tok, token);
      if (tok->decoding_erred) {
          result = ERRORTOKEN;
          tok->done = E_DECODE;
@@ -2166,8 +2181,6 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
  {
      struct tok_state *tok;
      FILE *fp;
-    const char *p_start = NULL;
-    const char *p_end = NULL;
      char *encoding = NULL;
  
      fp = fdopen_borrow(fd);
@@ -2191,8 +2204,9 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
              return encoding;
          }
      }
+    struct token token;
      while (tok->lineno < 2 && tok->done == E_OK) {
-        _PyTokenizer_Get(tok, &p_start, &p_end);
+        _PyTokenizer_Get(tok, &token);
      }
      fclose(fp);
      if (tok->encoding) {
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h

index 5ac64a99b7d6610b1645cc7f05588cf71c0a6b19..5b8c7f314386ecd41707c8cdab4c03e5e87dfd8a 100644 (file)
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -27,6 +27,12 @@ enum interactive_underflow_t {
      IUNDERFLOW_STOP,
  };
  
+struct token {
+    int level;
+    int lineno, col_offset, end_lineno, end_col_offset;
+    const char *start, *end;
+};
+
  /* Tokenizer state */
  struct tok_state {
      /* Input state; buf <= cur <= inp <= end */
@@ -94,7 +100,7 @@ extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int);
  extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*,
                                                const char *, const char *);
  extern void _PyTokenizer_Free(struct tok_state *);
-extern int _PyTokenizer_Get(struct tok_state *, const char **, const char **);
+extern int _PyTokenizer_Get(struct tok_state *, struct token *);
  
  #define tok_dump _Py_tok_dump
  
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c

index c5124a6942e7f26995120ef29120cff74cf61f69..8daa9877254e2e36bd1c8853c856eef3ba839985 100644 (file)
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -60,9 +60,8 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source)
  static PyObject *
  tokenizeriter_next(tokenizeriterobject *it)
  {
-    const char *start;
-    const char *end;
-    int type = _PyTokenizer_Get(it->tok, &start, &end);
+    struct token token;
+    int type = _PyTokenizer_Get(it->tok, &token);
      if (type == ERRORTOKEN && PyErr_Occurred()) {
          return NULL;
      }
@@ -71,11 +70,11 @@ tokenizeriter_next(tokenizeriterobject *it)
          return NULL;
      }
      PyObject *str = NULL;
-    if (start == NULL || end == NULL) {
+    if (token.start == NULL || token.end == NULL) {
          str = PyUnicode_FromString("");
      }
      else {
-        str = PyUnicode_FromStringAndSize(start, end - start);
+        str = PyUnicode_FromStringAndSize(token.start, token.end - token.start);
      }
      if (str == NULL) {
          return NULL;
@@ -92,11 +91,11 @@ tokenizeriter_next(tokenizeriterobject *it)
      int end_lineno = it->tok->lineno;
      int col_offset = -1;
      int end_col_offset = -1;
-    if (start != NULL && start >= line_start) {
-        col_offset = (int)(start - line_start);
+    if (token.start != NULL && token.start >= line_start) {
+        col_offset = (int)(token.start - line_start);
      }
-    if (end != NULL && end >= it->tok->line_start) {
-        end_col_offset = (int)(end - it->tok->line_start);
+    if (token.end != NULL && token.end >= it->tok->line_start) {
+        end_col_offset = (int)(token.end - it->tok->line_start);
      }
  
      return Py_BuildValue("(NiiiiiN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
author	Lysandros Nikolaou <lisandrosnik@gmail.com>
	Thu, 6 Oct 2022 23:07:17 +0000 (16:07 -0700)
committer	GitHub <noreply@github.com>
	Thu, 6 Oct 2022 23:07:17 +0000 (16:07 -0700)
Misc/NEWS.d/next/Core and Builtins/2022-10-06-20-41-29.gh-issue-97973.gB-xWi.rst	[new file with mode: 0644]	patch \| blob
Parser/pegen.c		patch \| blob \| blame \| history
Parser/pegen_errors.c		patch \| blob \| blame \| history
Parser/tokenizer.c		patch \| blob \| blame \| history
Parser/tokenizer.h		patch \| blob \| blame \| history
Python/Python-tokenize.c		patch \| blob \| blame \| history