}
static int
-_get_keyword_or_name_type(Parser *p, const char *name, int name_len)
+_get_keyword_or_name_type(Parser *p, struct token *new_token)
{
+ int name_len = new_token->end_col_offset - new_token->col_offset;
assert(name_len > 0);
+
if (name_len >= p->n_keyword_lists ||
p->keywords[name_len] == NULL ||
p->keywords[name_len]->type == -1) {
return NAME;
}
for (KeywordToken *k = p->keywords[name_len]; k != NULL && k->type != -1; k++) {
- if (strncmp(k->str, name, name_len) == 0) {
+ if (strncmp(k->str, new_token->start, name_len) == 0) {
return k->type;
}
}
}
static int
-initialize_token(Parser *p, Token *token, const char *start, const char *end, int token_type) {
- assert(token != NULL);
+initialize_token(Parser *p, Token *parser_token, struct token *new_token, int token_type) {
+ assert(parser_token != NULL);
- token->type = (token_type == NAME) ? _get_keyword_or_name_type(p, start, (int)(end - start)) : token_type;
- token->bytes = PyBytes_FromStringAndSize(start, end - start);
- if (token->bytes == NULL) {
+ parser_token->type = (token_type == NAME) ? _get_keyword_or_name_type(p, new_token) : token_type;
+ parser_token->bytes = PyBytes_FromStringAndSize(new_token->start, new_token->end - new_token->start);
+ if (parser_token->bytes == NULL) {
return -1;
}
-
- if (_PyArena_AddPyObject(p->arena, token->bytes) < 0) {
- Py_DECREF(token->bytes);
+ if (_PyArena_AddPyObject(p->arena, parser_token->bytes) < 0) {
+ Py_DECREF(parser_token->bytes);
return -1;
}
- token->level = p->tok->level;
-
- const char *line_start = token_type == STRING ? p->tok->multi_line_start : p->tok->line_start;
- int lineno = token_type == STRING ? p->tok->first_lineno : p->tok->lineno;
- int end_lineno = p->tok->lineno;
-
- int col_offset = (start != NULL && start >= line_start) ? (int)(start - line_start) : -1;
- int end_col_offset = (end != NULL && end >= p->tok->line_start) ? (int)(end - p->tok->line_start) : -1;
-
- token->lineno = lineno;
- token->col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + col_offset : col_offset;
- token->end_lineno = end_lineno;
- token->end_col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + end_col_offset : end_col_offset;
+ parser_token->level = new_token->level;
+ parser_token->lineno = new_token->lineno;
+ parser_token->col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + new_token->col_offset
+ : new_token->col_offset;
+ parser_token->end_lineno = new_token->end_lineno;
+ parser_token->end_col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + new_token->end_col_offset
+ : new_token->end_col_offset;
p->fill += 1;
int
_PyPegen_fill_token(Parser *p)
{
- const char *start;
- const char *end;
- int type = _PyTokenizer_Get(p->tok, &start, &end);
+ struct token new_token;
+ int type = _PyTokenizer_Get(p->tok, &new_token);
// Record and skip '# type: ignore' comments
while (type == TYPE_IGNORE) {
- Py_ssize_t len = end - start;
+ Py_ssize_t len = new_token.end_col_offset - new_token.col_offset;
char *tag = PyMem_Malloc(len + 1);
if (tag == NULL) {
PyErr_NoMemory();
return -1;
}
- strncpy(tag, start, len);
+ strncpy(tag, new_token.start, len);
tag[len] = '\0';
// Ownership of tag passes to the growable array
if (!growable_comment_array_add(&p->type_ignore_comments, p->tok->lineno, tag)) {
PyErr_NoMemory();
return -1;
}
- type = _PyTokenizer_Get(p->tok, &start, &end);
+ type = _PyTokenizer_Get(p->tok, &new_token);
}
// If we have reached the end and we are in single input mode we need to insert a newline and reset the parsing
}
Token *t = p->tokens[p->fill];
- return initialize_token(p, t, start, end, type);
+ return initialize_token(p, t, &new_token, type);
}
#if defined(Py_DEBUG)
/* Don't ever change this -- it would break the portability of Python code */
#define TABSIZE 8
+#define MAKE_TOKEN(token_type) token_setup(tok, token, token_type, p_start, p_end)
+
/* Forward */
static struct tok_state *tok_new(void);
static int tok_nextc(struct tok_state *tok);
return ret;
}
-
-
static int
indenterror(struct tok_state *tok)
{
}
static int
-tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
+token_setup(struct tok_state *tok, struct token *token, int type, const char *start, const char *end)
+{
+ assert((start == NULL && end == NULL) || (start != NULL && end != NULL));
+ token->level = tok->level;
+ token->lineno = type == STRING ? tok->first_lineno : tok->lineno;
+ token->end_lineno = tok->lineno;
+ token->col_offset = -1;
+ token->end_col_offset = -1;
+ token->start = start;
+ token->end = end;
+ if (start != NULL && end != NULL) {
+ const char *line_start = type == STRING ? tok->multi_line_start : tok->line_start;
+ token->col_offset = (start >= line_start) ? (int)(start - line_start) : -1;
+ token->end_col_offset = (end >= tok->line_start) ? (int)(end - tok->line_start) : -1;
+ }
+ return type;
+}
+
+static int
+tok_get(struct tok_state *tok, struct token *token)
{
int c;
int blankline, nonascii;
- *p_start = *p_end = NULL;
+ const char *p_start = NULL;
+ const char *p_end = NULL;
nextline:
tok->start = NULL;
blankline = 0;
// the level of indentation of whatever comes next.
cont_line_col = cont_line_col ? cont_line_col : col;
if ((c = tok_continuation_line(tok)) == -1) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
}
else {
if (col == tok->indstack[tok->indent]) {
/* No change */
if (altcol != tok->altindstack[tok->indent]) {
- return indenterror(tok);
+ return MAKE_TOKEN(indenterror(tok));
}
}
else if (col > tok->indstack[tok->indent]) {
if (tok->indent+1 >= MAXINDENT) {
tok->done = E_TOODEEP;
tok->cur = tok->inp;
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
if (altcol <= tok->altindstack[tok->indent]) {
- return indenterror(tok);
+ return MAKE_TOKEN(indenterror(tok));
}
tok->pendin++;
tok->indstack[++tok->indent] = col;
if (col != tok->indstack[tok->indent]) {
tok->done = E_DEDENT;
tok->cur = tok->inp;
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
if (altcol != tok->altindstack[tok->indent]) {
- return indenterror(tok);
+ return MAKE_TOKEN(indenterror(tok));
}
}
}
if (tok->pendin != 0) {
if (tok->pendin < 0) {
tok->pendin++;
- return DEDENT;
+ return MAKE_TOKEN(DEDENT);
}
else {
tok->pendin--;
- return INDENT;
+ return MAKE_TOKEN(INDENT);
}
}
&& ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
if (is_type_ignore) {
- *p_start = ignore_end;
- *p_end = tok->cur;
+ p_start = ignore_end;
+ p_end = tok->cur;
/* If this type ignore is the only thing on the line, consume the newline also. */
if (blankline) {
tok_nextc(tok);
tok->atbol = 1;
}
- return TYPE_IGNORE;
+ return MAKE_TOKEN(TYPE_IGNORE);
} else {
- *p_start = type_start; /* after type_comment_prefix */
- *p_end = tok->cur;
- return TYPE_COMMENT;
+ p_start = type_start;
+ p_end = tok->cur;
+ return MAKE_TOKEN(TYPE_COMMENT);
}
}
}
}
if (tok->done == E_INTERACT_STOP) {
- return ENDMARKER;
+ return MAKE_TOKEN(ENDMARKER);
}
/* Check for EOF and errors now */
if (c == EOF) {
if (tok->level) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
- return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
+ return MAKE_TOKEN(tok->done == E_EOF ? ENDMARKER : ERRORTOKEN);
}
/* Identifier (most frequent token!) */
}
tok_backup(tok, c);
if (nonascii && !verify_identifier(tok)) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
- *p_start = tok->start;
- *p_end = tok->cur;
+ p_start = tok->start;
+ p_end = tok->cur;
/* async/await parsing block. */
if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
if (!tok->async_hacks || tok->async_def) {
/* Always recognize the keywords. */
if (memcmp(tok->start, "async", 5) == 0) {
- return ASYNC;
+ return MAKE_TOKEN(ASYNC);
}
if (memcmp(tok->start, "await", 5) == 0) {
- return AWAIT;
+ return MAKE_TOKEN(AWAIT);
}
}
else if (memcmp(tok->start, "async", 5) == 0) {
Look ahead one token to see if that is 'def'. */
struct tok_state ahead_tok;
- const char *ahead_tok_start = NULL;
- const char *ahead_tok_end = NULL;
+ struct token ahead_token;
int ahead_tok_kind;
memcpy(&ahead_tok, tok, sizeof(ahead_tok));
- ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
- &ahead_tok_end);
+ ahead_tok_kind = tok_get(&ahead_tok, &ahead_token);
if (ahead_tok_kind == NAME
&& ahead_tok.cur - ahead_tok.start == 3
returning a plain NAME token, return ASYNC. */
tok->async_def_indent = tok->indent;
tok->async_def = 1;
- return ASYNC;
+ return MAKE_TOKEN(ASYNC);
}
}
}
- return NAME;
+ return MAKE_TOKEN(NAME);
}
/* Newline */
if (blankline || tok->level > 0) {
goto nextline;
}
- *p_start = tok->start;
- *p_end = tok->cur - 1; /* Leave '\n' out of the string */
+ p_start = tok->start;
+ p_end = tok->cur - 1; /* Leave '\n' out of the string */
tok->cont_line = 0;
if (tok->async_def) {
/* We're somewhere inside an 'async def' function, and
we've encountered a NEWLINE after its signature. */
tok->async_def_nl = 1;
}
- return NEWLINE;
+ return MAKE_TOKEN(NEWLINE);
}
/* Period or number starting with period? */
} else if (c == '.') {
c = tok_nextc(tok);
if (c == '.') {
- *p_start = tok->start;
- *p_end = tok->cur;
- return ELLIPSIS;
+ p_start = tok->start;
+ p_end = tok->cur;
+ return MAKE_TOKEN(ELLIPSIS);
}
else {
tok_backup(tok, c);
else {
tok_backup(tok, c);
}
- *p_start = tok->start;
- *p_end = tok->cur;
- return DOT;
+ p_start = tok->start;
+ p_end = tok->cur;
+ return MAKE_TOKEN(DOT);
}
/* Number */
}
if (!isxdigit(c)) {
tok_backup(tok, c);
- return syntaxerror(tok, "invalid hexadecimal literal");
+ return MAKE_TOKEN(syntaxerror(tok, "invalid hexadecimal literal"));
}
do {
c = tok_nextc(tok);
} while (isxdigit(c));
} while (c == '_');
if (!verify_end_of_number(tok, c, "hexadecimal")) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
}
else if (c == 'o' || c == 'O') {
}
if (c < '0' || c >= '8') {
if (isdigit(c)) {
- return syntaxerror(tok,
- "invalid digit '%c' in octal literal", c);
+ return MAKE_TOKEN(syntaxerror(tok,
+ "invalid digit '%c' in octal literal", c));
}
else {
tok_backup(tok, c);
- return syntaxerror(tok, "invalid octal literal");
+ return MAKE_TOKEN(syntaxerror(tok, "invalid octal literal"));
}
}
do {
} while ('0' <= c && c < '8');
} while (c == '_');
if (isdigit(c)) {
- return syntaxerror(tok,
- "invalid digit '%c' in octal literal", c);
+ return MAKE_TOKEN(syntaxerror(tok,
+ "invalid digit '%c' in octal literal", c));
}
if (!verify_end_of_number(tok, c, "octal")) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
}
else if (c == 'b' || c == 'B') {
}
if (c != '0' && c != '1') {
if (isdigit(c)) {
- return syntaxerror(tok,
- "invalid digit '%c' in binary literal", c);
+ return MAKE_TOKEN(syntaxerror(tok, "invalid digit '%c' in binary literal", c));
}
else {
tok_backup(tok, c);
- return syntaxerror(tok, "invalid binary literal");
+ return MAKE_TOKEN(syntaxerror(tok, "invalid binary literal"));
}
}
do {
} while (c == '0' || c == '1');
} while (c == '_');
if (isdigit(c)) {
- return syntaxerror(tok,
- "invalid digit '%c' in binary literal", c);
+ return MAKE_TOKEN(syntaxerror(tok, "invalid digit '%c' in binary literal", c));
}
if (!verify_end_of_number(tok, c, "binary")) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
}
else {
c = tok_nextc(tok);
if (!isdigit(c)) {
tok_backup(tok, c);
- return syntaxerror(tok, "invalid decimal literal");
+ return MAKE_TOKEN(syntaxerror(tok, "invalid decimal literal"));
}
}
if (c != '0') {
nonzero = 1;
c = tok_decimal_tail(tok);
if (c == 0) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
}
if (c == '.') {
else if (nonzero) {
/* Old-style octal: now disallowed. */
tok_backup(tok, c);
- return syntaxerror_known_range(
+ return MAKE_TOKEN(syntaxerror_known_range(
tok, (int)(tok->start + 1 - tok->line_start),
(int)(zeros_end - tok->line_start),
"leading zeros in decimal integer "
"literals are not permitted; "
- "use an 0o prefix for octal integers");
+ "use an 0o prefix for octal integers"));
}
if (!verify_end_of_number(tok, c, "decimal")) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
}
}
/* Decimal */
c = tok_decimal_tail(tok);
if (c == 0) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
{
/* Accept floating point numbers. */
if (isdigit(c)) {
c = tok_decimal_tail(tok);
if (c == 0) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
}
}
c = tok_nextc(tok);
if (!isdigit(c)) {
tok_backup(tok, c);
- return syntaxerror(tok, "invalid decimal literal");
+ return MAKE_TOKEN(syntaxerror(tok, "invalid decimal literal"));
}
} else if (!isdigit(c)) {
tok_backup(tok, c);
if (!verify_end_of_number(tok, e, "decimal")) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
tok_backup(tok, e);
- *p_start = tok->start;
- *p_end = tok->cur;
- return NUMBER;
+ p_start = tok->start;
+ p_end = tok->cur;
+ return MAKE_TOKEN(NUMBER);
}
c = tok_decimal_tail(tok);
if (c == 0) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
}
if (c == 'j' || c == 'J') {
imaginary:
c = tok_nextc(tok);
if (!verify_end_of_number(tok, c, "imaginary")) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
}
else if (!verify_end_of_number(tok, c, "decimal")) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
}
}
tok_backup(tok, c);
- *p_start = tok->start;
- *p_end = tok->cur;
- return NUMBER;
+ p_start = tok->start;
+ p_end = tok->cur;
+ return MAKE_TOKEN(NUMBER);
}
letter_quote:
if (c != '\n') {
tok->done = E_EOFS;
}
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
else {
syntaxerror(tok, "unterminated string literal (detected at"
if (c != '\n') {
tok->done = E_EOLS;
}
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
}
if (c == quote) {
}
}
- *p_start = tok->start;
- *p_end = tok->cur;
- return STRING;
+ p_start = tok->start;
+ p_end = tok->cur;
+ return MAKE_TOKEN(STRING);
}
/* Line continuation */
if (c == '\\') {
if ((c = tok_continuation_line(tok)) == -1) {
- return ERRORTOKEN;
+ return MAKE_TOKEN(ERRORTOKEN);
}
tok->cont_line = 1;
goto again; /* Read next line */
/* Check for two-character token */
{
int c2 = tok_nextc(tok);
- int token = _PyToken_TwoChars(c, c2);
- if (token != OP) {
+ int current_token = _PyToken_TwoChars(c, c2);
+ if (current_token != OP) {
int c3 = tok_nextc(tok);
- int token3 = _PyToken_ThreeChars(c, c2, c3);
- if (token3 != OP) {
- token = token3;
+ int current_token3 = _PyToken_ThreeChars(c, c2, c3);
+ if (current_token3 != OP) {
+ current_token = current_token3;
}
else {
tok_backup(tok, c3);
}
- *p_start = tok->start;
- *p_end = tok->cur;
- return token;
+ p_start = tok->start;
+ p_end = tok->cur;
+ return MAKE_TOKEN(current_token);
}
tok_backup(tok, c2);
}
case '[':
case '{':
if (tok->level >= MAXLEVEL) {
- return syntaxerror(tok, "too many nested parentheses");
+ return MAKE_TOKEN(syntaxerror(tok, "too many nested parentheses"));
}
tok->parenstack[tok->level] = c;
tok->parenlinenostack[tok->level] = tok->lineno;
case ']':
case '}':
if (!tok->level) {
- return syntaxerror(tok, "unmatched '%c'", c);
+ return MAKE_TOKEN(syntaxerror(tok, "unmatched '%c'", c));
}
tok->level--;
int opening = tok->parenstack[tok->level];
(opening == '{' && c == '}')))
{
if (tok->parenlinenostack[tok->level] != tok->lineno) {
- return syntaxerror(tok,
+ return MAKE_TOKEN(syntaxerror(tok,
"closing parenthesis '%c' does not match "
"opening parenthesis '%c' on line %d",
- c, opening, tok->parenlinenostack[tok->level]);
+ c, opening, tok->parenlinenostack[tok->level]));
}
else {
- return syntaxerror(tok,
+ return MAKE_TOKEN(syntaxerror(tok,
"closing parenthesis '%c' does not match "
"opening parenthesis '%c'",
- c, opening);
+ c, opening));
}
}
break;
if (!Py_UNICODE_ISPRINTABLE(c)) {
char hex[9];
(void)PyOS_snprintf(hex, sizeof(hex), "%04X", c);
- return syntaxerror(tok, "invalid non-printable character U+%s", hex);
+ return MAKE_TOKEN(syntaxerror(tok, "invalid non-printable character U+%s", hex));
}
/* Punctuation character */
- *p_start = tok->start;
- *p_end = tok->cur;
- return _PyToken_OneChar(c);
+ p_start = tok->start;
+ p_end = tok->cur;
+ return MAKE_TOKEN(_PyToken_OneChar(c));
}
int
-_PyTokenizer_Get(struct tok_state *tok,
- const char **p_start, const char **p_end)
+_PyTokenizer_Get(struct tok_state *tok, struct token *token)
{
- int result = tok_get(tok, p_start, p_end);
+ int result = tok_get(tok, token);
if (tok->decoding_erred) {
result = ERRORTOKEN;
tok->done = E_DECODE;
{
struct tok_state *tok;
FILE *fp;
- const char *p_start = NULL;
- const char *p_end = NULL;
char *encoding = NULL;
fp = fdopen_borrow(fd);
return encoding;
}
}
+ struct token token;
while (tok->lineno < 2 && tok->done == E_OK) {
- _PyTokenizer_Get(tok, &p_start, &p_end);
+ _PyTokenizer_Get(tok, &token);
}
fclose(fp);
if (tok->encoding) {