bpo-42827: Fix crash on SyntaxError in multiline expressions (GH-24140)

author Lysandros Nikolaou <lisandrosnik@gmail.com>

Thu, 14 Jan 2021 21:36:30 +0000 (23:36 +0200)

committer GitHub <noreply@github.com>

Thu, 14 Jan 2021 21:36:30 +0000 (21:36 +0000)
author Lysandros Nikolaou <lisandrosnik@gmail.com>
Thu, 14 Jan 2021 21:36:30 +0000 (23:36 +0200)
committer GitHub <noreply@github.com>
Thu, 14 Jan 2021 21:36:30 +0000 (21:36 +0000)
diff --git a/Lib/test/test_exceptions.py b/Lib/test/test_exceptions.py

index 864422390ad3025d4dd8958171adf9783e46f6e6..eb70d7b4e4972418be7f61aeafa957af4d003a73 100644 (file)
--- a/Lib/test/test_exceptions.py
+++ b/Lib/test/test_exceptions.py
@@ -209,6 +209,9 @@ class ExceptionTests(unittest.TestCase):
          check('x = "a', 1, 7)
          check('lambda x: x = 2', 1, 1)
          check('f{a + b + c}', 1, 2)
+        check('[file for str(file) in []\n])', 1, 11)
+        check('[\nfile\nfor str(file)\nin\n[]\n]', 3, 5)
+        check('[file for\n str(file) in []]', 2, 2)
  
          # Errors thrown by compile.c
          check('class foo:return 1', 1, 11)
diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-01-06-17-06-37.bpo-42827.jtRR0D.rst b/Misc/NEWS.d/next/Core and Builtins/2021-01-06-17-06-37.bpo-42827.jtRR0D.rst

new file mode 100644 (file)

index 0000000..8e40ab6
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2021-01-06-17-06-37.bpo-42827.jtRR0D.rst
@@ -0,0 +1,2 @@
+Fix a crash when working out the error line of a :exc:`SyntaxError` in some
+multi-line expressions.
diff --git a/Parser/pegen.c b/Parser/pegen.c

index 188fd282b7604360e3917c4c29cdf182438b7374..a6f97929255ac244a164d1a80a187f4600657b48 100644 (file)
--- a/Parser/pegen.c
+++ b/Parser/pegen.c
@@ -380,6 +380,27 @@ _PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...)
      return NULL;
  }
  
+static PyObject *
+get_error_line(Parser *p, Py_ssize_t lineno)
+{
+    /* If p->tok->fp == NULL, then we're parsing from a string, which means that
+       the whole source is stored in p->tok->str. If not, then we're parsing
+       from the REPL, so the source lines of the current (multi-line) statement
+       are stored in p->tok->stdin_content */
+    assert(p->tok->fp == NULL || p->tok->fp == stdin);
+
+    char *cur_line = p->tok->fp == NULL ? p->tok->str : p->tok->stdin_content;
+    for (int i = 0; i < lineno - 1; i++) {
+        cur_line = strchr(cur_line, '\n') + 1;
+    }
+
+    char *next_newline;
+    if ((next_newline = strchr(cur_line, '\n')) == NULL) { // This is the last line
+        next_newline = cur_line + strlen(cur_line);
+    }
+    return PyUnicode_DecodeUTF8(cur_line, next_newline - cur_line, "replace");
+}
+
  void *
  _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
                                      Py_ssize_t lineno, Py_ssize_t col_offset,
@@ -416,8 +437,22 @@ _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
      }
  
      if (!error_line) {
-        Py_ssize_t size = p->tok->inp - p->tok->buf;
-        error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
+        /* PyErr_ProgramTextObject was not called or returned NULL. If it was not called,
+           then we need to find the error line from some other source, because
+           p->start_rule != Py_file_input. If it returned NULL, then it either unexpectedly
+           failed or we're parsing from a string or the REPL. There's a third edge case where
+           we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
+           `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
+           does not physically exist */
+        assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
+
+        if (p->tok->lineno == lineno) {
+            Py_ssize_t size = p->tok->inp - p->tok->buf;
+            error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
+        }
+        else {
+            error_line = get_error_line(p, lineno);
+        }
          if (!error_line) {
              goto error;
          }
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c

index 96539bd556529aa952b556c63ee184fe8e8b6e47..62cd2966231b8a30ba864bbad759f3cc2b17d7b2 100644 (file)
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -81,6 +81,7 @@ tok_new(void)
      tok->decoding_readline = NULL;
      tok->decoding_buffer = NULL;
      tok->type_comments = 0;
+    tok->stdin_content = NULL;
  
      tok->async_hacks = 0;
      tok->async_def = 0;
@@ -816,6 +817,8 @@ PyTokenizer_Free(struct tok_state *tok)
          PyMem_Free(tok->buf);
      if (tok->input)
          PyMem_Free(tok->input);
+    if (tok->stdin_content)
+        PyMem_Free(tok->stdin_content);
      PyMem_Free(tok);
  }
  
@@ -856,6 +859,24 @@ tok_nextc(struct tok_state *tok)
                  if (translated == NULL)
                      return EOF;
                  newtok = translated;
+                if (tok->stdin_content == NULL) {
+                    tok->stdin_content = PyMem_Malloc(strlen(translated) + 1);
+                    if (tok->stdin_content == NULL) {
+                        tok->done = E_NOMEM;
+                        return EOF;
+                    }
+                    sprintf(tok->stdin_content, "%s", translated);
+                }
+                else {
+                    char *new_str = PyMem_Malloc(strlen(tok->stdin_content) + strlen(translated) + 1);
+                    if (new_str == NULL) {
+                        tok->done = E_NOMEM;
+                        return EOF;
+                    }
+                    sprintf(new_str, "%s%s", tok->stdin_content, translated);
+                    PyMem_Free(tok->stdin_content);
+                    tok->stdin_content = new_str;
+                }
              }
              if (tok->encoding && newtok && *newtok) {
                  /* Recode to UTF-8 */
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h

index 5660ea38e9443d44f0b2729ce2a927d4a7ffea80..b659f34796e42415c81ad26ad63f0747f5c9b878 100644 (file)
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -37,6 +37,7 @@ struct tok_state {
      int atbol;          /* Nonzero if at begin of new line */
      int pendin;         /* Pending indents (if > 0) or dedents (if < 0) */
      const char *prompt, *nextprompt;          /* For interactive prompting */
+    char *stdin_content;
      int lineno;         /* Current line number */
      int first_lineno;   /* First line of a single line or multi line string
                             expression (cf. issue 16806) */
author	Lysandros Nikolaou <lisandrosnik@gmail.com>
	Thu, 14 Jan 2021 21:36:30 +0000 (23:36 +0200)
committer	GitHub <noreply@github.com>
	Thu, 14 Jan 2021 21:36:30 +0000 (21:36 +0000)
Lib/test/test_exceptions.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Core and Builtins/2021-01-06-17-06-37.bpo-42827.jtRR0D.rst	[new file with mode: 0644]	patch \| blob
Parser/pegen.c		patch \| blob \| blame \| history
Parser/tokenizer.c		patch \| blob \| blame \| history
Parser/tokenizer.h		patch \| blob \| blame \| history