bpo-40334: Fix error location upon parsing an invalid string literal (GH-19962)

author Lysandros Nikolaou <lisandrosnik@gmail.com>

Thu, 7 May 2020 10:37:51 +0000 (13:37 +0300)

committer GitHub <noreply@github.com>

Thu, 7 May 2020 10:37:51 +0000 (11:37 +0100)
author Lysandros Nikolaou <lisandrosnik@gmail.com>
Thu, 7 May 2020 10:37:51 +0000 (13:37 +0300)
committer GitHub <noreply@github.com>
Thu, 7 May 2020 10:37:51 +0000 (11:37 +0100)
diff --git a/Lib/test/test_cmd_line_script.py b/Lib/test/test_cmd_line_script.py

index 1fc9500738f3522cbb540e520e768a5ea6c094c7..171340581af228b05b75ca0ae2488ffe3bd9cf04 100644 (file)
--- a/Lib/test/test_cmd_line_script.py
+++ b/Lib/test/test_cmd_line_script.py
@@ -648,7 +648,7 @@ class CmdLineTest(unittest.TestCase):
              self.assertEqual(
                  stderr.splitlines()[-3:],
                  [   b'    foo = """\\q"""',
-                    b'                 ^',
+                    b'          ^',
                      b'SyntaxError: invalid escape sequence \\q'
                  ],
              )
diff --git a/Lib/test/test_string_literals.py b/Lib/test/test_string_literals.py

index 5b5477d14d467dd6168d165a9ca6783f4738d96b..9565ee2485afd10f1179e79911eff2b8c160552a 100644 (file)
--- a/Lib/test/test_string_literals.py
+++ b/Lib/test/test_string_literals.py
@@ -118,8 +118,7 @@ class TestLiterals(unittest.TestCase):
              eval("'''\n\\z'''")
          self.assertEqual(len(w), 1)
          self.assertEqual(w[0].filename, '<string>')
-        if use_old_parser():
-            self.assertEqual(w[0].lineno, 1)
+        self.assertEqual(w[0].lineno, 1)
  
          with warnings.catch_warnings(record=True) as w:
              warnings.simplefilter('error', category=DeprecationWarning)
@@ -128,8 +127,8 @@ class TestLiterals(unittest.TestCase):
              exc = cm.exception
          self.assertEqual(w, [])
          self.assertEqual(exc.filename, '<string>')
-        if use_old_parser():
-            self.assertEqual(exc.lineno, 1)
+        self.assertEqual(exc.lineno, 1)
+        self.assertEqual(exc.offset, 1)
  
      def test_eval_str_raw(self):
          self.assertEqual(eval(""" r'x' """), 'x')
diff --git a/Parser/pegen/parse_string.c b/Parser/pegen/parse_string.c

index d96303dc183fa79e579742d127d40f1685c77bf1..ca4b733c153b57b427cb73b2d3756725942d2dde 100644 (file)
--- a/Parser/pegen/parse_string.c
+++ b/Parser/pegen/parse_string.c
@@ -12,7 +12,7 @@
  // file (like "_PyPegen_raise_syntax_error").
  
  static int
-warn_invalid_escape_sequence(Parser *p, unsigned char first_invalid_escape_char)
+warn_invalid_escape_sequence(Parser *p, unsigned char first_invalid_escape_char, Token *t)
  {
      PyObject *msg =
          PyUnicode_FromFormat("invalid escape sequence \\%c", first_invalid_escape_char);
@@ -20,11 +20,16 @@ warn_invalid_escape_sequence(Parser *p, unsigned char first_invalid_escape_char)
          return -1;
      }
      if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
-                                 p->tok->lineno, NULL, NULL) < 0) {
+                                 t->lineno, NULL, NULL) < 0) {
          if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
              /* Replace the DeprecationWarning exception with a SyntaxError
                 to get a more accurate error report */
              PyErr_Clear();
+
+            /* This is needed, in order for the SyntaxError to point to the token t,
+               since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
+               error location, if p->known_err_token is not set. */
+            p->known_err_token = t;
              RAISE_SYNTAX_ERROR("invalid escape sequence \\%c", first_invalid_escape_char);
          }
          Py_DECREF(msg);
@@ -47,7 +52,7 @@ decode_utf8(const char **sPtr, const char *end)
  }
  
  static PyObject *
-decode_unicode_with_escapes(Parser *parser, const char *s, size_t len)
+decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
  {
      PyObject *v, *u;
      char *buf;
@@ -110,7 +115,7 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len)
      v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
  
      if (v != NULL && first_invalid_escape != NULL) {
-        if (warn_invalid_escape_sequence(parser, *first_invalid_escape) < 0) {
+        if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
              /* We have not decref u before because first_invalid_escape points
                 inside u. */
              Py_XDECREF(u);
@@ -123,7 +128,7 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len)
  }
  
  static PyObject *
-decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len)
+decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
  {
      const char *first_invalid_escape;
      PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
@@ -132,7 +137,7 @@ decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len)
      }
  
      if (first_invalid_escape != NULL) {
-        if (warn_invalid_escape_sequence(p, *first_invalid_escape) < 0) {
+        if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
              Py_DECREF(result);
              return NULL;
          }
@@ -146,9 +151,14 @@ decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len)
     If the string is an f-string, set *fstr and *fstrlen to the unparsed
     string object.  Return 0 if no errors occurred.  */
  int
-_PyPegen_parsestr(Parser *p, const char *s, int *bytesmode, int *rawmode, PyObject **result,
-         const char **fstr, Py_ssize_t *fstrlen)
+_PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
+                  const char **fstr, Py_ssize_t *fstrlen, Token *t)
  {
+    const char *s = PyBytes_AsString(t->bytes);
+    if (s == NULL) {
+        return -1;
+    }
+
      size_t len;
      int quote = Py_CHARMASK(*s);
      int fmode = 0;
@@ -245,7 +255,7 @@ _PyPegen_parsestr(Parser *p, const char *s, int *bytesmode, int *rawmode, PyObje
              *result = PyBytes_FromStringAndSize(s, len);
          }
          else {
-            *result = decode_bytes_with_escapes(p, s, len);
+            *result = decode_bytes_with_escapes(p, s, len, t);
          }
      }
      else {
@@ -253,7 +263,7 @@ _PyPegen_parsestr(Parser *p, const char *s, int *bytesmode, int *rawmode, PyObje
              *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
          }
          else {
-            *result = decode_unicode_with_escapes(p, s, len);
+            *result = decode_unicode_with_escapes(p, s, len, t);
          }
      }
      return *result == NULL ? -1 : 0;
@@ -637,7 +647,7 @@ exit:
  */
  static int
  fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
-                     PyObject **literal, int recurse_lvl)
+                     PyObject **literal, int recurse_lvl, Token *t)
  {
      /* Get any literal string. It ends when we hit an un-doubled left
         brace (which isn't part of a unicode name escape such as
@@ -660,7 +670,7 @@ fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
                  }
                  break;
              }
-            if (ch == '{' && warn_invalid_escape_sequence(p, ch) < 0) {
+            if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
                  return -1;
              }
          }
@@ -704,7 +714,7 @@ done:
                                                      NULL, NULL);
          else
              *literal = decode_unicode_with_escapes(p, literal_start,
-                                                   s - literal_start);
+                                                   s - literal_start, t);
          if (!*literal)
              return -1;
      }
@@ -1041,7 +1051,7 @@ fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int
      assert(*literal == NULL && *expression == NULL);
  
      /* Get any literal string. */
-    result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl);
+    result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
      if (result < 0)
          goto error;
  
diff --git a/Parser/pegen/parse_string.h b/Parser/pegen/parse_string.h

index 4f2aa94fc19b057660063106bad8272a400b82fd..cd85bd57d0a383308eb2d012a3eb84358997adf5 100644 (file)
--- a/Parser/pegen/parse_string.h
+++ b/Parser/pegen/parse_string.h
@@ -34,8 +34,8 @@ typedef struct {
  } FstringParser;
  
  void _PyPegen_FstringParser_Init(FstringParser *);
-int _PyPegen_parsestr(Parser *, const char *, int *, int *, PyObject **,
-             const char **, Py_ssize_t *);
+int _PyPegen_parsestr(Parser *, int *, int *, PyObject **,
+                      const char **, Py_ssize_t *, Token *);
  int _PyPegen_FstringParser_ConcatFstring(Parser *, FstringParser *, const char **,
                                  const char *, int, int, Token *, Token *,
                                  Token *);
diff --git a/Parser/pegen/pegen.c b/Parser/pegen/pegen.c

index c311593af70f58ea1a2d0c4a8c131038b74ddb35..06af53b3597f7438a4d0d99c5a7e60a39222dbc0 100644 (file)
--- a/Parser/pegen/pegen.c
+++ b/Parser/pegen/pegen.c
@@ -383,7 +383,7 @@ _PyPegen_raise_error(Parser *p, PyObject *errtype, int with_col_number, const ch
      PyObject *errstr = NULL;
      PyObject *loc = NULL;
      PyObject *tmp = NULL;
-    Token *t = p->tokens[p->fill - 1];
+    Token *t = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
      Py_ssize_t col_number = !with_col_number;
      va_list va;
      p->error_indicator = 1;
@@ -1053,6 +1053,7 @@ _PyPegen_Parser_New(struct tok_state *tok, int start_rule, int flags,
      p->starting_col_offset = 0;
      p->flags = flags;
      p->feature_version = feature_version;
+    p->known_err_token = NULL;
  
      return p;
  }
@@ -1972,12 +1973,7 @@ _PyPegen_concatenate_strings(Parser *p, asdl_seq *strings)
          const char *fstr;
          Py_ssize_t fstrlen = -1;
  
-        char *this_str = PyBytes_AsString(t->bytes);
-        if (!this_str) {
-            goto error;
-        }
-
-        if (_PyPegen_parsestr(p, this_str, &this_bytesmode, &this_rawmode, &s, &fstr, &fstrlen) != 0) {
+        if (_PyPegen_parsestr(p, &this_bytesmode, &this_rawmode, &s, &fstr, &fstrlen, t) != 0) {
              goto error;
          }
  
diff --git a/Parser/pegen/pegen.h b/Parser/pegen/pegen.h

index cbe6f197ac742366318c19f098cebdd1101ffc82..ffb18e47e4a9a8e87ceccae80713dcc678e1a2df 100644 (file)
--- a/Parser/pegen/pegen.h
+++ b/Parser/pegen/pegen.h
@@ -71,6 +71,7 @@ typedef struct {
      int flags;
      int feature_version;
      growable_comment_array type_ignore_comments;
+    Token *known_err_token;
  } Parser;
  
  typedef struct {
author	Lysandros Nikolaou <lisandrosnik@gmail.com>
	Thu, 7 May 2020 10:37:51 +0000 (13:37 +0300)
committer	GitHub <noreply@github.com>
	Thu, 7 May 2020 10:37:51 +0000 (11:37 +0100)
Lib/test/test_cmd_line_script.py		patch \| blob \| blame \| history
Lib/test/test_string_literals.py		patch \| blob \| blame \| history
Parser/pegen/parse_string.c		patch \| blob \| blame \| history
Parser/pegen/parse_string.h		patch \| blob \| blame \| history
Parser/pegen/pegen.c		patch \| blob \| blame \| history
Parser/pegen/pegen.h		patch \| blob \| blame \| history