[3.12] gh-116042: Fix location for SyntaxErrors of invalid escapes in the tokenizer...

author Pablo Galindo Salgado <Pablogsal@gmail.com>

Thu, 13 Feb 2025 01:42:24 +0000 (01:42 +0000)

committer GitHub <noreply@github.com>

Thu, 13 Feb 2025 01:42:24 +0000 (01:42 +0000)
author Pablo Galindo Salgado <Pablogsal@gmail.com>
Thu, 13 Feb 2025 01:42:24 +0000 (01:42 +0000)
committer GitHub <noreply@github.com>
Thu, 13 Feb 2025 01:42:24 +0000 (01:42 +0000)
diff --git a/Lib/test/test_cmd_line_script.py b/Lib/test/test_cmd_line_script.py

index 1b588826010717bb0a01a99e2aeb7f20bcfb7b12..7109e3d164e4858c776114acbda2064d3071142f 100644 (file)
--- a/Lib/test/test_cmd_line_script.py
+++ b/Lib/test/test_cmd_line_script.py
@@ -652,7 +652,7 @@ class CmdLineTest(unittest.TestCase):
              self.assertEqual(
                  stderr.splitlines()[-3:],
                  [   b'    foo = """\\q"""',
-                    b'          ^^^^^^^^',
+                    b'             ^^',
                      b'SyntaxError: invalid escape sequence \'\\q\''
                  ],
              )
diff --git a/Lib/test/test_string_literals.py b/Lib/test/test_string_literals.py

index c7c6f684cd33f0777abd83c1d56e3cc2b5bc6a33..3d793427c9ab5d758e8bb386a1574aa54422b9d4 100644 (file)
--- a/Lib/test/test_string_literals.py
+++ b/Lib/test/test_string_literals.py
@@ -118,7 +118,7 @@ class TestLiterals(unittest.TestCase):
          self.assertEqual(len(w), 1)
          self.assertEqual(str(w[0].message), r"invalid escape sequence '\z'")
          self.assertEqual(w[0].filename, '<string>')
-        self.assertEqual(w[0].lineno, 1)
+        self.assertEqual(w[0].lineno, 2)
  
          with warnings.catch_warnings(record=True) as w:
              warnings.simplefilter('error', category=SyntaxWarning)
@@ -128,7 +128,7 @@ class TestLiterals(unittest.TestCase):
          self.assertEqual(w, [])
          self.assertEqual(exc.msg, r"invalid escape sequence '\z'")
          self.assertEqual(exc.filename, '<string>')
-        self.assertEqual(exc.lineno, 1)
+        self.assertEqual(exc.lineno, 2)
          self.assertEqual(exc.offset, 1)
  
          # Check that the warning is raised only once if there are syntax errors
@@ -155,7 +155,7 @@ class TestLiterals(unittest.TestCase):
          self.assertEqual(str(w[0].message),
                           r"invalid octal escape sequence '\407'")
          self.assertEqual(w[0].filename, '<string>')
-        self.assertEqual(w[0].lineno, 1)
+        self.assertEqual(w[0].lineno, 2)
  
          with warnings.catch_warnings(record=True) as w:
              warnings.simplefilter('error', category=SyntaxWarning)
@@ -165,9 +165,32 @@ class TestLiterals(unittest.TestCase):
          self.assertEqual(w, [])
          self.assertEqual(exc.msg, r"invalid octal escape sequence '\407'")
          self.assertEqual(exc.filename, '<string>')
-        self.assertEqual(exc.lineno, 1)
+        self.assertEqual(exc.lineno, 2)
          self.assertEqual(exc.offset, 1)
  
+    def test_invalid_escape_locations_with_offset(self):
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('error', category=SyntaxWarning)
+            with self.assertRaises(SyntaxError) as cm:
+                eval("\"'''''''''''''''''''''invalid\ Escape\"")
+            exc = cm.exception
+        self.assertEqual(w, [])
+        self.assertEqual(exc.msg, r"invalid escape sequence '\ '")
+        self.assertEqual(exc.filename, '<string>')
+        self.assertEqual(exc.lineno, 1)
+        self.assertEqual(exc.offset, 30)
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('error', category=SyntaxWarning)
+            with self.assertRaises(SyntaxError) as cm:
+                eval("\"''Incorrect \ logic?\"")
+            exc = cm.exception
+        self.assertEqual(w, [])
+        self.assertEqual(exc.msg, r"invalid escape sequence '\ '")
+        self.assertEqual(exc.filename, '<string>')
+        self.assertEqual(exc.lineno, 1)
+        self.assertEqual(exc.offset, 14)
+
      def test_eval_str_raw(self):
          self.assertEqual(eval(""" r'x' """), 'x')
          self.assertEqual(eval(r""" r'\x01' """), '\\' + 'x01')
@@ -207,7 +230,7 @@ class TestLiterals(unittest.TestCase):
          self.assertEqual(len(w), 1)
          self.assertEqual(str(w[0].message), r"invalid escape sequence '\z'")
          self.assertEqual(w[0].filename, '<string>')
-        self.assertEqual(w[0].lineno, 1)
+        self.assertEqual(w[0].lineno, 2)
  
          with warnings.catch_warnings(record=True) as w:
              warnings.simplefilter('error', category=SyntaxWarning)
@@ -217,7 +240,7 @@ class TestLiterals(unittest.TestCase):
          self.assertEqual(w, [])
          self.assertEqual(exc.msg, r"invalid escape sequence '\z'")
          self.assertEqual(exc.filename, '<string>')
-        self.assertEqual(exc.lineno, 1)
+        self.assertEqual(exc.lineno, 2)
  
      def test_eval_bytes_invalid_octal_escape(self):
          for i in range(0o400, 0o1000):
@@ -231,7 +254,7 @@ class TestLiterals(unittest.TestCase):
          self.assertEqual(str(w[0].message),
                           r"invalid octal escape sequence '\407'")
          self.assertEqual(w[0].filename, '<string>')
-        self.assertEqual(w[0].lineno, 1)
+        self.assertEqual(w[0].lineno, 2)
  
          with warnings.catch_warnings(record=True) as w:
              warnings.simplefilter('error', category=SyntaxWarning)
@@ -241,7 +264,7 @@ class TestLiterals(unittest.TestCase):
          self.assertEqual(w, [])
          self.assertEqual(exc.msg, r"invalid octal escape sequence '\407'")
          self.assertEqual(exc.filename, '<string>')
-        self.assertEqual(exc.lineno, 1)
+        self.assertEqual(exc.lineno, 2)
  
      def test_eval_bytes_raw(self):
          self.assertEqual(eval(""" br'x' """), b'x')
diff --git a/Misc/NEWS.d/next/Core and Builtins/2025-02-13-00-28-43.gh-issue-116042.861juq.rst b/Misc/NEWS.d/next/Core and Builtins/2025-02-13-00-28-43.gh-issue-116042.861juq.rst

new file mode 100644 (file)

index 0000000..098804f
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2025-02-13-00-28-43.gh-issue-116042.861juq.rst
@@ -0,0 +1,2 @@
+Fix location for SyntaxErrors of invalid escapes in the tokenizer. Patch by
+Pablo Galindo
diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c

index 72f1349897683dcf259b672d4e55acc9626d608f..d1cb91d299880e21d85e20adb76b4f90a2015a6d 100644 (file)
--- a/Parser/pegen_errors.c
+++ b/Parser/pegen_errors.c
@@ -350,8 +350,8 @@ _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
          assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
  
          if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
-            Py_ssize_t size = p->tok->inp - p->tok->buf;
-            error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
+            Py_ssize_t size = p->tok->inp - p->tok->line_start;
+            error_line = PyUnicode_DecodeUTF8(p->tok->line_start, size, "replace");
          }
          else if (p->tok->fp == NULL || p->tok->fp == stdin) {
              error_line = get_error_line_from_tokenizer_buffers(p, lineno);
diff --git a/Parser/string_parser.c b/Parser/string_parser.c

index 164f715e153eca9c58964dd0459e7b335439a550..751b56d0ee0e2ccbf860848c0ea99c38c76efd38 100644 (file)
--- a/Parser/string_parser.c
+++ b/Parser/string_parser.c
@@ -9,7 +9,7 @@
  //// STRING HANDLING FUNCTIONS ////
  
  static int
-warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t)
+warn_invalid_escape_sequence(Parser *p, const char* buffer, const char *first_invalid_escape, Token *t)
  {
      if (p->call_invalid_rules) {
          // Do not report warnings if we are in the second pass of the parser
@@ -38,8 +38,46 @@ warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token
      else {
          category = PyExc_DeprecationWarning;
      }
+
+    // Calculate the lineno and the col_offset of the invalid escape sequence
+    const char *start = buffer;
+    const char *end = first_invalid_escape;
+    int lineno = t->lineno;
+    int col_offset = t->col_offset;
+    while (start < end) {
+        if (*start == '\n') {
+            lineno++;
+            col_offset = 0;
+        }
+        else {
+            col_offset++;
+        }
+        start++;
+    }
+
+    // Count the number of quotes in the token
+    char first_quote = 0;
+    if (lineno == t->lineno) {
+        int quote_count = 0;
+        char* tok = PyBytes_AsString(t->bytes);
+        for (int i = 0; i < PyBytes_Size(t->bytes); i++) {
+            if (tok[i] == '\'' || tok[i] == '\"') {
+                if (quote_count == 0) {
+                    first_quote = tok[i];
+                }
+                if (tok[i] == first_quote) {
+                    quote_count++;
+                }
+            } else {
+                break;
+            }
+        }
+
+        col_offset += quote_count;
+    }
+
      if (PyErr_WarnExplicitObject(category, msg, p->tok->filename,
-                                 t->lineno, NULL, NULL) < 0) {
+                                 lineno, NULL, NULL) < 0) {
          if (PyErr_ExceptionMatches(category)) {
              /* Replace the Syntax/DeprecationWarning exception with a SyntaxError
                 to get a more accurate error report */
@@ -50,11 +88,12 @@ warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token
                 error location, if p->known_err_token is not set. */
              p->known_err_token = t;
              if (octal) {
-                RAISE_SYNTAX_ERROR("invalid octal escape sequence '\\%.3s'",
-                                   first_invalid_escape);
+                RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, col_offset-1, lineno, col_offset+1,
+                "invalid octal escape sequence '\\%.3s'", first_invalid_escape);
              }
              else {
-                RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", c);
+                RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, col_offset-1, lineno, col_offset+1,
+                "invalid escape sequence '\\%c'", c);
              }
          }
          Py_DECREF(msg);
@@ -148,7 +187,7 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
      // HACK: later we can simply pass the line no, since we don't preserve the tokens
      // when we are decoding the string but we preserve the line numbers.
      if (v != NULL && first_invalid_escape != NULL && t != NULL) {
-        if (warn_invalid_escape_sequence(parser, first_invalid_escape, t) < 0) {
+        if (warn_invalid_escape_sequence(parser, s, first_invalid_escape, t) < 0) {
              /* We have not decref u before because first_invalid_escape points
                 inside u. */
              Py_XDECREF(u);
@@ -170,7 +209,7 @@ decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
      }
  
      if (first_invalid_escape != NULL) {
-        if (warn_invalid_escape_sequence(p, first_invalid_escape, t) < 0) {
+        if (warn_invalid_escape_sequence(p, s, first_invalid_escape, t) < 0) {
              Py_DECREF(result);
              return NULL;
          }
author	Pablo Galindo Salgado <Pablogsal@gmail.com>
	Thu, 13 Feb 2025 01:42:24 +0000 (01:42 +0000)
committer	GitHub <noreply@github.com>
	Thu, 13 Feb 2025 01:42:24 +0000 (01:42 +0000)
Lib/test/test_cmd_line_script.py		patch \| blob \| blame \| history
Lib/test/test_string_literals.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Core and Builtins/2025-02-13-00-28-43.gh-issue-116042.861juq.rst	[new file with mode: 0644]	patch \| blob
Parser/pegen_errors.c		patch \| blob \| blame \| history
Parser/string_parser.c		patch \| blob \| blame \| history