]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
gh-116042: Fix location for SyntaxErrors of invalid escapes in the tokenizer (#116049)
authorPablo Galindo Salgado <Pablogsal@gmail.com>
Thu, 13 Feb 2025 01:07:37 +0000 (01:07 +0000)
committerGitHub <noreply@github.com>
Thu, 13 Feb 2025 01:07:37 +0000 (01:07 +0000)
Lib/test/test_cmd_line_script.py
Lib/test/test_string_literals.py
Misc/NEWS.d/next/Core_and_Builtins/2025-02-13-00-28-43.gh-issue-116042.861juq.rst [new file with mode: 0644]
Parser/pegen_errors.c
Parser/string_parser.c

index e7f3e46c1868f759beb5ae85ed522d8da2b2badb..53dc9b1a7effb54cd73920a28bf96fc9f63e06b6 100644 (file)
@@ -660,7 +660,7 @@ class CmdLineTest(unittest.TestCase):
             self.assertEqual(
                 stderr.splitlines()[-3:],
                 [   b'    foo = """\\q"""',
-                    b'          ^^^^^^^^',
+                    b'             ^^',
                     b'SyntaxError: "\\q" is an invalid escape sequence. '
                     b'Did you mean "\\\\q"? A raw string is also an option.'
                 ],
index f56195ca27672c34706a4550c96fd2f55d604ea2..9d57233eb0882a0bb6cf16f45ead0805ef3740aa 100644 (file)
@@ -120,7 +120,7 @@ class TestLiterals(unittest.TestCase):
                          r'Such sequences will not work in the future. '
                          r'Did you mean "\\z"? A raw string is also an option.')
         self.assertEqual(w[0].filename, '<string>')
-        self.assertEqual(w[0].lineno, 1)
+        self.assertEqual(w[0].lineno, 2)
 
         with warnings.catch_warnings(record=True) as w:
             warnings.simplefilter('error', category=SyntaxWarning)
@@ -131,7 +131,7 @@ class TestLiterals(unittest.TestCase):
         self.assertEqual(exc.msg, r'"\z" is an invalid escape sequence. '
                          r'Did you mean "\\z"? A raw string is also an option.')
         self.assertEqual(exc.filename, '<string>')
-        self.assertEqual(exc.lineno, 1)
+        self.assertEqual(exc.lineno, 2)
         self.assertEqual(exc.offset, 1)
 
         # Check that the warning is raised only once if there are syntax errors
@@ -160,7 +160,7 @@ class TestLiterals(unittest.TestCase):
                          r'Such sequences will not work in the future. '
                          r'Did you mean "\\407"? A raw string is also an option.')
         self.assertEqual(w[0].filename, '<string>')
-        self.assertEqual(w[0].lineno, 1)
+        self.assertEqual(w[0].lineno, 2)
 
         with warnings.catch_warnings(record=True) as w:
             warnings.simplefilter('error', category=SyntaxWarning)
@@ -171,9 +171,32 @@ class TestLiterals(unittest.TestCase):
         self.assertEqual(exc.msg, r'"\407" is an invalid octal escape sequence. '
                                  r'Did you mean "\\407"? A raw string is also an option.')
         self.assertEqual(exc.filename, '<string>')
-        self.assertEqual(exc.lineno, 1)
+        self.assertEqual(exc.lineno, 2)
         self.assertEqual(exc.offset, 1)
 
+    def test_invalid_escape_locations_with_offset(self):
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always', category=SyntaxWarning)
+            eval("\"'''''''''''''''''''''invalid\ Escape\"")
+        self.assertEqual(len(w), 1)
+        self.assertEqual(str(w[0].message),
+                         r'"\ " is an invalid escape sequence. Such sequences '
+                         r'will not work in the future. Did you mean "\\ "? '
+                         r'A raw string is also an option.')
+        self.assertEqual(w[0].filename, '<string>')
+        self.assertEqual(w[0].lineno, 1)
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always', category=SyntaxWarning)
+            eval("\"''Incorrect \ logic?\"")
+        self.assertEqual(len(w), 1)
+        self.assertEqual(str(w[0].message),
+                            r'"\ " is an invalid escape sequence. Such sequences '
+                            r'will not work in the future. Did you mean "\\ "? '
+                            r'A raw string is also an option.')
+        self.assertEqual(w[0].filename, '<string>')
+        self.assertEqual(w[0].lineno, 1)
+
     def test_eval_str_raw(self):
         self.assertEqual(eval(""" r'x' """), 'x')
         self.assertEqual(eval(r""" r'\x01' """), '\\' + 'x01')
@@ -215,7 +238,7 @@ class TestLiterals(unittest.TestCase):
                          r'Such sequences will not work in the future. '
                          r'Did you mean "\\z"? A raw string is also an option.')
         self.assertEqual(w[0].filename, '<string>')
-        self.assertEqual(w[0].lineno, 1)
+        self.assertEqual(w[0].lineno, 2)
 
         with warnings.catch_warnings(record=True) as w:
             warnings.simplefilter('error', category=SyntaxWarning)
@@ -226,7 +249,7 @@ class TestLiterals(unittest.TestCase):
         self.assertEqual(exc.msg, r'"\z" is an invalid escape sequence. '
                          r'Did you mean "\\z"? A raw string is also an option.')
         self.assertEqual(exc.filename, '<string>')
-        self.assertEqual(exc.lineno, 1)
+        self.assertEqual(exc.lineno, 2)
 
     def test_eval_bytes_invalid_octal_escape(self):
         for i in range(0o400, 0o1000):
@@ -241,7 +264,7 @@ class TestLiterals(unittest.TestCase):
                          r'Such sequences will not work in the future. '
                          r'Did you mean "\\407"? A raw string is also an option.')
         self.assertEqual(w[0].filename, '<string>')
-        self.assertEqual(w[0].lineno, 1)
+        self.assertEqual(w[0].lineno, 2)
 
         with warnings.catch_warnings(record=True) as w:
             warnings.simplefilter('error', category=SyntaxWarning)
@@ -252,7 +275,7 @@ class TestLiterals(unittest.TestCase):
         self.assertEqual(exc.msg, r'"\407" is an invalid octal escape sequence. '
                          r'Did you mean "\\407"? A raw string is also an option.')
         self.assertEqual(exc.filename, '<string>')
-        self.assertEqual(exc.lineno, 1)
+        self.assertEqual(exc.lineno, 2)
 
     def test_eval_bytes_raw(self):
         self.assertEqual(eval(""" br'x' """), b'x')
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-02-13-00-28-43.gh-issue-116042.861juq.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-02-13-00-28-43.gh-issue-116042.861juq.rst
new file mode 100644 (file)
index 0000000..098804f
--- /dev/null
@@ -0,0 +1,2 @@
+Fix location for SyntaxErrors of invalid escapes in the tokenizer. Patch by
+Pablo Galindo
index 6146f69912bfa3325c1aedc99b523f77c1c94b19..f62b8695995617f2c32903593891a54aa513a6d3 100644 (file)
@@ -352,8 +352,8 @@ _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
         assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
 
         if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
-            Py_ssize_t size = p->tok->inp - p->tok->buf;
-            error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
+            Py_ssize_t size = p->tok->inp - p->tok->line_start;
+            error_line = PyUnicode_DecodeUTF8(p->tok->line_start, size, "replace");
         }
         else if (p->tok->fp == NULL || p->tok->fp == stdin) {
             error_line = get_error_line_from_tokenizer_buffers(p, lineno);
index 9dd8f9ef28bd4f5a7a476661631abae1f29f60e6..b93300b00a8545bbb785bd2a7393e887db57a73e 100644 (file)
@@ -11,7 +11,7 @@
 //// STRING HANDLING FUNCTIONS ////
 
 static int
-warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t)
+warn_invalid_escape_sequence(Parser *p, const char* buffer, const char *first_invalid_escape, Token *t)
 {
     if (p->call_invalid_rules) {
         // Do not report warnings if we are in the second pass of the parser
@@ -48,8 +48,46 @@ warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token
     else {
         category = PyExc_DeprecationWarning;
     }
+
+    // Calculate the lineno and the col_offset of the invalid escape sequence
+    const char *start = buffer;
+    const char *end = first_invalid_escape;
+    int lineno = t->lineno;
+    int col_offset = t->col_offset;
+    while (start < end) {
+        if (*start == '\n') {
+            lineno++;
+            col_offset = 0;
+        }
+        else {
+            col_offset++;
+        }
+        start++;
+    }
+
+    // Count the number of quotes in the token
+    char first_quote = 0;
+    if (lineno == t->lineno) {
+        int quote_count = 0;
+        char* tok = PyBytes_AsString(t->bytes);
+        for (int i = 0; i < PyBytes_Size(t->bytes); i++) {
+            if (tok[i] == '\'' || tok[i] == '\"') {
+                if (quote_count == 0) {
+                    first_quote = tok[i];
+                }
+                if (tok[i] == first_quote) {
+                    quote_count++;
+                }
+            } else {
+                break;
+            }
+        }
+
+        col_offset += quote_count;
+    }
+
     if (PyErr_WarnExplicitObject(category, msg, p->tok->filename,
-                                 t->lineno, NULL, NULL) < 0) {
+                                 lineno, NULL, NULL) < 0) {
         if (PyErr_ExceptionMatches(category)) {
             /* Replace the Syntax/DeprecationWarning exception with a SyntaxError
                to get a more accurate error report */
@@ -60,13 +98,13 @@ warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token
                error location, if p->known_err_token is not set. */
             p->known_err_token = t;
             if (octal) {
-                RAISE_SYNTAX_ERROR(
+                RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, col_offset-1, lineno, col_offset+1,
                     "\"\\%.3s\" is an invalid octal escape sequence. "
                     "Did you mean \"\\\\%.3s\"? A raw string is also an option.",
                     first_invalid_escape, first_invalid_escape);
             }
             else {
-                RAISE_SYNTAX_ERROR(
+                RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, col_offset-1, lineno, col_offset+1,
                     "\"\\%c\" is an invalid escape sequence. "
                     "Did you mean \"\\\\%c\"? A raw string is also an option.",
                     c, c);
@@ -163,7 +201,7 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
     // HACK: later we can simply pass the line no, since we don't preserve the tokens
     // when we are decoding the string but we preserve the line numbers.
     if (v != NULL && first_invalid_escape != NULL && t != NULL) {
-        if (warn_invalid_escape_sequence(parser, first_invalid_escape, t) < 0) {
+        if (warn_invalid_escape_sequence(parser, s, first_invalid_escape, t) < 0) {
             /* We have not decref u before because first_invalid_escape points
                inside u. */
             Py_XDECREF(u);
@@ -185,7 +223,7 @@ decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
     }
 
     if (first_invalid_escape != NULL) {
-        if (warn_invalid_escape_sequence(p, first_invalid_escape, t) < 0) {
+        if (warn_invalid_escape_sequence(p, s, first_invalid_escape, t) < 0) {
             Py_DECREF(result);
             return NULL;
         }