]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
bpo-45848: Allow the parser to get error lines from encoded files (GH-29646)
authorPablo Galindo Salgado <Pablogsal@gmail.com>
Sat, 20 Nov 2021 14:36:07 +0000 (14:36 +0000)
committerGitHub <noreply@github.com>
Sat, 20 Nov 2021 14:36:07 +0000 (15:36 +0100)
Include/cpython/pyerrors.h
Lib/test/test_exceptions.py
Misc/NEWS.d/next/Core and Builtins/2021-11-19-22-57-42.bpo-45848.HgVBJ5.rst [new file with mode: 0644]
Parser/pegen.c
Python/errors.c

index 28ab565dde42377ea321c25849d342e58fe999c5..a07018abae0cf3aed69c798a95cc9c0b64ec0ca2 100644 (file)
@@ -149,6 +149,11 @@ PyAPI_FUNC(PyObject *) PyErr_ProgramTextObject(
     PyObject *filename,
     int lineno);
 
+PyAPI_FUNC(PyObject *) _PyErr_ProgramDecodedTextObject(
+    PyObject *filename,
+    int lineno,
+    const char* encoding);
+
 PyAPI_FUNC(PyObject *) _PyUnicodeTranslateError_Create(
     PyObject *object,
     Py_ssize_t start,
index 1341f77ac45cb35be809f55a853ad0045358d832..4c18a59f6e92cb3dbd0389aa77a8e64a979664d3 100644 (file)
@@ -2353,6 +2353,19 @@ class SyntaxErrorTests(unittest.TestCase):
         finally:
             unlink(TESTFN)
 
+        # Check backwards tokenizer errors
+        source = '# -*- coding: ascii -*-\n\n(\n'
+        try:
+            with open(TESTFN, 'w', encoding='ascii') as testfile:
+                testfile.write(source)
+            rc, out, err = script_helper.assert_python_failure('-Wd', '-X', 'utf8', TESTFN)
+            err = err.decode('utf-8').splitlines()
+
+            self.assertEqual(err[-3], '    (')
+            self.assertEqual(err[-2], '    ^')
+        finally:
+            unlink(TESTFN)
+
     def test_attributes_new_constructor(self):
         args = ("bad.py", 1, 2, "abcdefg", 1, 100)
         the_exception = SyntaxError("bad bad", args)
diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-11-19-22-57-42.bpo-45848.HgVBJ5.rst b/Misc/NEWS.d/next/Core and Builtins/2021-11-19-22-57-42.bpo-45848.HgVBJ5.rst
new file mode 100644 (file)
index 0000000..d9394c9
--- /dev/null
@@ -0,0 +1,2 @@
+Allow the parser to obtain error lines directly from encoded files. Patch by
+Pablo Galindo
index 09c1a19a79364623e44162bc8d9b7a2fb2bbd9f1..b3fdae487d970f5dc622b7898df239a54fa5ba23 100644 (file)
@@ -482,14 +482,12 @@ _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
         goto error;
     }
 
-    // PyErr_ProgramTextObject assumes that the text is utf-8 so we cannot call it with a file
-    // with an arbitrary encoding or otherwise we could get some badly decoded text.
-    int uses_utf8_codec = (!p->tok->encoding || strcmp(p->tok->encoding, "utf-8") == 0);
     if (p->tok->fp_interactive) {
         error_line = get_error_line(p, lineno);
     }
-    else if (uses_utf8_codec && p->start_rule == Py_file_input) {
-        error_line = PyErr_ProgramTextObject(p->tok->filename, (int) lineno);
+    else if (p->start_rule == Py_file_input) {
+        error_line = _PyErr_ProgramDecodedTextObject(p->tok->filename,
+                                                     (int) lineno, p->tok->encoding);
     }
 
     if (!error_line) {
@@ -500,15 +498,18 @@ _PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
            we're actually parsing from a file, which has an E_EOF SyntaxError and in that case
            `PyErr_ProgramTextObject` fails because lineno points to last_file_line + 1, which
            does not physically exist */
-        assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF || !uses_utf8_codec);
+        assert(p->tok->fp == NULL || p->tok->fp == stdin || p->tok->done == E_EOF);
 
         if (p->tok->lineno <= lineno && p->tok->inp > p->tok->buf) {
             Py_ssize_t size = p->tok->inp - p->tok->buf;
             error_line = PyUnicode_DecodeUTF8(p->tok->buf, size, "replace");
         }
-        else {
+        else if (p->tok->fp == NULL || p->tok->fp == stdin) {
             error_line = get_error_line(p, lineno);
         }
+        else {
+            error_line = PyUnicode_FromStringAndSize("", 0);
+        }
         if (!error_line) {
             goto error;
         }
index 519f2d459effd662412ad39941d831cd16f83e80..cb3938d20856fc23da8c330a389ae86defe20ea7 100644 (file)
@@ -1692,7 +1692,7 @@ PyErr_SyntaxLocationEx(const char *filename, int lineno, int col_offset)
    functionality in tb_displayline() in traceback.c. */
 
 static PyObject *
-err_programtext(PyThreadState *tstate, FILE *fp, int lineno)
+err_programtext(PyThreadState *tstate, FILE *fp, int lineno, const char* encoding)
 {
     int i;
     char linebuf[1000];
@@ -1720,7 +1720,11 @@ after_loop:
     fclose(fp);
     if (i == lineno) {
         PyObject *res;
-        res = PyUnicode_FromString(linebuf);
+        if (encoding != NULL) {
+            res = PyUnicode_Decode(linebuf, strlen(linebuf), encoding, "replace");
+        } else {
+            res = PyUnicode_FromString(linebuf);
+        }
         if (res == NULL)
             _PyErr_Clear(tstate);
         return res;
@@ -1746,7 +1750,7 @@ PyErr_ProgramText(const char *filename, int lineno)
 }
 
 PyObject *
-PyErr_ProgramTextObject(PyObject *filename, int lineno)
+_PyErr_ProgramDecodedTextObject(PyObject *filename, int lineno, const char* encoding)
 {
     if (filename == NULL || lineno <= 0) {
         return NULL;
@@ -1758,7 +1762,13 @@ PyErr_ProgramTextObject(PyObject *filename, int lineno)
         _PyErr_Clear(tstate);
         return NULL;
     }
-    return err_programtext(tstate, fp, lineno);
+    return err_programtext(tstate, fp, lineno, encoding);
+}
+
+PyObject *
+PyErr_ProgramTextObject(PyObject *filename, int lineno)
+{
+    return _PyErr_ProgramDecodedTextObject(filename, lineno, NULL);
 }
 
 #ifdef __cplusplus