bpo-46054: Fix parsing error when parsing non-utf8 characters in source files (GH...

author Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>

Sun, 12 Dec 2021 16:52:49 +0000 (08:52 -0800)

committer GitHub <noreply@github.com>

Sun, 12 Dec 2021 16:52:49 +0000 (16:52 +0000)
author Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
Sun, 12 Dec 2021 16:52:49 +0000 (08:52 -0800)
committer GitHub <noreply@github.com>
Sun, 12 Dec 2021 16:52:49 +0000 (16:52 +0000)
diff --git a/Lib/test/test_exceptions.py b/Lib/test/test_exceptions.py

index 9acb16c518210855fcb6ac6e22449e6dc916bf41..cc0640dda09802ee6fbd1993d6b3cb4b2231d39e 100644 (file)
--- a/Lib/test/test_exceptions.py
+++ b/Lib/test/test_exceptions.py
@@ -2368,6 +2368,18 @@ class SyntaxErrorTests(unittest.TestCase):
          finally:
              unlink(TESTFN)
  
+    def test_non_utf8(self):
+        # Check non utf-8 characters
+        try:
+            with open(TESTFN, 'bw') as testfile:
+                testfile.write(b"\x89")
+            rc, out, err = script_helper.assert_python_failure('-Wd', '-X', 'utf8', TESTFN)
+            err = err.decode('utf-8').splitlines()
+
+            self.assertIn("SyntaxError: Non-UTF-8 code starting with '\\x89' in file", err[-1])
+        finally:
+            unlink(TESTFN)
+
      def test_attributes_new_constructor(self):
          args = ("bad.py", 1, 2, "abcdefg", 1, 100)
          the_exception = SyntaxError("bad bad", args)
diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-12-12-05-30-21.bpo-46054.2P-foG.rst b/Misc/NEWS.d/next/Core and Builtins/2021-12-12-05-30-21.bpo-46054.2P-foG.rst

new file mode 100644 (file)

index 0000000..6ca91f0
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2021-12-12-05-30-21.bpo-46054.2P-foG.rst
@@ -0,0 +1,2 @@
+Fix parser error when parsing non-utf8 characters in source files. Patch by
+Pablo Galindo.
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c

index 672fdb92ec86f8adec8078f632858da12a214352..8e9c69d0785afd5cb40df95647274c0dbfafadf4 100644 (file)
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -818,10 +818,10 @@ tok_readline_raw(struct tok_state *tok)
              tok_concatenate_interactive_new_line(tok, line) == -1) {
              return 0;
          }
-        if (*tok->inp == '\0') {
+        tok->inp = strchr(tok->inp, '\0');
+        if (tok->inp == tok->buf) {
              return 0;
          }
-        tok->inp = strchr(tok->inp, '\0');
      } while (tok->inp[-1] != '\n');
      return 1;
  }
@@ -983,12 +983,9 @@ tok_underflow_file(struct tok_state *tok) {
      }
      /* The default encoding is UTF-8, so make sure we don't have any
         non-UTF-8 sequences in it. */
-    if (!tok->encoding
-        && (tok->decoding_state != STATE_NORMAL || tok->lineno >= 2)) {
-        if (!ensure_utf8(tok->cur, tok)) {
-            error_ret(tok);
-            return 0;
-        }
+    if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
+        error_ret(tok);
+        return 0;
      }
      assert(tok->done == E_OK);
      return tok->done == E_OK;
author	Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
	Sun, 12 Dec 2021 16:52:49 +0000 (08:52 -0800)
committer	GitHub <noreply@github.com>
	Sun, 12 Dec 2021 16:52:49 +0000 (16:52 +0000)
Lib/test/test_exceptions.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Core and Builtins/2021-12-12-05-30-21.bpo-46054.2P-foG.rst	[new file with mode: 0644]	patch \| blob
Parser/tokenizer.c		patch \| blob \| blame \| history