gh-145234: Normalize decoded CR in string tokenizer (#145281)

author Pablo Galindo Salgado <Pablogsal@gmail.com>

Fri, 27 Feb 2026 12:44:54 +0000 (12:44 +0000)

committer GitHub <noreply@github.com>

Fri, 27 Feb 2026 12:44:54 +0000 (12:44 +0000)
author Pablo Galindo Salgado <Pablogsal@gmail.com>
Fri, 27 Feb 2026 12:44:54 +0000 (12:44 +0000)
committer GitHub <noreply@github.com>
Fri, 27 Feb 2026 12:44:54 +0000 (12:44 +0000)
diff --git a/Lib/test/test_py_compile.py b/Lib/test/test_py_compile.py

index 66de61930968e44d5e98e670c8f5d7212f53c4b5..da2d630d7ace7bb84f4dd0bb5c2809266be8a81d 100644 (file)
--- a/Lib/test/test_py_compile.py
+++ b/Lib/test/test_py_compile.py
@@ -239,6 +239,14 @@ class PyCompileTestsBase:
              with self.assertRaises(py_compile.PyCompileError):
                  py_compile.compile(bad_coding, self.pyc_path, doraise=True, quiet=1)
  
+    def test_utf7_decoded_cr_compiles(self):
+        with open(self.source_path, 'wb') as file:
+            file.write(b"#coding=U7+AA0''\n")
+
+        pyc_path = py_compile.compile(self.source_path, self.pyc_path, doraise=True)
+        self.assertEqual(pyc_path, self.pyc_path)
+        self.assertTrue(os.path.exists(self.pyc_path))
+
  
  class PyCompileTestsWithSourceEpoch(PyCompileTestsBase,
                                      unittest.TestCase,
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-02-26-21-36-00.gh-issue-145234.w0mQ9n.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-02-26-21-36-00.gh-issue-145234.w0mQ9n.rst

new file mode 100644 (file)

index 0000000..caeffff
--- /dev/null
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-02-26-21-36-00.gh-issue-145234.w0mQ9n.rst
@@ -0,0 +1,5 @@
+Fixed a ``SystemError`` in the parser when an encoding cookie (for example,
+UTF-7) decodes to carriage returns (``\r``). Newlines are now normalized after
+decoding in the string tokenizer.
+
+Patch by Pablo Galindo.
diff --git a/Parser/tokenizer/string_tokenizer.c b/Parser/tokenizer/string_tokenizer.c

index 7299ecf483ccd9ae2c4c7e367804720fe7408f56..7f07cca37ee019197ab11c78f321d329edd0cb35 100644 (file)
--- a/Parser/tokenizer/string_tokenizer.c
+++ b/Parser/tokenizer/string_tokenizer.c
@@ -108,6 +108,19 @@ decode_str(const char *input, int single, struct tok_state *tok, int preserve_cr
      else if (!_PyTokenizer_ensure_utf8(str, tok, 1)) {
          return _PyTokenizer_error_ret(tok);
      }
+    if (utf8 != NULL) {
+        char *translated = _PyTokenizer_translate_newlines(
+            str, single, preserve_crlf, tok);
+        if (translated == NULL) {
+            Py_DECREF(utf8);
+            return _PyTokenizer_error_ret(tok);
+        }
+        PyMem_Free(tok->input);
+        tok->input = translated;
+        str = translated;
+        Py_CLEAR(utf8);
+    }
+    tok->str = str;
      assert(tok->decoding_buffer == NULL);
      tok->decoding_buffer = utf8; /* CAUTION */
      return str;
author	Pablo Galindo Salgado <Pablogsal@gmail.com>
	Fri, 27 Feb 2026 12:44:54 +0000 (12:44 +0000)
committer	GitHub <noreply@github.com>
	Fri, 27 Feb 2026 12:44:54 +0000 (12:44 +0000)
Lib/test/test_py_compile.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Core_and_Builtins/2026-02-26-21-36-00.gh-issue-145234.w0mQ9n.rst	[new file with mode: 0644]	patch \| blob
Parser/tokenizer/string_tokenizer.c		patch \| blob \| blame \| history