gh-144872: fix heap buffer overflow `_PyTokenizer_ensure_utf8` (#144807)

author AdamKorcz <44787359+AdamKorcz@users.noreply.github.com>

Thu, 26 Feb 2026 22:35:08 +0000 (22:35 +0000)

committer GitHub <noreply@github.com>

Thu, 26 Feb 2026 22:35:08 +0000 (22:35 +0000)
author AdamKorcz <44787359+AdamKorcz@users.noreply.github.com>
Thu, 26 Feb 2026 22:35:08 +0000 (22:35 +0000)
committer GitHub <noreply@github.com>
Thu, 26 Feb 2026 22:35:08 +0000 (22:35 +0000)
diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py

index 46b291192df4292e84f9d8592916b1414ba05771..8ac64b3105708f768a371048e979b39ef09efe5b 100644 (file)
--- a/Lib/test/test_source_encoding.py
+++ b/Lib/test/test_source_encoding.py
@@ -65,6 +65,23 @@ class MiscSourceEncodingTest(unittest.TestCase):
          # two bytes in common with the UTF-8 BOM
          self.assertRaises(SyntaxError, eval, b'\xef\xbb\x20')
  
+    def test_truncated_utf8_at_eof(self):
+        # Regression test for https://issues.oss-fuzz.com/issues/451112368
+        # Truncated multi-byte UTF-8 sequences at end of input caused an
+        # out-of-bounds read in Parser/tokenizer/helpers.c:valid_utf8().
+        truncated = [
+            b'\xc2',              # 2-byte lead, missing 1 continuation
+            b'\xdf',              # 2-byte lead, missing 1 continuation
+            b'\xe0',              # 3-byte lead, missing 2 continuations
+            b'\xe0\xa0',          # 3-byte lead, missing 1 continuation
+            b'\xf0\x90',          # 4-byte lead, missing 2 continuations
+            b'\xf0\x90\x80',      # 4-byte lead, missing 1 continuation
+            b'\xf3',              # 4-byte lead, missing 3 (the oss-fuzz reproducer)
+        ]
+        for seq in truncated:
+            with self.subTest(seq=seq):
+                self.assertRaises(SyntaxError, compile, seq, '<test>', 'exec')
+
      @support.requires_subprocess()
      def test_20731(self):
          sub = subprocess.Popen([sys.executable,
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-02-16-12-28-43.gh-issue-144872.k9_Q30.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-02-16-12-28-43.gh-issue-144872.k9_Q30.rst

new file mode 100644 (file)

index 0000000..c06bf01
--- /dev/null
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-02-16-12-28-43.gh-issue-144872.k9_Q30.rst
@@ -0,0 +1 @@
+Fix heap buffer overflow in the parser found by OSS-Fuzz.
diff --git a/Parser/tokenizer/helpers.c b/Parser/tokenizer/helpers.c

index fda8216a3005b9c7585a840158e6681aa99872a1..9542969ad3127b9519462ef01792a4a8a7318d03 100644 (file)
--- a/Parser/tokenizer/helpers.c
+++ b/Parser/tokenizer/helpers.c
@@ -494,9 +494,11 @@ valid_utf8(const unsigned char* s)
          return 0;
      }
      length = expected + 1;
-    for (; expected; expected--)
-        if (s[expected] < 0x80 || s[expected] >= 0xC0)
+    for (int i = 1; i <= expected; i++) {
+        if (s[i] < 0x80 || s[i] >= 0xC0) {
              return 0;
+        }
+    }
      return length;
  }
author	AdamKorcz <44787359+AdamKorcz@users.noreply.github.com>
	Thu, 26 Feb 2026 22:35:08 +0000 (22:35 +0000)
committer	GitHub <noreply@github.com>
	Thu, 26 Feb 2026 22:35:08 +0000 (22:35 +0000)
Lib/test/test_source_encoding.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Core_and_Builtins/2026-02-16-12-28-43.gh-issue-144872.k9_Q30.rst	[new file with mode: 0644]	patch \| blob
Parser/tokenizer/helpers.c		patch \| blob \| blame \| history