gh-99581: Fix a buffer overflow in the tokenizer when copying lines that fill the...

author Pablo Galindo Salgado <Pablogsal@gmail.com>

Sun, 20 Nov 2022 20:20:03 +0000 (20:20 +0000)

committer GitHub <noreply@github.com>

Sun, 20 Nov 2022 20:20:03 +0000 (20:20 +0000)
author Pablo Galindo Salgado <Pablogsal@gmail.com>
Sun, 20 Nov 2022 20:20:03 +0000 (20:20 +0000)
committer GitHub <noreply@github.com>
Sun, 20 Nov 2022 20:20:03 +0000 (20:20 +0000)
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py

index 47f2c06685bcaa553902e071c86ea8a87f4c72ec..63c2501cfe2338d7040a0e390c2473ee163924b8 100644 (file)
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -10,6 +10,8 @@ from textwrap import dedent
  from unittest import TestCase, mock
  from test.test_grammar import (VALID_UNDERSCORE_LITERALS,
                                 INVALID_UNDERSCORE_LITERALS)
+from test.support import os_helper
+from test.support.script_helper import run_test_script, make_script
  import os
  import token
  
@@ -2631,5 +2633,19 @@ async def f():
          self.assertEqual(get_tokens(code), get_tokens(code_no_cont))
  
  
+class CTokenizerBufferTests(unittest.TestCase):
+    def test_newline_at_the_end_of_buffer(self):
+        # See issue 99581: Make sure that if we need to add a new line at the
+        # end of the buffer, we have enough space in the buffer, specially when
+        # the current line is as long as the buffer space available.
+        test_script = f"""\
+        #coding: latin-1
+        #{"a"*10000}
+        #{"a"*10002}"""
+        with os_helper.temp_dir() as temp_dir:
+            file_name = make_script(temp_dir, 'foo', test_script)
+            run_test_script(file_name)
+
+
  if __name__ == "__main__":
      unittest.main()
diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-11-19-22-27-52.gh-issue-99581.yKYPbf.rst b/Misc/NEWS.d/next/Core and Builtins/2022-11-19-22-27-52.gh-issue-99581.yKYPbf.rst

new file mode 100644 (file)

index 0000000..8071fd1
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2022-11-19-22-27-52.gh-issue-99581.yKYPbf.rst
@@ -0,0 +1,3 @@
+Fixed a bug that was causing a buffer overflow if the tokenizer copies a
+line missing the newline caracter from a file that is as long as the
+available tokenizer buffer. Patch by Pablo galindo
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c

index f2131cf39b38da323768ac37c1d36a3509b8deac..ce72e1529024c138f499b9319283cd8ebaadd25c 100644 (file)
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -413,7 +413,11 @@ tok_readline_recode(struct tok_state *tok) {
          error_ret(tok);
          goto error;
      }
-    if (!tok_reserve_buf(tok, buflen + 1)) {
+    // Make room for the null terminator *and* potentially
+    // an extra newline character that we may need to artificially
+    // add.
+    size_t buffer_size = buflen + 2;
+    if (!tok_reserve_buf(tok, buffer_size)) {
          goto error;
      }
      memcpy(tok->inp, buf, buflen);
@@ -1000,6 +1004,7 @@ tok_underflow_file(struct tok_state *tok) {
          return 0;
      }
      if (tok->inp[-1] != '\n') {
+        assert(tok->inp + 1 < tok->end);
          /* Last line does not end in \n, fake one */
          *tok->inp++ = '\n';
          *tok->inp = '\0';
author	Pablo Galindo Salgado <Pablogsal@gmail.com>
	Sun, 20 Nov 2022 20:20:03 +0000 (20:20 +0000)
committer	GitHub <noreply@github.com>
	Sun, 20 Nov 2022 20:20:03 +0000 (20:20 +0000)
Lib/test/test_tokenize.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Core and Builtins/2022-11-19-22-27-52.gh-issue-99581.yKYPbf.rst	[new file with mode: 0644]	patch \| blob
Parser/tokenizer.c		patch \| blob \| blame \| history