gh-107450: Check for overflow in the tokenizer and fix overflow test (#110832)

author Lysandros Nikolaou <lisandrosnik@gmail.com>

Mon, 16 Oct 2023 14:42:49 +0000 (16:42 +0200)

committer GitHub <noreply@github.com>

Mon, 16 Oct 2023 14:42:49 +0000 (16:42 +0200)
author Lysandros Nikolaou <lisandrosnik@gmail.com>
Mon, 16 Oct 2023 14:42:49 +0000 (16:42 +0200)
committer GitHub <noreply@github.com>
Mon, 16 Oct 2023 14:42:49 +0000 (16:42 +0200)
diff --git a/Include/errcode.h b/Include/errcode.h

index 8d44e9ae55919389df49c44b4f4db3bdaf3aea29..dac5cf068c99d6d4ab1b678c48da3e6f1e2660ee 100644 (file)
--- a/Include/errcode.h
+++ b/Include/errcode.h
@@ -19,24 +19,25 @@
  extern "C" {
  #endif
  
-#define E_OK            10      /* No error */
-#define E_EOF           11      /* End Of File */
-#define E_INTR          12      /* Interrupted */
-#define E_TOKEN         13      /* Bad token */
-#define E_SYNTAX        14      /* Syntax error */
-#define E_NOMEM         15      /* Ran out of memory */
-#define E_DONE          16      /* Parsing complete */
-#define E_ERROR         17      /* Execution error */
-#define E_TABSPACE      18      /* Inconsistent mixing of tabs and spaces */
-#define E_OVERFLOW      19      /* Node had too many children */
-#define E_TOODEEP       20      /* Too many indentation levels */
-#define E_DEDENT        21      /* No matching outer block for dedent */
-#define E_DECODE        22      /* Error in decoding into Unicode */
-#define E_EOFS          23      /* EOF in triple-quoted string */
-#define E_EOLS          24      /* EOL in single-quoted string */
-#define E_LINECONT      25      /* Unexpected characters after a line continuation */
-#define E_BADSINGLE     27      /* Ill-formed single statement input */
-#define E_INTERACT_STOP 28      /* Interactive mode stopped tokenization */
+#define E_OK             10      /* No error */
+#define E_EOF            11      /* End Of File */
+#define E_INTR           12      /* Interrupted */
+#define E_TOKEN          13      /* Bad token */
+#define E_SYNTAX         14      /* Syntax error */
+#define E_NOMEM          15      /* Ran out of memory */
+#define E_DONE           16      /* Parsing complete */
+#define E_ERROR          17      /* Execution error */
+#define E_TABSPACE       18      /* Inconsistent mixing of tabs and spaces */
+#define E_OVERFLOW       19      /* Node had too many children */
+#define E_TOODEEP        20      /* Too many indentation levels */
+#define E_DEDENT         21      /* No matching outer block for dedent */
+#define E_DECODE         22      /* Error in decoding into Unicode */
+#define E_EOFS           23      /* EOF in triple-quoted string */
+#define E_EOLS           24      /* EOL in single-quoted string */
+#define E_LINECONT       25      /* Unexpected characters after a line continuation */
+#define E_BADSINGLE      27      /* Ill-formed single statement input */
+#define E_INTERACT_STOP  28      /* Interactive mode stopped tokenization */
+#define E_COLUMNOVERFLOW 29      /* Column offset overflow */
  
  #ifdef __cplusplus
  }
diff --git a/Lib/test/test_exceptions.py b/Lib/test/test_exceptions.py

index eafa7d84638b765f0c9c614f035b703483d62a79..4031c5ca76c7055182d5a9f7270df3acc1e6c4d1 100644 (file)
--- a/Lib/test/test_exceptions.py
+++ b/Lib/test/test_exceptions.py
@@ -18,6 +18,12 @@ from test.support.os_helper import TESTFN, unlink
  from test.support.warnings_helper import check_warnings
  from test import support
  
+try:
+    from _testcapi import INT_MAX
+except ImportError:
+    INT_MAX = 2**31 - 1
+
+
  
  class NaiveException(Exception):
      def __init__(self, x):
@@ -318,11 +324,13 @@ class ExceptionTests(unittest.TestCase):
          check('(yield i) = 2', 1, 2)
          check('def f(*):\n  pass', 1, 7)
  
+    @unittest.skipIf(INT_MAX >= sys.maxsize, "Downcasting to int is safe for col_offset")
      @support.requires_resource('cpu')
-    @support.bigmemtest(support._2G, memuse=1.5)
-    def testMemoryErrorBigSource(self, _size):
-        with self.assertRaises(OverflowError):
-            exec(f"if True:\n {' ' * 2**31}print('hello world')")
+    @support.bigmemtest(INT_MAX, memuse=2, dry_run=False)
+    def testMemoryErrorBigSource(self, size):
+        src = b"if True:\n%*s" % (size, b"pass")
+        with self.assertRaisesRegex(OverflowError, "Parser column offset overflow"):
+            compile(src, '<fragment>', 'exec')
  
      @cpython_only
      def testSettingException(self):
diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c

index c7134ab868bfbd66f82d5fe76d63316eedeea91b..1a01bb0352a7b110fbb4bd8f4d4a24c547ffb6b2 100644 (file)
--- a/Parser/lexer/lexer.c
+++ b/Parser/lexer/lexer.c
@@ -59,6 +59,10 @@ tok_nextc(struct tok_state *tok)
      int rc;
      for (;;) {
          if (tok->cur != tok->inp) {
+            if ((unsigned int) tok->col_offset >= (unsigned int) INT_MAX) {
+                tok->done = E_COLUMNOVERFLOW;
+                return EOF;
+            }
              tok->col_offset++;
              return Py_CHARMASK(*tok->cur++); /* Fast path */
          }
diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c

index 15e99e23d8490f21d01ce72048c5cccf722e3fe2..057bf5515199352dace046b0cb2e19fd8398122c 100644 (file)
--- a/Parser/pegen_errors.c
+++ b/Parser/pegen_errors.c
@@ -68,6 +68,7 @@ _Pypegen_tokenizer_error(Parser *p)
      const char *msg = NULL;
      PyObject* errtype = PyExc_SyntaxError;
      Py_ssize_t col_offset = -1;
+    p->error_indicator = 1;
      switch (p->tok->done) {
          case E_TOKEN:
              msg = "invalid token";
@@ -103,6 +104,10 @@ _Pypegen_tokenizer_error(Parser *p)
              msg = "unexpected character after line continuation character";
              break;
          }
+        case E_COLUMNOVERFLOW:
+            PyErr_SetString(PyExc_OverflowError,
+                    "Parser column offset overflow - source line is too big");
+            return -1;
          default:
              msg = "unknown parsing error";
      }
author	Lysandros Nikolaou <lisandrosnik@gmail.com>
	Mon, 16 Oct 2023 14:42:49 +0000 (16:42 +0200)
committer	GitHub <noreply@github.com>
	Mon, 16 Oct 2023 14:42:49 +0000 (16:42 +0200)
Include/errcode.h		patch \| blob \| blame \| history
Lib/test/test_exceptions.py		patch \| blob \| blame \| history
Parser/lexer/lexer.c		patch \| blob \| blame \| history
Parser/pegen_errors.c		patch \| blob \| blame \| history