bpo-46521: Fix codeop to use a new partial-input mode of the parser (GH-31010)

author Pablo Galindo Salgado <Pablogsal@gmail.com>

Tue, 8 Feb 2022 11:54:37 +0000 (11:54 +0000)

committer GitHub <noreply@github.com>

Tue, 8 Feb 2022 11:54:37 +0000 (11:54 +0000)
author Pablo Galindo Salgado <Pablogsal@gmail.com>
Tue, 8 Feb 2022 11:54:37 +0000 (11:54 +0000)
committer GitHub <noreply@github.com>
Tue, 8 Feb 2022 11:54:37 +0000 (11:54 +0000)
diff --git a/Include/cpython/compile.h b/Include/cpython/compile.h

index a202c0b0e655086cd26fdca707ec611a7a3adb0b..518a3764992954f0a7d46274ea0a45d6e91a161e 100644 (file)
--- a/Include/cpython/compile.h
+++ b/Include/cpython/compile.h
@@ -18,8 +18,10 @@
  #define PyCF_IGNORE_COOKIE 0x0800
  #define PyCF_TYPE_COMMENTS 0x1000
  #define PyCF_ALLOW_TOP_LEVEL_AWAIT 0x2000
+#define PyCF_ALLOW_INCOMPLETE_INPUT 0x4000
  #define PyCF_COMPILE_MASK (PyCF_ONLY_AST | PyCF_ALLOW_TOP_LEVEL_AWAIT | \
-                           PyCF_TYPE_COMMENTS | PyCF_DONT_IMPLY_DEDENT)
+                           PyCF_TYPE_COMMENTS | PyCF_DONT_IMPLY_DEDENT | \
+                           PyCF_ALLOW_INCOMPLETE_INPUT)
  
  typedef struct {
      int cf_flags;  /* bitmask of CO_xxx flags relevant to future */
diff --git a/Include/errcode.h b/Include/errcode.h

index 2e07fc2c963280bf6dd99c69a1919fe5da2692a5..54ae929bf258703845c0c832585ffa9595ee7da0 100644 (file)
--- a/Include/errcode.h
+++ b/Include/errcode.h
@@ -26,6 +26,8 @@ extern "C" {
  #define E_TOODEEP       20      /* Too many indentation levels */
  #define E_DEDENT        21      /* No matching outer block for dedent */
  #define E_DECODE        22      /* Error in decoding into Unicode */
+#define E_EOFS          23      /* EOF in triple-quoted string */
+#define E_EOLS          24      /* EOL in single-quoted string */
  #define E_LINECONT      25      /* Unexpected characters after a line continuation */
  #define E_BADSINGLE     27      /* Ill-formed single statement input */
  #define E_INTERACT_STOP 28      /* Interactive mode stopped tokenization */
diff --git a/Lib/codeop.py b/Lib/codeop.py

index 6b56be488eeb0319a8621d2edf33bcfe8f07c2ec..568e9bbc118050d68f4b06e156f10c63a23496ba 100644 (file)
--- a/Lib/codeop.py
+++ b/Lib/codeop.py
@@ -10,30 +10,6 @@ and:
    syntax error (OverflowError and ValueError can be produced by
    malformed literals).
  
-Approach:
-
-First, check if the source consists entirely of blank lines and
-comments; if so, replace it with 'pass', because the built-in
-parser doesn't always do the right thing for these.
-
-Compile three times: as is, with \n, and with \n\n appended.  If it
-compiles as is, it's complete.  If it compiles with one \n appended,
-we expect more.  If it doesn't compile either way, we compare the
-error we get when compiling with \n or \n\n appended.  If the errors
-are the same, the code is broken.  But if the errors are different, we
-expect more.  Not intuitive; not even guaranteed to hold in future
-releases; but this matches the compiler's behavior from Python 1.4
-through 2.2, at least.
-
-Caveat:
-
-It is possible (but not likely) that the parser stops parsing with a
-successful outcome before reaching the end of the source; in this
-case, trailing symbols may be ignored instead of causing an error.
-For example, a backslash followed by two newlines may be followed by
-arbitrary garbage.  This will be fixed once the API for the parser is
-better.
-
  The two interfaces are:
  
  compile_command(source, filename, symbol):
@@ -64,7 +40,11 @@ _features = [getattr(__future__, fname)
  
  __all__ = ["compile_command", "Compile", "CommandCompiler"]
  
-PyCF_DONT_IMPLY_DEDENT = 0x200          # Matches pythonrun.h.
+# The following flags match the values from Include/cpython/compile.h
+# Caveat emptor: These flags are undocumented on purpose and depending
+# on their effect outside the standard library is **unsupported**.
+PyCF_DONT_IMPLY_DEDENT = 0x200          
+PyCF_ALLOW_INCOMPLETE_INPUT = 0x4000
  
  def _maybe_compile(compiler, source, filename, symbol):
      # Check for source consisting of only blank lines and comments.
@@ -86,24 +66,12 @@ def _maybe_compile(compiler, source, filename, symbol):
      with warnings.catch_warnings():
          warnings.simplefilter("error")
  
-        code1 = err1 = err2 = None
-        try:
-            code1 = compiler(source + "\n", filename, symbol)
-        except SyntaxError as e:
-            err1 = e
-
          try:
-            code2 = compiler(source + "\n\n", filename, symbol)
+            compiler(source + "\n", filename, symbol)
          except SyntaxError as e:
-            err2 = e
-
-    try:
-        if not code1 and _is_syntax_error(err1, err2):
-            raise err1
-        else:
-            return None
-    finally:
-        err1 = err2 = None
+            if "incomplete input" in str(e):
+                return None
+            raise
  
  def _is_syntax_error(err1, err2):
      rep1 = repr(err1)
@@ -115,7 +83,7 @@ def _is_syntax_error(err1, err2):
      return False
  
  def _compile(source, filename, symbol):
-    return compile(source, filename, symbol, PyCF_DONT_IMPLY_DEDENT)
+    return compile(source, filename, symbol, PyCF_DONT_IMPLY_DEDENT | PyCF_ALLOW_INCOMPLETE_INPUT)
  
  def compile_command(source, filename="<input>", symbol="single"):
      r"""Compile a command and determine whether it is incomplete.
@@ -144,7 +112,7 @@ class Compile:
      statement, it "remembers" and compiles all subsequent program texts
      with the statement in force."""
      def __init__(self):
-        self.flags = PyCF_DONT_IMPLY_DEDENT
+        self.flags = PyCF_DONT_IMPLY_DEDENT | PyCF_ALLOW_INCOMPLETE_INPUT
  
      def __call__(self, source, filename, symbol):
          codeob = compile(source, filename, symbol, self.flags, True)
diff --git a/Misc/NEWS.d/next/Library/2022-02-01-19-34-28.bpo-46521.IMUIrs.rst b/Misc/NEWS.d/next/Library/2022-02-01-19-34-28.bpo-46521.IMUIrs.rst

new file mode 100644 (file)

index 0000000..4e9fa08
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2022-02-01-19-34-28.bpo-46521.IMUIrs.rst
@@ -0,0 +1,2 @@
+Fix a bug in the :mod:`codeop` module that was incorrectly identifying
+invalid code involving string quotes as valid code.
diff --git a/Parser/pegen.c b/Parser/pegen.c

index 470c2cbd7438b2b0b2d621391625dec6855414da..6adde8432307ddce7d72b7348e226f8e2f555c13 100644 (file)
--- a/Parser/pegen.c
+++ b/Parser/pegen.c
@@ -726,6 +726,9 @@ compute_parser_flags(PyCompilerFlags *flags)
      if ((flags->cf_flags & PyCF_ONLY_AST) && flags->cf_feature_version < 7) {
          parser_flags |= PyPARSE_ASYNC_HACKS;
      }
+    if (flags->cf_flags & PyCF_ALLOW_INCOMPLETE_INPUT) {
+        parser_flags |= PyPARSE_ALLOW_INCOMPLETE_INPUT;
+    }
      return parser_flags;
  }
  
@@ -811,16 +814,26 @@ reset_parser_state_for_error_pass(Parser *p)
      p->tok->interactive_underflow = IUNDERFLOW_STOP;
  }
  
+static inline int
+_is_end_of_source(Parser *p) {
+    int err = p->tok->done;
+    return err == E_EOF || err == E_EOFS || err == E_EOLS;
+}
+
  void *
  _PyPegen_run_parser(Parser *p)
  {
      void *res = _PyPegen_parse(p);
      assert(p->level == 0);
      if (res == NULL) {
+        if ((p->flags & PyPARSE_ALLOW_INCOMPLETE_INPUT) &&  _is_end_of_source(p)) {
+            PyErr_Clear();
+            return RAISE_SYNTAX_ERROR("incomplete input");
+        }
          if (PyErr_Occurred() && !PyErr_ExceptionMatches(PyExc_SyntaxError)) {
              return NULL;
          }
-        // Make a second parser pass. In this pass we activate heavier and slower checks
+       // Make a second parser pass. In this pass we activate heavier and slower checks
          // to produce better error messages and more complete diagnostics. Extra "invalid_*"
          // rules will be active during parsing.
          Token *last_token = p->tokens[p->fill - 1];
diff --git a/Parser/pegen.h b/Parser/pegen.h

index caba34e535b6aefc1a3f479ab36b2ae0d2772582..061ca3a2013cb6a4a94816d0006fcea73914cf7a 100644 (file)
--- a/Parser/pegen.h
+++ b/Parser/pegen.h
@@ -22,6 +22,7 @@
  #define PyPARSE_BARRY_AS_BDFL 0x0020
  #define PyPARSE_TYPE_COMMENTS 0x0040
  #define PyPARSE_ASYNC_HACKS   0x0080
+#define PyPARSE_ALLOW_INCOMPLETE_INPUT 0x0100
  
  #define CURRENT_POS (-5)
  
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c

index 5b5cbdb809ebeeb092863922ec9094e6c5c6fd1e..d38df66c69ed03fdebd9ec9f85fbdba81d4febdf 100644 (file)
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -40,7 +40,7 @@
  static struct tok_state *tok_new(void);
  static int tok_nextc(struct tok_state *tok);
  static void tok_backup(struct tok_state *tok, int c);
-
+static int syntaxerror(struct tok_state *tok, const char *format, ...);
  
  /* Spaces in this constant are treated as "zero or more spaces or tabs" when
     tokenizing. */
@@ -1031,8 +1031,9 @@ tok_nextc(struct tok_state *tok)
          if (tok->cur != tok->inp) {
              return Py_CHARMASK(*tok->cur++); /* Fast path */
          }
-        if (tok->done != E_OK)
-            return EOF;
+        if (tok->done != E_OK) {
+           return EOF;
+        }
          if (tok->fp == NULL) {
              rc = tok_underflow_string(tok);
          }
@@ -1964,16 +1965,21 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                  tok->line_start = tok->multi_line_start;
                  int start = tok->lineno;
                  tok->lineno = tok->first_lineno;
-
                  if (quote_size == 3) {
-                    return syntaxerror(tok,
-                                       "unterminated triple-quoted string literal"
-                                       " (detected at line %d)", start);
+                    syntaxerror(tok, "unterminated triple-quoted string literal"
+                                     " (detected at line %d)", start);
+                    if (c != '\n') {
+                        tok->done = E_EOFS;
+                    }
+                    return ERRORTOKEN;
                  }
                  else {
-                    return syntaxerror(tok,
-                                       "unterminated string literal (detected at"
-                                       " line %d)", start);
+                    syntaxerror(tok, "unterminated string literal (detected at"
+                                     " line %d)", start);
+                    if (c != '\n') {
+                        tok->done = E_EOLS;
+                    }
+                    return ERRORTOKEN;
                  }
              }
              if (c == quote) {
author	Pablo Galindo Salgado <Pablogsal@gmail.com>
	Tue, 8 Feb 2022 11:54:37 +0000 (11:54 +0000)
committer	GitHub <noreply@github.com>
	Tue, 8 Feb 2022 11:54:37 +0000 (11:54 +0000)
Include/cpython/compile.h		patch \| blob \| blame \| history
Include/errcode.h		patch \| blob \| blame \| history
Lib/codeop.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2022-02-01-19-34-28.bpo-46521.IMUIrs.rst	[new file with mode: 0644]	patch \| blob
Parser/pegen.c		patch \| blob \| blame \| history
Parser/pegen.h		patch \| blob \| blame \| history
Parser/tokenizer.c		patch \| blob \| blame \| history