bpo-46091: Correctly calculate indentation levels for whitespace lines with continuat...

author Pablo Galindo Salgado <Pablogsal@gmail.com>

Tue, 25 Jan 2022 22:12:14 +0000 (22:12 +0000)

committer GitHub <noreply@github.com>

Tue, 25 Jan 2022 22:12:14 +0000 (22:12 +0000)
author Pablo Galindo Salgado <Pablogsal@gmail.com>
Tue, 25 Jan 2022 22:12:14 +0000 (22:12 +0000)
committer GitHub <noreply@github.com>
Tue, 25 Jan 2022 22:12:14 +0000 (22:12 +0000)
diff --git a/Lib/test/test_ast.py b/Lib/test/test_ast.py

index 314b360c58ba95fea8afca8a794a243b797d0713..039d1c1010b6d19dcfe2a7390ce50b3a01c2fffc 100644 (file)
--- a/Lib/test/test_ast.py
+++ b/Lib/test/test_ast.py
@@ -1078,8 +1078,7 @@ Module(
              ast.literal_eval(node)
  
      def test_literal_eval_syntax_errors(self):
-        msg = "unexpected character after line continuation character"
-        with self.assertRaisesRegex(SyntaxError, msg):
+        with self.assertRaisesRegex(SyntaxError, "unexpected indent"):
              ast.literal_eval(r'''
                  \
                  (\
diff --git a/Lib/test/test_syntax.py b/Lib/test/test_syntax.py

index 968d34809ce431b178be30a883ca4b69eec384de..a6ff319af2ac8f93157ea943b39db443a77effe4 100644 (file)
--- a/Lib/test/test_syntax.py
+++ b/Lib/test/test_syntax.py
@@ -1613,6 +1613,36 @@ pass
          except SyntaxError:
              self.fail("Empty line after a line continuation character is valid.")
  
+        # See issue-46091
+        s1 = r"""\
+def fib(n):
+    \
+'''Print a Fibonacci series up to n.'''
+    \
+a, b = 0, 1
+"""
+        s2 = r"""\
+def fib(n):
+    '''Print a Fibonacci series up to n.'''
+    a, b = 0, 1
+"""
+        try:
+            self.assertEqual(compile(s1, '<string>', 'exec'), compile(s2, '<string>', 'exec'))
+        except SyntaxError:
+            self.fail("Indented statement over multiple lines is valid")
+    
+    def test_continuation_bad_indentation(self): 
+        # Check that code that breaks indentation across multiple lines raises a syntax error
+
+        code = r"""\
+if x:
+    y = 1
+  \
+  foo = 1
+        """
+
+        self.assertRaises(IndentationError, exec, code)
+
      @support.cpython_only
      def test_nested_named_except_blocks(self):
          code = ""
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py

index ca2821de7c08166907664152f36198bfd1ea7bc4..334390abaa2de6f0b6fe6bd337cd1d553d38a2ad 100644 (file)
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -6,6 +6,7 @@ from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
                       NEWLINE, _generate_tokens_from_c_tokenizer)
  from io import BytesIO, StringIO
  import unittest
+from textwrap import dedent
  from unittest import TestCase, mock
  from test.test_grammar import (VALID_UNDERSCORE_LITERALS,
                                 INVALID_UNDERSCORE_LITERALS)
@@ -44,7 +45,6 @@ class TokenizeTest(TestCase):
          # The ENDMARKER and final NEWLINE are omitted.
          f = BytesIO(s.encode('utf-8'))
          result = stringify_tokens_from_source(tokenize(f.readline), s)
-
          self.assertEqual(result,
                           ["    ENCODING   'utf-8'       (0, 0) (0, 0)"] +
                           expected.rstrip().splitlines())
@@ -2511,7 +2511,105 @@ async def f():
  
          self.assertRaises(SyntaxError, get_tokens, "("*1000+"a"+")"*1000)
          self.assertRaises(SyntaxError, get_tokens, "]")
+    
+    def test_continuation_lines_indentation(self): 
+        def get_tokens(string):
+            return [(kind, string) for (kind, string, *_) in _generate_tokens_from_c_tokenizer(string)]
  
+        code = dedent("""
+            def fib(n):
+                \\
+            '''Print a Fibonacci series up to n.'''
+                \\
+            a, b = 0, 1
+        """)
+
+        self.check_tokenize(code, """\
+    NAME       'def'         (2, 0) (2, 3)
+    NAME       'fib'         (2, 4) (2, 7)
+    LPAR       '('           (2, 7) (2, 8)
+    NAME       'n'           (2, 8) (2, 9)
+    RPAR       ')'           (2, 9) (2, 10)
+    COLON      ':'           (2, 10) (2, 11)
+    NEWLINE    ''            (2, 11) (2, 11)
+    INDENT     ''            (4, -1) (4, -1)
+    STRING     "'''Print a Fibonacci series up to n.'''" (4, 0) (4, 39)
+    NEWLINE    ''            (4, 39) (4, 39)
+    NAME       'a'           (6, 0) (6, 1)
+    COMMA      ','           (6, 1) (6, 2)
+    NAME       'b'           (6, 3) (6, 4)
+    EQUAL      '='           (6, 5) (6, 6)
+    NUMBER     '0'           (6, 7) (6, 8)
+    COMMA      ','           (6, 8) (6, 9)
+    NUMBER     '1'           (6, 10) (6, 11)
+    NEWLINE    ''            (6, 11) (6, 11)
+    DEDENT     ''            (6, -1) (6, -1)
+        """)
+
+        code_no_cont = dedent("""
+            def fib(n):
+                '''Print a Fibonacci series up to n.'''
+                a, b = 0, 1
+        """)
+        
+        self.assertEqual(get_tokens(code), get_tokens(code_no_cont))
+
+        code = dedent("""
+            pass
+                \\
+
+            pass
+        """)
+
+        self.check_tokenize(code, """\
+    NAME       'pass'        (2, 0) (2, 4)
+    NEWLINE    ''            (2, 4) (2, 4)
+    NAME       'pass'        (5, 0) (5, 4)
+    NEWLINE    ''            (5, 4) (5, 4)
+        """)
+
+        code_no_cont = dedent("""
+            pass
+            pass
+        """)
+        
+        self.assertEqual(get_tokens(code), get_tokens(code_no_cont))
+
+        code = dedent("""
+            if x:
+                y = 1
+                \\
+                        \\
+                    \\
+                \\
+                foo = 1
+        """)
+
+        self.check_tokenize(code, """\
+    NAME       'if'          (2, 0) (2, 2)
+    NAME       'x'           (2, 3) (2, 4)
+    COLON      ':'           (2, 4) (2, 5)
+    NEWLINE    ''            (2, 5) (2, 5)
+    INDENT     ''            (3, -1) (3, -1)
+    NAME       'y'           (3, 4) (3, 5)
+    EQUAL      '='           (3, 6) (3, 7)
+    NUMBER     '1'           (3, 8) (3, 9)
+    NEWLINE    ''            (3, 9) (3, 9)
+    NAME       'foo'         (8, 4) (8, 7)
+    EQUAL      '='           (8, 8) (8, 9)
+    NUMBER     '1'           (8, 10) (8, 11)
+    NEWLINE    ''            (8, 11) (8, 11)
+    DEDENT     ''            (8, -1) (8, -1)
+        """)
+
+        code_no_cont = dedent("""
+            if x:
+                y = 1
+                foo = 1
+        """)
+
+        self.assertEqual(get_tokens(code), get_tokens(code_no_cont))
+ 
  
  if __name__ == "__main__":
      unittest.main()
diff --git a/Misc/NEWS.d/next/Core and Builtins/2021-12-16-00-24-00.bpo-46091.rJ_e_e.rst b/Misc/NEWS.d/next/Core and Builtins/2021-12-16-00-24-00.bpo-46091.rJ_e_e.rst

new file mode 100644 (file)

index 0000000..a2eee0f
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2021-12-16-00-24-00.bpo-46091.rJ_e_e.rst
@@ -0,0 +1,2 @@
+Correctly calculate indentation levels for lines with whitespace character
+that are ended by line continuation characters. Patch by Pablo Galindo
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c

index 5e35d6fa621b1a640e746fcdab52f11a2e4045a8..cd4254f8b9077e5125f36e47bfd0ec74a3a52f3a 100644 (file)
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1347,6 +1347,24 @@ tok_decimal_tail(struct tok_state *tok)
  
  /* Get next token, after space stripping etc. */
  
+static inline int
+tok_continuation_line(struct tok_state *tok) {
+    int c = tok_nextc(tok);
+    if (c != '\n') {
+        tok->done = E_LINECONT;
+        return -1;
+    }
+    c = tok_nextc(tok);
+    if (c == EOF) {
+        tok->done = E_EOF;
+        tok->cur = tok->inp;
+        return -1;
+    } else {
+        tok_backup(tok, c);
+    }
+    return c;
+}
+
  static int
  tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
  {
@@ -1363,6 +1381,7 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
          int col = 0;
          int altcol = 0;
          tok->atbol = 0;
+        int cont_line_col = 0;
          for (;;) {
              c = tok_nextc(tok);
              if (c == ' ') {
@@ -1375,14 +1394,23 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
              else if (c == '\014')  {/* Control-L (formfeed) */
                  col = altcol = 0; /* For Emacs users */
              }
+            else if (c == '\\') {
+                // Indentation cannot be split over multiple physical lines
+                // using backslashes. This means that if we found a backslash
+                // preceded by whitespace, **the first one we find** determines
+                // the level of indentation of whatever comes next.
+                cont_line_col = cont_line_col ? cont_line_col : col;
+                if ((c = tok_continuation_line(tok)) == -1) {
+                    return ERRORTOKEN;
+                }
+            }
              else {
                  break;
              }
          }
          tok_backup(tok, c);
-        if (c == '#' || c == '\n' || c == '\\') {
+        if (c == '#' || c == '\n') {
              /* Lines with only whitespace and/or comments
-               and/or a line continuation character
                 shouldn't affect the indentation and are
                 not passed to the parser as NEWLINE tokens,
                 except *totally* empty lines in interactive
@@ -1403,6 +1431,8 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
                 may need to skip to the end of a comment */
          }
          if (!blankline && tok->level == 0) {
+            col = cont_line_col ? cont_line_col : col;
+            altcol = cont_line_col ? cont_line_col : altcol;
              if (col == tok->indstack[tok->indent]) {
                  /* No change */
                  if (altcol != tok->altindstack[tok->indent]) {
@@ -1964,19 +1994,9 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
  
      /* Line continuation */
      if (c == '\\') {
-        c = tok_nextc(tok);
-        if (c != '\n') {
-            tok->done = E_LINECONT;
+        if ((c = tok_continuation_line(tok)) == -1) {
              return ERRORTOKEN;
          }
-        c = tok_nextc(tok);
-        if (c == EOF) {
-            tok->done = E_EOF;
-            tok->cur = tok->inp;
-            return ERRORTOKEN;
-        } else {
-            tok_backup(tok, c);
-        }
          tok->cont_line = 1;
          goto again; /* Read next line */
      }
author	Pablo Galindo Salgado <Pablogsal@gmail.com>
	Tue, 25 Jan 2022 22:12:14 +0000 (22:12 +0000)
committer	GitHub <noreply@github.com>
	Tue, 25 Jan 2022 22:12:14 +0000 (22:12 +0000)
Lib/test/test_ast.py		patch \| blob \| blame \| history
Lib/test/test_syntax.py		patch \| blob \| blame \| history
Lib/test/test_tokenize.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Core and Builtins/2021-12-16-00-24-00.bpo-46091.rJ_e_e.rst	[new file with mode: 0644]	patch \| blob
Parser/tokenizer.c		patch \| blob \| blame \| history