gh-105564: Don't include artificial newlines in the line attribute of tokens (#105565)

author Pablo Galindo Salgado <Pablogsal@gmail.com>

Fri, 9 Jun 2023 16:01:26 +0000 (17:01 +0100)

committer GitHub <noreply@github.com>

Fri, 9 Jun 2023 16:01:26 +0000 (17:01 +0100)
author Pablo Galindo Salgado <Pablogsal@gmail.com>
Fri, 9 Jun 2023 16:01:26 +0000 (17:01 +0100)
committer GitHub <noreply@github.com>
Fri, 9 Jun 2023 16:01:26 +0000 (17:01 +0100)
diff --git a/Lib/test/test_peg_generator/test_pegen.py b/Lib/test/test_peg_generator/test_pegen.py

index 876bf789f482829a3ed210c0f417399a07f28ff4..3af2c0cf47d20a1604398ac98f7e1244dcdcfb32 100644 (file)
--- a/Lib/test/test_peg_generator/test_pegen.py
+++ b/Lib/test/test_peg_generator/test_pegen.py
@@ -552,14 +552,14 @@ class TestPegen(unittest.TestCase):
                                  string="D",
                                  start=(1, 0),
                                  end=(1, 1),
-                                line="D A C A E\n",
+                                line="D A C A E",
                              ),
                              TokenInfo(
                                  type=NAME,
                                  string="A",
                                  start=(1, 2),
                                  end=(1, 3),
-                                line="D A C A E\n",
+                                line="D A C A E",
                              ),
                          ],
                          TokenInfo(
@@ -567,7 +567,7 @@ class TestPegen(unittest.TestCase):
                              string="C",
                              start=(1, 4),
                              end=(1, 5),
-                            line="D A C A E\n",
+                            line="D A C A E",
                          ),
                      ],
                      TokenInfo(
@@ -575,11 +575,11 @@ class TestPegen(unittest.TestCase):
                          string="A",
                          start=(1, 6),
                          end=(1, 7),
-                        line="D A C A E\n",
+                        line="D A C A E",
                      ),
                  ],
                  TokenInfo(
-                    type=NAME, string="E", start=(1, 8), end=(1, 9), line="D A C A E\n"
+                    type=NAME, string="E", start=(1, 8), end=(1, 9), line="D A C A E"
                  ),
              ],
          )
@@ -594,22 +594,22 @@ class TestPegen(unittest.TestCase):
                              string="B",
                              start=(1, 0),
                              end=(1, 1),
-                            line="B C A E\n",
+                            line="B C A E",
                          ),
                          TokenInfo(
                              type=NAME,
                              string="C",
                              start=(1, 2),
                              end=(1, 3),
-                            line="B C A E\n",
+                            line="B C A E",
                          ),
                      ],
                      TokenInfo(
-                        type=NAME, string="A", start=(1, 4), end=(1, 5), line="B C A E\n"
+                        type=NAME, string="A", start=(1, 4), end=(1, 5), line="B C A E"
                      ),
                  ],
                  TokenInfo(
-                    type=NAME, string="E", start=(1, 6), end=(1, 7), line="B C A E\n"
+                    type=NAME, string="E", start=(1, 6), end=(1, 7), line="B C A E"
                  ),
              ],
          )
@@ -655,10 +655,10 @@ class TestPegen(unittest.TestCase):
              node,
              [
                  TokenInfo(
-                    NAME, string="foo", start=(1, 0), end=(1, 3), line="foo = 12 + 12 .\n"
+                    NAME, string="foo", start=(1, 0), end=(1, 3), line="foo = 12 + 12 ."
                  ),
                  TokenInfo(
-                    OP, string="=", start=(1, 4), end=(1, 5), line="foo = 12 + 12 .\n"
+                    OP, string="=", start=(1, 4), end=(1, 5), line="foo = 12 + 12 ."
                  ),
                  [
                      TokenInfo(
@@ -666,7 +666,7 @@ class TestPegen(unittest.TestCase):
                          string="12",
                          start=(1, 6),
                          end=(1, 8),
-                        line="foo = 12 + 12 .\n",
+                        line="foo = 12 + 12 .",
                      ),
                      [
                          [
@@ -675,14 +675,14 @@ class TestPegen(unittest.TestCase):
                                  string="+",
                                  start=(1, 9),
                                  end=(1, 10),
-                                line="foo = 12 + 12 .\n",
+                                line="foo = 12 + 12 .",
                              ),
                              TokenInfo(
                                  NUMBER,
                                  string="12",
                                  start=(1, 11),
                                  end=(1, 13),
-                                line="foo = 12 + 12 .\n",
+                                line="foo = 12 + 12 .",
                              ),
                          ]
                      ],
@@ -734,9 +734,9 @@ class TestPegen(unittest.TestCase):
          self.assertEqual(
              node,
              [
-                TokenInfo(OP, string="(", start=(1, 0), end=(1, 1), line="(1)\n"),
-                TokenInfo(NUMBER, string="1", start=(1, 1), end=(1, 2), line="(1)\n"),
-                TokenInfo(OP, string=")", start=(1, 2), end=(1, 3), line="(1)\n"),
+                TokenInfo(OP, string="(", start=(1, 0), end=(1, 1), line="(1)"),
+                TokenInfo(NUMBER, string="1", start=(1, 1), end=(1, 2), line="(1)"),
+                TokenInfo(OP, string=")", start=(1, 2), end=(1, 3), line="(1)"),
              ],
          )
  
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py

index 6747b0d8f65a17b1f4806c237df95f4fff859a3b..2c124f062e7fd64f48ae992c0a97a084c0e6700b 100644 (file)
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1229,7 +1229,7 @@ class Test_Tokenize(TestCase):
          # skip the initial encoding token and the end tokens
          tokens = list(_generate_tokens_from_c_tokenizer(readline().__next__, encoding='utf-8',
                        extra_tokens=True))[:-2]
-        expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"\n')]
+        expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
          self.assertEqual(tokens, expected_tokens,
                           "bytes not decoded with encoding")
  
@@ -1638,8 +1638,8 @@ class TestTokenize(TestCase):
              TokenInfo(type=token.NUMBER, string='1', start=(1, 4), end=(1, 5), line='b = 1\n'),
              TokenInfo(type=token.NEWLINE, string='\n', start=(1, 5), end=(1, 6), line='b = 1\n'),
              TokenInfo(type=token.NL, string='\n', start=(2, 0), end=(2, 1), line='\n'),
-            TokenInfo(type=token.COMMENT, string='#test', start=(3, 0), end=(3, 5), line='#test\n'),
-            TokenInfo(type=token.NL, string='', start=(3, 5), end=(3, 6), line='#test\n'),
+            TokenInfo(type=token.COMMENT, string='#test', start=(3, 0), end=(3, 5), line='#test'),
+            TokenInfo(type=token.NL, string='', start=(3, 5), end=(3, 6), line='#test'),
              TokenInfo(type=token.ENDMARKER, string='', start=(4, 0), end=(4, 0), line='')
          ]
  
@@ -1653,7 +1653,7 @@ class TestTokenize(TestCase):
              TokenInfo(token.ENCODING, string='utf-8', start=(0, 0), end=(0, 0), line=''),
              TokenInfo(token.NAME, string='a', start=(1, 0), end=(1, 1), line='a\n'),
              TokenInfo(token.NEWLINE, string='\n', start=(1, 1), end=(1, 2), line='a\n'),
-            TokenInfo(token.NL, string='', start=(2, 1), end=(2, 2), line=' \n'),
+            TokenInfo(token.NL, string='', start=(2, 1), end=(2, 2), line=' '),
              TokenInfo(token.ENDMARKER, string='', start=(3, 0), end=(3, 0), line='')
          ]
  
@@ -1889,10 +1889,10 @@ class CTokenizeTest(TestCase):
              yield "1+1".encode(encoding)
  
          expected = [
-            TokenInfo(type=NUMBER, string='1', start=(1, 0), end=(1, 1), line='1+1\n'),
-            TokenInfo(type=OP, string='+', start=(1, 1), end=(1, 2), line='1+1\n'),
-            TokenInfo(type=NUMBER, string='1', start=(1, 2), end=(1, 3), line='1+1\n'),
-            TokenInfo(type=NEWLINE, string='', start=(1, 3), end=(1, 4), line='1+1\n'),
+            TokenInfo(type=NUMBER, string='1', start=(1, 0), end=(1, 1), line='1+1'),
+            TokenInfo(type=OP, string='+', start=(1, 1), end=(1, 2), line='1+1'),
+            TokenInfo(type=NUMBER, string='1', start=(1, 2), end=(1, 3), line='1+1'),
+            TokenInfo(type=NEWLINE, string='', start=(1, 3), end=(1, 4), line='1+1'),
              TokenInfo(type=ENDMARKER, string='', start=(2, 0), end=(2, 0), line='')
          ]
          for encoding in ["utf-8", "latin-1", "utf-16"]:
diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-06-09-15-25-12.gh-issue-105564.sFdUu4.rst b/Misc/NEWS.d/next/Core and Builtins/2023-06-09-15-25-12.gh-issue-105564.sFdUu4.rst

new file mode 100644 (file)

index 0000000..9809fac
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2023-06-09-15-25-12.gh-issue-105564.sFdUu4.rst
@@ -0,0 +1,2 @@
+Don't include artificil newlines in the ``line`` attribute of tokens in the
+APIs of the :mod:`tokenize` module. Patch by Pablo Galindo
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c

index 2cf052a0cdeb3b860973b438a84728bc07efc89a..1938562706914c7e347d1a5256c01d7d61b26505 100644 (file)
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -206,6 +206,9 @@ tokenizeriter_next(tokenizeriterobject *it)
          line = PyUnicode_FromString("");
      } else {
          Py_ssize_t size = it->tok->inp - line_start;
+        if (size >= 1 && it->tok->implicit_newline) {
+            size -= 1;
+        }
          line = PyUnicode_DecodeUTF8(line_start, size, "replace");
      }
      if (line == NULL) {
author	Pablo Galindo Salgado <Pablogsal@gmail.com>
	Fri, 9 Jun 2023 16:01:26 +0000 (17:01 +0100)
committer	GitHub <noreply@github.com>
	Fri, 9 Jun 2023 16:01:26 +0000 (17:01 +0100)
Lib/test/test_peg_generator/test_pegen.py		patch \| blob \| blame \| history
Lib/test/test_tokenize.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Core and Builtins/2023-06-09-15-25-12.gh-issue-105564.sFdUu4.rst	[new file with mode: 0644]	patch \| blob
Python/Python-tokenize.c		patch \| blob \| blame \| history