gh-104972: Ensure that line attributes in tokens in the tokenize module are correct...

author Pablo Galindo Salgado <Pablogsal@gmail.com>

Fri, 26 May 2023 14:46:22 +0000 (15:46 +0100)

committer GitHub <noreply@github.com>

Fri, 26 May 2023 14:46:22 +0000 (15:46 +0100)
author Pablo Galindo Salgado <Pablogsal@gmail.com>
Fri, 26 May 2023 14:46:22 +0000 (15:46 +0100)
committer GitHub <noreply@github.com>
Fri, 26 May 2023 14:46:22 +0000 (15:46 +0100)
diff --git a/Lib/idlelib/idle_test/test_editor.py b/Lib/idlelib/idle_test/test_editor.py

index ba59c40dc6dde5012f873ee9af8f875959090e57..9296a6d235fbbe3b61c96fe19c6cb91734949330 100644 (file)
--- a/Lib/idlelib/idle_test/test_editor.py
+++ b/Lib/idlelib/idle_test/test_editor.py
@@ -201,8 +201,8 @@ class IndentSearcherTest(unittest.TestCase):
          test_info = (# text, (block, indent))
                       ("", (None, None)),
                       ("[1,", (None, None)),  # TokenError
-                     ("if 1:\n", ('if 1:', None)),
-                     ("if 1:\n  2\n  3\n", ('if 1:', '  2')),
+                     ("if 1:\n", ('if 1:\n', None)),
+                     ("if 1:\n  2\n  3\n", ('if 1:\n', '  2\n')),
                       )
          for code, expected_pair in test_info:
              with self.subTest(code=code):
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py

index 251ce2b864a9d8c1e193dbd2f10155ec36825797..0b7c25838d6782e3f9113904342ec39602f62432 100644 (file)
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1174,7 +1174,7 @@ class Test_Tokenize(TestCase):
  
          # skip the initial encoding token and the end tokens
          tokens = list(_tokenize(readline(), encoding='utf-8'))[:-2]
-        expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
+        expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"\n')]
          self.assertEqual(tokens, expected_tokens,
                           "bytes not decoded with encoding")
  
@@ -1657,7 +1657,6 @@ class TestRoundtrip(TestCase):
              code = f.encode('utf-8')
          else:
              code = f.read()
-            f.close()
          readline = iter(code.splitlines(keepends=True)).__next__
          tokens5 = list(tokenize(readline))
          tokens2 = [tok[:2] for tok in tokens5]
@@ -1672,6 +1671,17 @@ class TestRoundtrip(TestCase):
          tokens2_from5 = [tok[:2] for tok in tokenize(readline5)]
          self.assertEqual(tokens2_from5, tokens2)
  
+    def check_line_extraction(self, f):
+        if isinstance(f, str):
+            code = f.encode('utf-8')
+        else:
+            code = f.read()
+        readline = iter(code.splitlines(keepends=True)).__next__
+        for tok in tokenize(readline):
+            if tok.type in  {ENCODING, ENDMARKER}:
+                continue
+            self.assertEqual(tok.string, tok.line[tok.start[1]: tok.end[1]])
+
      def test_roundtrip(self):
          # There are some standard formatting practices that are easy to get right.
  
@@ -1768,6 +1778,7 @@ class TestRoundtrip(TestCase):
              with open(testfile, 'rb') as f:
                  # with self.subTest(file=testfile):
                  self.check_roundtrip(f)
+                self.check_line_extraction(f)
  
  
      def roundtrip(self, code):
diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-05-26-14-09-47.gh-issue-104972.El2UjE.rst b/Misc/NEWS.d/next/Core and Builtins/2023-05-26-14-09-47.gh-issue-104972.El2UjE.rst

new file mode 100644 (file)

index 0000000..05d50c1
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2023-05-26-14-09-47.gh-issue-104972.El2UjE.rst
@@ -0,0 +1,2 @@
+Ensure that the ``line`` attribute in :class:`tokenize.TokenInfo` objects in
+the :mod:`tokenize` module are always correct. Patch by Pablo Galindo
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c

index 0023e303b96e836230630e0dcac8d8fd38f3718c..88087c12562413e0818948b61187af417b4435df 100644 (file)
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -194,15 +194,14 @@ tokenizeriter_next(tokenizeriterobject *it)
          goto exit;
      }
  
-    Py_ssize_t size = it->tok->inp - it->tok->buf;
-    assert(it->tok->buf[size-1] == '\n');
-    size -= 1; // Remove the newline character from the end of the line
-    PyObject *line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace");
+    const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
+    Py_ssize_t size = it->tok->inp - line_start;
+    PyObject *line = PyUnicode_DecodeUTF8(line_start, size, "replace");
      if (line == NULL) {
          Py_DECREF(str);
          goto exit;
      }
-    const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start;
+
      Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno;
      Py_ssize_t end_lineno = it->tok->lineno;
      Py_ssize_t col_offset = -1;
author	Pablo Galindo Salgado <Pablogsal@gmail.com>
	Fri, 26 May 2023 14:46:22 +0000 (15:46 +0100)
committer	GitHub <noreply@github.com>
	Fri, 26 May 2023 14:46:22 +0000 (15:46 +0100)
Lib/idlelib/idle_test/test_editor.py		patch \| blob \| blame \| history
Lib/test/test_tokenize.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Core and Builtins/2023-05-26-14-09-47.gh-issue-104972.El2UjE.rst	[new file with mode: 0644]	patch \| blob
Python/Python-tokenize.c		patch \| blob \| blame \| history