[3.12] gh-112943: Correctly compute end offsets for multiline tokens in the tokenize...

author Pablo Galindo Salgado <Pablogsal@gmail.com>

Mon, 11 Dec 2023 12:48:19 +0000 (12:48 +0000)

committer GitHub <noreply@github.com>

Mon, 11 Dec 2023 12:48:19 +0000 (12:48 +0000)
author Pablo Galindo Salgado <Pablogsal@gmail.com>
Mon, 11 Dec 2023 12:48:19 +0000 (12:48 +0000)
committer GitHub <noreply@github.com>
Mon, 11 Dec 2023 12:48:19 +0000 (12:48 +0000)
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py

index bbbc337b1883a9decce532b831900ef2b52d45f8..2886bceb7ba931988ae84e02908bd5bc6d3ffe57 100644 (file)
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -620,6 +620,16 @@ f'__{
      OP         '}'           (3, 0) (3, 1)
      FSTRING_MIDDLE '__'          (3, 1) (3, 3)
      FSTRING_END "'"           (3, 3) (3, 4)
+    """)
+
+        self.check_tokenize("""\
+    '''Autorzy, którzy tą jednostkę mają wpisani jako AKTUALNA -- czyli
+    aktualni pracownicy, obecni pracownicy'''
+""", """\
+    INDENT     '    '        (1, 0) (1, 4)
+    STRING     "'''Autorzy, którzy tą jednostkę mają wpisani jako AKTUALNA -- czyli\\n    aktualni pracownicy, obecni pracownicy'''" (1, 4) (2, 45)
+    NEWLINE    '\\n'          (2, 45) (2, 46)
+    DEDENT     ''            (3, 0) (3, 0)
      """)
  
      def test_function(self):
diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-12-11-00-50-00.gh-issue-112943.RHNZie.rst b/Misc/NEWS.d/next/Core and Builtins/2023-12-11-00-50-00.gh-issue-112943.RHNZie.rst

new file mode 100644 (file)

index 0000000..4bc2fe7
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2023-12-11-00-50-00.gh-issue-112943.RHNZie.rst
@@ -0,0 +1,2 @@
+Correctly compute end column offsets for multiline tokens in the
+:mod:`tokenize` module. Patch by Pablo Galindo
diff --git a/Parser/pegen.c b/Parser/pegen.c

index ff02e88cee753d0195d03993152929dea38f6802..cbceaae599d2073e7cac03ee42c8eaa20ddef90b 100644 (file)
--- a/Parser/pegen.c
+++ b/Parser/pegen.c
@@ -18,12 +18,8 @@ _PyPegen_interactive_exit(Parser *p)
  }
  
  Py_ssize_t
-_PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
+_PyPegen_byte_offset_to_character_offset_raw(const char* str, Py_ssize_t col_offset)
  {
-    const char *str = PyUnicode_AsUTF8(line);
-    if (!str) {
-        return -1;
-    }
      Py_ssize_t len = strlen(str);
      if (col_offset > len + 1) {
          col_offset = len + 1;
@@ -93,6 +89,16 @@ _PyPegen_calculate_display_width(PyObject *line, Py_ssize_t character_offset)
      return width;
  }
  
+Py_ssize_t
+_PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
+{
+    const char *str = PyUnicode_AsUTF8(line);
+    if (!str) {
+        return -1;
+    }
+    return _PyPegen_byte_offset_to_character_offset_raw(str, col_offset);
+}
+
  // Here, mark is the start of the node, while p->mark is the end.
  // If node==NULL, they should be the same.
  int
diff --git a/Parser/pegen.h b/Parser/pegen.h

index 268f380262b80ea2fb8c4c5e5de274bb256e83f4..c2a3e02b2e0aad51113e8877f1952081397d8201 100644 (file)
--- a/Parser/pegen.h
+++ b/Parser/pegen.h
@@ -151,6 +151,7 @@ expr_ty _PyPegen_name_token(Parser *p);
  expr_ty _PyPegen_number_token(Parser *p);
  void *_PyPegen_string_token(Parser *p);
  Py_ssize_t _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset);
+Py_ssize_t _PyPegen_byte_offset_to_character_offset_raw(const char*, Py_ssize_t col_offset);
  Py_ssize_t _PyPegen_calculate_display_width(PyObject *segment, Py_ssize_t character_offset);
  
  // Error handling functions and APIs
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c

index 1938562706914c7e347d1a5256c01d7d61b26505..179f71aa1f56351f6e8b93c5d66f1677b818a1ad 100644 (file)
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -224,7 +224,7 @@ tokenizeriter_next(tokenizeriterobject *it)
          col_offset = _PyPegen_byte_offset_to_character_offset(line, token.start - line_start);
      }
      if (token.end != NULL && token.end >= it->tok->line_start) {
-        end_col_offset = _PyPegen_byte_offset_to_character_offset(line, token.end - it->tok->line_start);
+        end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, token.end - it->tok->line_start);
      }
  
      if (it->tok->tok_extra_tokens) {
author	Pablo Galindo Salgado <Pablogsal@gmail.com>
	Mon, 11 Dec 2023 12:48:19 +0000 (12:48 +0000)
committer	GitHub <noreply@github.com>
	Mon, 11 Dec 2023 12:48:19 +0000 (12:48 +0000)
Lib/test/test_tokenize.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Core and Builtins/2023-12-11-00-50-00.gh-issue-112943.RHNZie.rst	[new file with mode: 0644]	patch \| blob
Parser/pegen.c		patch \| blob \| blame \| history
Parser/pegen.h		patch \| blob \| blame \| history
Python/Python-tokenize.c		patch \| blob \| blame \| history