[3.11] bpo-43950: handle wide unicode characters in tracebacks (GH-28150) (#111373)

author Pablo Galindo Salgado <Pablogsal@gmail.com>

Fri, 27 Oct 2023 00:46:20 +0000 (09:46 +0900)

committer GitHub <noreply@github.com>

Fri, 27 Oct 2023 00:46:20 +0000 (09:46 +0900)
author Pablo Galindo Salgado <Pablogsal@gmail.com>
Fri, 27 Oct 2023 00:46:20 +0000 (09:46 +0900)
committer GitHub <noreply@github.com>
Fri, 27 Oct 2023 00:46:20 +0000 (09:46 +0900)
diff --git a/Lib/test/test_traceback.py b/Lib/test/test_traceback.py

index ccc59870c2a9520ee6a147d269166d33327bf35f..2881a3489084a08d9127d180d0dffb137f2a3acc 100644 (file)
--- a/Lib/test/test_traceback.py
+++ b/Lib/test/test_traceback.py
@@ -893,7 +893,62 @@ class TracebackErrorLocationCaretTests(unittest.TestCase):
              f"    callable()",
              f"  File \"{__file__}\", line {f.__code__.co_firstlineno + 4}, in f",
              f"    print(1, ｗｗｗ(",
-            f"             ^^^^",
+            f"             ^^^^^^^",
+        ]
+        self.assertEqual(actual, expected)
+
+    def test_byte_offset_with_wide_characters_term_highlight(self):
+        def f():
+            说明说明 = 1
+            şçöğıĤellö = 0 # not wide but still non-ascii
+            return 说明说明 / şçöğıĤellö
+
+        actual = self.get_exception(f)
+        expected = [
+            f"Traceback (most recent call last):",
+            f"  File \"{__file__}\", line {self.callable_line}, in get_exception",
+            f"    callable()",
+            f"  File \"{__file__}\", line {f.__code__.co_firstlineno + 3}, in f",
+            f"    return 说明说明 / şçöğıĤellö",
+            f"           ~~~~~~~~~^~~~~~~~~~~~",
+        ]
+        self.assertEqual(actual, expected)
+
+    def test_byte_offset_with_emojis_term_highlight(self):
+        def f():
+            return "✨🐍" + func_说明说明("📗🚛",
+                "📗🚛") + "🐍"
+
+        actual = self.get_exception(f)
+        expected = [
+            f"Traceback (most recent call last):",
+            f"  File \"{__file__}\", line {self.callable_line}, in get_exception",
+            f"    callable()",
+            f"  File \"{__file__}\", line {f.__code__.co_firstlineno + 1}, in f",
+            f'    return "✨🐍" + func_说明说明("📗🚛",',
+            f"                    ^^^^^^^^^^^^^",
+        ]
+        self.assertEqual(actual, expected)
+
+    def test_byte_offset_wide_chars_subscript(self):
+        def f():
+            my_dct = {
+                "✨🚛✨": {
+                    "说明": {
+                        "🐍🐍🐍": None
+                    }
+                }
+            }
+            return my_dct["✨🚛✨"]["说明"]["🐍"]["说明"]["🐍🐍"]
+
+        actual = self.get_exception(f)
+        expected = [
+            f"Traceback (most recent call last):",
+            f"  File \"{__file__}\", line {self.callable_line}, in get_exception",
+            f"    callable()",
+            f"  File \"{__file__}\", line {f.__code__.co_firstlineno + 8}, in f",
+            f'    return my_dct["✨🚛✨"]["说明"]["🐍"]["说明"]["🐍🐍"]',
+            f"           ~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^",
          ]
          self.assertEqual(actual, expected)
  
diff --git a/Lib/traceback.py b/Lib/traceback.py

index 0e229553cb5d253f2e24da31c3c33c0fbd2a2c8f..ea045e27610d4d54461441f6c1fa0c89c99936d3 100644 (file)
--- a/Lib/traceback.py
+++ b/Lib/traceback.py
@@ -465,7 +465,8 @@ class StackSummary(list):
              stripped_line = frame_summary.line.strip()
              row.append('    {}\n'.format(stripped_line))
  
-            orig_line_len = len(frame_summary._original_line)
+            line = frame_summary._original_line
+            orig_line_len = len(line)
              frame_line_len = len(frame_summary.line.lstrip())
              stripped_characters = orig_line_len - frame_line_len
              if (
@@ -473,31 +474,40 @@ class StackSummary(list):
                  and frame_summary.end_colno is not None
              ):
                  start_offset = _byte_offset_to_character_offset(
-                    frame_summary._original_line, frame_summary.colno) + 1
+                    line, frame_summary.colno)
                  end_offset = _byte_offset_to_character_offset(
-                    frame_summary._original_line, frame_summary.end_colno) + 1
+                    line, frame_summary.end_colno)
+                code_segment = line[start_offset:end_offset]
  
                  anchors = None
                  if frame_summary.lineno == frame_summary.end_lineno:
                      with suppress(Exception):
-                        anchors = _extract_caret_anchors_from_line_segment(
-                            frame_summary._original_line[start_offset - 1:end_offset - 1]
-                        )
+                        anchors = _extract_caret_anchors_from_line_segment(code_segment)
                  else:
-                    end_offset = stripped_characters + len(stripped_line)
+                    # Don't count the newline since the anchors only need to
+                    # go up until the last character of the line.
+                    end_offset = len(line.rstrip())
  
                  # show indicators if primary char doesn't span the frame line
                  if end_offset - start_offset < len(stripped_line) or (
                          anchors and anchors.right_start_offset - anchors.left_end_offset > 0):
+                    # When showing this on a terminal, some of the non-ASCII characters
+                    # might be rendered as double-width characters, so we need to take
+                    # that into account when calculating the length of the line.
+                    dp_start_offset = _display_width(line, start_offset) + 1
+                    dp_end_offset = _display_width(line, end_offset) + 1
+
                      row.append('    ')
-                    row.append(' ' * (start_offset - stripped_characters))
+                    row.append(' ' * (dp_start_offset - stripped_characters))
  
                      if anchors:
-                        row.append(anchors.primary_char * (anchors.left_end_offset))
-                        row.append(anchors.secondary_char * (anchors.right_start_offset - anchors.left_end_offset))
-                        row.append(anchors.primary_char * (end_offset - start_offset - anchors.right_start_offset))
+                        dp_left_end_offset = _display_width(code_segment, anchors.left_end_offset)
+                        dp_right_start_offset = _display_width(code_segment, anchors.right_start_offset)
+                        row.append(anchors.primary_char * dp_left_end_offset)
+                        row.append(anchors.secondary_char * (dp_right_start_offset - dp_left_end_offset))
+                        row.append(anchors.primary_char * (dp_end_offset - dp_start_offset - dp_right_start_offset))
                      else:
-                        row.append('^' * (end_offset - start_offset))
+                        row.append('^' * (dp_end_offset - dp_start_offset))
  
                      row.append('\n')
  
@@ -618,6 +628,25 @@ def _extract_caret_anchors_from_line_segment(segment):
  
      return None
  
+_WIDE_CHAR_SPECIFIERS = "WF"
+
+def _display_width(line, offset):
+    """Calculate the extra amount of width space the given source
+    code segment might take if it were to be displayed on a fixed
+    width output device. Supports wide unicode characters and emojis."""
+
+    # Fast track for ASCII-only strings
+    if line.isascii():
+        return offset
+
+    import unicodedata
+
+    return sum(
+        2 if unicodedata.east_asian_width(char) in _WIDE_CHAR_SPECIFIERS else 1
+        for char in line[:offset]
+    )
+
+
  
  class _ExceptionPrintContext:
      def __init__(self):
diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-10-26-15-34-11.gh-issue-88116.W9-vaQ.rst b/Misc/NEWS.d/next/Core and Builtins/2023-10-26-15-34-11.gh-issue-88116.W9-vaQ.rst

new file mode 100644 (file)

index 0000000..12257ef
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2023-10-26-15-34-11.gh-issue-88116.W9-vaQ.rst
@@ -0,0 +1,3 @@
+Traceback location ranges involving wide unicode characters (like emoji and
+asian characters) now are properly highlighted. Patch by Batuhan Taskaya and
+Pablo Galindo.
diff --git a/Parser/pegen.c b/Parser/pegen.c

index 87b47bacec553f5448fcf66efe58db2243757660..3b85b095beb235d8569f13cff623804edf1bc84c 100644 (file)
--- a/Parser/pegen.c
+++ b/Parser/pegen.c
@@ -38,6 +38,61 @@ _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
      return size;
  }
  
+// Calculate the extra amount of width space the given source
+// code segment might take if it were to be displayed on a fixed
+// width output device. Supports wide unicode characters and emojis.
+Py_ssize_t
+_PyPegen_calculate_display_width(PyObject *line, Py_ssize_t character_offset)
+{
+    PyObject *segment = PyUnicode_Substring(line, 0, character_offset);
+    if (!segment) {
+        return -1;
+    }
+
+    // Fast track for ascii strings
+    if (PyUnicode_IS_ASCII(segment)) {
+        Py_DECREF(segment);
+        return character_offset;
+    }
+
+    PyObject *width_fn = _PyImport_GetModuleAttrString("unicodedata", "east_asian_width");
+    if (!width_fn) {
+        return -1;
+    }
+
+    Py_ssize_t width = 0;
+    Py_ssize_t len = PyUnicode_GET_LENGTH(segment);
+    for (Py_ssize_t i = 0; i < len; i++) {
+        PyObject *chr = PyUnicode_Substring(segment, i, i + 1);
+        if (!chr) {
+            Py_DECREF(segment);
+            Py_DECREF(width_fn);
+            return -1;
+        }
+
+        PyObject *width_specifier = PyObject_CallOneArg(width_fn, chr);
+        Py_DECREF(chr);
+        if (!width_specifier) {
+            Py_DECREF(segment);
+            Py_DECREF(width_fn);
+            return -1;
+        }
+
+        if (_PyUnicode_EqualToASCIIString(width_specifier, "W") ||
+            _PyUnicode_EqualToASCIIString(width_specifier, "F")) {
+            width += 2;
+        }
+        else {
+            width += 1;
+        }
+        Py_DECREF(width_specifier);
+    }
+
+    Py_DECREF(segment);
+    Py_DECREF(width_fn);
+    return width;
+}
+
  // Here, mark is the start of the node, while p->mark is the end.
  // If node==NULL, they should be the same.
  int
diff --git a/Parser/pegen.h b/Parser/pegen.h

index fe0c327b8755669390014faa444142d68f8fecc0..2c4b2c3dfc65c6d614762109264ea15347e04001 100644 (file)
--- a/Parser/pegen.h
+++ b/Parser/pegen.h
@@ -143,6 +143,7 @@ expr_ty _PyPegen_name_token(Parser *p);
  expr_ty _PyPegen_number_token(Parser *p);
  void *_PyPegen_string_token(Parser *p);
  Py_ssize_t _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset);
+Py_ssize_t _PyPegen_calculate_display_width(PyObject *segment, Py_ssize_t character_offset);
  
  // Error handling functions and APIs
  typedef enum {
diff --git a/Python/traceback.c b/Python/traceback.c

index c4f5ec877bba5da0806bd0e249f270b6b91f300f..130f945c29023432db2b5762623740698ac0d9ce 100644 (file)
--- a/Python/traceback.c
+++ b/Python/traceback.c
@@ -907,8 +907,39 @@ tb_displayline(PyTracebackObject* tb, PyObject *f, PyObject *filename, int linen
          goto done;
      }
  
-    if (print_error_location_carets(f, truncation, start_offset, end_offset,
-                                    right_start_offset, left_end_offset,
+    // Convert all offsets to display offsets (e.g. the space they would take up if printed
+    // on the screen).
+    Py_ssize_t dp_start = _PyPegen_calculate_display_width(source_line, start_offset);
+    if (dp_start < 0) {
+        err = ignore_source_errors() < 0;
+        goto done;
+    }
+
+    Py_ssize_t dp_end = _PyPegen_calculate_display_width(source_line, end_offset);
+    if (dp_end < 0) {
+        err = ignore_source_errors() < 0;
+        goto done;
+    }
+
+    Py_ssize_t dp_left_end = -1;
+    Py_ssize_t dp_right_start = -1;
+    if (has_secondary_ranges) {
+        dp_left_end = _PyPegen_calculate_display_width(source_line, left_end_offset);
+        if (dp_left_end < 0) {
+            err = ignore_source_errors() < 0;
+            goto done;
+        }
+
+        dp_right_start = _PyPegen_calculate_display_width(source_line, right_start_offset);
+        if (dp_right_start < 0) {
+            err = ignore_source_errors() < 0;
+            goto done;
+        }
+    }
+
+
+    if (print_error_location_carets(f, truncation, dp_start, dp_end,
+                                    dp_right_start, dp_left_end,
                                      primary_error_char, secondary_error_char) < 0) {
          err = -1;
          goto done;
author	Pablo Galindo Salgado <Pablogsal@gmail.com>
	Fri, 27 Oct 2023 00:46:20 +0000 (09:46 +0900)
committer	GitHub <noreply@github.com>
	Fri, 27 Oct 2023 00:46:20 +0000 (09:46 +0900)
Lib/test/test_traceback.py		patch \| blob \| blame \| history
Lib/traceback.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Core and Builtins/2023-10-26-15-34-11.gh-issue-88116.W9-vaQ.rst	[new file with mode: 0644]	patch \| blob
Parser/pegen.c		patch \| blob \| blame \| history
Parser/pegen.h		patch \| blob \| blame \| history
Python/traceback.c		patch \| blob \| blame \| history