gh-130273: Fix traceback color output with unicode characters (GH-142529)

author grayjk <grayjk@gmail.com>

Tue, 7 Apr 2026 13:05:23 +0000 (09:05 -0400)

committer GitHub <noreply@github.com>

Tue, 7 Apr 2026 13:05:23 +0000 (15:05 +0200)
author grayjk <grayjk@gmail.com>
Tue, 7 Apr 2026 13:05:23 +0000 (09:05 -0400)
committer GitHub <noreply@github.com>
Tue, 7 Apr 2026 13:05:23 +0000 (15:05 +0200)
diff --git a/Lib/_pyrepl/utils.py b/Lib/_pyrepl/utils.py

index 7175d57a9e319e3061745afe0481b140b1f8538e..d399b4cf53c82a1e07ef93210dfba47dba7be8a9 100644 (file)
--- a/Lib/_pyrepl/utils.py
+++ b/Lib/_pyrepl/utils.py
@@ -16,6 +16,7 @@ from typing import Iterable, Iterator, Match, NamedTuple, Self
  from .types import CharBuffer, CharWidths
  from .trace import trace
  
+
  ANSI_ESCAPE_SEQUENCE = re.compile(r"\x1b\[[ -@]*[A-~]")
  ZERO_WIDTH_BRACKET = re.compile(r"\x01.*?\x02")
  ZERO_WIDTH_TRANS = str.maketrans({"\x01": "", "\x02": ""})
diff --git a/Lib/test/test_traceback.py b/Lib/test/test_traceback.py

index 5dc11253e0d5c890755f4b021a192e3a3c4595a9..909808825f055e22a632f53912a8170cc9563e52 100644 (file)
--- a/Lib/test/test_traceback.py
+++ b/Lib/test/test_traceback.py
@@ -1790,6 +1790,7 @@ class TracebackErrorLocationCaretTestBase:
          ]
          self.assertEqual(result_lines, expected)
  
+
  class TestKeywordTypoSuggestions(unittest.TestCase):
      TYPO_CASES = [
          ("with block ad something:\n  pass", "and"),
@@ -5414,6 +5415,92 @@ class TestColorizedTraceback(unittest.TestCase):
          ]
          self.assertEqual(actual, expected(**colors))
  
+    def test_colorized_traceback_unicode(self):
+        try:
+            啊哈=1; 啊哈/0####
+        except Exception as e:
+            exc = traceback.TracebackException.from_exception(e)
+
+        actual = "".join(exc.format(colorize=True)).splitlines()
+        def expected(t, m, fn, l, f, E, e, z, n):
+            return [
+                f"    啊哈=1; {e}啊哈{z}{E}/{z}{e}0{z}####",
+                f"            {e}~~~~{z}{E}^{z}{e}~{z}",
+            ]
+        self.assertEqual(actual[2:4], expected(**colors))
+
+        try:
+            ééééé/0
+        except Exception as e:
+            exc = traceback.TracebackException.from_exception(e)
+
+        actual = "".join(exc.format(colorize=True)).splitlines()
+        def expected(t, m, fn, l, f, E, e, z, n):
+            return [
+                f"    {E}ééééé{z}/0",
+                f"    {E}^^^^^{z}",
+            ]
+        self.assertEqual(actual[2:4], expected(**colors))
+
+    def test_colorized_syntax_error_ascii_display_width(self):
+        """Caret alignment for ASCII edge cases handled by _wlen.
+
+        The old ASCII fast track in _display_width returned the raw character
+        offset for ASCII strings, which is wrong for CTRL-Z (display width 2)
+        and ANSI escape sequences (display width 0).
+        """
+        E = colors["E"]
+        z = colors["z"]
+        t = colors["t"]
+        m = colors["m"]
+        fn = colors["fn"]
+        l = colors["l"]
+
+        def _make_syntax_error(text, offset, end_offset):
+            err = SyntaxError("invalid syntax")
+            err.filename = "<string>"
+            err.lineno = 1
+            err.end_lineno = 1
+            err.text = text
+            err.offset = offset
+            err.end_offset = end_offset
+            return err
+
+        # CTRL-Z (\x1a) is ASCII but displayed as ^Z (2 columns).
+        # Verify caret aligns when CTRL-Z precedes the error.
+        err = _make_syntax_error("a\x1a$\n", offset=3, end_offset=4)
+        exc = traceback.TracebackException.from_exception(err)
+        actual = "".join(exc.format(colorize=True))
+        # 'a' (1 col) + '\x1a' (2 cols) = 3 cols before '$'
+        self.assertIn(
+            f'  File {fn}"<string>"{z}, line {l}1{z}\n'
+            f'    a\x1a{E}${z}\n'
+            f'    {" " * 3}{E}^{z}\n'
+            f'{t}SyntaxError{z}: {m}invalid syntax{z}\n',
+            actual,
+        )
+
+        # CTRL-Z in the highlighted (error) region counts as 2 columns.
+        err = _make_syntax_error("$\x1a\n", offset=1, end_offset=3)
+        exc = traceback.TracebackException.from_exception(err)
+        actual = "".join(exc.format(colorize=True))
+        # '$' (1 col) + '\x1a' (2 cols) = 3 columns of carets
+        self.assertIn(
+            f'    {E}$\x1a{z}\n'
+            f'    {E}{"^" * 3}{z}\n',
+            actual,
+        )
+
+        # ANSI escape sequences are ASCII but take 0 display columns.
+        err = _make_syntax_error("a\x1b[1mb$\n", offset=7, end_offset=8)
+        exc = traceback.TracebackException.from_exception(err)
+        actual = "".join(exc.format(colorize=True))
+        # 'a' (1 col) + '\x1b[1m' (0 cols) + 'b' (1 col) = 2 before '$'
+        self.assertIn(
+            f'    a\x1b[1mb{E}${z}\n'
+            f'    {" " * 2}{E}^{z}\n',
+            actual,
+        )
  
  class TestLazyImportSuggestions(unittest.TestCase):
      """Test that lazy imports are not reified when computing AttributeError suggestions."""
diff --git a/Lib/traceback.py b/Lib/traceback.py

index 1f9f151ebf5d39140366d71e01bac1594ac42d2e..343d0e5f108c3522492beecdfb292804037e3e5a 100644 (file)
--- a/Lib/traceback.py
+++ b/Lib/traceback.py
@@ -1,9 +1,11 @@
  """Extract, format and print information about Python stack traces."""
  
  import collections.abc
+import functools
  import itertools
  import linecache
  import os
+import re
  import sys
  import textwrap
  import types
@@ -684,12 +686,12 @@ class StackSummary(list):
                          colorized_line_parts = []
                          colorized_carets_parts = []
  
-                        for color, group in itertools.groupby(itertools.zip_longest(line, carets, fillvalue=""), key=lambda x: x[1]):
+                        for color, group in itertools.groupby(_zip_display_width(line, carets), key=lambda x: x[1]):
                              caret_group = list(group)
-                            if color == "^":
+                            if "^" in color:
                                  colorized_line_parts.append(theme.error_highlight + "".join(char for char, _ in caret_group) + theme.reset)
                                  colorized_carets_parts.append(theme.error_highlight + "".join(caret for _, caret in caret_group) + theme.reset)
-                            elif color == "~":
+                            elif "~" in color:
                                  colorized_line_parts.append(theme.error_range + "".join(char for char, _ in caret_group) + theme.reset)
                                  colorized_carets_parts.append(theme.error_range + "".join(caret for _, caret in caret_group) + theme.reset)
                              else:
@@ -971,7 +973,54 @@ def _extract_caret_anchors_from_line_segment(segment):
  
      return None
  
-_WIDE_CHAR_SPECIFIERS = "WF"
+
+def _zip_display_width(line, carets):
+    carets = iter(carets)
+    if line.isascii() and '\x1a' not in line:
+        for char in line:
+            yield char, next(carets, "")
+        return
+
+    import unicodedata
+    for char in unicodedata.iter_graphemes(line):
+        char = str(char)
+        char_width = _display_width(char)
+        yield char, "".join(itertools.islice(carets, char_width))
+
+
+@functools.cache
+def _str_width(c: str) -> int:
+    # copied from _pyrepl.utils to fix gh-130273
+
+    if ord(c) < 128:
+        return 1
+    import unicodedata
+    # gh-139246 for zero-width joiner and combining characters
+    if unicodedata.combining(c):
+        return 0
+    category = unicodedata.category(c)
+    if category == "Cf" and c != "\u00ad":
+        return 0
+    w = unicodedata.east_asian_width(c)
+    if w in ("N", "Na", "H", "A"):
+        return 1
+    return 2
+
+
+_ANSI_ESCAPE_SEQUENCE = re.compile(r"\x1b\[[ -@]*[A-~]")
+
+
+def _wlen(s: str) -> int:
+    # copied from _pyrepl.utils to fix gh-130273
+
+    if len(s) == 1 and s != "\x1a":
+        return _str_width(s)
+    length = sum(_str_width(i) for i in s)
+    # remove lengths of any escape sequences
+    sequence = _ANSI_ESCAPE_SEQUENCE.findall(s)
+    ctrl_z_cnt = s.count("\x1a")
+    return length - sum(len(i) for i in sequence) + ctrl_z_cnt
+
  
  def _display_width(line, offset=None):
      """Calculate the extra amount of width space the given source
@@ -979,18 +1028,9 @@ def _display_width(line, offset=None):
      width output device. Supports wide unicode characters and emojis."""
  
      if offset is None:
-        offset = len(line)
-
-    # Fast track for ASCII-only strings
-    if line.isascii():
-        return offset
+        return _wlen(line)
  
-    import unicodedata
-
-    return sum(
-        2 if unicodedata.east_asian_width(char) in _WIDE_CHAR_SPECIFIERS else 1
-        for char in line[:offset]
-    )
+    return _wlen(line[:offset])
  
  
  def _format_note(note, indent, theme):
diff --git a/Misc/NEWS.d/next/Library/2025-12-10-15-15-09.gh-issue-130273.iCfiY5.rst b/Misc/NEWS.d/next/Library/2025-12-10-15-15-09.gh-issue-130273.iCfiY5.rst

new file mode 100644 (file)

index 0000000..2e06953
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-12-10-15-15-09.gh-issue-130273.iCfiY5.rst
@@ -0,0 +1 @@
+Fix traceback color output with Unicode characters.
author	grayjk <grayjk@gmail.com>
	Tue, 7 Apr 2026 13:05:23 +0000 (09:05 -0400)
committer	GitHub <noreply@github.com>
	Tue, 7 Apr 2026 13:05:23 +0000 (15:05 +0200)
Lib/_pyrepl/utils.py		patch \| blob \| blame \| history
Lib/test/test_traceback.py		patch \| blob \| blame \| history
Lib/traceback.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2025-12-10-15-15-09.gh-issue-130273.iCfiY5.rst	[new file with mode: 0644]	patch \| blob