gh-98744: Prevent column-level decoding crashes on traceback module (#98824)

author Batuhan Taskaya <isidentical@gmail.com>

Sat, 29 Oct 2022 12:28:20 +0000 (15:28 +0300)

committer GitHub <noreply@github.com>

Sat, 29 Oct 2022 12:28:20 +0000 (13:28 +0100)
author Batuhan Taskaya <isidentical@gmail.com>
Sat, 29 Oct 2022 12:28:20 +0000 (15:28 +0300)
committer GitHub <noreply@github.com>
Sat, 29 Oct 2022 12:28:20 +0000 (13:28 +0100)
diff --git a/Lib/test/test_traceback.py b/Lib/test/test_traceback.py

index cf52d17ff8fa97ae1dd14bfd192163a4cc4fac93..56b168735d15098c2b388f3a1ffa145369c45a3e 100644 (file)
--- a/Lib/test/test_traceback.py
+++ b/Lib/test/test_traceback.py
@@ -804,6 +804,56 @@ class TracebackErrorLocationCaretTestBase:
          ]
          self.assertEqual(actual, expected)
  
+    def test_wide_characters_unicode_with_problematic_byte_offset(self):
+        def f():
+            ｗｉｄｔｈ
+
+        actual = self.get_exception(f)
+        expected = [
+            f"Traceback (most recent call last):",
+            f"  File \"{__file__}\", line {self.callable_line}, in get_exception",
+            f"    callable()",
+            f"  File \"{__file__}\", line {f.__code__.co_firstlineno + 1}, in f",
+            f"    ｗｉｄｔｈ",
+        ]
+        self.assertEqual(actual, expected)
+
+
+    def test_byte_offset_with_wide_characters_middle(self):
+        def f():
+            ｗｉｄｔｈ = 1
+            raise ValueError(ｗｉｄｔｈ)
+
+        actual = self.get_exception(f)
+        expected = [
+            f"Traceback (most recent call last):",
+            f"  File \"{__file__}\", line {self.callable_line}, in get_exception",
+            f"    callable()",
+            f"  File \"{__file__}\", line {f.__code__.co_firstlineno + 2}, in f",
+            f"    raise ValueError(ｗｉｄｔｈ)",
+        ]
+        self.assertEqual(actual, expected)
+
+    def test_byte_offset_multiline(self):
+        def f():
+            ｗｗｗ = 1
+            ｔｈ = 0
+
+            print(1, ｗｗｗ(
+                    ｔｈ))
+
+        actual = self.get_exception(f)
+        expected = [
+            f"Traceback (most recent call last):",
+            f"  File \"{__file__}\", line {self.callable_line}, in get_exception",
+            f"    callable()",
+            f"  File \"{__file__}\", line {f.__code__.co_firstlineno + 4}, in f",
+            f"    print(1, ｗｗｗ(",
+            f"             ^^^^",
+        ]
+        self.assertEqual(actual, expected)
+
+
  
  @requires_debug_ranges()
  class PurePythonTracebackErrorCaretTests(
diff --git a/Lib/traceback.py b/Lib/traceback.py

index 6270100348a6a21098aa3d34c360b3b2e1957d33..0f0f2b317de264f73b1e428a9530ce9995452d94 100644 (file)
--- a/Lib/traceback.py
+++ b/Lib/traceback.py
@@ -476,32 +476,32 @@ class StackSummary(list):
                  frame_summary.colno is not None
                  and frame_summary.end_colno is not None
              ):
-                colno = _byte_offset_to_character_offset(
-                    frame_summary._original_line, frame_summary.colno)
-                end_colno = _byte_offset_to_character_offset(
-                    frame_summary._original_line, frame_summary.end_colno)
+                start_offset = _byte_offset_to_character_offset(
+                    frame_summary._original_line, frame_summary.colno) + 1
+                end_offset = _byte_offset_to_character_offset(
+                    frame_summary._original_line, frame_summary.end_colno) + 1
  
                  anchors = None
                  if frame_summary.lineno == frame_summary.end_lineno:
                      with suppress(Exception):
                          anchors = _extract_caret_anchors_from_line_segment(
-                            frame_summary._original_line[colno - 1:end_colno - 1]
+                            frame_summary._original_line[start_offset - 1:end_offset - 1]
                          )
                  else:
-                    end_colno = stripped_characters + len(stripped_line)
+                    end_offset = stripped_characters + len(stripped_line)
  
                  # show indicators if primary char doesn't span the frame line
-                if end_colno - colno < len(stripped_line) or (
+                if end_offset - start_offset < len(stripped_line) or (
                          anchors and anchors.right_start_offset - anchors.left_end_offset > 0):
                      row.append('    ')
-                    row.append(' ' * (colno - stripped_characters))
+                    row.append(' ' * (start_offset - stripped_characters))
  
                      if anchors:
                          row.append(anchors.primary_char * (anchors.left_end_offset))
                          row.append(anchors.secondary_char * (anchors.right_start_offset - anchors.left_end_offset))
-                        row.append(anchors.primary_char * (end_colno - colno - anchors.right_start_offset))
+                        row.append(anchors.primary_char * (end_offset - start_offset - anchors.right_start_offset))
                      else:
-                        row.append('^' * (end_colno - colno))
+                        row.append('^' * (end_offset - start_offset))
  
                      row.append('\n')
  
@@ -561,10 +561,7 @@ class StackSummary(list):
  
  def _byte_offset_to_character_offset(str, offset):
      as_utf8 = str.encode('utf-8')
-    if offset > len(as_utf8):
-        offset = len(as_utf8)
-
-    return len(as_utf8[:offset + 1].decode("utf-8"))
+    return len(as_utf8[:offset].decode("utf-8", errors="replace"))
  
  
  _Anchors = collections.namedtuple(
diff --git a/Misc/NEWS.d/next/Library/2022-10-28-23-44-17.gh-issue-98744.sGHDWm.rst b/Misc/NEWS.d/next/Library/2022-10-28-23-44-17.gh-issue-98744.sGHDWm.rst

new file mode 100644 (file)

index 0000000..cf99ea5
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2022-10-28-23-44-17.gh-issue-98744.sGHDWm.rst
@@ -0,0 +1,2 @@
+Prevent crashing in :mod:`traceback` when retrieving the byte-offset for
+some source files that contain certain unicode characters.
author	Batuhan Taskaya <isidentical@gmail.com>
	Sat, 29 Oct 2022 12:28:20 +0000 (15:28 +0300)
committer	GitHub <noreply@github.com>
	Sat, 29 Oct 2022 12:28:20 +0000 (13:28 +0100)
Lib/test/test_traceback.py		patch \| blob \| blame \| history
Lib/traceback.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2022-10-28-23-44-17.gh-issue-98744.sGHDWm.rst	[new file with mode: 0644]	patch \| blob