Issue #9188: The gdb extension now handles correctly narrow (UCS2) as well

author Antoine Pitrou <solipsis@pitrou.net>

Wed, 8 Sep 2010 20:57:48 +0000 (20:57 +0000)

committer Antoine Pitrou <solipsis@pitrou.net>

Wed, 8 Sep 2010 20:57:48 +0000 (20:57 +0000)
author Antoine Pitrou <solipsis@pitrou.net>
Wed, 8 Sep 2010 20:57:48 +0000 (20:57 +0000)
committer Antoine Pitrou <solipsis@pitrou.net>
Wed, 8 Sep 2010 20:57:48 +0000 (20:57 +0000)
diff --git a/Misc/NEWS b/Misc/NEWS

index 894983aef346207e7eea27affe38b74d396d712a..148ee791d477a0c6910c4ed5bd0fbca7f19ead99 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -76,6 +76,13 @@ Library
    guaranteed to exist in all Python implementations and the names of hash
    algorithms available in the current process.
  
+Tools/Demos
+-----------
+
+- Issue #9188: The gdb extension now handles correctly narrow (UCS2) as well
+  as wide (UCS4) unicode builds for both the host interpreter (embedded
+  inside gdb) and the interpreter under test.
+
  Build
  -----
  
diff --git a/Tools/gdb/libpython.py b/Tools/gdb/libpython.py

index b23a22e393827fa039aaf39965071f6160d77d0b..79f21e3f86ab26aa640e04872797d64e75bb1c92 100644 (file)
--- a/Tools/gdb/libpython.py
+++ b/Tools/gdb/libpython.py
@@ -1065,7 +1065,19 @@ def _unichr_is_printable(char):
      if char == u" ":
          return True
      import unicodedata
-    return unicodedata.category(char)[0] not in ("C", "Z")
+    return unicodedata.category(char) not in ("C", "Z")
+
+if sys.maxunicode >= 0x10000:
+    _unichr = unichr
+else:
+    # Needed for proper surrogate support if sizeof(Py_UNICODE) is 2 in gdb
+    def _unichr(x):
+        if x < 0x10000:
+            return unichr(x)
+        x -= 0x10000
+        ch1 = 0xD800 | (x >> 10)
+        ch2 = 0xDC00 | (x & 0x3FF)
+        return unichr(ch1) + unichr(ch2)
  
  
  class PyUnicodeObjectPtr(PyObjectPtr):
@@ -1084,11 +1096,33 @@ class PyUnicodeObjectPtr(PyObjectPtr):
  
          # Gather a list of ints from the Py_UNICODE array; these are either
          # UCS-2 or UCS-4 code points:
-        Py_UNICODEs = [int(field_str[i]) for i in safe_range(field_length)]
+        if self.char_width() > 2:
+            Py_UNICODEs = [int(field_str[i]) for i in safe_range(field_length)]
+        else:
+            # A more elaborate routine if sizeof(Py_UNICODE) is 2 in the
+            # inferior process: we must join surrogate pairs.
+            Py_UNICODEs = []
+            i = 0
+            while i < field_length:
+                ucs = int(field_str[i])
+                i += 1
+                if ucs < 0xD800 or ucs >= 0xDC00 or i == field_length:
+                    Py_UNICODEs.append(ucs)
+                    continue
+                # This could be a surrogate pair.
+                ucs2 = int(field_str[i])
+                if ucs2 < 0xDC00 or ucs2 > 0xDFFF:
+                    continue
+                code = (ucs & 0x03FF) << 10
+                code |= ucs2 & 0x03FF
+                code += 0x00010000
+                Py_UNICODEs.append(code)
+                i += 1
  
          # Convert the int code points to unicode characters, and generate a
-        # local unicode instance:
-        result = u''.join([unichr(ucs) for ucs in Py_UNICODEs])
+        # local unicode instance.
+        # This splits surrogate pairs if sizeof(Py_UNICODE) is 2 here (in gdb).
+        result = u''.join([_unichr(ucs) for ucs in Py_UNICODEs])
          return result
  
      def write_repr(self, out, visited):
@@ -1137,20 +1171,16 @@ class PyUnicodeObjectPtr(PyObjectPtr):
              else:
                  ucs = ch
                  orig_ucs = None
+                ch2 = None
                  if self.char_width() == 2:
-                    # Get code point from surrogate pair
+                    # If sizeof(Py_UNICODE) is 2 here (in gdb), join
+                    # surrogate pairs before calling _unichr_is_printable.
                      if (i < len(proxy)
                      and 0xD800 <= ord(ch) < 0xDC00 \
                      and 0xDC00 <= ord(proxy[i]) <= 0xDFFF):
                          ch2 = proxy[i]
-                        code = (ord(ch) & 0x03FF) << 10
-                        code |= ord(ch2) & 0x03FF
-                        code += 0x00010000
-                        orig_ucs = ucs
-                        ucs = unichr(code)
+                        ucs = ch + ch2
                          i += 1
-                    else:
-                        ch2 = None
  
                  printable = _unichr_is_printable(ucs)
                  if printable:
@@ -1195,7 +1225,7 @@ class PyUnicodeObjectPtr(PyObjectPtr):
                  else:
                      # Copy characters as-is
                      out.write(ch)
-                    if self.char_width() == 2 and (ch2 is not None):
+                    if ch2 is not None:
                          out.write(ch2)
  
          out.write(quote)
author	Antoine Pitrou <solipsis@pitrou.net>
	Wed, 8 Sep 2010 20:57:48 +0000 (20:57 +0000)
committer	Antoine Pitrou <solipsis@pitrou.net>
	Wed, 8 Sep 2010 20:57:48 +0000 (20:57 +0000)
Misc/NEWS		patch \| blob \| blame \| history
Tools/gdb/libpython.py		patch \| blob \| blame \| history