From: Tom Tromey <tromey@adacore.com>
Date: Thu, 5 Feb 2026 20:51:07 +0000 (-0700)
Subject: Handle wide characters that cannot be converted
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=5835be9c497e85e0b788cedeff99999dbd37efd1;p=thirdparty%2Fbinutils-gdb.git

Handle wide characters that cannot be converted

The internal AdaCore test suite found a bug with the char-printing
changes: on Windows, it is possible to have a wide character (in our
case, 0xBEEF) that is "printable" (as determined by iswprint) but
which cannot be converted to the current host charset.

This in turn would result in strange output like:

    $2 = 48879 '\357\276'

where what we would expect in Ada would be:

    $2 = 48879 '["00beef"]'

A similar problem could occur for C on Windows.  There, the character
boundaries appeared lost to the user, so rather than '\xbeef' the user
would see '\357\276'.

This patch fixes this problem by checking the convertibility of a wide
character before printing it.

New in v3: Correctly check result of wcrtomb
New in v2: Skip the new check if the host encoding is UTF-8.
---

diff --git a/gdb/char-print.c b/gdb/char-print.c
index fdf7fd0d40f..7c1ba85bbc1 100644
--- a/gdb/char-print.c
+++ b/gdb/char-print.c
@@ -23,8 +23,7 @@
 #include "valprint.h"
 #include "value.h"
 
-/* Return true if print_wchar can display W without resorting to a
-   numeric escape, false otherwise.  */
+/* See char-print.h.  */
 
 bool
 wchar_printer::printable (gdb_wchar_t w) const
@@ -45,6 +44,29 @@ wchar_printer::printable (gdb_wchar_t w) const
 
 /* See char-print.h.  */
 
+bool
+wchar_printer::printable_and_convertible (gdb_wchar_t w) const
+{
+  if (!printable (w))
+    return false;
+
+  /* We think it is printable -- but is it really?  It must also be
+     convertible from the intermediate charset (normally wchar_t) to
+     the host charset, which is not always possible.  Note we don't
+     try to handle stateful encodings, this seems difficult and also
+     somewhat pointless.  And, if the host encoding is UTF-8, we
+     simply assume this is fine without checking any more.  */
+  if (m_host_utf8)
+    return true;
+
+  mbstate_t state;
+  memset (&state, 0, sizeof (state));
+  char ignore[MB_LEN_MAX];
+  return wcrtomb (ignore, w, &state) != (size_t) -1;
+}
+
+/* See char-print.h.  */
+
 void
 wchar_printer::print_char (gdb_wchar_t w)
 {
@@ -183,7 +205,7 @@ wchar_printer::print (int c, ui_file *stream)
 
 	  need_escape = false;
 	  for (i = 0; i < num_chars; ++i)
-	    if (!printable (chars[i]))
+	    if (!printable_and_convertible (chars[i]))
 	      {
 		need_escape = true;
 		break;
@@ -349,7 +371,7 @@ wchar_printer::print_converted_chars_to_obstack
 	    for (j = 0; j < repeat_count; ++j)
 	      {
 		if (elem->result == wchar_iterate_ok
-		    && printable (elem->chars[0]))
+		    && printable_and_convertible (elem->chars[0]))
 		  print_char (elem->chars[0]);
 		else
 		  print_escape (elem->buf, elem->buflen);
@@ -377,7 +399,7 @@ wchar_printer::print_converted_chars_to_obstack
 	    /* Output the character and repeat string.  */
 	    m_file.write (LCST ("'"));
 	    if (elem->result == wchar_iterate_ok
-		&& printable (elem->chars[0]))
+		&& printable_and_convertible (elem->chars[0]))
 	      print_char (elem->chars[0]);
 	    else
 	      print_escape (elem->buf, elem->buflen);
diff --git a/gdb/char-print.h b/gdb/char-print.h
index 36b09c4d408..bbadb968c80 100644
--- a/gdb/char-print.h
+++ b/gdb/char-print.h
@@ -81,6 +81,7 @@ public:
     : m_encoding (encoding == nullptr
 		  ? get_default_encoding (ch_type)
 		  : encoding),
+      m_host_utf8 (streq (host_charset (), "UTF-8")),
       m_byte_order (type_byte_order (ch_type)),
       m_file (&m_wchar_buf),
       m_quoter (quoter),
@@ -169,10 +170,17 @@ protected:
 
 private:
 
+  /* Check whether C is both printable (deferring to the 'printable'
+     method), and also whether it is convertible to the host character
+     set.  Returns true if both conditions hold, false otherwise.  */
+  bool printable_and_convertible (gdb_wchar_t c) const;
+
   /* Intermediate output is stored here.  */
   auto_obstack m_wchar_buf;
   /* The encoding.  */
   const char *m_encoding;
+  /* True if the host encoding is UTF-8.  */
+  bool m_host_utf8;
 
 protected: