]> git.ipfire.org Git - thirdparty/binutils-gdb.git/blame - gdb/charset.c
Update copyright year range in header of all files managed by GDB
[thirdparty/binutils-gdb.git] / gdb / charset.c
CommitLineData
234b45d4 1/* Character set conversion support for GDB.
1bac305b 2
1d506c26 3 Copyright (C) 2001-2024 Free Software Foundation, Inc.
234b45d4
KB
4
5 This file is part of GDB.
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
a9762ec7 9 the Free Software Foundation; either version 3 of the License, or
234b45d4
KB
10 (at your option) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
a9762ec7 18 along with this program. If not, see <http://www.gnu.org/licenses/>. */
234b45d4
KB
19
20#include "defs.h"
d55e5aa6 21#include "charset.h"
4de283e4 22#include "gdbcmd.h"
bf31fd38 23#include "gdbsupport/gdb_obstack.h"
268a13a5 24#include "gdbsupport/gdb_wait.h"
4de283e4 25#include "charset-list.h"
268a13a5 26#include "gdbsupport/environ.h"
4de283e4 27#include "arch-utils.h"
268a13a5 28#include "gdbsupport/gdb_vecs.h"
4de283e4 29#include <ctype.h>
234b45d4 30
43484f03
DJ
31#ifdef USE_WIN32API
32#include <windows.h>
33#endif
234b45d4
KB
34\f
35/* How GDB's character set support works
36
6c7a06a3 37 GDB has three global settings:
234b45d4
KB
38
39 - The `current host character set' is the character set GDB should
40 use in talking to the user, and which (hopefully) the user's
6c7a06a3
TT
41 terminal knows how to display properly. Most users should not
42 change this.
234b45d4
KB
43
44 - The `current target character set' is the character set the
45 program being debugged uses.
46
6c7a06a3
TT
47 - The `current target wide character set' is the wide character set
48 the program being debugged uses, that is, the encoding used for
49 wchar_t.
50
234b45d4
KB
51 There are commands to set each of these, and mechanisms for
52 choosing reasonable default values. GDB has a global list of
53 character sets that it can use as its host or target character
54 sets.
55
56 The header file `charset.h' declares various functions that
57 different pieces of GDB need to perform tasks like:
58
59 - printing target strings and characters to the user's terminal
60 (mostly target->host conversions),
61
62 - building target-appropriate representations of strings and
63 characters the user enters in expressions (mostly host->target
64 conversions),
65
6c7a06a3
TT
66 and so on.
67
68 To avoid excessive code duplication and maintenance efforts,
69 GDB simply requires a capable iconv function. Users on platforms
70 without a suitable iconv can use the GNU iconv library. */
234b45d4
KB
71
72\f
6c7a06a3 73#ifdef PHONY_ICONV
234b45d4 74
6c7a06a3
TT
75/* Provide a phony iconv that does as little as possible. Also,
76 arrange for there to be a single available character set. */
234b45d4 77
6c7a06a3 78#undef GDB_DEFAULT_HOST_CHARSET
f74f61cb
SL
79#ifdef USE_WIN32API
80# define GDB_DEFAULT_HOST_CHARSET "CP1252"
81#else
82# define GDB_DEFAULT_HOST_CHARSET "ISO-8859-1"
83#endif
84#define GDB_DEFAULT_TARGET_CHARSET GDB_DEFAULT_HOST_CHARSET
85#define GDB_DEFAULT_TARGET_WIDE_CHARSET "UTF-32"
6c7a06a3
TT
86#undef DEFAULT_CHARSET_NAMES
87#define DEFAULT_CHARSET_NAMES GDB_DEFAULT_HOST_CHARSET ,
88
89#undef iconv_t
90#define iconv_t int
91#undef iconv_open
62234ccc 92#define iconv_open phony_iconv_open
6c7a06a3 93#undef iconv
62234ccc 94#define iconv phony_iconv
6c7a06a3 95#undef iconv_close
62234ccc 96#define iconv_close phony_iconv_close
6c7a06a3 97
0dd7fb99
TT
98#undef ICONV_CONST
99#define ICONV_CONST const
100
f74f61cb
SL
101/* We allow conversions from UTF-32, wchar_t, and the host charset.
102 We allow conversions to wchar_t and the host charset.
103 Return 1 if we are converting from UTF-32BE, 2 if from UTF32-LE,
104 0 otherwise. This is used as a flag in calls to iconv. */
105
a95babbf 106static iconv_t
62234ccc 107phony_iconv_open (const char *to, const char *from)
6c7a06a3 108{
6c7a06a3
TT
109 if (strcmp (to, "wchar_t") && strcmp (to, GDB_DEFAULT_HOST_CHARSET))
110 return -1;
234b45d4 111
f74f61cb
SL
112 if (!strcmp (from, "UTF-32BE") || !strcmp (from, "UTF-32"))
113 return 1;
114
115 if (!strcmp (from, "UTF-32LE"))
116 return 2;
117
118 if (strcmp (from, "wchar_t") && strcmp (from, GDB_DEFAULT_HOST_CHARSET))
119 return -1;
120
121 return 0;
6c7a06a3 122}
234b45d4 123
a95babbf 124static int
62234ccc 125phony_iconv_close (iconv_t arg)
6c7a06a3
TT
126{
127 return 0;
128}
234b45d4 129
a95babbf 130static size_t
62234ccc
TT
131phony_iconv (iconv_t utf_flag, const char **inbuf, size_t *inbytesleft,
132 char **outbuf, size_t *outbytesleft)
6c7a06a3 133{
b8899f2b 134 if (utf_flag)
6c7a06a3 135 {
f74f61cb
SL
136 enum bfd_endian endian
137 = utf_flag == 1 ? BFD_ENDIAN_BIG : BFD_ENDIAN_LITTLE;
6c7a06a3
TT
138 while (*inbytesleft >= 4)
139 {
f74f61cb
SL
140 unsigned long c
141 = extract_unsigned_integer ((const gdb_byte *)*inbuf, 4, endian);
6c7a06a3
TT
142
143 if (c >= 256)
144 {
145 errno = EILSEQ;
146 return -1;
147 }
f74f61cb
SL
148 if (*outbytesleft < 1)
149 {
150 errno = E2BIG;
151 return -1;
152 }
6c7a06a3
TT
153 **outbuf = c & 0xff;
154 ++*outbuf;
155 --*outbytesleft;
156
f74f61cb 157 *inbuf += 4;
6c7a06a3
TT
158 *inbytesleft -= 4;
159 }
f74f61cb 160 if (*inbytesleft)
6c7a06a3 161 {
f74f61cb 162 /* Partial sequence on input. */
6c7a06a3
TT
163 errno = EINVAL;
164 return -1;
165 }
166 }
167 else
168 {
169 /* In all other cases we simply copy input bytes to the
170 output. */
171 size_t amt = *inbytesleft;
c5504eaf 172
6c7a06a3
TT
173 if (amt > *outbytesleft)
174 amt = *outbytesleft;
175 memcpy (*outbuf, *inbuf, amt);
176 *inbuf += amt;
177 *outbuf += amt;
178 *inbytesleft -= amt;
179 *outbytesleft -= amt;
f74f61cb
SL
180 if (*inbytesleft)
181 {
182 errno = E2BIG;
183 return -1;
184 }
6c7a06a3 185 }
234b45d4 186
6c7a06a3
TT
187 /* The number of non-reversible conversions -- but they were all
188 reversible. */
189 return 0;
190}
234b45d4 191
83030110
PA
192#else /* PHONY_ICONV */
193
194/* On systems that don't have EILSEQ, GNU iconv's iconv.h defines it
195 to ENOENT, while gnulib defines it to a different value. Always
196 map ENOENT to gnulib's EILSEQ, leaving callers agnostic. */
197
198static size_t
199gdb_iconv (iconv_t utf_flag, ICONV_CONST char **inbuf, size_t *inbytesleft,
200 char **outbuf, size_t *outbytesleft)
201{
202 size_t ret;
203
204 ret = iconv (utf_flag, inbuf, inbytesleft, outbuf, outbytesleft);
205 if (errno == ENOENT)
206 errno = EILSEQ;
207 return ret;
208}
209
210#undef iconv
211#define iconv gdb_iconv
234b45d4 212
83030110 213#endif /* PHONY_ICONV */
234b45d4
KB
214
215\f
216/* The global lists of character sets and translations. */
217
218
e33d66ec
EZ
219#ifndef GDB_DEFAULT_TARGET_CHARSET
220#define GDB_DEFAULT_TARGET_CHARSET "ISO-8859-1"
221#endif
222
6c7a06a3 223#ifndef GDB_DEFAULT_TARGET_WIDE_CHARSET
b8899f2b 224#define GDB_DEFAULT_TARGET_WIDE_CHARSET "UTF-32"
6c7a06a3
TT
225#endif
226
227static const char *auto_host_charset_name = GDB_DEFAULT_HOST_CHARSET;
228static const char *host_charset_name = "auto";
920d2a44
AC
229static void
230show_host_charset_name (struct ui_file *file, int from_tty,
231 struct cmd_list_element *c,
232 const char *value)
233{
6c7a06a3 234 if (!strcmp (value, "auto"))
6cb06a8c
TT
235 gdb_printf (file,
236 _("The host character set is \"auto; currently %s\".\n"),
237 auto_host_charset_name);
6c7a06a3 238 else
6cb06a8c 239 gdb_printf (file, _("The host character set is \"%s\".\n"), value);
920d2a44
AC
240}
241
f870a310 242static const char *target_charset_name = "auto";
920d2a44
AC
243static void
244show_target_charset_name (struct ui_file *file, int from_tty,
245 struct cmd_list_element *c, const char *value)
246{
f870a310 247 if (!strcmp (value, "auto"))
6cb06a8c
TT
248 gdb_printf (file,
249 _("The target character set is \"auto; "
250 "currently %s\".\n"),
251 gdbarch_auto_charset (get_current_arch ()));
f870a310 252 else
6cb06a8c
TT
253 gdb_printf (file, _("The target character set is \"%s\".\n"),
254 value);
920d2a44
AC
255}
256
f870a310 257static const char *target_wide_charset_name = "auto";
6c7a06a3 258static void
aff410f1
MS
259show_target_wide_charset_name (struct ui_file *file,
260 int from_tty,
261 struct cmd_list_element *c,
262 const char *value)
e33d66ec 263{
f870a310 264 if (!strcmp (value, "auto"))
6cb06a8c
TT
265 gdb_printf (file,
266 _("The target wide character set is \"auto; "
267 "currently %s\".\n"),
268 gdbarch_auto_wide_charset (get_current_arch ()));
f870a310 269 else
6cb06a8c
TT
270 gdb_printf (file, _("The target wide character set is \"%s\".\n"),
271 value);
6c7a06a3 272}
e33d66ec 273
27087b7f 274static const char * const default_charset_names[] =
e33d66ec 275{
6c7a06a3 276 DEFAULT_CHARSET_NAMES
e33d66ec
EZ
277 0
278};
234b45d4 279
27087b7f 280static const char * const *charset_enum;
234b45d4 281
6c7a06a3
TT
282\f
283/* If the target wide character set has big- or little-endian
284 variants, these are the corresponding names. */
285static const char *target_wide_charset_be_name;
286static const char *target_wide_charset_le_name;
234b45d4 287
f870a310
TT
288/* The architecture for which the BE- and LE-names are valid. */
289static struct gdbarch *be_le_arch;
290
291/* A helper function which sets the target wide big- and little-endian
292 character set names, if possible. */
234b45d4 293
6c7a06a3 294static void
f870a310 295set_be_le_names (struct gdbarch *gdbarch)
234b45d4 296{
f870a310
TT
297 if (be_le_arch == gdbarch)
298 return;
299 be_le_arch = gdbarch;
234b45d4 300
f74f61cb
SL
301#ifdef PHONY_ICONV
302 /* Match the wide charset names recognized by phony_iconv_open. */
303 target_wide_charset_le_name = "UTF-32LE";
304 target_wide_charset_be_name = "UTF-32BE";
305#else
15766370
TT
306 int i, len;
307 const char *target_wide;
308
6c7a06a3
TT
309 target_wide_charset_le_name = NULL;
310 target_wide_charset_be_name = NULL;
234b45d4 311
f870a310
TT
312 target_wide = target_wide_charset_name;
313 if (!strcmp (target_wide, "auto"))
314 target_wide = gdbarch_auto_wide_charset (gdbarch);
315
316 len = strlen (target_wide);
6c7a06a3
TT
317 for (i = 0; charset_enum[i]; ++i)
318 {
f870a310 319 if (strncmp (target_wide, charset_enum[i], len))
6c7a06a3
TT
320 continue;
321 if ((charset_enum[i][len] == 'B'
322 || charset_enum[i][len] == 'L')
323 && charset_enum[i][len + 1] == 'E'
324 && charset_enum[i][len + 2] == '\0')
325 {
326 if (charset_enum[i][len] == 'B')
327 target_wide_charset_be_name = charset_enum[i];
328 else
329 target_wide_charset_le_name = charset_enum[i];
330 }
331 }
f74f61cb 332# endif /* PHONY_ICONV */
234b45d4
KB
333}
334
6c7a06a3
TT
335/* 'Set charset', 'set host-charset', 'set target-charset', 'set
336 target-wide-charset', 'set charset' sfunc's. */
234b45d4
KB
337
338static void
f870a310 339validate (struct gdbarch *gdbarch)
234b45d4 340{
6c7a06a3
TT
341 iconv_t desc;
342 const char *host_cset = host_charset ();
f870a310
TT
343 const char *target_cset = target_charset (gdbarch);
344 const char *target_wide_cset = target_wide_charset_name;
c5504eaf 345
f870a310
TT
346 if (!strcmp (target_wide_cset, "auto"))
347 target_wide_cset = gdbarch_auto_wide_charset (gdbarch);
234b45d4 348
f870a310 349 desc = iconv_open (target_wide_cset, host_cset);
6c7a06a3 350 if (desc == (iconv_t) -1)
a73c6dcd 351 error (_("Cannot convert between character sets `%s' and `%s'"),
f870a310 352 target_wide_cset, host_cset);
6c7a06a3 353 iconv_close (desc);
234b45d4 354
f870a310 355 desc = iconv_open (target_cset, host_cset);
6c7a06a3 356 if (desc == (iconv_t) -1)
a73c6dcd 357 error (_("Cannot convert between character sets `%s' and `%s'"),
f870a310 358 target_cset, host_cset);
6c7a06a3 359 iconv_close (desc);
234b45d4 360
f870a310
TT
361 /* Clear the cache. */
362 be_le_arch = NULL;
234b45d4
KB
363}
364
6c7a06a3
TT
365/* This is the sfunc for the 'set charset' command. */
366static void
eb4c3f4a 367set_charset_sfunc (const char *charset, int from_tty,
aff410f1 368 struct cmd_list_element *c)
234b45d4 369{
aff410f1 370 /* CAREFUL: set the target charset here as well. */
6c7a06a3 371 target_charset_name = host_charset_name;
f870a310 372 validate (get_current_arch ());
234b45d4
KB
373}
374
6c7a06a3
TT
375/* 'set host-charset' command sfunc. We need a wrapper here because
376 the function needs to have a specific signature. */
377static void
eb4c3f4a 378set_host_charset_sfunc (const char *charset, int from_tty,
6c7a06a3 379 struct cmd_list_element *c)
234b45d4 380{
f870a310 381 validate (get_current_arch ());
234b45d4
KB
382}
383
6c7a06a3
TT
384/* Wrapper for the 'set target-charset' command. */
385static void
eb4c3f4a 386set_target_charset_sfunc (const char *charset, int from_tty,
6c7a06a3 387 struct cmd_list_element *c)
234b45d4 388{
f870a310 389 validate (get_current_arch ());
234b45d4
KB
390}
391
6c7a06a3
TT
392/* Wrapper for the 'set target-wide-charset' command. */
393static void
eb4c3f4a 394set_target_wide_charset_sfunc (const char *charset, int from_tty,
6c7a06a3 395 struct cmd_list_element *c)
234b45d4 396{
f870a310 397 validate (get_current_arch ());
234b45d4
KB
398}
399
6c7a06a3
TT
400/* sfunc for the 'show charset' command. */
401static void
aff410f1
MS
402show_charset (struct ui_file *file, int from_tty,
403 struct cmd_list_element *c,
6c7a06a3 404 const char *name)
234b45d4 405{
6c7a06a3
TT
406 show_host_charset_name (file, from_tty, c, host_charset_name);
407 show_target_charset_name (file, from_tty, c, target_charset_name);
aff410f1
MS
408 show_target_wide_charset_name (file, from_tty, c,
409 target_wide_charset_name);
234b45d4
KB
410}
411
234b45d4 412\f
6c7a06a3 413/* Accessor functions. */
234b45d4 414
6c7a06a3
TT
415const char *
416host_charset (void)
234b45d4 417{
6c7a06a3
TT
418 if (!strcmp (host_charset_name, "auto"))
419 return auto_host_charset_name;
420 return host_charset_name;
234b45d4
KB
421}
422
6c7a06a3 423const char *
f870a310 424target_charset (struct gdbarch *gdbarch)
234b45d4 425{
f870a310
TT
426 if (!strcmp (target_charset_name, "auto"))
427 return gdbarch_auto_charset (gdbarch);
6c7a06a3 428 return target_charset_name;
234b45d4 429}
234b45d4 430
6c7a06a3 431const char *
f870a310 432target_wide_charset (struct gdbarch *gdbarch)
234b45d4 433{
f870a310
TT
434 enum bfd_endian byte_order = gdbarch_byte_order (gdbarch);
435
436 set_be_le_names (gdbarch);
e17a4113 437 if (byte_order == BFD_ENDIAN_BIG)
234b45d4 438 {
6c7a06a3
TT
439 if (target_wide_charset_be_name)
440 return target_wide_charset_be_name;
234b45d4 441 }
6c7a06a3 442 else
234b45d4 443 {
6c7a06a3
TT
444 if (target_wide_charset_le_name)
445 return target_wide_charset_le_name;
234b45d4
KB
446 }
447
f870a310
TT
448 if (!strcmp (target_wide_charset_name, "auto"))
449 return gdbarch_auto_wide_charset (gdbarch);
450
6c7a06a3 451 return target_wide_charset_name;
234b45d4
KB
452}
453
234b45d4 454\f
6c7a06a3
TT
455/* Host character set management. For the time being, we assume that
456 the host character set is some superset of ASCII. */
234b45d4 457
6c7a06a3
TT
458char
459host_letter_to_control_character (char c)
234b45d4 460{
6c7a06a3
TT
461 if (c == '?')
462 return 0177;
463 return c & 0237;
234b45d4
KB
464}
465
234b45d4 466\f
6c7a06a3 467/* Public character management functions. */
234b45d4 468
80a3b8c5 469class iconv_wrapper
234b45d4 470{
80a3b8c5
TT
471public:
472
473 iconv_wrapper (const char *to, const char *from)
474 {
475 m_desc = iconv_open (to, from);
476 if (m_desc == (iconv_t) -1)
477 perror_with_name (_("Converting character sets"));
478 }
479
480 ~iconv_wrapper ()
481 {
482 iconv_close (m_desc);
483 }
484
485 size_t convert (ICONV_CONST char **inp, size_t *inleft, char **outp,
486 size_t *outleft)
487 {
488 return iconv (m_desc, inp, inleft, outp, outleft);
489 }
490
491private:
492
493 iconv_t m_desc;
494};
234b45d4 495
6c7a06a3
TT
496void
497convert_between_encodings (const char *from, const char *to,
498 const gdb_byte *bytes, unsigned int num_bytes,
499 int width, struct obstack *output,
500 enum transliterations translit)
501{
6c7a06a3 502 size_t inleft;
39086a0e 503 ICONV_CONST char *inp;
6c7a06a3
TT
504 unsigned int space_request;
505
506 /* Often, the host and target charsets will be the same. */
507 if (!strcmp (from, to))
508 {
509 obstack_grow (output, bytes, num_bytes);
510 return;
511 }
234b45d4 512
80a3b8c5 513 iconv_wrapper desc (to, from);
234b45d4 514
6c7a06a3 515 inleft = num_bytes;
39086a0e 516 inp = (ICONV_CONST char *) bytes;
234b45d4 517
6c7a06a3 518 space_request = num_bytes;
234b45d4 519
6c7a06a3 520 while (inleft > 0)
234b45d4 521 {
6c7a06a3
TT
522 char *outp;
523 size_t outleft, r;
524 int old_size;
525
526 old_size = obstack_object_size (output);
527 obstack_blank (output, space_request);
528
241fd515 529 outp = (char *) obstack_base (output) + old_size;
6c7a06a3
TT
530 outleft = space_request;
531
80a3b8c5 532 r = desc.convert (&inp, &inleft, &outp, &outleft);
6c7a06a3
TT
533
534 /* Now make sure that the object on the obstack only includes
535 bytes we have converted. */
89eb3c54 536 obstack_blank_fast (output, -(ssize_t) outleft);
6c7a06a3
TT
537
538 if (r == (size_t) -1)
539 {
540 switch (errno)
541 {
542 case EILSEQ:
543 {
544 int i;
545
546 /* Invalid input sequence. */
547 if (translit == translit_none)
3e43a32a
MS
548 error (_("Could not convert character "
549 "to `%s' character set"), to);
6c7a06a3
TT
550
551 /* We emit escape sequence for the bytes, skip them,
552 and try again. */
553 for (i = 0; i < width; ++i)
554 {
555 char octal[5];
556
08850b56 557 xsnprintf (octal, sizeof (octal), "\\%.3o", *inp & 0xff);
6c7a06a3
TT
558 obstack_grow_str (output, octal);
559
560 ++inp;
561 --inleft;
562 }
563 }
564 break;
565
566 case E2BIG:
567 /* We ran out of space in the output buffer. Make it
568 bigger next time around. */
569 space_request *= 2;
570 break;
571
572 case EINVAL:
573 /* Incomplete input sequence. FIXME: ought to report this
574 to the caller somehow. */
575 inleft = 0;
576 break;
577
578 default:
9b20d036
MS
579 perror_with_name (_("Internal error while "
580 "converting character sets"));
6c7a06a3
TT
581 }
582 }
234b45d4 583 }
234b45d4
KB
584}
585
e33d66ec 586\f
e33d66ec 587
6c7a06a3 588/* Create a new iterator. */
cda6c55b
TT
589wchar_iterator::wchar_iterator (const gdb_byte *input, size_t bytes,
590 const char *charset, size_t width)
591: m_input (input),
592 m_bytes (bytes),
593 m_width (width),
594 m_out (1)
234b45d4 595{
cda6c55b
TT
596 m_desc = iconv_open (INTERMEDIATE_ENCODING, charset);
597 if (m_desc == (iconv_t) -1)
9b20d036 598 perror_with_name (_("Converting character sets"));
234b45d4
KB
599}
600
cda6c55b 601wchar_iterator::~wchar_iterator ()
e33d66ec 602{
cda6c55b
TT
603 if (m_desc != (iconv_t) -1)
604 iconv_close (m_desc);
e33d66ec 605}
234b45d4 606
6c7a06a3 607int
cda6c55b
TT
608wchar_iterator::iterate (enum wchar_iterate_result *out_result,
609 gdb_wchar_t **out_chars,
610 const gdb_byte **ptr,
611 size_t *len)
6c7a06a3
TT
612{
613 size_t out_request;
614
615 /* Try to convert some characters. At first we try to convert just
616 a single character. The reason for this is that iconv does not
617 necessarily update its outgoing arguments when it encounters an
618 invalid input sequence -- but we want to reliably report this to
619 our caller so it can emit an escape sequence. */
620 out_request = 1;
cda6c55b 621 while (m_bytes > 0)
e33d66ec 622 {
cda6c55b
TT
623 ICONV_CONST char *inptr = (ICONV_CONST char *) m_input;
624 char *outptr = (char *) m_out.data ();
625 const gdb_byte *orig_inptr = m_input;
626 size_t orig_in = m_bytes;
6c7a06a3
TT
627 size_t out_avail = out_request * sizeof (gdb_wchar_t);
628 size_t num;
cda6c55b 629 size_t r = iconv (m_desc, &inptr, &m_bytes, &outptr, &out_avail);
39086a0e 630
cda6c55b 631 m_input = (gdb_byte *) inptr;
c5504eaf 632
6c7a06a3
TT
633 if (r == (size_t) -1)
634 {
635 switch (errno)
636 {
637 case EILSEQ:
aff410f1
MS
638 /* Invalid input sequence. We still might have
639 converted a character; if so, return it. */
1558ab4c
JK
640 if (out_avail < out_request * sizeof (gdb_wchar_t))
641 break;
642
aff410f1
MS
643 /* Otherwise skip the first invalid character, and let
644 the caller know about it. */
6c7a06a3 645 *out_result = wchar_iterate_invalid;
cda6c55b
TT
646 *ptr = m_input;
647 *len = m_width;
648 m_input += m_width;
649 m_bytes -= m_width;
6c7a06a3
TT
650 return 0;
651
652 case E2BIG:
653 /* We ran out of space. We still might have converted a
654 character; if so, return it. Otherwise, grow the
655 buffer and try again. */
656 if (out_avail < out_request * sizeof (gdb_wchar_t))
657 break;
658
659 ++out_request;
cda6c55b 660 if (out_request > m_out.size ())
d5722aa2 661 m_out.resize (out_request);
6c7a06a3
TT
662 continue;
663
664 case EINVAL:
665 /* Incomplete input sequence. Let the caller know, and
666 arrange for future calls to see EOF. */
667 *out_result = wchar_iterate_incomplete;
cda6c55b
TT
668 *ptr = m_input;
669 *len = m_bytes;
670 m_bytes = 0;
6c7a06a3
TT
671 return 0;
672
673 default:
9b20d036
MS
674 perror_with_name (_("Internal error while "
675 "converting character sets"));
6c7a06a3
TT
676 }
677 }
678
679 /* We converted something. */
680 num = out_request - out_avail / sizeof (gdb_wchar_t);
681 *out_result = wchar_iterate_ok;
cda6c55b 682 *out_chars = m_out.data ();
6c7a06a3 683 *ptr = orig_inptr;
cda6c55b 684 *len = orig_in - m_bytes;
6c7a06a3 685 return num;
e33d66ec 686 }
6c7a06a3
TT
687
688 /* Really done. */
689 *out_result = wchar_iterate_eof;
690 return -1;
234b45d4
KB
691}
692
ccb2231c
SM
693struct charset_vector
694{
695 ~charset_vector ()
696 {
4f92e10c
TV
697 /* Note that we do not call charset_vector::clear, which would also xfree
698 the elements. This destructor is only called after exit, at which point
699 those will be freed anyway on process exit, so not freeing them now is
700 not classified as a memory leak. OTOH, freeing them now might be
701 classified as a data race, because some worker thread might still be
702 accessing them. */
703 charsets.clear ();
ccb2231c
SM
704 }
705
706 void clear ()
707 {
708 for (char *c : charsets)
709 xfree (c);
234b45d4 710
ccb2231c
SM
711 charsets.clear ();
712 }
713
714 std::vector<char *> charsets;
715};
716
717static charset_vector charsets;
234b45d4 718
6c7a06a3 719#ifdef PHONY_ICONV
234b45d4 720
6c7a06a3
TT
721static void
722find_charset_names (void)
234b45d4 723{
ccb2231c
SM
724 charsets.charsets.push_back (xstrdup (GDB_DEFAULT_HOST_CHARSET));
725 charsets.charsets.push_back (NULL);
234b45d4
KB
726}
727
6c7a06a3 728#else /* PHONY_ICONV */
fc3b640d
TT
729
730/* Sometimes, libiconv redefines iconvlist as libiconvlist -- but
731 provides different symbols in the static and dynamic libraries.
732 So, configure may see libiconvlist but not iconvlist. But, calling
733 iconvlist is the right thing to do and will work. Hence we do a
734 check here but unconditionally call iconvlist below. */
735#if defined (HAVE_ICONVLIST) || defined (HAVE_LIBICONVLIST)
234b45d4 736
6c7a06a3
TT
737/* A helper function that adds some character sets to the vector of
738 all character sets. This is a callback function for iconvlist. */
739
740static int
741add_one (unsigned int count, const char *const *names, void *data)
234b45d4 742{
6c7a06a3 743 unsigned int i;
234b45d4 744
6c7a06a3 745 for (i = 0; i < count; ++i)
ccb2231c 746 charsets.charsets.push_back (xstrdup (names[i]));
234b45d4 747
6c7a06a3 748 return 0;
234b45d4
KB
749}
750
6c7a06a3
TT
751static void
752find_charset_names (void)
234b45d4 753{
6c7a06a3 754 iconvlist (add_one, NULL);
ccb2231c
SM
755
756 charsets.charsets.push_back (NULL);
234b45d4
KB
757}
758
6c7a06a3 759#else
234b45d4 760
40b5c9fb
DE
761/* Return non-zero if LINE (output from iconv) should be ignored.
762 Older iconv programs (e.g. 2.2.2) include the human readable
763 introduction even when stdout is not a tty. Newer versions omit
764 the intro if stdout is not a tty. */
765
766static int
767ignore_line_p (const char *line)
768{
769 /* This table is used to filter the output. If this text appears
770 anywhere in the line, it is ignored (strstr is used). */
771 static const char * const ignore_lines[] =
772 {
773 "The following",
774 "not necessarily",
775 "the FROM and TO",
776 "listed with several",
777 NULL
778 };
779 int i;
780
781 for (i = 0; ignore_lines[i] != NULL; ++i)
782 {
783 if (strstr (line, ignore_lines[i]) != NULL)
784 return 1;
785 }
786
787 return 0;
788}
789
6c7a06a3
TT
790static void
791find_charset_names (void)
234b45d4 792{
732f6a93 793 struct pex_obj *child;
a121b7c1 794 const char *args[3];
732f6a93
TT
795 int err, status;
796 int fail = 1;
478aac75 797 int flags;
9a6c7d9c 798 gdb_environ iconv_env = gdb_environ::from_host_environ ();
478aac75 799 char *iconv_program;
40b5c9fb 800
aff410f1
MS
801 /* Older iconvs, e.g. 2.2.2, don't omit the intro text if stdout is
802 not a tty. We need to recognize it and ignore it. This text is
803 subject to translation, so force LANGUAGE=C. */
9a6c7d9c
SDJ
804 iconv_env.set ("LANGUAGE", "C");
805 iconv_env.set ("LC_ALL", "C");
732f6a93 806
40618926 807 child = pex_init (PEX_USE_PIPES, "iconv", NULL);
732f6a93 808
478aac75
DE
809#ifdef ICONV_BIN
810 {
1834d45f
AT
811 std::string iconv_dir = relocate_gdb_directory (ICONV_BIN,
812 ICONV_BIN_RELOCATABLE);
38de8abe
TV
813 iconv_program
814 = concat (iconv_dir.c_str(), SLASH_STRING, "iconv", (char *) NULL);
478aac75
DE
815 }
816#else
817 iconv_program = xstrdup ("iconv");
818#endif
819 args[0] = iconv_program;
732f6a93
TT
820 args[1] = "-l";
821 args[2] = NULL;
478aac75
DE
822 flags = PEX_STDERR_TO_STDOUT;
823#ifndef ICONV_BIN
824 flags |= PEX_SEARCH;
825#endif
732f6a93 826 /* Note that we simply ignore errors here. */
478aac75 827 if (!pex_run_in_environment (child, flags,
a121b7c1 828 args[0], const_cast<char **> (args),
9a6c7d9c 829 iconv_env.envp (),
40b5c9fb 830 NULL, NULL, &err))
732f6a93
TT
831 {
832 FILE *in = pex_read_output (child, 0);
833
834 /* POSIX says that iconv -l uses an unspecified format. We
835 parse the glibc and libiconv formats; feel free to add others
836 as needed. */
40b5c9fb 837
1d6b2d2b 838 while (in != NULL && !feof (in))
732f6a93
TT
839 {
840 /* The size of buf is chosen arbitrarily. */
841 char buf[1024];
842 char *start, *r;
8ea13695 843 int len;
732f6a93
TT
844
845 r = fgets (buf, sizeof (buf), in);
846 if (!r)
847 break;
848 len = strlen (r);
849 if (len <= 3)
850 continue;
40b5c9fb
DE
851 if (ignore_line_p (r))
852 continue;
853
732f6a93
TT
854 /* Strip off the newline. */
855 --len;
856 /* Strip off one or two '/'s. glibc will print lines like
857 "8859_7//", but also "10646-1:1993/UCS4/". */
858 if (buf[len - 1] == '/')
859 --len;
860 if (buf[len - 1] == '/')
861 --len;
862 buf[len] = '\0';
863
864 /* libiconv will print multiple entries per line, separated
aff410f1
MS
865 by spaces. Older iconvs will print multiple entries per
866 line, indented by two spaces, and separated by ", "
40b5c9fb 867 (i.e. the human readable form). */
732f6a93
TT
868 start = buf;
869 while (1)
870 {
871 int keep_going;
872 char *p;
873
40b5c9fb
DE
874 /* Skip leading blanks. */
875 for (p = start; *p && *p == ' '; ++p)
876 ;
877 start = p;
878 /* Find the next space, comma, or end-of-line. */
879 for ( ; *p && *p != ' ' && *p != ','; ++p)
732f6a93
TT
880 ;
881 /* Ignore an empty result. */
882 if (p == start)
883 break;
884 keep_going = *p;
885 *p = '\0';
ccb2231c 886 charsets.charsets.push_back (xstrdup (start));
732f6a93
TT
887 if (!keep_going)
888 break;
889 /* Skip any extra spaces. */
890 for (start = p + 1; *start && *start == ' '; ++start)
891 ;
892 }
893 }
234b45d4 894
732f6a93
TT
895 if (pex_get_status (child, 1, &status)
896 && WIFEXITED (status) && !WEXITSTATUS (status))
897 fail = 0;
234b45d4 898
6c7a06a3 899 }
234b45d4 900
478aac75 901 xfree (iconv_program);
732f6a93 902 pex_free (child);
234b45d4 903
732f6a93
TT
904 if (fail)
905 {
906 /* Some error occurred, so drop the vector. */
ccb2231c 907 charsets.clear ();
732f6a93
TT
908 }
909 else
ccb2231c 910 charsets.charsets.push_back (NULL);
6c7a06a3 911}
234b45d4 912
fc3b640d 913#endif /* HAVE_ICONVLIST || HAVE_LIBICONVLIST */
6c7a06a3 914#endif /* PHONY_ICONV */
234b45d4 915
f870a310
TT
916/* The "auto" target charset used by default_auto_charset. */
917static const char *auto_target_charset_name = GDB_DEFAULT_TARGET_CHARSET;
918
919const char *
920default_auto_charset (void)
921{
922 return auto_target_charset_name;
923}
924
925const char *
926default_auto_wide_charset (void)
927{
928 return GDB_DEFAULT_TARGET_WIDE_CHARSET;
929}
930
bcb28afc
PM
931
932#ifdef USE_INTERMEDIATE_ENCODING_FUNCTION
933/* Macro used for UTF or UCS endianness suffix. */
934#if WORDS_BIGENDIAN
935#define ENDIAN_SUFFIX "BE"
936#else
937#define ENDIAN_SUFFIX "LE"
938#endif
939
51f1fdc3
CB
940/* GDB cannot handle strings correctly if this size is different. */
941
69f6730d 942static_assert (sizeof (gdb_wchar_t) == 2 || sizeof (gdb_wchar_t) == 4);
bcb28afc 943
ee34b3f9 944/* intermediate_encoding returns the charset used internally by
bcb28afc
PM
945 GDB to convert between target and host encodings. As the test above
946 compiled, sizeof (gdb_wchar_t) is either 2 or 4 bytes.
947 UTF-16/32 is tested first, UCS-2/4 is tested as a second option,
948 otherwise an error is generated. */
949
950const char *
951intermediate_encoding (void)
952{
953 iconv_t desc;
954 static const char *stored_result = NULL;
8579fd13 955 gdb::unique_xmalloc_ptr<char> result;
bcb28afc
PM
956
957 if (stored_result)
958 return stored_result;
959 result = xstrprintf ("UTF-%d%s", (int) (sizeof (gdb_wchar_t) * 8),
960 ENDIAN_SUFFIX);
961 /* Check that the name is supported by iconv_open. */
8579fd13 962 desc = iconv_open (result.get (), host_charset ());
bcb28afc
PM
963 if (desc != (iconv_t) -1)
964 {
965 iconv_close (desc);
8579fd13
AB
966 stored_result = result.release ();
967 return stored_result;
bcb28afc 968 }
bcb28afc
PM
969 /* Second try, with UCS-2 type. */
970 result = xstrprintf ("UCS-%d%s", (int) sizeof (gdb_wchar_t),
971 ENDIAN_SUFFIX);
972 /* Check that the name is supported by iconv_open. */
8579fd13 973 desc = iconv_open (result.get (), host_charset ());
bcb28afc
PM
974 if (desc != (iconv_t) -1)
975 {
976 iconv_close (desc);
8579fd13
AB
977 stored_result = result.release ();
978 return stored_result;
bcb28afc 979 }
bcb28afc 980 /* No valid charset found, generate error here. */
40c94099 981 error (_("Unable to find a valid charset for string conversions"));
bcb28afc
PM
982}
983
984#endif /* USE_INTERMEDIATE_ENCODING_FUNCTION */
985
6c265988 986void _initialize_charset ();
234b45d4 987void
6c265988 988_initialize_charset ()
234b45d4 989{
f870a310 990 /* The first element is always "auto". */
ccb2231c 991 charsets.charsets.push_back (xstrdup ("auto"));
6c7a06a3
TT
992 find_charset_names ();
993
ccb2231c 994 if (charsets.charsets.size () > 1)
27087b7f 995 charset_enum = (const char * const *) charsets.charsets.data ();
6c7a06a3
TT
996 else
997 charset_enum = default_charset_names;
998
999#ifndef PHONY_ICONV
1000#ifdef HAVE_LANGINFO_CODESET
f870a310
TT
1001 /* The result of nl_langinfo may be overwritten later. This may
1002 leak a little memory, if the user later changes the host charset,
1003 but that doesn't matter much. */
1004 auto_host_charset_name = xstrdup (nl_langinfo (CODESET));
aff410f1
MS
1005 /* Solaris will return `646' here -- but the Solaris iconv then does
1006 not accept this. Darwin (and maybe FreeBSD) may return "" here,
06be6983
TG
1007 which GNU libiconv doesn't like (infinite loop). */
1008 if (!strcmp (auto_host_charset_name, "646") || !*auto_host_charset_name)
58720494 1009 auto_host_charset_name = "ASCII";
f870a310
TT
1010 auto_target_charset_name = auto_host_charset_name;
1011#elif defined (USE_WIN32API)
1012 {
3e43a32a
MS
1013 /* "CP" + x<=5 digits + paranoia. */
1014 static char w32_host_default_charset[16];
f870a310
TT
1015
1016 snprintf (w32_host_default_charset, sizeof w32_host_default_charset,
1017 "CP%d", GetACP());
1018 auto_host_charset_name = w32_host_default_charset;
1019 auto_target_charset_name = auto_host_charset_name;
1020 }
6c7a06a3
TT
1021#endif
1022#endif
e33d66ec 1023
dedb7102
TT
1024 /* Recall that the first element is always "auto". */
1025 host_charset_name = charset_enum[0];
1026 gdb_assert (strcmp (host_charset_name, "auto") == 0);
7ab04401 1027 add_setshow_enum_cmd ("charset", class_support,
f870a310 1028 charset_enum, &host_charset_name, _("\
7ab04401
AC
1029Set the host and target character sets."), _("\
1030Show the host and target character sets."), _("\
3d263c1d
BI
1031The `host character set' is the one used by the system GDB is running on.\n\
1032The `target character set' is the one used by the program being debugged.\n\
1033You may only use supersets of ASCII for your host character set; GDB does\n\
1034not support any others.\n\
1035To see a list of the character sets GDB supports, type `set charset <TAB>'."),
7ab04401
AC
1036 /* Note that the sfunc below needs to set
1037 target_charset_name, because the 'set
1038 charset' command sets two variables. */
1039 set_charset_sfunc,
1040 show_charset,
1041 &setlist, &showlist);
1042
1043 add_setshow_enum_cmd ("host-charset", class_support,
6c7a06a3 1044 charset_enum, &host_charset_name, _("\
7ab04401
AC
1045Set the host character set."), _("\
1046Show the host character set."), _("\
3d263c1d
BI
1047The `host character set' is the one used by the system GDB is running on.\n\
1048You may only use supersets of ASCII for your host character set; GDB does\n\
ac74f770
MS
1049not support any others.\n\
1050To see a list of the character sets GDB supports, type `set host-charset <TAB>'."),
7ab04401 1051 set_host_charset_sfunc,
920d2a44 1052 show_host_charset_name,
7ab04401
AC
1053 &setlist, &showlist);
1054
dedb7102
TT
1055 /* Recall that the first element is always "auto". */
1056 target_charset_name = charset_enum[0];
1057 gdb_assert (strcmp (target_charset_name, "auto") == 0);
7ab04401 1058 add_setshow_enum_cmd ("target-charset", class_support,
f870a310 1059 charset_enum, &target_charset_name, _("\
7ab04401
AC
1060Set the target character set."), _("\
1061Show the target character set."), _("\
3d263c1d
BI
1062The `target character set' is the one used by the program being debugged.\n\
1063GDB translates characters and strings between the host and target\n\
b670013c 1064character sets as needed.\n\
ac74f770 1065To see a list of the character sets GDB supports, type `set target-charset'<TAB>"),
7ab04401 1066 set_target_charset_sfunc,
920d2a44 1067 show_target_charset_name,
7ab04401 1068 &setlist, &showlist);
6c7a06a3 1069
dedb7102
TT
1070 /* Recall that the first element is always "auto". */
1071 target_wide_charset_name = charset_enum[0];
1072 gdb_assert (strcmp (target_wide_charset_name, "auto") == 0);
6c7a06a3 1073 add_setshow_enum_cmd ("target-wide-charset", class_support,
f870a310 1074 charset_enum, &target_wide_charset_name,
6c7a06a3
TT
1075 _("\
1076Set the target wide character set."), _("\
1077Show the target wide character set."), _("\
3e43a32a
MS
1078The `target wide character set' is the one used by the program being debugged.\
1079\nIn particular it is the encoding used by `wchar_t'.\n\
6c7a06a3
TT
1080GDB translates characters and strings between the host and target\n\
1081character sets as needed.\n\
1082To see a list of the character sets GDB supports, type\n\
1083`set target-wide-charset'<TAB>"),
1084 set_target_wide_charset_sfunc,
1085 show_target_wide_charset_name,
1086 &setlist, &showlist);
234b45d4 1087}