]> git.ipfire.org Git - thirdparty/binutils-gdb.git/blame - binutils/strings.c
Add note about adding ChangeLog.git to src-release.sh
[thirdparty/binutils-gdb.git] / binutils / strings.c
CommitLineData
252b5132 1/* strings -- print the strings of printable characters in files
d87bef3a 2 Copyright (C) 1993-2023 Free Software Foundation, Inc.
252b5132
RH
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
32866df7 6 the Free Software Foundation; either version 3, or (at your option)
252b5132
RH
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
b43b5d5f
NC
16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
17 02110-1301, USA. */
252b5132
RH
18\f
19/* Usage: strings [options] file...
20
21 Options:
22 --all
23 -a
7fac9594
NC
24 - Scan each file in its entirety.
25
26 --data
27 -d Scan only the initialized data section(s) of object files.
252b5132
RH
28
29 --print-file-name
30 -f Print the name of the file before each string.
31
32 --bytes=min-len
33 -n min-len
34 -min-len Print graphic char sequences, MIN-LEN or more bytes long,
8fee99c3
NC
35 that are followed by a NUL or a non-displayable character.
36 Default is 4.
252b5132
RH
37
38 --radix={o,x,d}
39 -t {o,x,d} Print the offset within the file before each string,
40 in octal/hex/decimal.
41
334ac421
EA
42 --include-all-whitespace
43 -w By default tab and space are the only whitepace included in graphic
44 char sequences. This option considers all of isspace() valid.
45
252b5132
RH
46 -o Like -to. (Some other implementations have -o like -to,
47 others like -td. We chose one arbitrarily.)
48
8745eafa
NC
49 --encoding={s,S,b,l,B,L}
50 -e {s,S,b,l,B,L}
51 Select character encoding: 7-bit-character, 8-bit-character,
52 bigendian 16-bit, littleendian 16-bit, bigendian 32-bit,
53 littleendian 32-bit.
d132876a 54
252b5132 55 --target=BFDNAME
3bf31ec9 56 -T {bfdname}
252b5132
RH
57 Specify a non-default object file format.
58
b3aa80b4 59 --unicode={default|locale|invalid|hex|escape|highlight}
584294c4 60 -U {d|l|i|x|e|h}
795588ae 61 Determine how to handle UTF-8 unicode characters. The default
b3aa80b4
NC
62 is no special treatment. All other versions of this option
63 only apply if the encoding is valid and enabling the option
64 implies --encoding=S.
65 The 'locale' option displays the characters according to the
66 current locale. The 'invalid' option treats them as
67 non-string characters. The 'hex' option displays them as hex
68 byte sequences. The 'escape' option displays them as escape
69 sequences and the 'highlight' option displays them as
70 coloured escape sequences.
71
55edd97b
EA
72 --output-separator=sep_string
73 -s sep_string String used to separate parsed strings in output.
74 Default is newline.
75
252b5132
RH
76 --help
77 -h Print the usage message on the standard output.
78
79 --version
ffbe5983 80 -V
252b5132
RH
81 -v Print the program version number.
82
83 Written by Richard Stallman <rms@gnu.ai.mit.edu>
84 and David MacKenzie <djm@gnu.ai.mit.edu>. */
85
3db64b00 86#include "sysdep.h"
252b5132 87#include "bfd.h"
e9792343 88#include "getopt.h"
252b5132 89#include "libiberty.h"
3882b010 90#include "safe-ctype.h"
3db64b00 91#include "bucomm.h"
252b5132 92
b3aa80b4
NC
93#ifndef streq
94#define streq(a,b) (strcmp ((a),(b)) == 0)
95#endif
96
97typedef enum unicode_display_type
98{
99 unicode_default = 0,
100 unicode_locale,
101 unicode_escape,
102 unicode_hex,
103 unicode_highlight,
104 unicode_invalid
105} unicode_display_type;
106
107static unicode_display_type unicode_display = unicode_default;
108
8745eafa
NC
109#define STRING_ISGRAPHIC(c) \
110 ( (c) >= 0 \
111 && (c) <= 255 \
334ac421 112 && ((c) == '\t' || ISPRINT (c) || (encoding == 'S' && (c) > 127) \
535b785f 113 || (include_all_whitespace && ISSPACE (c))) \
334ac421 114 )
252b5132
RH
115
116#ifndef errno
117extern int errno;
118#endif
119
120/* The BFD section flags that identify an initialized data section. */
121#define DATA_FLAGS (SEC_ALLOC | SEC_LOAD | SEC_HAS_CONTENTS)
122
123/* Radix for printing addresses (must be 8, 10 or 16). */
124static int address_radix;
125
126/* Minimum length of sequence of graphic chars to trigger output. */
795588ae 127static unsigned int string_min;
252b5132 128
334ac421 129/* Whether or not we include all whitespace as a graphic char. */
015dc7e1 130static bool include_all_whitespace;
334ac421 131
b34976b6 132/* TRUE means print address within file for each string. */
015dc7e1 133static bool print_addresses;
252b5132 134
b34976b6 135/* TRUE means print filename for each string. */
015dc7e1 136static bool print_filenames;
252b5132 137
b34976b6 138/* TRUE means for object files scan only the data section. */
015dc7e1 139static bool datasection_only;
252b5132 140
252b5132
RH
141/* The BFD object file format. */
142static char *target;
143
d132876a
NC
144/* The character encoding format. */
145static char encoding;
146static int encoding_bytes;
147
55edd97b
EA
148/* Output string used to separate parsed strings */
149static char *output_separator;
150
252b5132
RH
151static struct option long_options[] =
152{
153 {"all", no_argument, NULL, 'a'},
b3aa80b4 154 {"bytes", required_argument, NULL, 'n'},
7fac9594 155 {"data", no_argument, NULL, 'd'},
b3aa80b4
NC
156 {"encoding", required_argument, NULL, 'e'},
157 {"help", no_argument, NULL, 'h'},
158 {"include-all-whitespace", no_argument, NULL, 'w'},
159 {"output-separator", required_argument, NULL, 's'},
252b5132 160 {"print-file-name", no_argument, NULL, 'f'},
252b5132
RH
161 {"radix", required_argument, NULL, 't'},
162 {"target", required_argument, NULL, 'T'},
b3aa80b4 163 {"unicode", required_argument, NULL, 'U'},
252b5132
RH
164 {"version", no_argument, NULL, 'v'},
165 {NULL, 0, NULL, 0}
166};
167
015dc7e1 168static bool strings_file (char *);
b3aa80b4 169static void print_strings (const char *, FILE *, file_ptr, int, char *);
1e0f0b4d 170static void usage (FILE *, int) ATTRIBUTE_NORETURN;
252b5132 171\f
2da42df6 172int main (int, char **);
65de42c0 173
252b5132 174int
2da42df6 175main (int argc, char **argv)
252b5132
RH
176{
177 int optc;
178 int exit_status = 0;
015dc7e1 179 bool files_given = false;
508e676d 180 char *s;
e36aef42 181 int numeric_opt = 0;
252b5132 182
1c529ca6 183 setlocale (LC_ALL, "");
252b5132
RH
184 bindtextdomain (PACKAGE, LOCALEDIR);
185 textdomain (PACKAGE);
186
187 program_name = argv[0];
188 xmalloc_set_program_name (program_name);
86eafac0 189 bfd_set_error_program_name (program_name);
869b9d07
MM
190
191 expandargv (&argc, &argv);
192
c904a764 193 string_min = 4;
015dc7e1
AM
194 include_all_whitespace = false;
195 print_addresses = false;
196 print_filenames = false;
7fac9594 197 if (DEFAULT_STRINGS_ALL)
015dc7e1 198 datasection_only = false;
7fac9594 199 else
015dc7e1 200 datasection_only = true;
252b5132 201 target = NULL;
d132876a 202 encoding = 's';
55edd97b 203 output_separator = NULL;
252b5132 204
b3aa80b4 205 while ((optc = getopt_long (argc, argv, "adfhHn:wot:e:T:s:U:Vv0123456789",
252b5132
RH
206 long_options, (int *) 0)) != EOF)
207 {
208 switch (optc)
209 {
210 case 'a':
015dc7e1 211 datasection_only = false;
252b5132
RH
212 break;
213
7fac9594 214 case 'd':
015dc7e1 215 datasection_only = true;
7fac9594
NC
216 break;
217
252b5132 218 case 'f':
015dc7e1 219 print_filenames = true;
252b5132
RH
220 break;
221
8b53311e 222 case 'H':
252b5132
RH
223 case 'h':
224 usage (stdout, 0);
225
226 case 'n':
508e676d
JK
227 string_min = (int) strtoul (optarg, &s, 0);
228 if (s != NULL && *s != 0)
229 fatal (_("invalid integer argument %s"), optarg);
252b5132
RH
230 break;
231
334ac421 232 case 'w':
015dc7e1 233 include_all_whitespace = true;
334ac421
EA
234 break;
235
252b5132 236 case 'o':
015dc7e1 237 print_addresses = true;
252b5132
RH
238 address_radix = 8;
239 break;
240
241 case 't':
015dc7e1 242 print_addresses = true;
252b5132
RH
243 if (optarg[1] != '\0')
244 usage (stderr, 1);
245 switch (optarg[0])
246 {
247 case 'o':
248 address_radix = 8;
249 break;
250
251 case 'd':
252 address_radix = 10;
253 break;
254
255 case 'x':
256 address_radix = 16;
257 break;
258
259 default:
260 usage (stderr, 1);
261 }
262 break;
263
264 case 'T':
265 target = optarg;
266 break;
267
d132876a
NC
268 case 'e':
269 if (optarg[1] != '\0')
270 usage (stderr, 1);
271 encoding = optarg[0];
272 break;
273
55edd97b
EA
274 case 's':
275 output_separator = optarg;
795588ae 276 break;
55edd97b 277
b3aa80b4
NC
278 case 'U':
279 if (streq (optarg, "default") || streq (optarg, "d"))
280 unicode_display = unicode_default;
281 else if (streq (optarg, "locale") || streq (optarg, "l"))
282 unicode_display = unicode_locale;
283 else if (streq (optarg, "escape") || streq (optarg, "e"))
284 unicode_display = unicode_escape;
285 else if (streq (optarg, "invalid") || streq (optarg, "i"))
286 unicode_display = unicode_invalid;
287 else if (streq (optarg, "hex") || streq (optarg, "x"))
288 unicode_display = unicode_hex;
289 else if (streq (optarg, "highlight") || streq (optarg, "h"))
290 unicode_display = unicode_highlight;
291 else
292 fatal (_("invalid argument to -U/--unicode: %s"), optarg);
293 break;
294
8b53311e 295 case 'V':
252b5132
RH
296 case 'v':
297 print_version ("strings");
298 break;
299
300 case '?':
301 usage (stderr, 1);
302
303 default:
e36aef42 304 numeric_opt = optind;
252b5132
RH
305 break;
306 }
307 }
308
b3aa80b4
NC
309 if (unicode_display != unicode_default)
310 encoding = 'S';
311
e36aef42
AM
312 if (numeric_opt != 0)
313 {
314 string_min = (int) strtoul (argv[numeric_opt - 1] + 1, &s, 0);
315 if (s != NULL && *s != 0)
316 fatal (_("invalid integer argument %s"), argv[numeric_opt - 1] + 1);
317 }
c904a764
NC
318 if (string_min < 1)
319 fatal (_("invalid minimum string length %d"), string_min);
252b5132 320
d132876a
NC
321 switch (encoding)
322 {
8745eafa 323 case 'S':
d132876a
NC
324 case 's':
325 encoding_bytes = 1;
326 break;
327 case 'b':
328 case 'l':
329 encoding_bytes = 2;
330 break;
331 case 'B':
332 case 'L':
333 encoding_bytes = 4;
334 break;
335 default:
336 usage (stderr, 1);
337 }
338
bf2dd8d7
AM
339 if (bfd_init () != BFD_INIT_MAGIC)
340 fatal (_("fatal error: libbfd ABI mismatch"));
252b5132
RH
341 set_default_bfd_target ();
342
343 if (optind >= argc)
344 {
015dc7e1 345 datasection_only = false;
5af11cab 346 SET_BINARY (fileno (stdin));
b3aa80b4 347 print_strings ("{standard input}", stdin, 0, 0, (char *) NULL);
015dc7e1 348 files_given = true;
252b5132
RH
349 }
350 else
351 {
352 for (; optind < argc; ++optind)
353 {
b3aa80b4 354 if (streq (argv[optind], "-"))
015dc7e1 355 datasection_only = false;
252b5132
RH
356 else
357 {
015dc7e1 358 files_given = true;
535b785f 359 exit_status |= !strings_file (argv[optind]);
252b5132
RH
360 }
361 }
362 }
363
b34976b6 364 if (!files_given)
252b5132
RH
365 usage (stderr, 1);
366
367 return (exit_status);
368}
369\f
19871f45
AM
370/* Scan section SECT of the file ABFD, whose printable name is
371 FILENAME. If it contains initialized data set GOT_A_SECTION and
372 print the strings in it. */
252b5132
RH
373
374static void
19871f45 375strings_a_section (bfd *abfd, asection *sect, const char *filename,
015dc7e1 376 bool *got_a_section)
252b5132 377{
06803313 378 bfd_size_type sectsize;
19871f45 379 bfd_byte *mem;
3aade688 380
06803313
NC
381 if ((sect->flags & DATA_FLAGS) != DATA_FLAGS)
382 return;
383
fd361982 384 sectsize = bfd_section_size (sect);
19871f45 385 if (sectsize == 0)
06803313
NC
386 return;
387
19871f45 388 if (!bfd_malloc_and_get_section (abfd, sect, &mem))
252b5132 389 {
19871f45
AM
390 non_fatal (_("%s: Reading section %s failed: %s"),
391 filename, sect->name, bfd_errmsg (bfd_get_error ()));
392 return;
252b5132 393 }
06803313 394
015dc7e1 395 *got_a_section = true;
b3aa80b4 396 print_strings (filename, NULL, sect->filepos, sectsize, (char *) mem);
06803313 397 free (mem);
252b5132
RH
398}
399
400/* Scan all of the sections in FILE, and print the strings
401 in the initialized data section(s).
402
b34976b6
AM
403 Return TRUE if successful,
404 FALSE if not (such as if FILE is not an object file). */
252b5132 405
015dc7e1 406static bool
2da42df6 407strings_object_file (const char *file)
252b5132 408{
06803313 409 bfd *abfd;
19871f45 410 asection *s;
015dc7e1 411 bool got_a_section;
06803313
NC
412
413 abfd = bfd_openr (file, target);
252b5132
RH
414
415 if (abfd == NULL)
8745eafa 416 /* Treat the file as a non-object file. */
015dc7e1 417 return false;
252b5132
RH
418
419 /* This call is mainly for its side effect of reading in the sections.
420 We follow the traditional behavior of `strings' in that we don't
421 complain if we don't recognize a file to be an object file. */
b34976b6 422 if (!bfd_check_format (abfd, bfd_object))
252b5132
RH
423 {
424 bfd_close (abfd);
015dc7e1 425 return false;
252b5132
RH
426 }
427
015dc7e1 428 got_a_section = false;
19871f45
AM
429 for (s = abfd->sections; s != NULL; s = s->next)
430 strings_a_section (abfd, s, file, &got_a_section);
252b5132
RH
431
432 if (!bfd_close (abfd))
433 {
434 bfd_nonfatal (file);
015dc7e1 435 return false;
252b5132
RH
436 }
437
438 return got_a_section;
439}
440
b34976b6 441/* Print the strings in FILE. Return TRUE if ok, FALSE if an error occurs. */
252b5132 442
015dc7e1 443static bool
2da42df6 444strings_file (char *file)
252b5132 445{
ee2fb9eb
JK
446 struct stat st;
447
448 /* get_file_size does not support non-S_ISREG files. */
fb5b5478 449
ee2fb9eb 450 if (stat (file, &st) < 0)
fb5b5478
JJ
451 {
452 if (errno == ENOENT)
453 non_fatal (_("'%s': No such file"), file);
454 else
455 non_fatal (_("Warning: could not locate '%s'. reason: %s"),
456 file, strerror (errno));
015dc7e1 457 return false;
fb5b5478 458 }
0e158763
NC
459 else if (S_ISDIR (st.st_mode))
460 {
461 non_fatal (_("Warning: '%s' is a directory"), file);
015dc7e1 462 return false;
0e158763 463 }
f24ddbdd 464
252b5132
RH
465 /* If we weren't told to scan the whole file,
466 try to open it as an object file and only look at
467 initialized data sections. If that fails, fall back to the
468 whole file. */
469 if (!datasection_only || !strings_object_file (file))
470 {
471 FILE *stream;
472
ee2fb9eb 473 stream = fopen (file, FOPEN_RB);
252b5132
RH
474 if (stream == NULL)
475 {
476 fprintf (stderr, "%s: ", program_name);
477 perror (file);
015dc7e1 478 return false;
252b5132
RH
479 }
480
b3aa80b4 481 print_strings (file, stream, (file_ptr) 0, 0, (char *) NULL);
252b5132
RH
482
483 if (fclose (stream) == EOF)
484 {
485 fprintf (stderr, "%s: ", program_name);
486 perror (file);
015dc7e1 487 return false;
252b5132
RH
488 }
489 }
490
015dc7e1 491 return true;
252b5132
RH
492}
493\f
d132876a
NC
494/* Read the next character, return EOF if none available.
495 Assume that STREAM is positioned so that the next byte read
496 is at address ADDRESS in the file.
497
498 If STREAM is NULL, do not read from it.
499 The caller can supply a buffer of characters
500 to be processed before the data in STREAM.
501 MAGIC is the address of the buffer and
502 MAGICCOUNT is how many characters are in it. */
503
504static long
ee2fb9eb 505get_char (FILE *stream, file_ptr *address, int *magiccount, char **magic)
d132876a
NC
506{
507 int c, i;
c54e2ec1 508 long r = 0;
d132876a
NC
509
510 for (i = 0; i < encoding_bytes; i++)
511 {
512 if (*magiccount)
513 {
514 (*magiccount)--;
515 c = *(*magic)++;
516 }
517 else
518 {
519 if (stream == NULL)
520 return EOF;
b7d4af3a
JW
521
522 /* Only use getc_unlocked if we found a declaration for it.
523 Otherwise, libc is not thread safe by default, and we
524 should not use it. */
525
526#if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
cedd9a58
JJ
527 c = getc_unlocked (stream);
528#else
d132876a 529 c = getc (stream);
cedd9a58 530#endif
d132876a
NC
531 if (c == EOF)
532 return EOF;
533 }
534
535 (*address)++;
c54e2ec1 536 r = (r << 8) | (c & 0xff);
d132876a
NC
537 }
538
539 switch (encoding)
540 {
c54e2ec1 541 default:
d132876a
NC
542 break;
543 case 'l':
c54e2ec1 544 r = ((r & 0xff) << 8) | ((r & 0xff00) >> 8);
d132876a
NC
545 break;
546 case 'L':
c54e2ec1
AM
547 r = (((r & 0xff) << 24) | ((r & 0xff00) << 8)
548 | ((r & 0xff0000) >> 8) | ((r & 0xff000000) >> 24));
d132876a
NC
549 break;
550 }
551
d132876a
NC
552 return r;
553}
7ca166c9
AM
554
555/* Throw away one byte of a (possibly) multi-byte char C, updating
556 address and buffer to suit. */
557
558static void
559unget_part_char (long c, file_ptr *address, int *magiccount, char **magic)
560{
561 static char tmp[4];
562
563 if (encoding_bytes > 1)
564 {
565 *address -= encoding_bytes - 1;
566
567 if (*magiccount == 0)
568 {
569 /* If no magic buffer exists, use temp buffer. */
570 switch (encoding)
571 {
572 default:
573 break;
574 case 'b':
575 tmp[0] = c & 0xff;
576 *magiccount = 1;
577 break;
578 case 'l':
579 tmp[0] = (c >> 8) & 0xff;
580 *magiccount = 1;
581 break;
582 case 'B':
583 tmp[0] = (c >> 16) & 0xff;
584 tmp[1] = (c >> 8) & 0xff;
585 tmp[2] = c & 0xff;
586 *magiccount = 3;
587 break;
588 case 'L':
589 tmp[0] = (c >> 8) & 0xff;
590 tmp[1] = (c >> 16) & 0xff;
591 tmp[2] = (c >> 24) & 0xff;
592 *magiccount = 3;
593 break;
594 }
595 *magic = tmp;
596 }
597 else
598 {
599 /* If magic buffer exists, rewind. */
600 *magic -= encoding_bytes - 1;
601 *magiccount += encoding_bytes - 1;
602 }
603 }
604}
b3aa80b4
NC
605
606static void
607print_filename_and_address (const char * filename, file_ptr address)
608{
609 if (print_filenames)
610 printf ("%s: ", filename);
611
612 if (! print_addresses)
613 return;
614
615 switch (address_radix)
616 {
617 case 8:
618 if (sizeof (address) > sizeof (long))
619 {
620#ifndef __MSVCRT__
621 printf ("%7llo ", (unsigned long long) address);
622#else
623 printf ("%7I64o ", (unsigned long long) address);
624#endif
625 }
626 else
627 printf ("%7lo ", (unsigned long) address);
628 break;
629
630 case 10:
631 if (sizeof (address) > sizeof (long))
632 {
633#ifndef __MSVCRT__
634 printf ("%7llu ", (unsigned long long) address);
635#else
636 printf ("%7I64d ", (unsigned long long) address);
637#endif
638 }
639 else
640 printf ("%7ld ", (long) address);
641 break;
642
643 case 16:
644 if (sizeof (address) > sizeof (long))
645 {
646#ifndef __MSVCRT__
647 printf ("%7llx ", (unsigned long long) address);
648#else
649 printf ("%7I64x ", (unsigned long long) address);
650#endif
651 }
652 else
653 printf ("%7lx ", (unsigned long) address);
654 break;
655 }
656}
657
658/* Return non-zero if the bytes starting at BUFFER form a valid UTF-8 encoding.
659 If the encoding is valid then returns the number of bytes it uses. */
660
661static unsigned int
662is_valid_utf8 (const unsigned char * buffer, unsigned long buflen)
663{
664 if (buffer[0] < 0xc0)
665 return 0;
666
667 if (buflen < 2)
668 return 0;
669
670 if ((buffer[1] & 0xc0) != 0x80)
671 return 0;
672
673 if ((buffer[0] & 0x20) == 0)
674 return 2;
675
676 if (buflen < 3)
677 return 0;
678
679 if ((buffer[2] & 0xc0) != 0x80)
680 return 0;
795588ae 681
b3aa80b4
NC
682 if ((buffer[0] & 0x10) == 0)
683 return 3;
684
685 if (buflen < 4)
686 return 0;
687
688 if ((buffer[3] & 0xc0) != 0x80)
689 return 0;
690
691 return 4;
692}
693
694/* Display a UTF-8 encoded character in BUFFER according to the setting
695 of unicode_display. The character is known to be valid.
696 Returns the number of bytes consumed. */
697
795588ae 698static unsigned int
b3aa80b4
NC
699display_utf8_char (const unsigned char * buffer)
700{
795588ae
PS
701 unsigned int j;
702 unsigned int utf8_len;
b3aa80b4
NC
703
704 switch (buffer[0] & 0x30)
705 {
706 case 0x00:
707 case 0x10:
708 utf8_len = 2;
709 break;
710 case 0x20:
711 utf8_len = 3;
712 break;
713 default:
714 utf8_len = 4;
715 }
795588ae 716
b3aa80b4
NC
717 switch (unicode_display)
718 {
719 default:
720 fprintf (stderr, "ICE: unexpected unicode display type\n");
721 break;
722
723 case unicode_escape:
724 case unicode_highlight:
725 if (unicode_display == unicode_highlight && isatty (1))
726 printf ("\x1B[31;47m"); /* Red. */
727
728 switch (utf8_len)
729 {
730 case 2:
731 printf ("\\u%02x%02x",
795588ae 732 ((buffer[0] & 0x1c) >> 2),
b3aa80b4
NC
733 ((buffer[0] & 0x03) << 6) | (buffer[1] & 0x3f));
734 break;
735
736 case 3:
737 printf ("\\u%02x%02x",
738 ((buffer[0] & 0x0f) << 4) | ((buffer[1] & 0x3c) >> 2),
739 ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3f)));
740 break;
741
742 case 4:
743 printf ("\\u%02x%02x%02x",
744 ((buffer[0] & 0x07) << 6) | ((buffer[1] & 0x3c) >> 2),
745 ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3c) >> 2),
746 ((buffer[2] & 0x03) << 6) | ((buffer[3] & 0x3f)));
747 break;
748 default:
749 /* URG. */
750 break;
751 }
752
753 if (unicode_display == unicode_highlight && isatty (1))
754 printf ("\033[0m"); /* Default colour. */
755 break;
756
757 case unicode_hex:
758 putchar ('<');
759 printf ("0x");
760 for (j = 0; j < utf8_len; j++)
761 printf ("%02x", buffer [j]);
762 putchar ('>');
763 break;
764
765 case unicode_locale:
766 printf ("%.1s", buffer);
767 break;
768 }
769
770 return utf8_len;
771}
772
773/* Display strings in BUFFER. Treat any UTF-8 encoded characters encountered
774 according to the setting of the unicode_display variable. The buffer
775 contains BUFLEN bytes.
776
777 Display the characters as if they started at ADDRESS and are contained in
778 FILENAME. */
779
780static void
781print_unicode_buffer (const char * filename,
782 file_ptr address,
783 const unsigned char * buffer,
784 unsigned long buflen)
785{
786 /* Paranoia checks... */
787 if (filename == NULL
788 || buffer == NULL
789 || unicode_display == unicode_default
790 || encoding != 'S'
791 || encoding_bytes != 1)
792 {
793 fprintf (stderr, "ICE: bad arguments to print_unicode_buffer\n");
794 return;
795 }
796
797 if (buflen == 0)
798 return;
799
800 /* We must only display strings that are at least string_min *characters*
801 long. So we scan the buffer in two stages. First we locate the start
802 of a potential string. Then we walk along it until we have found
803 string_min characters. Then we go back to the start point and start
804 displaying characters according to the unicode_display setting. */
805
806 unsigned long start_point = 0;
807 unsigned long i = 0;
808 unsigned int char_len = 1;
809 unsigned int num_found = 0;
810
811 for (i = 0; i < buflen; i += char_len)
812 {
813 int c = buffer[i];
814
815 char_len = 1;
816
817 /* Find the first potential character of a string. */
818 if (! STRING_ISGRAPHIC (c))
819 {
820 num_found = 0;
821 continue;
822 }
823
824 if (c > 126)
825 {
826 if (c < 0xc0)
827 {
828 num_found = 0;
829 continue;
830 }
831
832 if ((char_len = is_valid_utf8 (buffer + i, buflen - i)) == 0)
833 {
834 char_len = 1;
835 num_found = 0;
836 continue;
837 }
838
839 if (unicode_display == unicode_invalid)
840 {
841 /* We have found a valid UTF-8 character, but we treat it as non-graphic. */
842 num_found = 0;
843 continue;
844 }
845 }
846
847 if (num_found == 0)
848 /* We have found a potential starting point for a string. */
849 start_point = i;
850
851 ++ num_found;
852
853 if (num_found >= string_min)
854 break;
855 }
856
857 if (num_found < string_min)
858 return;
859
860 print_filename_and_address (filename, address + start_point);
795588ae 861
b3aa80b4
NC
862 /* We have found string_min characters. Display them and any
863 more that follow. */
864 for (i = start_point; i < buflen; i += char_len)
865 {
866 int c = buffer[i];
867
868 char_len = 1;
869
870 if (! STRING_ISGRAPHIC (c))
871 break;
872 else if (c < 127)
873 putchar (c);
874 else if (! is_valid_utf8 (buffer + i, buflen - i))
875 break;
876 else if (unicode_display == unicode_invalid)
877 break;
878 else
879 char_len = display_utf8_char (buffer + i);
880 }
881
882 if (output_separator)
883 fputs (output_separator, stdout);
884 else
885 putchar ('\n');
886
887 /* FIXME: Using tail recursion here is lazy programming... */
888 print_unicode_buffer (filename, address + i, buffer + i, buflen - i);
889}
890
891static int
795588ae
PS
892get_unicode_byte (FILE * stream,
893 unsigned char * putback,
894 unsigned int * num_putback,
895 unsigned int * num_read)
b3aa80b4
NC
896{
897 if (* num_putback > 0)
898 {
899 * num_putback = * num_putback - 1;
900 return putback [* num_putback];
901 }
902
903 * num_read = * num_read + 1;
904
905#if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
906 return getc_unlocked (stream);
907#else
908 return getc (stream);
909#endif
910}
911
912/* Helper function for print_unicode_stream. */
913
914static void
915print_unicode_stream_body (const char * filename,
916 file_ptr address,
917 FILE * stream,
918 unsigned char * putback_buf,
795588ae 919 unsigned int num_putback,
b3aa80b4
NC
920 unsigned char * print_buf)
921{
922 /* It would be nice if we could just read the stream into a buffer
923 and then process if with print_unicode_buffer. But the input
924 might be huge or it might time-locked (eg stdin). So instead
925 we go one byte at a time... */
926
927 file_ptr start_point = 0;
795588ae
PS
928 unsigned int num_read = 0;
929 unsigned int num_chars = 0;
930 unsigned int num_print = 0;
a9a09f51 931 int c = 0;
b3aa80b4
NC
932
933 /* Find a series of string_min characters. Put them into print_buf. */
934 do
935 {
936 if (num_chars >= string_min)
937 break;
938
939 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
940 if (c == EOF)
941 break;
942
943 if (! STRING_ISGRAPHIC (c))
944 {
945 num_chars = num_print = 0;
946 continue;
947 }
948
949 if (num_chars == 0)
950 start_point = num_read - 1;
951
952 if (c < 127)
953 {
954 print_buf[num_print] = c;
955 num_chars ++;
956 num_print ++;
957 continue;
958 }
959
960 if (c < 0xc0)
961 {
962 num_chars = num_print = 0;
963 continue;
964 }
965
966 /* We *might* have a UTF-8 sequence. Time to start peeking. */
967 char utf8[4];
968
969 utf8[0] = c;
970 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
971 if (c == EOF)
972 break;
973 utf8[1] = c;
974
975 if ((utf8[1] & 0xc0) != 0x80)
976 {
977 /* Invalid UTF-8. */
978 putback_buf[num_putback++] = utf8[1];
979 num_chars = num_print = 0;
980 continue;
981 }
982 else if ((utf8[0] & 0x20) == 0)
983 {
984 /* A valid 2-byte UTF-8 encoding. */
985 if (unicode_display == unicode_invalid)
986 {
987 putback_buf[num_putback++] = utf8[1];
988 num_chars = num_print = 0;
989 }
990 else
991 {
992 print_buf[num_print ++] = utf8[0];
993 print_buf[num_print ++] = utf8[1];
994 num_chars ++;
995 }
996 continue;
997 }
998
999 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1000 if (c == EOF)
1001 break;
1002 utf8[2] = c;
1003
1004 if ((utf8[2] & 0xc0) != 0x80)
1005 {
1006 /* Invalid UTF-8. */
1007 putback_buf[num_putback++] = utf8[2];
1008 putback_buf[num_putback++] = utf8[1];
1009 num_chars = num_print = 0;
1010 continue;
1011 }
1012 else if ((utf8[0] & 0x10) == 0)
1013 {
1014 /* A valid 3-byte UTF-8 encoding. */
1015 if (unicode_display == unicode_invalid)
1016 {
1017 putback_buf[num_putback++] = utf8[2];
1018 putback_buf[num_putback++] = utf8[1];
1019 num_chars = num_print = 0;
1020 }
1021 else
1022 {
1023 print_buf[num_print ++] = utf8[0];
1024 print_buf[num_print ++] = utf8[1];
1025 print_buf[num_print ++] = utf8[2];
1026 num_chars ++;
1027 }
1028 continue;
1029 }
1030
1031 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1032 if (c == EOF)
1033 break;
1034 utf8[3] = c;
1035
1036 if ((utf8[3] & 0xc0) != 0x80)
1037 {
1038 /* Invalid UTF-8. */
1039 putback_buf[num_putback++] = utf8[3];
1040 putback_buf[num_putback++] = utf8[2];
1041 putback_buf[num_putback++] = utf8[1];
1042 num_chars = num_print = 0;
1043 }
1044 /* We have a valid 4-byte UTF-8 encoding. */
1045 else if (unicode_display == unicode_invalid)
1046 {
1047 putback_buf[num_putback++] = utf8[3];
1048 putback_buf[num_putback++] = utf8[1];
1049 putback_buf[num_putback++] = utf8[2];
1050 num_chars = num_print = 0;
1051 }
1052 else
1053 {
1054 print_buf[num_print ++] = utf8[0];
1055 print_buf[num_print ++] = utf8[1];
1056 print_buf[num_print ++] = utf8[2];
1057 print_buf[num_print ++] = utf8[3];
1058 num_chars ++;
1059 }
1060 }
1061 while (1);
1062
1063 if (num_chars >= string_min)
1064 {
1065 /* We know that we have string_min valid characters in print_buf,
1066 and there may be more to come in the stream. Start displaying
1067 them. */
1068
1069 print_filename_and_address (filename, address + start_point);
1070
795588ae 1071 unsigned int i;
b3aa80b4
NC
1072 for (i = 0; i < num_print;)
1073 {
1074 if (print_buf[i] < 127)
1075 putchar (print_buf[i++]);
1076 else
1077 i += display_utf8_char (print_buf + i);
1078 }
1079
1080 /* OK so now we have to start read unchecked bytes. */
1081
795588ae 1082 /* Find a series of string_min characters. Put them into print_buf. */
b3aa80b4
NC
1083 do
1084 {
1085 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1086 if (c == EOF)
1087 break;
1088
1089 if (! STRING_ISGRAPHIC (c))
1090 break;
1091
1092 if (c < 127)
1093 {
1094 putchar (c);
1095 continue;
1096 }
1097
1098 if (c < 0xc0)
1099 break;
1100
1101 /* We *might* have a UTF-8 sequence. Time to start peeking. */
1102 unsigned char utf8[4];
1103
1104 utf8[0] = c;
1105 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1106 if (c == EOF)
1107 break;
1108 utf8[1] = c;
1109
1110 if ((utf8[1] & 0xc0) != 0x80)
1111 {
1112 /* Invalid UTF-8. */
1113 putback_buf[num_putback++] = utf8[1];
1114 break;
1115 }
1116 else if ((utf8[0] & 0x20) == 0)
1117 {
1118 /* Valid 2-byte UTF-8. */
1119 if (unicode_display == unicode_invalid)
1120 {
1121 putback_buf[num_putback++] = utf8[1];
1122 break;
1123 }
1124 else
1125 {
1126 (void) display_utf8_char (utf8);
1127 continue;
1128 }
1129 }
1130
1131 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1132 if (c == EOF)
1133 break;
1134 utf8[2] = c;
1135
1136 if ((utf8[2] & 0xc0) != 0x80)
1137 {
1138 /* Invalid UTF-8. */
1139 putback_buf[num_putback++] = utf8[2];
1140 putback_buf[num_putback++] = utf8[1];
1141 break;
1142 }
1143 else if ((utf8[0] & 0x10) == 0)
1144 {
1145 /* Valid 3-byte UTF-8. */
1146 if (unicode_display == unicode_invalid)
1147 {
1148 putback_buf[num_putback++] = utf8[2];
1149 putback_buf[num_putback++] = utf8[1];
1150 break;
1151 }
1152 else
1153 {
1154 (void) display_utf8_char (utf8);
1155 continue;
1156 }
1157 }
1158
1159 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1160 if (c == EOF)
1161 break;
1162 utf8[3] = c;
1163
1164 if ((utf8[3] & 0xc0) != 0x80)
1165 {
1166 /* Invalid UTF-8. */
1167 putback_buf[num_putback++] = utf8[3];
1168 putback_buf[num_putback++] = utf8[2];
1169 putback_buf[num_putback++] = utf8[1];
1170 break;
1171 }
1172 else if (unicode_display == unicode_invalid)
1173 {
1174 putback_buf[num_putback++] = utf8[3];
1175 putback_buf[num_putback++] = utf8[2];
1176 putback_buf[num_putback++] = utf8[1];
1177 break;
1178 }
1179 else
1180 /* A valid 4-byte UTF-8 encoding. */
1181 (void) display_utf8_char (utf8);
1182 }
1183 while (1);
1184
1185 if (output_separator)
1186 fputs (output_separator, stdout);
1187 else
1188 putchar ('\n');
1189 }
1190
1191 if (c != EOF)
1192 /* FIXME: Using tail recursion here is lazy, but it works. */
1193 print_unicode_stream_body (filename, address + num_read, stream, putback_buf, num_putback, print_buf);
1194}
1195
1196/* Display strings read in from STREAM. Treat any UTF-8 encoded characters
1197 encountered according to the setting of the unicode_display variable.
1198 The stream is positioned at ADDRESS and is attached to FILENAME. */
1199
1200static void
1201print_unicode_stream (const char * filename,
1202 file_ptr address,
1203 FILE * stream)
1204{
1205 /* Paranoia checks... */
1206 if (filename == NULL
1207 || stream == NULL
1208 || unicode_display == unicode_default
1209 || encoding != 'S'
1210 || encoding_bytes != 1)
1211 {
1212 fprintf (stderr, "ICE: bad arguments to print_unicode_stream\n");
1213 return;
1214 }
1215
1216 /* Allocate space for string_min 4-byte utf-8 characters. */
1217 unsigned char * print_buf = xmalloc ((4 * string_min) + 1);
1218 /* We should never have to put back more than 4 bytes. */
1219 unsigned char putback_buf[5];
795588ae 1220 unsigned int num_putback = 0;
b3aa80b4
NC
1221
1222 print_unicode_stream_body (filename, address, stream, putback_buf, num_putback, print_buf);
1223 free (print_buf);
1224}
d132876a 1225\f
252b5132
RH
1226/* Find the strings in file FILENAME, read from STREAM.
1227 Assume that STREAM is positioned so that the next byte read
1228 is at address ADDRESS in the file.
252b5132
RH
1229
1230 If STREAM is NULL, do not read from it.
1231 The caller can supply a buffer of characters
1232 to be processed before the data in STREAM.
1233 MAGIC is the address of the buffer and
1234 MAGICCOUNT is how many characters are in it.
1235 Those characters come at address ADDRESS and the data in STREAM follow. */
1236
1237static void
ee2fb9eb 1238print_strings (const char *filename, FILE *stream, file_ptr address,
b3aa80b4 1239 int magiccount, char *magic)
252b5132 1240{
b3aa80b4
NC
1241 if (unicode_display != unicode_default)
1242 {
1243 if (magic != NULL)
1244 print_unicode_buffer (filename, address,
1245 (const unsigned char *) magic, magiccount);
1246
1247 if (stream != NULL)
1248 print_unicode_stream (filename, address, stream);
1249 return;
1250 }
1251
d132876a 1252 char *buf = (char *) xmalloc (sizeof (char) * (string_min + 1));
252b5132
RH
1253
1254 while (1)
1255 {
ee2fb9eb 1256 file_ptr start;
795588ae 1257 unsigned int i;
d132876a 1258 long c;
252b5132
RH
1259
1260 /* See if the next `string_min' chars are all graphic chars. */
1261 tryline:
252b5132
RH
1262 start = address;
1263 for (i = 0; i < string_min; i++)
1264 {
d132876a
NC
1265 c = get_char (stream, &address, &magiccount, &magic);
1266 if (c == EOF)
68187828
NC
1267 {
1268 free (buf);
1269 return;
1270 }
71f5e3f7 1271
8745eafa 1272 if (! STRING_ISGRAPHIC (c))
71f5e3f7 1273 {
7ca166c9
AM
1274 /* Found a non-graphic. Try again starting with next byte. */
1275 unget_part_char (c, &address, &magiccount, &magic);
71f5e3f7
NC
1276 goto tryline;
1277 }
252b5132
RH
1278 buf[i] = c;
1279 }
1280
1281 /* We found a run of `string_min' graphic characters. Print up
e9f87780 1282 to the next non-graphic character. */
b3aa80b4 1283 print_filename_and_address (filename, start);
252b5132
RH
1284
1285 buf[i] = '\0';
1286 fputs (buf, stdout);
1287
1288 while (1)
1289 {
d132876a
NC
1290 c = get_char (stream, &address, &magiccount, &magic);
1291 if (c == EOF)
1292 break;
8745eafa 1293 if (! STRING_ISGRAPHIC (c))
dcd9adc5 1294 {
7ca166c9 1295 unget_part_char (c, &address, &magiccount, &magic);
dcd9adc5
NC
1296 break;
1297 }
252b5132
RH
1298 putchar (c);
1299 }
1300
55edd97b 1301 if (output_separator)
7ca166c9 1302 fputs (output_separator, stdout);
55edd97b 1303 else
7ca166c9 1304 putchar ('\n');
252b5132 1305 }
68187828 1306 free (buf);
252b5132
RH
1307}
1308\f
252b5132 1309static void
2da42df6 1310usage (FILE *stream, int status)
252b5132 1311{
8b53311e
NC
1312 fprintf (stream, _("Usage: %s [option(s)] [file(s)]\n"), program_name);
1313 fprintf (stream, _(" Display printable strings in [file(s)] (stdin by default)\n"));
7fac9594
NC
1314 fprintf (stream, _(" The options are:\n"));
1315
1316 if (DEFAULT_STRINGS_ALL)
1317 fprintf (stream, _("\
1318 -a - --all Scan the entire file, not just the data section [default]\n\
1319 -d --data Only scan the data sections in the file\n"));
1320 else
1321 fprintf (stream, _("\
8b53311e 1322 -a - --all Scan the entire file, not just the data section\n\
7fac9594
NC
1323 -d --data Only scan the data sections in the file [default]\n"));
1324
1325 fprintf (stream, _("\
8b53311e 1326 -f --print-file-name Print the name of the file before each string\n\
8fee99c3
NC
1327 -n <number> Locate & print any sequence of at least <number>\n\
1328 --bytes=<number> displayable characters. (The default is 4).\n\
d412a550 1329 -t --radix={o,d,x} Print the location of the string in base 8, 10 or 16\n\
334ac421 1330 -w --include-all-whitespace Include all whitespace as valid string characters\n\
8b53311e
NC
1331 -o An alias for --radix=o\n\
1332 -T --target=<BFDNAME> Specify the binary file format\n\
8745eafa
NC
1333 -e --encoding={s,S,b,l,B,L} Select character size and endianness:\n\
1334 s = 7-bit, S = 8-bit, {b,l} = 16-bit, {B,L} = 32-bit\n\
b3aa80b4 1335 --unicode={default|show|invalid|hex|escape|highlight}\n\
584294c4 1336 -U {d|s|i|x|e|h} Specify how to treat UTF-8 encoded unicode characters\n\
55edd97b 1337 -s --output-separator=<string> String used to separate strings in output.\n\
07012eee 1338 @<file> Read options from <file>\n\
8b53311e 1339 -h --help Display this information\n\
ffbe5983 1340 -v -V --version Print the program's version number\n"));
252b5132 1341 list_supported_targets (program_name, stream);
92f01d61 1342 if (REPORT_BUGS_TO[0] && status == 0)
8ad3436c 1343 fprintf (stream, _("Report bugs to %s\n"), REPORT_BUGS_TO);
252b5132
RH
1344 exit (status);
1345}