]> git.ipfire.org Git - thirdparty/binutils-gdb.git/blob - binutils/strings.c
x86: Add NT_X86_SHSTK note
[thirdparty/binutils-gdb.git] / binutils / strings.c
1 /* strings -- print the strings of printable characters in files
2 Copyright (C) 1993-2023 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
17 02110-1301, USA. */
18 \f
19 /* Usage: strings [options] file...
20
21 Options:
22 --all
23 -a
24 - Scan each file in its entirety.
25
26 --data
27 -d Scan only the initialized data section(s) of object files.
28
29 --print-file-name
30 -f Print the name of the file before each string.
31
32 --bytes=min-len
33 -n min-len
34 -min-len Print graphic char sequences, MIN-LEN or more bytes long,
35 that are followed by a NUL or a non-displayable character.
36 Default is 4.
37
38 --radix={o,x,d}
39 -t {o,x,d} Print the offset within the file before each string,
40 in octal/hex/decimal.
41
42 --include-all-whitespace
43 -w By default tab and space are the only whitepace included in graphic
44 char sequences. This option considers all of isspace() valid.
45
46 -o Like -to. (Some other implementations have -o like -to,
47 others like -td. We chose one arbitrarily.)
48
49 --encoding={s,S,b,l,B,L}
50 -e {s,S,b,l,B,L}
51 Select character encoding: 7-bit-character, 8-bit-character,
52 bigendian 16-bit, littleendian 16-bit, bigendian 32-bit,
53 littleendian 32-bit.
54
55 --target=BFDNAME
56 -T {bfdname}
57 Specify a non-default object file format.
58
59 --unicode={default|locale|invalid|hex|escape|highlight}
60 -U {d|l|i|x|e|h}
61 Determine how to handle UTF-8 unicode characters. The default
62 is no special treatment. All other versions of this option
63 only apply if the encoding is valid and enabling the option
64 implies --encoding=S.
65 The 'locale' option displays the characters according to the
66 current locale. The 'invalid' option treats them as
67 non-string characters. The 'hex' option displays them as hex
68 byte sequences. The 'escape' option displays them as escape
69 sequences and the 'highlight' option displays them as
70 coloured escape sequences.
71
72 --output-separator=sep_string
73 -s sep_string String used to separate parsed strings in output.
74 Default is newline.
75
76 --help
77 -h Print the usage message on the standard output.
78
79 --version
80 -V
81 -v Print the program version number.
82
83 Written by Richard Stallman <rms@gnu.ai.mit.edu>
84 and David MacKenzie <djm@gnu.ai.mit.edu>. */
85
86 #include "sysdep.h"
87 #include "bfd.h"
88 #include "getopt.h"
89 #include "libiberty.h"
90 #include "safe-ctype.h"
91 #include "bucomm.h"
92
93 #ifndef streq
94 #define streq(a,b) (strcmp ((a),(b)) == 0)
95 #endif
96
97 typedef enum unicode_display_type
98 {
99 unicode_default = 0,
100 unicode_locale,
101 unicode_escape,
102 unicode_hex,
103 unicode_highlight,
104 unicode_invalid
105 } unicode_display_type;
106
107 static unicode_display_type unicode_display = unicode_default;
108
109 #define STRING_ISGRAPHIC(c) \
110 ( (c) >= 0 \
111 && (c) <= 255 \
112 && ((c) == '\t' || ISPRINT (c) || (encoding == 'S' && (c) > 127) \
113 || (include_all_whitespace && ISSPACE (c))) \
114 )
115
116 #ifndef errno
117 extern int errno;
118 #endif
119
120 /* The BFD section flags that identify an initialized data section. */
121 #define DATA_FLAGS (SEC_ALLOC | SEC_LOAD | SEC_HAS_CONTENTS)
122
123 /* Radix for printing addresses (must be 8, 10 or 16). */
124 static int address_radix;
125
126 /* Minimum length of sequence of graphic chars to trigger output. */
127 static unsigned int string_min;
128
129 /* Whether or not we include all whitespace as a graphic char. */
130 static bool include_all_whitespace;
131
132 /* TRUE means print address within file for each string. */
133 static bool print_addresses;
134
135 /* TRUE means print filename for each string. */
136 static bool print_filenames;
137
138 /* TRUE means for object files scan only the data section. */
139 static bool datasection_only;
140
141 /* The BFD object file format. */
142 static char *target;
143
144 /* The character encoding format. */
145 static char encoding;
146 static int encoding_bytes;
147
148 /* Output string used to separate parsed strings */
149 static char *output_separator;
150
151 static struct option long_options[] =
152 {
153 {"all", no_argument, NULL, 'a'},
154 {"bytes", required_argument, NULL, 'n'},
155 {"data", no_argument, NULL, 'd'},
156 {"encoding", required_argument, NULL, 'e'},
157 {"help", no_argument, NULL, 'h'},
158 {"include-all-whitespace", no_argument, NULL, 'w'},
159 {"output-separator", required_argument, NULL, 's'},
160 {"print-file-name", no_argument, NULL, 'f'},
161 {"radix", required_argument, NULL, 't'},
162 {"target", required_argument, NULL, 'T'},
163 {"unicode", required_argument, NULL, 'U'},
164 {"version", no_argument, NULL, 'v'},
165 {NULL, 0, NULL, 0}
166 };
167
168 static bool strings_file (char *);
169 static void print_strings (const char *, FILE *, file_ptr, int, char *);
170 static void usage (FILE *, int) ATTRIBUTE_NORETURN;
171 \f
172 int main (int, char **);
173
174 static void
175 set_string_min (const char * arg)
176 {
177 char *s;
178 unsigned long l = strtoul (arg, &s, 0);
179
180 if (s != NULL && *s != 0)
181 fatal (_("invalid integer argument %s"), arg);
182
183 string_min = (unsigned int) l;
184
185 if (l != (unsigned long) string_min)
186 fatal (_("minimum string length is too big: %s"), arg);
187
188 if (string_min < 1)
189 fatal (_("minimum string length is too small: %s"), arg);
190
191 /* PR 30595: Look for minimum string lengths that overflow an 'int'. */
192 if (string_min + 1 == 0)
193 fatal (_("minimum string length %s is too big"), arg);
194
195 /* FIXME: Should we warn for unreasonably large minimum
196 string lengths, even if technically they will work ? */
197 }
198
199 int
200 main (int argc, char **argv)
201 {
202 int optc;
203 int exit_status = 0;
204 bool files_given = false;
205 int numeric_opt = 0;
206
207 setlocale (LC_ALL, "");
208 bindtextdomain (PACKAGE, LOCALEDIR);
209 textdomain (PACKAGE);
210
211 program_name = argv[0];
212 xmalloc_set_program_name (program_name);
213 bfd_set_error_program_name (program_name);
214
215 expandargv (&argc, &argv);
216
217 string_min = 4;
218 include_all_whitespace = false;
219 print_addresses = false;
220 print_filenames = false;
221 if (DEFAULT_STRINGS_ALL)
222 datasection_only = false;
223 else
224 datasection_only = true;
225 target = NULL;
226 encoding = 's';
227 output_separator = NULL;
228
229 while ((optc = getopt_long (argc, argv, "adfhHn:wot:e:T:s:U:Vv0123456789",
230 long_options, (int *) 0)) != EOF)
231 {
232 switch (optc)
233 {
234 case 'a':
235 datasection_only = false;
236 break;
237
238 case 'd':
239 datasection_only = true;
240 break;
241
242 case 'f':
243 print_filenames = true;
244 break;
245
246 case 'H':
247 case 'h':
248 usage (stdout, 0);
249
250 case 'n':
251 set_string_min (optarg);
252 break;
253
254 case 'w':
255 include_all_whitespace = true;
256 break;
257
258 case 'o':
259 print_addresses = true;
260 address_radix = 8;
261 break;
262
263 case 't':
264 print_addresses = true;
265 if (optarg[1] != '\0')
266 usage (stderr, 1);
267 switch (optarg[0])
268 {
269 case 'o':
270 address_radix = 8;
271 break;
272
273 case 'd':
274 address_radix = 10;
275 break;
276
277 case 'x':
278 address_radix = 16;
279 break;
280
281 default:
282 usage (stderr, 1);
283 }
284 break;
285
286 case 'T':
287 target = optarg;
288 break;
289
290 case 'e':
291 if (optarg[1] != '\0')
292 usage (stderr, 1);
293 encoding = optarg[0];
294 break;
295
296 case 's':
297 output_separator = optarg;
298 break;
299
300 case 'U':
301 if (streq (optarg, "default") || streq (optarg, "d"))
302 unicode_display = unicode_default;
303 else if (streq (optarg, "locale") || streq (optarg, "l"))
304 unicode_display = unicode_locale;
305 else if (streq (optarg, "escape") || streq (optarg, "e"))
306 unicode_display = unicode_escape;
307 else if (streq (optarg, "invalid") || streq (optarg, "i"))
308 unicode_display = unicode_invalid;
309 else if (streq (optarg, "hex") || streq (optarg, "x"))
310 unicode_display = unicode_hex;
311 else if (streq (optarg, "highlight") || streq (optarg, "h"))
312 unicode_display = unicode_highlight;
313 else
314 fatal (_("invalid argument to -U/--unicode: %s"), optarg);
315 break;
316
317 case 'V':
318 case 'v':
319 print_version ("strings");
320 break;
321
322 case '?':
323 usage (stderr, 1);
324
325 default:
326 numeric_opt = optind;
327 break;
328 }
329 }
330
331 if (unicode_display != unicode_default)
332 encoding = 'S';
333
334 if (numeric_opt != 0)
335 set_string_min (argv[numeric_opt - 1] + 1);
336
337 switch (encoding)
338 {
339 case 'S':
340 case 's':
341 encoding_bytes = 1;
342 break;
343 case 'b':
344 case 'l':
345 encoding_bytes = 2;
346 break;
347 case 'B':
348 case 'L':
349 encoding_bytes = 4;
350 break;
351 default:
352 usage (stderr, 1);
353 }
354
355 if (bfd_init () != BFD_INIT_MAGIC)
356 fatal (_("fatal error: libbfd ABI mismatch"));
357 set_default_bfd_target ();
358
359 if (optind >= argc)
360 {
361 datasection_only = false;
362 SET_BINARY (fileno (stdin));
363 print_strings ("{standard input}", stdin, 0, 0, (char *) NULL);
364 files_given = true;
365 }
366 else
367 {
368 for (; optind < argc; ++optind)
369 {
370 if (streq (argv[optind], "-"))
371 datasection_only = false;
372 else
373 {
374 files_given = true;
375 exit_status |= !strings_file (argv[optind]);
376 }
377 }
378 }
379
380 if (!files_given)
381 usage (stderr, 1);
382
383 return (exit_status);
384 }
385 \f
386 /* Scan section SECT of the file ABFD, whose printable name is
387 FILENAME. If it contains initialized data set GOT_A_SECTION and
388 print the strings in it. */
389
390 static void
391 strings_a_section (bfd *abfd, asection *sect, const char *filename,
392 bool *got_a_section)
393 {
394 bfd_size_type sectsize;
395 bfd_byte *mem;
396
397 if ((sect->flags & DATA_FLAGS) != DATA_FLAGS)
398 return;
399
400 sectsize = bfd_section_size (sect);
401 if (sectsize == 0)
402 return;
403
404 if (!bfd_malloc_and_get_section (abfd, sect, &mem))
405 {
406 non_fatal (_("%s: Reading section %s failed: %s"),
407 filename, sect->name, bfd_errmsg (bfd_get_error ()));
408 return;
409 }
410
411 *got_a_section = true;
412 print_strings (filename, NULL, sect->filepos, sectsize, (char *) mem);
413 free (mem);
414 }
415
416 /* Scan all of the sections in FILE, and print the strings
417 in the initialized data section(s).
418
419 Return TRUE if successful,
420 FALSE if not (such as if FILE is not an object file). */
421
422 static bool
423 strings_object_file (const char *file)
424 {
425 bfd *abfd;
426 asection *s;
427 bool got_a_section;
428
429 abfd = bfd_openr (file, target);
430
431 if (abfd == NULL)
432 /* Treat the file as a non-object file. */
433 return false;
434
435 /* This call is mainly for its side effect of reading in the sections.
436 We follow the traditional behavior of `strings' in that we don't
437 complain if we don't recognize a file to be an object file. */
438 if (!bfd_check_format (abfd, bfd_object))
439 {
440 bfd_close (abfd);
441 return false;
442 }
443
444 got_a_section = false;
445 for (s = abfd->sections; s != NULL; s = s->next)
446 strings_a_section (abfd, s, file, &got_a_section);
447
448 if (!bfd_close (abfd))
449 {
450 bfd_nonfatal (file);
451 return false;
452 }
453
454 return got_a_section;
455 }
456
457 /* Print the strings in FILE. Return TRUE if ok, FALSE if an error occurs. */
458
459 static bool
460 strings_file (char *file)
461 {
462 struct stat st;
463
464 /* get_file_size does not support non-S_ISREG files. */
465
466 if (stat (file, &st) < 0)
467 {
468 if (errno == ENOENT)
469 non_fatal (_("'%s': No such file"), file);
470 else
471 non_fatal (_("Warning: could not locate '%s'. reason: %s"),
472 file, strerror (errno));
473 return false;
474 }
475 else if (S_ISDIR (st.st_mode))
476 {
477 non_fatal (_("Warning: '%s' is a directory"), file);
478 return false;
479 }
480
481 /* If we weren't told to scan the whole file,
482 try to open it as an object file and only look at
483 initialized data sections. If that fails, fall back to the
484 whole file. */
485 if (!datasection_only || !strings_object_file (file))
486 {
487 FILE *stream;
488
489 stream = fopen (file, FOPEN_RB);
490 if (stream == NULL)
491 {
492 fprintf (stderr, "%s: ", program_name);
493 perror (file);
494 return false;
495 }
496
497 print_strings (file, stream, (file_ptr) 0, 0, (char *) NULL);
498
499 if (fclose (stream) == EOF)
500 {
501 fprintf (stderr, "%s: ", program_name);
502 perror (file);
503 return false;
504 }
505 }
506
507 return true;
508 }
509 \f
510 /* Read the next character, return EOF if none available.
511 Assume that STREAM is positioned so that the next byte read
512 is at address ADDRESS in the file.
513
514 If STREAM is NULL, do not read from it.
515 The caller can supply a buffer of characters
516 to be processed before the data in STREAM.
517 MAGIC is the address of the buffer and
518 MAGICCOUNT is how many characters are in it. */
519
520 static long
521 get_char (FILE *stream, file_ptr *address, int *magiccount, char **magic)
522 {
523 int c, i;
524 long r = 0;
525
526 for (i = 0; i < encoding_bytes; i++)
527 {
528 if (*magiccount)
529 {
530 (*magiccount)--;
531 c = *(*magic)++;
532 }
533 else
534 {
535 if (stream == NULL)
536 return EOF;
537
538 /* Only use getc_unlocked if we found a declaration for it.
539 Otherwise, libc is not thread safe by default, and we
540 should not use it. */
541
542 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
543 c = getc_unlocked (stream);
544 #else
545 c = getc (stream);
546 #endif
547 if (c == EOF)
548 return EOF;
549 }
550
551 (*address)++;
552 r = (r << 8) | (c & 0xff);
553 }
554
555 switch (encoding)
556 {
557 default:
558 break;
559 case 'l':
560 r = ((r & 0xff) << 8) | ((r & 0xff00) >> 8);
561 break;
562 case 'L':
563 r = (((r & 0xff) << 24) | ((r & 0xff00) << 8)
564 | ((r & 0xff0000) >> 8) | ((r & 0xff000000) >> 24));
565 break;
566 }
567
568 return r;
569 }
570
571 /* Throw away one byte of a (possibly) multi-byte char C, updating
572 address and buffer to suit. */
573
574 static void
575 unget_part_char (long c, file_ptr *address, int *magiccount, char **magic)
576 {
577 static char tmp[4];
578
579 if (encoding_bytes > 1)
580 {
581 *address -= encoding_bytes - 1;
582
583 if (*magiccount == 0)
584 {
585 /* If no magic buffer exists, use temp buffer. */
586 switch (encoding)
587 {
588 default:
589 break;
590 case 'b':
591 tmp[0] = c & 0xff;
592 *magiccount = 1;
593 break;
594 case 'l':
595 tmp[0] = (c >> 8) & 0xff;
596 *magiccount = 1;
597 break;
598 case 'B':
599 tmp[0] = (c >> 16) & 0xff;
600 tmp[1] = (c >> 8) & 0xff;
601 tmp[2] = c & 0xff;
602 *magiccount = 3;
603 break;
604 case 'L':
605 tmp[0] = (c >> 8) & 0xff;
606 tmp[1] = (c >> 16) & 0xff;
607 tmp[2] = (c >> 24) & 0xff;
608 *magiccount = 3;
609 break;
610 }
611 *magic = tmp;
612 }
613 else
614 {
615 /* If magic buffer exists, rewind. */
616 *magic -= encoding_bytes - 1;
617 *magiccount += encoding_bytes - 1;
618 }
619 }
620 }
621
622 static void
623 print_filename_and_address (const char * filename, file_ptr address)
624 {
625 if (print_filenames)
626 printf ("%s: ", filename);
627
628 if (! print_addresses)
629 return;
630
631 switch (address_radix)
632 {
633 case 8:
634 if (sizeof (address) > sizeof (long))
635 {
636 #ifndef __MSVCRT__
637 printf ("%7llo ", (unsigned long long) address);
638 #else
639 printf ("%7I64o ", (unsigned long long) address);
640 #endif
641 }
642 else
643 printf ("%7lo ", (unsigned long) address);
644 break;
645
646 case 10:
647 if (sizeof (address) > sizeof (long))
648 {
649 #ifndef __MSVCRT__
650 printf ("%7llu ", (unsigned long long) address);
651 #else
652 printf ("%7I64d ", (unsigned long long) address);
653 #endif
654 }
655 else
656 printf ("%7ld ", (long) address);
657 break;
658
659 case 16:
660 if (sizeof (address) > sizeof (long))
661 {
662 #ifndef __MSVCRT__
663 printf ("%7llx ", (unsigned long long) address);
664 #else
665 printf ("%7I64x ", (unsigned long long) address);
666 #endif
667 }
668 else
669 printf ("%7lx ", (unsigned long) address);
670 break;
671 }
672 }
673
674 /* Return non-zero if the bytes starting at BUFFER form a valid UTF-8 encoding.
675 If the encoding is valid then returns the number of bytes it uses. */
676
677 static unsigned int
678 is_valid_utf8 (const unsigned char * buffer, unsigned long buflen)
679 {
680 if (buffer[0] < 0xc0)
681 return 0;
682
683 if (buflen < 2)
684 return 0;
685
686 if ((buffer[1] & 0xc0) != 0x80)
687 return 0;
688
689 if ((buffer[0] & 0x20) == 0)
690 return 2;
691
692 if (buflen < 3)
693 return 0;
694
695 if ((buffer[2] & 0xc0) != 0x80)
696 return 0;
697
698 if ((buffer[0] & 0x10) == 0)
699 return 3;
700
701 if (buflen < 4)
702 return 0;
703
704 if ((buffer[3] & 0xc0) != 0x80)
705 return 0;
706
707 return 4;
708 }
709
710 /* Display a UTF-8 encoded character in BUFFER according to the setting
711 of unicode_display. The character is known to be valid.
712 Returns the number of bytes consumed. */
713
714 static unsigned int
715 display_utf8_char (const unsigned char * buffer)
716 {
717 unsigned int j;
718 unsigned int utf8_len;
719
720 switch (buffer[0] & 0x30)
721 {
722 case 0x00:
723 case 0x10:
724 utf8_len = 2;
725 break;
726 case 0x20:
727 utf8_len = 3;
728 break;
729 default:
730 utf8_len = 4;
731 }
732
733 switch (unicode_display)
734 {
735 default:
736 fprintf (stderr, "ICE: unexpected unicode display type\n");
737 break;
738
739 case unicode_escape:
740 case unicode_highlight:
741 if (unicode_display == unicode_highlight && isatty (1))
742 printf ("\x1B[31;47m"); /* Red. */
743
744 switch (utf8_len)
745 {
746 case 2:
747 printf ("\\u%02x%02x",
748 ((buffer[0] & 0x1c) >> 2),
749 ((buffer[0] & 0x03) << 6) | (buffer[1] & 0x3f));
750 break;
751
752 case 3:
753 printf ("\\u%02x%02x",
754 ((buffer[0] & 0x0f) << 4) | ((buffer[1] & 0x3c) >> 2),
755 ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3f)));
756 break;
757
758 case 4:
759 printf ("\\u%02x%02x%02x",
760 ((buffer[0] & 0x07) << 6) | ((buffer[1] & 0x3c) >> 2),
761 ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3c) >> 2),
762 ((buffer[2] & 0x03) << 6) | ((buffer[3] & 0x3f)));
763 break;
764 default:
765 /* URG. */
766 break;
767 }
768
769 if (unicode_display == unicode_highlight && isatty (1))
770 printf ("\033[0m"); /* Default colour. */
771 break;
772
773 case unicode_hex:
774 putchar ('<');
775 printf ("0x");
776 for (j = 0; j < utf8_len; j++)
777 printf ("%02x", buffer [j]);
778 putchar ('>');
779 break;
780
781 case unicode_locale:
782 printf ("%.1s", buffer);
783 break;
784 }
785
786 return utf8_len;
787 }
788
789 /* Display strings in BUFFER. Treat any UTF-8 encoded characters encountered
790 according to the setting of the unicode_display variable. The buffer
791 contains BUFLEN bytes.
792
793 Display the characters as if they started at ADDRESS and are contained in
794 FILENAME. */
795
796 static void
797 print_unicode_buffer (const char * filename,
798 file_ptr address,
799 const unsigned char * buffer,
800 unsigned long buflen)
801 {
802 /* Paranoia checks... */
803 if (filename == NULL
804 || buffer == NULL
805 || unicode_display == unicode_default
806 || encoding != 'S'
807 || encoding_bytes != 1)
808 {
809 fprintf (stderr, "ICE: bad arguments to print_unicode_buffer\n");
810 return;
811 }
812
813 if (buflen == 0)
814 return;
815
816 /* We must only display strings that are at least string_min *characters*
817 long. So we scan the buffer in two stages. First we locate the start
818 of a potential string. Then we walk along it until we have found
819 string_min characters. Then we go back to the start point and start
820 displaying characters according to the unicode_display setting. */
821
822 unsigned long start_point = 0;
823 unsigned long i = 0;
824 unsigned int char_len = 1;
825 unsigned int num_found = 0;
826
827 for (i = 0; i < buflen; i += char_len)
828 {
829 int c = buffer[i];
830
831 char_len = 1;
832
833 /* Find the first potential character of a string. */
834 if (! STRING_ISGRAPHIC (c))
835 {
836 num_found = 0;
837 continue;
838 }
839
840 if (c > 126)
841 {
842 if (c < 0xc0)
843 {
844 num_found = 0;
845 continue;
846 }
847
848 if ((char_len = is_valid_utf8 (buffer + i, buflen - i)) == 0)
849 {
850 char_len = 1;
851 num_found = 0;
852 continue;
853 }
854
855 if (unicode_display == unicode_invalid)
856 {
857 /* We have found a valid UTF-8 character, but we treat it as non-graphic. */
858 num_found = 0;
859 continue;
860 }
861 }
862
863 if (num_found == 0)
864 /* We have found a potential starting point for a string. */
865 start_point = i;
866
867 ++ num_found;
868
869 if (num_found >= string_min)
870 break;
871 }
872
873 if (num_found < string_min)
874 return;
875
876 print_filename_and_address (filename, address + start_point);
877
878 /* We have found string_min characters. Display them and any
879 more that follow. */
880 for (i = start_point; i < buflen; i += char_len)
881 {
882 int c = buffer[i];
883
884 char_len = 1;
885
886 if (! STRING_ISGRAPHIC (c))
887 break;
888 else if (c < 127)
889 putchar (c);
890 else if (! is_valid_utf8 (buffer + i, buflen - i))
891 break;
892 else if (unicode_display == unicode_invalid)
893 break;
894 else
895 char_len = display_utf8_char (buffer + i);
896 }
897
898 if (output_separator)
899 fputs (output_separator, stdout);
900 else
901 putchar ('\n');
902
903 /* FIXME: Using tail recursion here is lazy programming... */
904 print_unicode_buffer (filename, address + i, buffer + i, buflen - i);
905 }
906
907 static int
908 get_unicode_byte (FILE * stream,
909 unsigned char * putback,
910 unsigned int * num_putback,
911 unsigned int * num_read)
912 {
913 if (* num_putback > 0)
914 {
915 * num_putback = * num_putback - 1;
916 return putback [* num_putback];
917 }
918
919 * num_read = * num_read + 1;
920
921 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
922 return getc_unlocked (stream);
923 #else
924 return getc (stream);
925 #endif
926 }
927
928 /* Helper function for print_unicode_stream. */
929
930 static void
931 print_unicode_stream_body (const char * filename,
932 file_ptr address,
933 FILE * stream,
934 unsigned char * putback_buf,
935 unsigned int num_putback,
936 unsigned char * print_buf)
937 {
938 /* It would be nice if we could just read the stream into a buffer
939 and then process if with print_unicode_buffer. But the input
940 might be huge or it might time-locked (eg stdin). So instead
941 we go one byte at a time... */
942
943 file_ptr start_point = 0;
944 unsigned int num_read = 0;
945 unsigned int num_chars = 0;
946 unsigned int num_print = 0;
947 int c = 0;
948
949 /* Find a series of string_min characters. Put them into print_buf. */
950 do
951 {
952 if (num_chars >= string_min)
953 break;
954
955 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
956 if (c == EOF)
957 break;
958
959 if (! STRING_ISGRAPHIC (c))
960 {
961 num_chars = num_print = 0;
962 continue;
963 }
964
965 if (num_chars == 0)
966 start_point = num_read - 1;
967
968 if (c < 127)
969 {
970 print_buf[num_print] = c;
971 num_chars ++;
972 num_print ++;
973 continue;
974 }
975
976 if (c < 0xc0)
977 {
978 num_chars = num_print = 0;
979 continue;
980 }
981
982 /* We *might* have a UTF-8 sequence. Time to start peeking. */
983 char utf8[4];
984
985 utf8[0] = c;
986 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
987 if (c == EOF)
988 break;
989 utf8[1] = c;
990
991 if ((utf8[1] & 0xc0) != 0x80)
992 {
993 /* Invalid UTF-8. */
994 putback_buf[num_putback++] = utf8[1];
995 num_chars = num_print = 0;
996 continue;
997 }
998 else if ((utf8[0] & 0x20) == 0)
999 {
1000 /* A valid 2-byte UTF-8 encoding. */
1001 if (unicode_display == unicode_invalid)
1002 {
1003 putback_buf[num_putback++] = utf8[1];
1004 num_chars = num_print = 0;
1005 }
1006 else
1007 {
1008 print_buf[num_print ++] = utf8[0];
1009 print_buf[num_print ++] = utf8[1];
1010 num_chars ++;
1011 }
1012 continue;
1013 }
1014
1015 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1016 if (c == EOF)
1017 break;
1018 utf8[2] = c;
1019
1020 if ((utf8[2] & 0xc0) != 0x80)
1021 {
1022 /* Invalid UTF-8. */
1023 putback_buf[num_putback++] = utf8[2];
1024 putback_buf[num_putback++] = utf8[1];
1025 num_chars = num_print = 0;
1026 continue;
1027 }
1028 else if ((utf8[0] & 0x10) == 0)
1029 {
1030 /* A valid 3-byte UTF-8 encoding. */
1031 if (unicode_display == unicode_invalid)
1032 {
1033 putback_buf[num_putback++] = utf8[2];
1034 putback_buf[num_putback++] = utf8[1];
1035 num_chars = num_print = 0;
1036 }
1037 else
1038 {
1039 print_buf[num_print ++] = utf8[0];
1040 print_buf[num_print ++] = utf8[1];
1041 print_buf[num_print ++] = utf8[2];
1042 num_chars ++;
1043 }
1044 continue;
1045 }
1046
1047 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1048 if (c == EOF)
1049 break;
1050 utf8[3] = c;
1051
1052 if ((utf8[3] & 0xc0) != 0x80)
1053 {
1054 /* Invalid UTF-8. */
1055 putback_buf[num_putback++] = utf8[3];
1056 putback_buf[num_putback++] = utf8[2];
1057 putback_buf[num_putback++] = utf8[1];
1058 num_chars = num_print = 0;
1059 }
1060 /* We have a valid 4-byte UTF-8 encoding. */
1061 else if (unicode_display == unicode_invalid)
1062 {
1063 putback_buf[num_putback++] = utf8[3];
1064 putback_buf[num_putback++] = utf8[1];
1065 putback_buf[num_putback++] = utf8[2];
1066 num_chars = num_print = 0;
1067 }
1068 else
1069 {
1070 print_buf[num_print ++] = utf8[0];
1071 print_buf[num_print ++] = utf8[1];
1072 print_buf[num_print ++] = utf8[2];
1073 print_buf[num_print ++] = utf8[3];
1074 num_chars ++;
1075 }
1076 }
1077 while (1);
1078
1079 if (num_chars >= string_min)
1080 {
1081 /* We know that we have string_min valid characters in print_buf,
1082 and there may be more to come in the stream. Start displaying
1083 them. */
1084
1085 print_filename_and_address (filename, address + start_point);
1086
1087 unsigned int i;
1088 for (i = 0; i < num_print;)
1089 {
1090 if (print_buf[i] < 127)
1091 putchar (print_buf[i++]);
1092 else
1093 i += display_utf8_char (print_buf + i);
1094 }
1095
1096 /* OK so now we have to start read unchecked bytes. */
1097
1098 /* Find a series of string_min characters. Put them into print_buf. */
1099 do
1100 {
1101 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1102 if (c == EOF)
1103 break;
1104
1105 if (! STRING_ISGRAPHIC (c))
1106 break;
1107
1108 if (c < 127)
1109 {
1110 putchar (c);
1111 continue;
1112 }
1113
1114 if (c < 0xc0)
1115 break;
1116
1117 /* We *might* have a UTF-8 sequence. Time to start peeking. */
1118 unsigned char utf8[4];
1119
1120 utf8[0] = c;
1121 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1122 if (c == EOF)
1123 break;
1124 utf8[1] = c;
1125
1126 if ((utf8[1] & 0xc0) != 0x80)
1127 {
1128 /* Invalid UTF-8. */
1129 putback_buf[num_putback++] = utf8[1];
1130 break;
1131 }
1132 else if ((utf8[0] & 0x20) == 0)
1133 {
1134 /* Valid 2-byte UTF-8. */
1135 if (unicode_display == unicode_invalid)
1136 {
1137 putback_buf[num_putback++] = utf8[1];
1138 break;
1139 }
1140 else
1141 {
1142 (void) display_utf8_char (utf8);
1143 continue;
1144 }
1145 }
1146
1147 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1148 if (c == EOF)
1149 break;
1150 utf8[2] = c;
1151
1152 if ((utf8[2] & 0xc0) != 0x80)
1153 {
1154 /* Invalid UTF-8. */
1155 putback_buf[num_putback++] = utf8[2];
1156 putback_buf[num_putback++] = utf8[1];
1157 break;
1158 }
1159 else if ((utf8[0] & 0x10) == 0)
1160 {
1161 /* Valid 3-byte UTF-8. */
1162 if (unicode_display == unicode_invalid)
1163 {
1164 putback_buf[num_putback++] = utf8[2];
1165 putback_buf[num_putback++] = utf8[1];
1166 break;
1167 }
1168 else
1169 {
1170 (void) display_utf8_char (utf8);
1171 continue;
1172 }
1173 }
1174
1175 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1176 if (c == EOF)
1177 break;
1178 utf8[3] = c;
1179
1180 if ((utf8[3] & 0xc0) != 0x80)
1181 {
1182 /* Invalid UTF-8. */
1183 putback_buf[num_putback++] = utf8[3];
1184 putback_buf[num_putback++] = utf8[2];
1185 putback_buf[num_putback++] = utf8[1];
1186 break;
1187 }
1188 else if (unicode_display == unicode_invalid)
1189 {
1190 putback_buf[num_putback++] = utf8[3];
1191 putback_buf[num_putback++] = utf8[2];
1192 putback_buf[num_putback++] = utf8[1];
1193 break;
1194 }
1195 else
1196 /* A valid 4-byte UTF-8 encoding. */
1197 (void) display_utf8_char (utf8);
1198 }
1199 while (1);
1200
1201 if (output_separator)
1202 fputs (output_separator, stdout);
1203 else
1204 putchar ('\n');
1205 }
1206
1207 if (c != EOF)
1208 /* FIXME: Using tail recursion here is lazy, but it works. */
1209 print_unicode_stream_body (filename, address + num_read, stream, putback_buf, num_putback, print_buf);
1210 }
1211
1212 /* Display strings read in from STREAM. Treat any UTF-8 encoded characters
1213 encountered according to the setting of the unicode_display variable.
1214 The stream is positioned at ADDRESS and is attached to FILENAME. */
1215
1216 static void
1217 print_unicode_stream (const char * filename,
1218 file_ptr address,
1219 FILE * stream)
1220 {
1221 /* Paranoia checks... */
1222 if (filename == NULL
1223 || stream == NULL
1224 || unicode_display == unicode_default
1225 || encoding != 'S'
1226 || encoding_bytes != 1)
1227 {
1228 fprintf (stderr, "ICE: bad arguments to print_unicode_stream\n");
1229 return;
1230 }
1231
1232 /* Allocate space for string_min 4-byte utf-8 characters. */
1233 size_t amt = string_min;
1234 amt = (4 * amt) + 1;
1235 unsigned char * print_buf = xmalloc (amt);
1236 /* We should never have to put back more than 4 bytes. */
1237 unsigned char putback_buf[5];
1238 unsigned int num_putback = 0;
1239
1240 print_unicode_stream_body (filename, address, stream, putback_buf, num_putback, print_buf);
1241 free (print_buf);
1242 }
1243 \f
1244 /* Find the strings in file FILENAME, read from STREAM.
1245 Assume that STREAM is positioned so that the next byte read
1246 is at address ADDRESS in the file.
1247
1248 If STREAM is NULL, do not read from it.
1249 The caller can supply a buffer of characters
1250 to be processed before the data in STREAM.
1251 MAGIC is the address of the buffer and
1252 MAGICCOUNT is how many characters are in it.
1253 Those characters come at address ADDRESS and the data in STREAM follow. */
1254
1255 static void
1256 print_strings (const char *filename, FILE *stream, file_ptr address,
1257 int magiccount, char *magic)
1258 {
1259 if (unicode_display != unicode_default)
1260 {
1261 if (magic != NULL)
1262 print_unicode_buffer (filename, address,
1263 (const unsigned char *) magic, magiccount);
1264
1265 if (stream != NULL)
1266 print_unicode_stream (filename, address, stream);
1267 return;
1268 }
1269
1270 char *buf = (char *) xmalloc (sizeof (char) * (string_min + 1));
1271
1272 while (1)
1273 {
1274 file_ptr start;
1275 unsigned int i;
1276 long c;
1277
1278 /* See if the next `string_min' chars are all graphic chars. */
1279 tryline:
1280 start = address;
1281 for (i = 0; i < string_min; i++)
1282 {
1283 c = get_char (stream, &address, &magiccount, &magic);
1284 if (c == EOF)
1285 {
1286 free (buf);
1287 return;
1288 }
1289
1290 if (! STRING_ISGRAPHIC (c))
1291 {
1292 /* Found a non-graphic. Try again starting with next byte. */
1293 unget_part_char (c, &address, &magiccount, &magic);
1294 goto tryline;
1295 }
1296 buf[i] = c;
1297 }
1298
1299 /* We found a run of `string_min' graphic characters. Print up
1300 to the next non-graphic character. */
1301 print_filename_and_address (filename, start);
1302
1303 buf[i] = '\0';
1304 fputs (buf, stdout);
1305
1306 while (1)
1307 {
1308 c = get_char (stream, &address, &magiccount, &magic);
1309 if (c == EOF)
1310 break;
1311 if (! STRING_ISGRAPHIC (c))
1312 {
1313 unget_part_char (c, &address, &magiccount, &magic);
1314 break;
1315 }
1316 putchar (c);
1317 }
1318
1319 if (output_separator)
1320 fputs (output_separator, stdout);
1321 else
1322 putchar ('\n');
1323 }
1324 free (buf);
1325 }
1326 \f
1327 static void
1328 usage (FILE *stream, int status)
1329 {
1330 fprintf (stream, _("Usage: %s [option(s)] [file(s)]\n"), program_name);
1331 fprintf (stream, _(" Display printable strings in [file(s)] (stdin by default)\n"));
1332 fprintf (stream, _(" The options are:\n"));
1333
1334 if (DEFAULT_STRINGS_ALL)
1335 fprintf (stream, _("\
1336 -a - --all Scan the entire file, not just the data section [default]\n\
1337 -d --data Only scan the data sections in the file\n"));
1338 else
1339 fprintf (stream, _("\
1340 -a - --all Scan the entire file, not just the data section\n\
1341 -d --data Only scan the data sections in the file [default]\n"));
1342
1343 fprintf (stream, _("\
1344 -f --print-file-name Print the name of the file before each string\n\
1345 -n <number> Locate & print any sequence of at least <number>\n\
1346 --bytes=<number> displayable characters. (The default is 4).\n\
1347 -t --radix={o,d,x} Print the location of the string in base 8, 10 or 16\n\
1348 -w --include-all-whitespace Include all whitespace as valid string characters\n\
1349 -o An alias for --radix=o\n\
1350 -T --target=<BFDNAME> Specify the binary file format\n\
1351 -e --encoding={s,S,b,l,B,L} Select character size and endianness:\n\
1352 s = 7-bit, S = 8-bit, {b,l} = 16-bit, {B,L} = 32-bit\n\
1353 --unicode={default|show|invalid|hex|escape|highlight}\n\
1354 -U {d|s|i|x|e|h} Specify how to treat UTF-8 encoded unicode characters\n\
1355 -s --output-separator=<string> String used to separate strings in output.\n\
1356 @<file> Read options from <file>\n\
1357 -h --help Display this information\n\
1358 -v -V --version Print the program's version number\n"));
1359 list_supported_targets (program_name, stream);
1360 if (REPORT_BUGS_TO[0] && status == 0)
1361 fprintf (stream, _("Report bugs to %s\n"), REPORT_BUGS_TO);
1362 exit (status);
1363 }