]> git.ipfire.org Git - thirdparty/binutils-gdb.git/blob - binutils/strings.c
Prevent an illegal memory access when running the strings program with an excessively...
[thirdparty/binutils-gdb.git] / binutils / strings.c
1 /* strings -- print the strings of printable characters in files
2 Copyright (C) 1993-2023 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
17 02110-1301, USA. */
18 \f
19 /* Usage: strings [options] file...
20
21 Options:
22 --all
23 -a
24 - Scan each file in its entirety.
25
26 --data
27 -d Scan only the initialized data section(s) of object files.
28
29 --print-file-name
30 -f Print the name of the file before each string.
31
32 --bytes=min-len
33 -n min-len
34 -min-len Print graphic char sequences, MIN-LEN or more bytes long,
35 that are followed by a NUL or a non-displayable character.
36 Default is 4.
37
38 --radix={o,x,d}
39 -t {o,x,d} Print the offset within the file before each string,
40 in octal/hex/decimal.
41
42 --include-all-whitespace
43 -w By default tab and space are the only whitepace included in graphic
44 char sequences. This option considers all of isspace() valid.
45
46 -o Like -to. (Some other implementations have -o like -to,
47 others like -td. We chose one arbitrarily.)
48
49 --encoding={s,S,b,l,B,L}
50 -e {s,S,b,l,B,L}
51 Select character encoding: 7-bit-character, 8-bit-character,
52 bigendian 16-bit, littleendian 16-bit, bigendian 32-bit,
53 littleendian 32-bit.
54
55 --target=BFDNAME
56 -T {bfdname}
57 Specify a non-default object file format.
58
59 --unicode={default|locale|invalid|hex|escape|highlight}
60 -U {d|l|i|x|e|h}
61 Determine how to handle UTF-8 unicode characters. The default
62 is no special treatment. All other versions of this option
63 only apply if the encoding is valid and enabling the option
64 implies --encoding=S.
65 The 'locale' option displays the characters according to the
66 current locale. The 'invalid' option treats them as
67 non-string characters. The 'hex' option displays them as hex
68 byte sequences. The 'escape' option displays them as escape
69 sequences and the 'highlight' option displays them as
70 coloured escape sequences.
71
72 --output-separator=sep_string
73 -s sep_string String used to separate parsed strings in output.
74 Default is newline.
75
76 --help
77 -h Print the usage message on the standard output.
78
79 --version
80 -V
81 -v Print the program version number.
82
83 Written by Richard Stallman <rms@gnu.ai.mit.edu>
84 and David MacKenzie <djm@gnu.ai.mit.edu>. */
85
86 #include "sysdep.h"
87 #include "bfd.h"
88 #include "getopt.h"
89 #include "libiberty.h"
90 #include "safe-ctype.h"
91 #include "bucomm.h"
92
93 #ifndef streq
94 #define streq(a,b) (strcmp ((a),(b)) == 0)
95 #endif
96
97 typedef enum unicode_display_type
98 {
99 unicode_default = 0,
100 unicode_locale,
101 unicode_escape,
102 unicode_hex,
103 unicode_highlight,
104 unicode_invalid
105 } unicode_display_type;
106
107 static unicode_display_type unicode_display = unicode_default;
108
109 #define STRING_ISGRAPHIC(c) \
110 ( (c) >= 0 \
111 && (c) <= 255 \
112 && ((c) == '\t' || ISPRINT (c) || (encoding == 'S' && (c) > 127) \
113 || (include_all_whitespace && ISSPACE (c))) \
114 )
115
116 #ifndef errno
117 extern int errno;
118 #endif
119
120 /* The BFD section flags that identify an initialized data section. */
121 #define DATA_FLAGS (SEC_ALLOC | SEC_LOAD | SEC_HAS_CONTENTS)
122
123 /* Radix for printing addresses (must be 8, 10 or 16). */
124 static int address_radix;
125
126 /* Minimum length of sequence of graphic chars to trigger output. */
127 static unsigned int string_min;
128
129 /* Whether or not we include all whitespace as a graphic char. */
130 static bool include_all_whitespace;
131
132 /* TRUE means print address within file for each string. */
133 static bool print_addresses;
134
135 /* TRUE means print filename for each string. */
136 static bool print_filenames;
137
138 /* TRUE means for object files scan only the data section. */
139 static bool datasection_only;
140
141 /* The BFD object file format. */
142 static char *target;
143
144 /* The character encoding format. */
145 static char encoding;
146 static int encoding_bytes;
147
148 /* Output string used to separate parsed strings */
149 static char *output_separator;
150
151 static struct option long_options[] =
152 {
153 {"all", no_argument, NULL, 'a'},
154 {"bytes", required_argument, NULL, 'n'},
155 {"data", no_argument, NULL, 'd'},
156 {"encoding", required_argument, NULL, 'e'},
157 {"help", no_argument, NULL, 'h'},
158 {"include-all-whitespace", no_argument, NULL, 'w'},
159 {"output-separator", required_argument, NULL, 's'},
160 {"print-file-name", no_argument, NULL, 'f'},
161 {"radix", required_argument, NULL, 't'},
162 {"target", required_argument, NULL, 'T'},
163 {"unicode", required_argument, NULL, 'U'},
164 {"version", no_argument, NULL, 'v'},
165 {NULL, 0, NULL, 0}
166 };
167
168 static bool strings_file (char *);
169 static void print_strings (const char *, FILE *, file_ptr, int, char *);
170 static void usage (FILE *, int) ATTRIBUTE_NORETURN;
171 \f
172 int main (int, char **);
173
174 int
175 main (int argc, char **argv)
176 {
177 int optc;
178 int exit_status = 0;
179 bool files_given = false;
180 char *s;
181 int numeric_opt = 0;
182
183 setlocale (LC_ALL, "");
184 bindtextdomain (PACKAGE, LOCALEDIR);
185 textdomain (PACKAGE);
186
187 program_name = argv[0];
188 xmalloc_set_program_name (program_name);
189 bfd_set_error_program_name (program_name);
190
191 expandargv (&argc, &argv);
192
193 string_min = 4;
194 include_all_whitespace = false;
195 print_addresses = false;
196 print_filenames = false;
197 if (DEFAULT_STRINGS_ALL)
198 datasection_only = false;
199 else
200 datasection_only = true;
201 target = NULL;
202 encoding = 's';
203 output_separator = NULL;
204
205 while ((optc = getopt_long (argc, argv, "adfhHn:wot:e:T:s:U:Vv0123456789",
206 long_options, (int *) 0)) != EOF)
207 {
208 switch (optc)
209 {
210 case 'a':
211 datasection_only = false;
212 break;
213
214 case 'd':
215 datasection_only = true;
216 break;
217
218 case 'f':
219 print_filenames = true;
220 break;
221
222 case 'H':
223 case 'h':
224 usage (stdout, 0);
225
226 case 'n':
227 string_min = (int) strtoul (optarg, &s, 0);
228 if (s != NULL && *s != 0)
229 fatal (_("invalid integer argument %s"), optarg);
230 break;
231
232 case 'w':
233 include_all_whitespace = true;
234 break;
235
236 case 'o':
237 print_addresses = true;
238 address_radix = 8;
239 break;
240
241 case 't':
242 print_addresses = true;
243 if (optarg[1] != '\0')
244 usage (stderr, 1);
245 switch (optarg[0])
246 {
247 case 'o':
248 address_radix = 8;
249 break;
250
251 case 'd':
252 address_radix = 10;
253 break;
254
255 case 'x':
256 address_radix = 16;
257 break;
258
259 default:
260 usage (stderr, 1);
261 }
262 break;
263
264 case 'T':
265 target = optarg;
266 break;
267
268 case 'e':
269 if (optarg[1] != '\0')
270 usage (stderr, 1);
271 encoding = optarg[0];
272 break;
273
274 case 's':
275 output_separator = optarg;
276 break;
277
278 case 'U':
279 if (streq (optarg, "default") || streq (optarg, "d"))
280 unicode_display = unicode_default;
281 else if (streq (optarg, "locale") || streq (optarg, "l"))
282 unicode_display = unicode_locale;
283 else if (streq (optarg, "escape") || streq (optarg, "e"))
284 unicode_display = unicode_escape;
285 else if (streq (optarg, "invalid") || streq (optarg, "i"))
286 unicode_display = unicode_invalid;
287 else if (streq (optarg, "hex") || streq (optarg, "x"))
288 unicode_display = unicode_hex;
289 else if (streq (optarg, "highlight") || streq (optarg, "h"))
290 unicode_display = unicode_highlight;
291 else
292 fatal (_("invalid argument to -U/--unicode: %s"), optarg);
293 break;
294
295 case 'V':
296 case 'v':
297 print_version ("strings");
298 break;
299
300 case '?':
301 usage (stderr, 1);
302
303 default:
304 numeric_opt = optind;
305 break;
306 }
307 }
308
309 if (unicode_display != unicode_default)
310 encoding = 'S';
311
312 if (numeric_opt != 0)
313 {
314 string_min = (int) strtoul (argv[numeric_opt - 1] + 1, &s, 0);
315 if (s != NULL && *s != 0)
316 fatal (_("invalid integer argument %s"), argv[numeric_opt - 1] + 1);
317 }
318
319 if (string_min < 1)
320 fatal (_("invalid minimum string length %d"), string_min);
321 /* PR 30595: Look for excessive minimum string lengths.
322 The "(4 * string_min) + 1" is because this is the value
323 used to allocate space in print_unicode_stream(). */
324 else if (string_min == -1U || ((4 * string_min) + 1) == 0)
325 fatal (_("minimum string length %#x is too big"), string_min);
326
327 switch (encoding)
328 {
329 case 'S':
330 case 's':
331 encoding_bytes = 1;
332 break;
333 case 'b':
334 case 'l':
335 encoding_bytes = 2;
336 break;
337 case 'B':
338 case 'L':
339 encoding_bytes = 4;
340 break;
341 default:
342 usage (stderr, 1);
343 }
344
345 if (bfd_init () != BFD_INIT_MAGIC)
346 fatal (_("fatal error: libbfd ABI mismatch"));
347 set_default_bfd_target ();
348
349 if (optind >= argc)
350 {
351 datasection_only = false;
352 SET_BINARY (fileno (stdin));
353 print_strings ("{standard input}", stdin, 0, 0, (char *) NULL);
354 files_given = true;
355 }
356 else
357 {
358 for (; optind < argc; ++optind)
359 {
360 if (streq (argv[optind], "-"))
361 datasection_only = false;
362 else
363 {
364 files_given = true;
365 exit_status |= !strings_file (argv[optind]);
366 }
367 }
368 }
369
370 if (!files_given)
371 usage (stderr, 1);
372
373 return (exit_status);
374 }
375 \f
376 /* Scan section SECT of the file ABFD, whose printable name is
377 FILENAME. If it contains initialized data set GOT_A_SECTION and
378 print the strings in it. */
379
380 static void
381 strings_a_section (bfd *abfd, asection *sect, const char *filename,
382 bool *got_a_section)
383 {
384 bfd_size_type sectsize;
385 bfd_byte *mem;
386
387 if ((sect->flags & DATA_FLAGS) != DATA_FLAGS)
388 return;
389
390 sectsize = bfd_section_size (sect);
391 if (sectsize == 0)
392 return;
393
394 if (!bfd_malloc_and_get_section (abfd, sect, &mem))
395 {
396 non_fatal (_("%s: Reading section %s failed: %s"),
397 filename, sect->name, bfd_errmsg (bfd_get_error ()));
398 return;
399 }
400
401 *got_a_section = true;
402 print_strings (filename, NULL, sect->filepos, sectsize, (char *) mem);
403 free (mem);
404 }
405
406 /* Scan all of the sections in FILE, and print the strings
407 in the initialized data section(s).
408
409 Return TRUE if successful,
410 FALSE if not (such as if FILE is not an object file). */
411
412 static bool
413 strings_object_file (const char *file)
414 {
415 bfd *abfd;
416 asection *s;
417 bool got_a_section;
418
419 abfd = bfd_openr (file, target);
420
421 if (abfd == NULL)
422 /* Treat the file as a non-object file. */
423 return false;
424
425 /* This call is mainly for its side effect of reading in the sections.
426 We follow the traditional behavior of `strings' in that we don't
427 complain if we don't recognize a file to be an object file. */
428 if (!bfd_check_format (abfd, bfd_object))
429 {
430 bfd_close (abfd);
431 return false;
432 }
433
434 got_a_section = false;
435 for (s = abfd->sections; s != NULL; s = s->next)
436 strings_a_section (abfd, s, file, &got_a_section);
437
438 if (!bfd_close (abfd))
439 {
440 bfd_nonfatal (file);
441 return false;
442 }
443
444 return got_a_section;
445 }
446
447 /* Print the strings in FILE. Return TRUE if ok, FALSE if an error occurs. */
448
449 static bool
450 strings_file (char *file)
451 {
452 struct stat st;
453
454 /* get_file_size does not support non-S_ISREG files. */
455
456 if (stat (file, &st) < 0)
457 {
458 if (errno == ENOENT)
459 non_fatal (_("'%s': No such file"), file);
460 else
461 non_fatal (_("Warning: could not locate '%s'. reason: %s"),
462 file, strerror (errno));
463 return false;
464 }
465 else if (S_ISDIR (st.st_mode))
466 {
467 non_fatal (_("Warning: '%s' is a directory"), file);
468 return false;
469 }
470
471 /* If we weren't told to scan the whole file,
472 try to open it as an object file and only look at
473 initialized data sections. If that fails, fall back to the
474 whole file. */
475 if (!datasection_only || !strings_object_file (file))
476 {
477 FILE *stream;
478
479 stream = fopen (file, FOPEN_RB);
480 if (stream == NULL)
481 {
482 fprintf (stderr, "%s: ", program_name);
483 perror (file);
484 return false;
485 }
486
487 print_strings (file, stream, (file_ptr) 0, 0, (char *) NULL);
488
489 if (fclose (stream) == EOF)
490 {
491 fprintf (stderr, "%s: ", program_name);
492 perror (file);
493 return false;
494 }
495 }
496
497 return true;
498 }
499 \f
500 /* Read the next character, return EOF if none available.
501 Assume that STREAM is positioned so that the next byte read
502 is at address ADDRESS in the file.
503
504 If STREAM is NULL, do not read from it.
505 The caller can supply a buffer of characters
506 to be processed before the data in STREAM.
507 MAGIC is the address of the buffer and
508 MAGICCOUNT is how many characters are in it. */
509
510 static long
511 get_char (FILE *stream, file_ptr *address, int *magiccount, char **magic)
512 {
513 int c, i;
514 long r = 0;
515
516 for (i = 0; i < encoding_bytes; i++)
517 {
518 if (*magiccount)
519 {
520 (*magiccount)--;
521 c = *(*magic)++;
522 }
523 else
524 {
525 if (stream == NULL)
526 return EOF;
527
528 /* Only use getc_unlocked if we found a declaration for it.
529 Otherwise, libc is not thread safe by default, and we
530 should not use it. */
531
532 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
533 c = getc_unlocked (stream);
534 #else
535 c = getc (stream);
536 #endif
537 if (c == EOF)
538 return EOF;
539 }
540
541 (*address)++;
542 r = (r << 8) | (c & 0xff);
543 }
544
545 switch (encoding)
546 {
547 default:
548 break;
549 case 'l':
550 r = ((r & 0xff) << 8) | ((r & 0xff00) >> 8);
551 break;
552 case 'L':
553 r = (((r & 0xff) << 24) | ((r & 0xff00) << 8)
554 | ((r & 0xff0000) >> 8) | ((r & 0xff000000) >> 24));
555 break;
556 }
557
558 return r;
559 }
560
561 /* Throw away one byte of a (possibly) multi-byte char C, updating
562 address and buffer to suit. */
563
564 static void
565 unget_part_char (long c, file_ptr *address, int *magiccount, char **magic)
566 {
567 static char tmp[4];
568
569 if (encoding_bytes > 1)
570 {
571 *address -= encoding_bytes - 1;
572
573 if (*magiccount == 0)
574 {
575 /* If no magic buffer exists, use temp buffer. */
576 switch (encoding)
577 {
578 default:
579 break;
580 case 'b':
581 tmp[0] = c & 0xff;
582 *magiccount = 1;
583 break;
584 case 'l':
585 tmp[0] = (c >> 8) & 0xff;
586 *magiccount = 1;
587 break;
588 case 'B':
589 tmp[0] = (c >> 16) & 0xff;
590 tmp[1] = (c >> 8) & 0xff;
591 tmp[2] = c & 0xff;
592 *magiccount = 3;
593 break;
594 case 'L':
595 tmp[0] = (c >> 8) & 0xff;
596 tmp[1] = (c >> 16) & 0xff;
597 tmp[2] = (c >> 24) & 0xff;
598 *magiccount = 3;
599 break;
600 }
601 *magic = tmp;
602 }
603 else
604 {
605 /* If magic buffer exists, rewind. */
606 *magic -= encoding_bytes - 1;
607 *magiccount += encoding_bytes - 1;
608 }
609 }
610 }
611
612 static void
613 print_filename_and_address (const char * filename, file_ptr address)
614 {
615 if (print_filenames)
616 printf ("%s: ", filename);
617
618 if (! print_addresses)
619 return;
620
621 switch (address_radix)
622 {
623 case 8:
624 if (sizeof (address) > sizeof (long))
625 {
626 #ifndef __MSVCRT__
627 printf ("%7llo ", (unsigned long long) address);
628 #else
629 printf ("%7I64o ", (unsigned long long) address);
630 #endif
631 }
632 else
633 printf ("%7lo ", (unsigned long) address);
634 break;
635
636 case 10:
637 if (sizeof (address) > sizeof (long))
638 {
639 #ifndef __MSVCRT__
640 printf ("%7llu ", (unsigned long long) address);
641 #else
642 printf ("%7I64d ", (unsigned long long) address);
643 #endif
644 }
645 else
646 printf ("%7ld ", (long) address);
647 break;
648
649 case 16:
650 if (sizeof (address) > sizeof (long))
651 {
652 #ifndef __MSVCRT__
653 printf ("%7llx ", (unsigned long long) address);
654 #else
655 printf ("%7I64x ", (unsigned long long) address);
656 #endif
657 }
658 else
659 printf ("%7lx ", (unsigned long) address);
660 break;
661 }
662 }
663
664 /* Return non-zero if the bytes starting at BUFFER form a valid UTF-8 encoding.
665 If the encoding is valid then returns the number of bytes it uses. */
666
667 static unsigned int
668 is_valid_utf8 (const unsigned char * buffer, unsigned long buflen)
669 {
670 if (buffer[0] < 0xc0)
671 return 0;
672
673 if (buflen < 2)
674 return 0;
675
676 if ((buffer[1] & 0xc0) != 0x80)
677 return 0;
678
679 if ((buffer[0] & 0x20) == 0)
680 return 2;
681
682 if (buflen < 3)
683 return 0;
684
685 if ((buffer[2] & 0xc0) != 0x80)
686 return 0;
687
688 if ((buffer[0] & 0x10) == 0)
689 return 3;
690
691 if (buflen < 4)
692 return 0;
693
694 if ((buffer[3] & 0xc0) != 0x80)
695 return 0;
696
697 return 4;
698 }
699
700 /* Display a UTF-8 encoded character in BUFFER according to the setting
701 of unicode_display. The character is known to be valid.
702 Returns the number of bytes consumed. */
703
704 static unsigned int
705 display_utf8_char (const unsigned char * buffer)
706 {
707 unsigned int j;
708 unsigned int utf8_len;
709
710 switch (buffer[0] & 0x30)
711 {
712 case 0x00:
713 case 0x10:
714 utf8_len = 2;
715 break;
716 case 0x20:
717 utf8_len = 3;
718 break;
719 default:
720 utf8_len = 4;
721 }
722
723 switch (unicode_display)
724 {
725 default:
726 fprintf (stderr, "ICE: unexpected unicode display type\n");
727 break;
728
729 case unicode_escape:
730 case unicode_highlight:
731 if (unicode_display == unicode_highlight && isatty (1))
732 printf ("\x1B[31;47m"); /* Red. */
733
734 switch (utf8_len)
735 {
736 case 2:
737 printf ("\\u%02x%02x",
738 ((buffer[0] & 0x1c) >> 2),
739 ((buffer[0] & 0x03) << 6) | (buffer[1] & 0x3f));
740 break;
741
742 case 3:
743 printf ("\\u%02x%02x",
744 ((buffer[0] & 0x0f) << 4) | ((buffer[1] & 0x3c) >> 2),
745 ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3f)));
746 break;
747
748 case 4:
749 printf ("\\u%02x%02x%02x",
750 ((buffer[0] & 0x07) << 6) | ((buffer[1] & 0x3c) >> 2),
751 ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3c) >> 2),
752 ((buffer[2] & 0x03) << 6) | ((buffer[3] & 0x3f)));
753 break;
754 default:
755 /* URG. */
756 break;
757 }
758
759 if (unicode_display == unicode_highlight && isatty (1))
760 printf ("\033[0m"); /* Default colour. */
761 break;
762
763 case unicode_hex:
764 putchar ('<');
765 printf ("0x");
766 for (j = 0; j < utf8_len; j++)
767 printf ("%02x", buffer [j]);
768 putchar ('>');
769 break;
770
771 case unicode_locale:
772 printf ("%.1s", buffer);
773 break;
774 }
775
776 return utf8_len;
777 }
778
779 /* Display strings in BUFFER. Treat any UTF-8 encoded characters encountered
780 according to the setting of the unicode_display variable. The buffer
781 contains BUFLEN bytes.
782
783 Display the characters as if they started at ADDRESS and are contained in
784 FILENAME. */
785
786 static void
787 print_unicode_buffer (const char * filename,
788 file_ptr address,
789 const unsigned char * buffer,
790 unsigned long buflen)
791 {
792 /* Paranoia checks... */
793 if (filename == NULL
794 || buffer == NULL
795 || unicode_display == unicode_default
796 || encoding != 'S'
797 || encoding_bytes != 1)
798 {
799 fprintf (stderr, "ICE: bad arguments to print_unicode_buffer\n");
800 return;
801 }
802
803 if (buflen == 0)
804 return;
805
806 /* We must only display strings that are at least string_min *characters*
807 long. So we scan the buffer in two stages. First we locate the start
808 of a potential string. Then we walk along it until we have found
809 string_min characters. Then we go back to the start point and start
810 displaying characters according to the unicode_display setting. */
811
812 unsigned long start_point = 0;
813 unsigned long i = 0;
814 unsigned int char_len = 1;
815 unsigned int num_found = 0;
816
817 for (i = 0; i < buflen; i += char_len)
818 {
819 int c = buffer[i];
820
821 char_len = 1;
822
823 /* Find the first potential character of a string. */
824 if (! STRING_ISGRAPHIC (c))
825 {
826 num_found = 0;
827 continue;
828 }
829
830 if (c > 126)
831 {
832 if (c < 0xc0)
833 {
834 num_found = 0;
835 continue;
836 }
837
838 if ((char_len = is_valid_utf8 (buffer + i, buflen - i)) == 0)
839 {
840 char_len = 1;
841 num_found = 0;
842 continue;
843 }
844
845 if (unicode_display == unicode_invalid)
846 {
847 /* We have found a valid UTF-8 character, but we treat it as non-graphic. */
848 num_found = 0;
849 continue;
850 }
851 }
852
853 if (num_found == 0)
854 /* We have found a potential starting point for a string. */
855 start_point = i;
856
857 ++ num_found;
858
859 if (num_found >= string_min)
860 break;
861 }
862
863 if (num_found < string_min)
864 return;
865
866 print_filename_and_address (filename, address + start_point);
867
868 /* We have found string_min characters. Display them and any
869 more that follow. */
870 for (i = start_point; i < buflen; i += char_len)
871 {
872 int c = buffer[i];
873
874 char_len = 1;
875
876 if (! STRING_ISGRAPHIC (c))
877 break;
878 else if (c < 127)
879 putchar (c);
880 else if (! is_valid_utf8 (buffer + i, buflen - i))
881 break;
882 else if (unicode_display == unicode_invalid)
883 break;
884 else
885 char_len = display_utf8_char (buffer + i);
886 }
887
888 if (output_separator)
889 fputs (output_separator, stdout);
890 else
891 putchar ('\n');
892
893 /* FIXME: Using tail recursion here is lazy programming... */
894 print_unicode_buffer (filename, address + i, buffer + i, buflen - i);
895 }
896
897 static int
898 get_unicode_byte (FILE * stream,
899 unsigned char * putback,
900 unsigned int * num_putback,
901 unsigned int * num_read)
902 {
903 if (* num_putback > 0)
904 {
905 * num_putback = * num_putback - 1;
906 return putback [* num_putback];
907 }
908
909 * num_read = * num_read + 1;
910
911 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
912 return getc_unlocked (stream);
913 #else
914 return getc (stream);
915 #endif
916 }
917
918 /* Helper function for print_unicode_stream. */
919
920 static void
921 print_unicode_stream_body (const char * filename,
922 file_ptr address,
923 FILE * stream,
924 unsigned char * putback_buf,
925 unsigned int num_putback,
926 unsigned char * print_buf)
927 {
928 /* It would be nice if we could just read the stream into a buffer
929 and then process if with print_unicode_buffer. But the input
930 might be huge or it might time-locked (eg stdin). So instead
931 we go one byte at a time... */
932
933 file_ptr start_point = 0;
934 unsigned int num_read = 0;
935 unsigned int num_chars = 0;
936 unsigned int num_print = 0;
937 int c = 0;
938
939 /* Find a series of string_min characters. Put them into print_buf. */
940 do
941 {
942 if (num_chars >= string_min)
943 break;
944
945 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
946 if (c == EOF)
947 break;
948
949 if (! STRING_ISGRAPHIC (c))
950 {
951 num_chars = num_print = 0;
952 continue;
953 }
954
955 if (num_chars == 0)
956 start_point = num_read - 1;
957
958 if (c < 127)
959 {
960 print_buf[num_print] = c;
961 num_chars ++;
962 num_print ++;
963 continue;
964 }
965
966 if (c < 0xc0)
967 {
968 num_chars = num_print = 0;
969 continue;
970 }
971
972 /* We *might* have a UTF-8 sequence. Time to start peeking. */
973 char utf8[4];
974
975 utf8[0] = c;
976 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
977 if (c == EOF)
978 break;
979 utf8[1] = c;
980
981 if ((utf8[1] & 0xc0) != 0x80)
982 {
983 /* Invalid UTF-8. */
984 putback_buf[num_putback++] = utf8[1];
985 num_chars = num_print = 0;
986 continue;
987 }
988 else if ((utf8[0] & 0x20) == 0)
989 {
990 /* A valid 2-byte UTF-8 encoding. */
991 if (unicode_display == unicode_invalid)
992 {
993 putback_buf[num_putback++] = utf8[1];
994 num_chars = num_print = 0;
995 }
996 else
997 {
998 print_buf[num_print ++] = utf8[0];
999 print_buf[num_print ++] = utf8[1];
1000 num_chars ++;
1001 }
1002 continue;
1003 }
1004
1005 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1006 if (c == EOF)
1007 break;
1008 utf8[2] = c;
1009
1010 if ((utf8[2] & 0xc0) != 0x80)
1011 {
1012 /* Invalid UTF-8. */
1013 putback_buf[num_putback++] = utf8[2];
1014 putback_buf[num_putback++] = utf8[1];
1015 num_chars = num_print = 0;
1016 continue;
1017 }
1018 else if ((utf8[0] & 0x10) == 0)
1019 {
1020 /* A valid 3-byte UTF-8 encoding. */
1021 if (unicode_display == unicode_invalid)
1022 {
1023 putback_buf[num_putback++] = utf8[2];
1024 putback_buf[num_putback++] = utf8[1];
1025 num_chars = num_print = 0;
1026 }
1027 else
1028 {
1029 print_buf[num_print ++] = utf8[0];
1030 print_buf[num_print ++] = utf8[1];
1031 print_buf[num_print ++] = utf8[2];
1032 num_chars ++;
1033 }
1034 continue;
1035 }
1036
1037 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1038 if (c == EOF)
1039 break;
1040 utf8[3] = c;
1041
1042 if ((utf8[3] & 0xc0) != 0x80)
1043 {
1044 /* Invalid UTF-8. */
1045 putback_buf[num_putback++] = utf8[3];
1046 putback_buf[num_putback++] = utf8[2];
1047 putback_buf[num_putback++] = utf8[1];
1048 num_chars = num_print = 0;
1049 }
1050 /* We have a valid 4-byte UTF-8 encoding. */
1051 else if (unicode_display == unicode_invalid)
1052 {
1053 putback_buf[num_putback++] = utf8[3];
1054 putback_buf[num_putback++] = utf8[1];
1055 putback_buf[num_putback++] = utf8[2];
1056 num_chars = num_print = 0;
1057 }
1058 else
1059 {
1060 print_buf[num_print ++] = utf8[0];
1061 print_buf[num_print ++] = utf8[1];
1062 print_buf[num_print ++] = utf8[2];
1063 print_buf[num_print ++] = utf8[3];
1064 num_chars ++;
1065 }
1066 }
1067 while (1);
1068
1069 if (num_chars >= string_min)
1070 {
1071 /* We know that we have string_min valid characters in print_buf,
1072 and there may be more to come in the stream. Start displaying
1073 them. */
1074
1075 print_filename_and_address (filename, address + start_point);
1076
1077 unsigned int i;
1078 for (i = 0; i < num_print;)
1079 {
1080 if (print_buf[i] < 127)
1081 putchar (print_buf[i++]);
1082 else
1083 i += display_utf8_char (print_buf + i);
1084 }
1085
1086 /* OK so now we have to start read unchecked bytes. */
1087
1088 /* Find a series of string_min characters. Put them into print_buf. */
1089 do
1090 {
1091 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1092 if (c == EOF)
1093 break;
1094
1095 if (! STRING_ISGRAPHIC (c))
1096 break;
1097
1098 if (c < 127)
1099 {
1100 putchar (c);
1101 continue;
1102 }
1103
1104 if (c < 0xc0)
1105 break;
1106
1107 /* We *might* have a UTF-8 sequence. Time to start peeking. */
1108 unsigned char utf8[4];
1109
1110 utf8[0] = c;
1111 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1112 if (c == EOF)
1113 break;
1114 utf8[1] = c;
1115
1116 if ((utf8[1] & 0xc0) != 0x80)
1117 {
1118 /* Invalid UTF-8. */
1119 putback_buf[num_putback++] = utf8[1];
1120 break;
1121 }
1122 else if ((utf8[0] & 0x20) == 0)
1123 {
1124 /* Valid 2-byte UTF-8. */
1125 if (unicode_display == unicode_invalid)
1126 {
1127 putback_buf[num_putback++] = utf8[1];
1128 break;
1129 }
1130 else
1131 {
1132 (void) display_utf8_char (utf8);
1133 continue;
1134 }
1135 }
1136
1137 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1138 if (c == EOF)
1139 break;
1140 utf8[2] = c;
1141
1142 if ((utf8[2] & 0xc0) != 0x80)
1143 {
1144 /* Invalid UTF-8. */
1145 putback_buf[num_putback++] = utf8[2];
1146 putback_buf[num_putback++] = utf8[1];
1147 break;
1148 }
1149 else if ((utf8[0] & 0x10) == 0)
1150 {
1151 /* Valid 3-byte UTF-8. */
1152 if (unicode_display == unicode_invalid)
1153 {
1154 putback_buf[num_putback++] = utf8[2];
1155 putback_buf[num_putback++] = utf8[1];
1156 break;
1157 }
1158 else
1159 {
1160 (void) display_utf8_char (utf8);
1161 continue;
1162 }
1163 }
1164
1165 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1166 if (c == EOF)
1167 break;
1168 utf8[3] = c;
1169
1170 if ((utf8[3] & 0xc0) != 0x80)
1171 {
1172 /* Invalid UTF-8. */
1173 putback_buf[num_putback++] = utf8[3];
1174 putback_buf[num_putback++] = utf8[2];
1175 putback_buf[num_putback++] = utf8[1];
1176 break;
1177 }
1178 else if (unicode_display == unicode_invalid)
1179 {
1180 putback_buf[num_putback++] = utf8[3];
1181 putback_buf[num_putback++] = utf8[2];
1182 putback_buf[num_putback++] = utf8[1];
1183 break;
1184 }
1185 else
1186 /* A valid 4-byte UTF-8 encoding. */
1187 (void) display_utf8_char (utf8);
1188 }
1189 while (1);
1190
1191 if (output_separator)
1192 fputs (output_separator, stdout);
1193 else
1194 putchar ('\n');
1195 }
1196
1197 if (c != EOF)
1198 /* FIXME: Using tail recursion here is lazy, but it works. */
1199 print_unicode_stream_body (filename, address + num_read, stream, putback_buf, num_putback, print_buf);
1200 }
1201
1202 /* Display strings read in from STREAM. Treat any UTF-8 encoded characters
1203 encountered according to the setting of the unicode_display variable.
1204 The stream is positioned at ADDRESS and is attached to FILENAME. */
1205
1206 static void
1207 print_unicode_stream (const char * filename,
1208 file_ptr address,
1209 FILE * stream)
1210 {
1211 /* Paranoia checks... */
1212 if (filename == NULL
1213 || stream == NULL
1214 || unicode_display == unicode_default
1215 || encoding != 'S'
1216 || encoding_bytes != 1)
1217 {
1218 fprintf (stderr, "ICE: bad arguments to print_unicode_stream\n");
1219 return;
1220 }
1221
1222 /* Allocate space for string_min 4-byte utf-8 characters. */
1223 unsigned char * print_buf = xmalloc ((4 * string_min) + 1);
1224 /* We should never have to put back more than 4 bytes. */
1225 unsigned char putback_buf[5];
1226 unsigned int num_putback = 0;
1227
1228 print_unicode_stream_body (filename, address, stream, putback_buf, num_putback, print_buf);
1229 free (print_buf);
1230 }
1231 \f
1232 /* Find the strings in file FILENAME, read from STREAM.
1233 Assume that STREAM is positioned so that the next byte read
1234 is at address ADDRESS in the file.
1235
1236 If STREAM is NULL, do not read from it.
1237 The caller can supply a buffer of characters
1238 to be processed before the data in STREAM.
1239 MAGIC is the address of the buffer and
1240 MAGICCOUNT is how many characters are in it.
1241 Those characters come at address ADDRESS and the data in STREAM follow. */
1242
1243 static void
1244 print_strings (const char *filename, FILE *stream, file_ptr address,
1245 int magiccount, char *magic)
1246 {
1247 if (unicode_display != unicode_default)
1248 {
1249 if (magic != NULL)
1250 print_unicode_buffer (filename, address,
1251 (const unsigned char *) magic, magiccount);
1252
1253 if (stream != NULL)
1254 print_unicode_stream (filename, address, stream);
1255 return;
1256 }
1257
1258 char *buf = (char *) xmalloc (sizeof (char) * (string_min + 1));
1259
1260 while (1)
1261 {
1262 file_ptr start;
1263 unsigned int i;
1264 long c;
1265
1266 /* See if the next `string_min' chars are all graphic chars. */
1267 tryline:
1268 start = address;
1269 for (i = 0; i < string_min; i++)
1270 {
1271 c = get_char (stream, &address, &magiccount, &magic);
1272 if (c == EOF)
1273 {
1274 free (buf);
1275 return;
1276 }
1277
1278 if (! STRING_ISGRAPHIC (c))
1279 {
1280 /* Found a non-graphic. Try again starting with next byte. */
1281 unget_part_char (c, &address, &magiccount, &magic);
1282 goto tryline;
1283 }
1284 buf[i] = c;
1285 }
1286
1287 /* We found a run of `string_min' graphic characters. Print up
1288 to the next non-graphic character. */
1289 print_filename_and_address (filename, start);
1290
1291 buf[i] = '\0';
1292 fputs (buf, stdout);
1293
1294 while (1)
1295 {
1296 c = get_char (stream, &address, &magiccount, &magic);
1297 if (c == EOF)
1298 break;
1299 if (! STRING_ISGRAPHIC (c))
1300 {
1301 unget_part_char (c, &address, &magiccount, &magic);
1302 break;
1303 }
1304 putchar (c);
1305 }
1306
1307 if (output_separator)
1308 fputs (output_separator, stdout);
1309 else
1310 putchar ('\n');
1311 }
1312 free (buf);
1313 }
1314 \f
1315 static void
1316 usage (FILE *stream, int status)
1317 {
1318 fprintf (stream, _("Usage: %s [option(s)] [file(s)]\n"), program_name);
1319 fprintf (stream, _(" Display printable strings in [file(s)] (stdin by default)\n"));
1320 fprintf (stream, _(" The options are:\n"));
1321
1322 if (DEFAULT_STRINGS_ALL)
1323 fprintf (stream, _("\
1324 -a - --all Scan the entire file, not just the data section [default]\n\
1325 -d --data Only scan the data sections in the file\n"));
1326 else
1327 fprintf (stream, _("\
1328 -a - --all Scan the entire file, not just the data section\n\
1329 -d --data Only scan the data sections in the file [default]\n"));
1330
1331 fprintf (stream, _("\
1332 -f --print-file-name Print the name of the file before each string\n\
1333 -n <number> Locate & print any sequence of at least <number>\n\
1334 --bytes=<number> displayable characters. (The default is 4).\n\
1335 -t --radix={o,d,x} Print the location of the string in base 8, 10 or 16\n\
1336 -w --include-all-whitespace Include all whitespace as valid string characters\n\
1337 -o An alias for --radix=o\n\
1338 -T --target=<BFDNAME> Specify the binary file format\n\
1339 -e --encoding={s,S,b,l,B,L} Select character size and endianness:\n\
1340 s = 7-bit, S = 8-bit, {b,l} = 16-bit, {B,L} = 32-bit\n\
1341 --unicode={default|show|invalid|hex|escape|highlight}\n\
1342 -U {d|s|i|x|e|h} Specify how to treat UTF-8 encoded unicode characters\n\
1343 -s --output-separator=<string> String used to separate strings in output.\n\
1344 @<file> Read options from <file>\n\
1345 -h --help Display this information\n\
1346 -v -V --version Print the program's version number\n"));
1347 list_supported_targets (program_name, stream);
1348 if (REPORT_BUGS_TO[0] && status == 0)
1349 fprintf (stream, _("Report bugs to %s\n"), REPORT_BUGS_TO);
1350 exit (status);
1351 }