]> git.ipfire.org Git - thirdparty/binutils-gdb.git/blob - binutils/strings.c
Update year range in copyright notice of binutils files
[thirdparty/binutils-gdb.git] / binutils / strings.c
1 /* strings -- print the strings of printable characters in files
2 Copyright (C) 1993-2023 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
17 02110-1301, USA. */
18 \f
19 /* Usage: strings [options] file...
20
21 Options:
22 --all
23 -a
24 - Scan each file in its entirety.
25
26 --data
27 -d Scan only the initialized data section(s) of object files.
28
29 --print-file-name
30 -f Print the name of the file before each string.
31
32 --bytes=min-len
33 -n min-len
34 -min-len Print graphic char sequences, MIN-LEN or more bytes long,
35 that are followed by a NUL or a non-displayable character.
36 Default is 4.
37
38 --radix={o,x,d}
39 -t {o,x,d} Print the offset within the file before each string,
40 in octal/hex/decimal.
41
42 --include-all-whitespace
43 -w By default tab and space are the only whitepace included in graphic
44 char sequences. This option considers all of isspace() valid.
45
46 -o Like -to. (Some other implementations have -o like -to,
47 others like -td. We chose one arbitrarily.)
48
49 --encoding={s,S,b,l,B,L}
50 -e {s,S,b,l,B,L}
51 Select character encoding: 7-bit-character, 8-bit-character,
52 bigendian 16-bit, littleendian 16-bit, bigendian 32-bit,
53 littleendian 32-bit.
54
55 --target=BFDNAME
56 -T {bfdname}
57 Specify a non-default object file format.
58
59 --unicode={default|locale|invalid|hex|escape|highlight}
60 -U {d|l|i|x|e|h}
61 Determine how to handle UTF-8 unicode characters. The default
62 is no special treatment. All other versions of this option
63 only apply if the encoding is valid and enabling the option
64 implies --encoding=S.
65 The 'locale' option displays the characters according to the
66 current locale. The 'invalid' option treats them as
67 non-string characters. The 'hex' option displays them as hex
68 byte sequences. The 'escape' option displays them as escape
69 sequences and the 'highlight' option displays them as
70 coloured escape sequences.
71
72 --output-separator=sep_string
73 -s sep_string String used to separate parsed strings in output.
74 Default is newline.
75
76 --help
77 -h Print the usage message on the standard output.
78
79 --version
80 -V
81 -v Print the program version number.
82
83 Written by Richard Stallman <rms@gnu.ai.mit.edu>
84 and David MacKenzie <djm@gnu.ai.mit.edu>. */
85
86 #include "sysdep.h"
87 #include "bfd.h"
88 #include "getopt.h"
89 #include "libiberty.h"
90 #include "safe-ctype.h"
91 #include "bucomm.h"
92
93 #ifndef streq
94 #define streq(a,b) (strcmp ((a),(b)) == 0)
95 #endif
96
97 typedef enum unicode_display_type
98 {
99 unicode_default = 0,
100 unicode_locale,
101 unicode_escape,
102 unicode_hex,
103 unicode_highlight,
104 unicode_invalid
105 } unicode_display_type;
106
107 static unicode_display_type unicode_display = unicode_default;
108
109 #define STRING_ISGRAPHIC(c) \
110 ( (c) >= 0 \
111 && (c) <= 255 \
112 && ((c) == '\t' || ISPRINT (c) || (encoding == 'S' && (c) > 127) \
113 || (include_all_whitespace && ISSPACE (c))) \
114 )
115
116 #ifndef errno
117 extern int errno;
118 #endif
119
120 /* The BFD section flags that identify an initialized data section. */
121 #define DATA_FLAGS (SEC_ALLOC | SEC_LOAD | SEC_HAS_CONTENTS)
122
123 /* Radix for printing addresses (must be 8, 10 or 16). */
124 static int address_radix;
125
126 /* Minimum length of sequence of graphic chars to trigger output. */
127 static unsigned int string_min;
128
129 /* Whether or not we include all whitespace as a graphic char. */
130 static bool include_all_whitespace;
131
132 /* TRUE means print address within file for each string. */
133 static bool print_addresses;
134
135 /* TRUE means print filename for each string. */
136 static bool print_filenames;
137
138 /* TRUE means for object files scan only the data section. */
139 static bool datasection_only;
140
141 /* The BFD object file format. */
142 static char *target;
143
144 /* The character encoding format. */
145 static char encoding;
146 static int encoding_bytes;
147
148 /* Output string used to separate parsed strings */
149 static char *output_separator;
150
151 static struct option long_options[] =
152 {
153 {"all", no_argument, NULL, 'a'},
154 {"bytes", required_argument, NULL, 'n'},
155 {"data", no_argument, NULL, 'd'},
156 {"encoding", required_argument, NULL, 'e'},
157 {"help", no_argument, NULL, 'h'},
158 {"include-all-whitespace", no_argument, NULL, 'w'},
159 {"output-separator", required_argument, NULL, 's'},
160 {"print-file-name", no_argument, NULL, 'f'},
161 {"radix", required_argument, NULL, 't'},
162 {"target", required_argument, NULL, 'T'},
163 {"unicode", required_argument, NULL, 'U'},
164 {"version", no_argument, NULL, 'v'},
165 {NULL, 0, NULL, 0}
166 };
167
168 static bool strings_file (char *);
169 static void print_strings (const char *, FILE *, file_ptr, int, char *);
170 static void usage (FILE *, int) ATTRIBUTE_NORETURN;
171 \f
172 int main (int, char **);
173
174 int
175 main (int argc, char **argv)
176 {
177 int optc;
178 int exit_status = 0;
179 bool files_given = false;
180 char *s;
181 int numeric_opt = 0;
182
183 setlocale (LC_ALL, "");
184 bindtextdomain (PACKAGE, LOCALEDIR);
185 textdomain (PACKAGE);
186
187 program_name = argv[0];
188 xmalloc_set_program_name (program_name);
189 bfd_set_error_program_name (program_name);
190
191 expandargv (&argc, &argv);
192
193 string_min = 4;
194 include_all_whitespace = false;
195 print_addresses = false;
196 print_filenames = false;
197 if (DEFAULT_STRINGS_ALL)
198 datasection_only = false;
199 else
200 datasection_only = true;
201 target = NULL;
202 encoding = 's';
203 output_separator = NULL;
204
205 while ((optc = getopt_long (argc, argv, "adfhHn:wot:e:T:s:U:Vv0123456789",
206 long_options, (int *) 0)) != EOF)
207 {
208 switch (optc)
209 {
210 case 'a':
211 datasection_only = false;
212 break;
213
214 case 'd':
215 datasection_only = true;
216 break;
217
218 case 'f':
219 print_filenames = true;
220 break;
221
222 case 'H':
223 case 'h':
224 usage (stdout, 0);
225
226 case 'n':
227 string_min = (int) strtoul (optarg, &s, 0);
228 if (s != NULL && *s != 0)
229 fatal (_("invalid integer argument %s"), optarg);
230 break;
231
232 case 'w':
233 include_all_whitespace = true;
234 break;
235
236 case 'o':
237 print_addresses = true;
238 address_radix = 8;
239 break;
240
241 case 't':
242 print_addresses = true;
243 if (optarg[1] != '\0')
244 usage (stderr, 1);
245 switch (optarg[0])
246 {
247 case 'o':
248 address_radix = 8;
249 break;
250
251 case 'd':
252 address_radix = 10;
253 break;
254
255 case 'x':
256 address_radix = 16;
257 break;
258
259 default:
260 usage (stderr, 1);
261 }
262 break;
263
264 case 'T':
265 target = optarg;
266 break;
267
268 case 'e':
269 if (optarg[1] != '\0')
270 usage (stderr, 1);
271 encoding = optarg[0];
272 break;
273
274 case 's':
275 output_separator = optarg;
276 break;
277
278 case 'U':
279 if (streq (optarg, "default") || streq (optarg, "d"))
280 unicode_display = unicode_default;
281 else if (streq (optarg, "locale") || streq (optarg, "l"))
282 unicode_display = unicode_locale;
283 else if (streq (optarg, "escape") || streq (optarg, "e"))
284 unicode_display = unicode_escape;
285 else if (streq (optarg, "invalid") || streq (optarg, "i"))
286 unicode_display = unicode_invalid;
287 else if (streq (optarg, "hex") || streq (optarg, "x"))
288 unicode_display = unicode_hex;
289 else if (streq (optarg, "highlight") || streq (optarg, "h"))
290 unicode_display = unicode_highlight;
291 else
292 fatal (_("invalid argument to -U/--unicode: %s"), optarg);
293 break;
294
295 case 'V':
296 case 'v':
297 print_version ("strings");
298 break;
299
300 case '?':
301 usage (stderr, 1);
302
303 default:
304 numeric_opt = optind;
305 break;
306 }
307 }
308
309 if (unicode_display != unicode_default)
310 encoding = 'S';
311
312 if (numeric_opt != 0)
313 {
314 string_min = (int) strtoul (argv[numeric_opt - 1] + 1, &s, 0);
315 if (s != NULL && *s != 0)
316 fatal (_("invalid integer argument %s"), argv[numeric_opt - 1] + 1);
317 }
318 if (string_min < 1)
319 fatal (_("invalid minimum string length %d"), string_min);
320
321 switch (encoding)
322 {
323 case 'S':
324 case 's':
325 encoding_bytes = 1;
326 break;
327 case 'b':
328 case 'l':
329 encoding_bytes = 2;
330 break;
331 case 'B':
332 case 'L':
333 encoding_bytes = 4;
334 break;
335 default:
336 usage (stderr, 1);
337 }
338
339 if (bfd_init () != BFD_INIT_MAGIC)
340 fatal (_("fatal error: libbfd ABI mismatch"));
341 set_default_bfd_target ();
342
343 if (optind >= argc)
344 {
345 datasection_only = false;
346 SET_BINARY (fileno (stdin));
347 print_strings ("{standard input}", stdin, 0, 0, (char *) NULL);
348 files_given = true;
349 }
350 else
351 {
352 for (; optind < argc; ++optind)
353 {
354 if (streq (argv[optind], "-"))
355 datasection_only = false;
356 else
357 {
358 files_given = true;
359 exit_status |= !strings_file (argv[optind]);
360 }
361 }
362 }
363
364 if (!files_given)
365 usage (stderr, 1);
366
367 return (exit_status);
368 }
369 \f
370 /* Scan section SECT of the file ABFD, whose printable name is
371 FILENAME. If it contains initialized data set GOT_A_SECTION and
372 print the strings in it. */
373
374 static void
375 strings_a_section (bfd *abfd, asection *sect, const char *filename,
376 bool *got_a_section)
377 {
378 bfd_size_type sectsize;
379 bfd_byte *mem;
380
381 if ((sect->flags & DATA_FLAGS) != DATA_FLAGS)
382 return;
383
384 sectsize = bfd_section_size (sect);
385 if (sectsize == 0)
386 return;
387
388 if (!bfd_malloc_and_get_section (abfd, sect, &mem))
389 {
390 non_fatal (_("%s: Reading section %s failed: %s"),
391 filename, sect->name, bfd_errmsg (bfd_get_error ()));
392 return;
393 }
394
395 *got_a_section = true;
396 print_strings (filename, NULL, sect->filepos, sectsize, (char *) mem);
397 free (mem);
398 }
399
400 /* Scan all of the sections in FILE, and print the strings
401 in the initialized data section(s).
402
403 Return TRUE if successful,
404 FALSE if not (such as if FILE is not an object file). */
405
406 static bool
407 strings_object_file (const char *file)
408 {
409 bfd *abfd;
410 asection *s;
411 bool got_a_section;
412
413 abfd = bfd_openr (file, target);
414
415 if (abfd == NULL)
416 /* Treat the file as a non-object file. */
417 return false;
418
419 /* This call is mainly for its side effect of reading in the sections.
420 We follow the traditional behavior of `strings' in that we don't
421 complain if we don't recognize a file to be an object file. */
422 if (!bfd_check_format (abfd, bfd_object))
423 {
424 bfd_close (abfd);
425 return false;
426 }
427
428 got_a_section = false;
429 for (s = abfd->sections; s != NULL; s = s->next)
430 strings_a_section (abfd, s, file, &got_a_section);
431
432 if (!bfd_close (abfd))
433 {
434 bfd_nonfatal (file);
435 return false;
436 }
437
438 return got_a_section;
439 }
440
441 /* Print the strings in FILE. Return TRUE if ok, FALSE if an error occurs. */
442
443 static bool
444 strings_file (char *file)
445 {
446 struct stat st;
447
448 /* get_file_size does not support non-S_ISREG files. */
449
450 if (stat (file, &st) < 0)
451 {
452 if (errno == ENOENT)
453 non_fatal (_("'%s': No such file"), file);
454 else
455 non_fatal (_("Warning: could not locate '%s'. reason: %s"),
456 file, strerror (errno));
457 return false;
458 }
459 else if (S_ISDIR (st.st_mode))
460 {
461 non_fatal (_("Warning: '%s' is a directory"), file);
462 return false;
463 }
464
465 /* If we weren't told to scan the whole file,
466 try to open it as an object file and only look at
467 initialized data sections. If that fails, fall back to the
468 whole file. */
469 if (!datasection_only || !strings_object_file (file))
470 {
471 FILE *stream;
472
473 stream = fopen (file, FOPEN_RB);
474 if (stream == NULL)
475 {
476 fprintf (stderr, "%s: ", program_name);
477 perror (file);
478 return false;
479 }
480
481 print_strings (file, stream, (file_ptr) 0, 0, (char *) NULL);
482
483 if (fclose (stream) == EOF)
484 {
485 fprintf (stderr, "%s: ", program_name);
486 perror (file);
487 return false;
488 }
489 }
490
491 return true;
492 }
493 \f
494 /* Read the next character, return EOF if none available.
495 Assume that STREAM is positioned so that the next byte read
496 is at address ADDRESS in the file.
497
498 If STREAM is NULL, do not read from it.
499 The caller can supply a buffer of characters
500 to be processed before the data in STREAM.
501 MAGIC is the address of the buffer and
502 MAGICCOUNT is how many characters are in it. */
503
504 static long
505 get_char (FILE *stream, file_ptr *address, int *magiccount, char **magic)
506 {
507 int c, i;
508 long r = 0;
509
510 for (i = 0; i < encoding_bytes; i++)
511 {
512 if (*magiccount)
513 {
514 (*magiccount)--;
515 c = *(*magic)++;
516 }
517 else
518 {
519 if (stream == NULL)
520 return EOF;
521
522 /* Only use getc_unlocked if we found a declaration for it.
523 Otherwise, libc is not thread safe by default, and we
524 should not use it. */
525
526 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
527 c = getc_unlocked (stream);
528 #else
529 c = getc (stream);
530 #endif
531 if (c == EOF)
532 return EOF;
533 }
534
535 (*address)++;
536 r = (r << 8) | (c & 0xff);
537 }
538
539 switch (encoding)
540 {
541 default:
542 break;
543 case 'l':
544 r = ((r & 0xff) << 8) | ((r & 0xff00) >> 8);
545 break;
546 case 'L':
547 r = (((r & 0xff) << 24) | ((r & 0xff00) << 8)
548 | ((r & 0xff0000) >> 8) | ((r & 0xff000000) >> 24));
549 break;
550 }
551
552 return r;
553 }
554
555 /* Throw away one byte of a (possibly) multi-byte char C, updating
556 address and buffer to suit. */
557
558 static void
559 unget_part_char (long c, file_ptr *address, int *magiccount, char **magic)
560 {
561 static char tmp[4];
562
563 if (encoding_bytes > 1)
564 {
565 *address -= encoding_bytes - 1;
566
567 if (*magiccount == 0)
568 {
569 /* If no magic buffer exists, use temp buffer. */
570 switch (encoding)
571 {
572 default:
573 break;
574 case 'b':
575 tmp[0] = c & 0xff;
576 *magiccount = 1;
577 break;
578 case 'l':
579 tmp[0] = (c >> 8) & 0xff;
580 *magiccount = 1;
581 break;
582 case 'B':
583 tmp[0] = (c >> 16) & 0xff;
584 tmp[1] = (c >> 8) & 0xff;
585 tmp[2] = c & 0xff;
586 *magiccount = 3;
587 break;
588 case 'L':
589 tmp[0] = (c >> 8) & 0xff;
590 tmp[1] = (c >> 16) & 0xff;
591 tmp[2] = (c >> 24) & 0xff;
592 *magiccount = 3;
593 break;
594 }
595 *magic = tmp;
596 }
597 else
598 {
599 /* If magic buffer exists, rewind. */
600 *magic -= encoding_bytes - 1;
601 *magiccount += encoding_bytes - 1;
602 }
603 }
604 }
605
606 static void
607 print_filename_and_address (const char * filename, file_ptr address)
608 {
609 if (print_filenames)
610 printf ("%s: ", filename);
611
612 if (! print_addresses)
613 return;
614
615 switch (address_radix)
616 {
617 case 8:
618 if (sizeof (address) > sizeof (long))
619 {
620 #ifndef __MSVCRT__
621 printf ("%7llo ", (unsigned long long) address);
622 #else
623 printf ("%7I64o ", (unsigned long long) address);
624 #endif
625 }
626 else
627 printf ("%7lo ", (unsigned long) address);
628 break;
629
630 case 10:
631 if (sizeof (address) > sizeof (long))
632 {
633 #ifndef __MSVCRT__
634 printf ("%7llu ", (unsigned long long) address);
635 #else
636 printf ("%7I64d ", (unsigned long long) address);
637 #endif
638 }
639 else
640 printf ("%7ld ", (long) address);
641 break;
642
643 case 16:
644 if (sizeof (address) > sizeof (long))
645 {
646 #ifndef __MSVCRT__
647 printf ("%7llx ", (unsigned long long) address);
648 #else
649 printf ("%7I64x ", (unsigned long long) address);
650 #endif
651 }
652 else
653 printf ("%7lx ", (unsigned long) address);
654 break;
655 }
656 }
657
658 /* Return non-zero if the bytes starting at BUFFER form a valid UTF-8 encoding.
659 If the encoding is valid then returns the number of bytes it uses. */
660
661 static unsigned int
662 is_valid_utf8 (const unsigned char * buffer, unsigned long buflen)
663 {
664 if (buffer[0] < 0xc0)
665 return 0;
666
667 if (buflen < 2)
668 return 0;
669
670 if ((buffer[1] & 0xc0) != 0x80)
671 return 0;
672
673 if ((buffer[0] & 0x20) == 0)
674 return 2;
675
676 if (buflen < 3)
677 return 0;
678
679 if ((buffer[2] & 0xc0) != 0x80)
680 return 0;
681
682 if ((buffer[0] & 0x10) == 0)
683 return 3;
684
685 if (buflen < 4)
686 return 0;
687
688 if ((buffer[3] & 0xc0) != 0x80)
689 return 0;
690
691 return 4;
692 }
693
694 /* Display a UTF-8 encoded character in BUFFER according to the setting
695 of unicode_display. The character is known to be valid.
696 Returns the number of bytes consumed. */
697
698 static unsigned int
699 display_utf8_char (const unsigned char * buffer)
700 {
701 unsigned int j;
702 unsigned int utf8_len;
703
704 switch (buffer[0] & 0x30)
705 {
706 case 0x00:
707 case 0x10:
708 utf8_len = 2;
709 break;
710 case 0x20:
711 utf8_len = 3;
712 break;
713 default:
714 utf8_len = 4;
715 }
716
717 switch (unicode_display)
718 {
719 default:
720 fprintf (stderr, "ICE: unexpected unicode display type\n");
721 break;
722
723 case unicode_escape:
724 case unicode_highlight:
725 if (unicode_display == unicode_highlight && isatty (1))
726 printf ("\x1B[31;47m"); /* Red. */
727
728 switch (utf8_len)
729 {
730 case 2:
731 printf ("\\u%02x%02x",
732 ((buffer[0] & 0x1c) >> 2),
733 ((buffer[0] & 0x03) << 6) | (buffer[1] & 0x3f));
734 break;
735
736 case 3:
737 printf ("\\u%02x%02x",
738 ((buffer[0] & 0x0f) << 4) | ((buffer[1] & 0x3c) >> 2),
739 ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3f)));
740 break;
741
742 case 4:
743 printf ("\\u%02x%02x%02x",
744 ((buffer[0] & 0x07) << 6) | ((buffer[1] & 0x3c) >> 2),
745 ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3c) >> 2),
746 ((buffer[2] & 0x03) << 6) | ((buffer[3] & 0x3f)));
747 break;
748 default:
749 /* URG. */
750 break;
751 }
752
753 if (unicode_display == unicode_highlight && isatty (1))
754 printf ("\033[0m"); /* Default colour. */
755 break;
756
757 case unicode_hex:
758 putchar ('<');
759 printf ("0x");
760 for (j = 0; j < utf8_len; j++)
761 printf ("%02x", buffer [j]);
762 putchar ('>');
763 break;
764
765 case unicode_locale:
766 printf ("%.1s", buffer);
767 break;
768 }
769
770 return utf8_len;
771 }
772
773 /* Display strings in BUFFER. Treat any UTF-8 encoded characters encountered
774 according to the setting of the unicode_display variable. The buffer
775 contains BUFLEN bytes.
776
777 Display the characters as if they started at ADDRESS and are contained in
778 FILENAME. */
779
780 static void
781 print_unicode_buffer (const char * filename,
782 file_ptr address,
783 const unsigned char * buffer,
784 unsigned long buflen)
785 {
786 /* Paranoia checks... */
787 if (filename == NULL
788 || buffer == NULL
789 || unicode_display == unicode_default
790 || encoding != 'S'
791 || encoding_bytes != 1)
792 {
793 fprintf (stderr, "ICE: bad arguments to print_unicode_buffer\n");
794 return;
795 }
796
797 if (buflen == 0)
798 return;
799
800 /* We must only display strings that are at least string_min *characters*
801 long. So we scan the buffer in two stages. First we locate the start
802 of a potential string. Then we walk along it until we have found
803 string_min characters. Then we go back to the start point and start
804 displaying characters according to the unicode_display setting. */
805
806 unsigned long start_point = 0;
807 unsigned long i = 0;
808 unsigned int char_len = 1;
809 unsigned int num_found = 0;
810
811 for (i = 0; i < buflen; i += char_len)
812 {
813 int c = buffer[i];
814
815 char_len = 1;
816
817 /* Find the first potential character of a string. */
818 if (! STRING_ISGRAPHIC (c))
819 {
820 num_found = 0;
821 continue;
822 }
823
824 if (c > 126)
825 {
826 if (c < 0xc0)
827 {
828 num_found = 0;
829 continue;
830 }
831
832 if ((char_len = is_valid_utf8 (buffer + i, buflen - i)) == 0)
833 {
834 char_len = 1;
835 num_found = 0;
836 continue;
837 }
838
839 if (unicode_display == unicode_invalid)
840 {
841 /* We have found a valid UTF-8 character, but we treat it as non-graphic. */
842 num_found = 0;
843 continue;
844 }
845 }
846
847 if (num_found == 0)
848 /* We have found a potential starting point for a string. */
849 start_point = i;
850
851 ++ num_found;
852
853 if (num_found >= string_min)
854 break;
855 }
856
857 if (num_found < string_min)
858 return;
859
860 print_filename_and_address (filename, address + start_point);
861
862 /* We have found string_min characters. Display them and any
863 more that follow. */
864 for (i = start_point; i < buflen; i += char_len)
865 {
866 int c = buffer[i];
867
868 char_len = 1;
869
870 if (! STRING_ISGRAPHIC (c))
871 break;
872 else if (c < 127)
873 putchar (c);
874 else if (! is_valid_utf8 (buffer + i, buflen - i))
875 break;
876 else if (unicode_display == unicode_invalid)
877 break;
878 else
879 char_len = display_utf8_char (buffer + i);
880 }
881
882 if (output_separator)
883 fputs (output_separator, stdout);
884 else
885 putchar ('\n');
886
887 /* FIXME: Using tail recursion here is lazy programming... */
888 print_unicode_buffer (filename, address + i, buffer + i, buflen - i);
889 }
890
891 static int
892 get_unicode_byte (FILE * stream,
893 unsigned char * putback,
894 unsigned int * num_putback,
895 unsigned int * num_read)
896 {
897 if (* num_putback > 0)
898 {
899 * num_putback = * num_putback - 1;
900 return putback [* num_putback];
901 }
902
903 * num_read = * num_read + 1;
904
905 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
906 return getc_unlocked (stream);
907 #else
908 return getc (stream);
909 #endif
910 }
911
912 /* Helper function for print_unicode_stream. */
913
914 static void
915 print_unicode_stream_body (const char * filename,
916 file_ptr address,
917 FILE * stream,
918 unsigned char * putback_buf,
919 unsigned int num_putback,
920 unsigned char * print_buf)
921 {
922 /* It would be nice if we could just read the stream into a buffer
923 and then process if with print_unicode_buffer. But the input
924 might be huge or it might time-locked (eg stdin). So instead
925 we go one byte at a time... */
926
927 file_ptr start_point = 0;
928 unsigned int num_read = 0;
929 unsigned int num_chars = 0;
930 unsigned int num_print = 0;
931 int c = 0;
932
933 /* Find a series of string_min characters. Put them into print_buf. */
934 do
935 {
936 if (num_chars >= string_min)
937 break;
938
939 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
940 if (c == EOF)
941 break;
942
943 if (! STRING_ISGRAPHIC (c))
944 {
945 num_chars = num_print = 0;
946 continue;
947 }
948
949 if (num_chars == 0)
950 start_point = num_read - 1;
951
952 if (c < 127)
953 {
954 print_buf[num_print] = c;
955 num_chars ++;
956 num_print ++;
957 continue;
958 }
959
960 if (c < 0xc0)
961 {
962 num_chars = num_print = 0;
963 continue;
964 }
965
966 /* We *might* have a UTF-8 sequence. Time to start peeking. */
967 char utf8[4];
968
969 utf8[0] = c;
970 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
971 if (c == EOF)
972 break;
973 utf8[1] = c;
974
975 if ((utf8[1] & 0xc0) != 0x80)
976 {
977 /* Invalid UTF-8. */
978 putback_buf[num_putback++] = utf8[1];
979 num_chars = num_print = 0;
980 continue;
981 }
982 else if ((utf8[0] & 0x20) == 0)
983 {
984 /* A valid 2-byte UTF-8 encoding. */
985 if (unicode_display == unicode_invalid)
986 {
987 putback_buf[num_putback++] = utf8[1];
988 num_chars = num_print = 0;
989 }
990 else
991 {
992 print_buf[num_print ++] = utf8[0];
993 print_buf[num_print ++] = utf8[1];
994 num_chars ++;
995 }
996 continue;
997 }
998
999 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1000 if (c == EOF)
1001 break;
1002 utf8[2] = c;
1003
1004 if ((utf8[2] & 0xc0) != 0x80)
1005 {
1006 /* Invalid UTF-8. */
1007 putback_buf[num_putback++] = utf8[2];
1008 putback_buf[num_putback++] = utf8[1];
1009 num_chars = num_print = 0;
1010 continue;
1011 }
1012 else if ((utf8[0] & 0x10) == 0)
1013 {
1014 /* A valid 3-byte UTF-8 encoding. */
1015 if (unicode_display == unicode_invalid)
1016 {
1017 putback_buf[num_putback++] = utf8[2];
1018 putback_buf[num_putback++] = utf8[1];
1019 num_chars = num_print = 0;
1020 }
1021 else
1022 {
1023 print_buf[num_print ++] = utf8[0];
1024 print_buf[num_print ++] = utf8[1];
1025 print_buf[num_print ++] = utf8[2];
1026 num_chars ++;
1027 }
1028 continue;
1029 }
1030
1031 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1032 if (c == EOF)
1033 break;
1034 utf8[3] = c;
1035
1036 if ((utf8[3] & 0xc0) != 0x80)
1037 {
1038 /* Invalid UTF-8. */
1039 putback_buf[num_putback++] = utf8[3];
1040 putback_buf[num_putback++] = utf8[2];
1041 putback_buf[num_putback++] = utf8[1];
1042 num_chars = num_print = 0;
1043 }
1044 /* We have a valid 4-byte UTF-8 encoding. */
1045 else if (unicode_display == unicode_invalid)
1046 {
1047 putback_buf[num_putback++] = utf8[3];
1048 putback_buf[num_putback++] = utf8[1];
1049 putback_buf[num_putback++] = utf8[2];
1050 num_chars = num_print = 0;
1051 }
1052 else
1053 {
1054 print_buf[num_print ++] = utf8[0];
1055 print_buf[num_print ++] = utf8[1];
1056 print_buf[num_print ++] = utf8[2];
1057 print_buf[num_print ++] = utf8[3];
1058 num_chars ++;
1059 }
1060 }
1061 while (1);
1062
1063 if (num_chars >= string_min)
1064 {
1065 /* We know that we have string_min valid characters in print_buf,
1066 and there may be more to come in the stream. Start displaying
1067 them. */
1068
1069 print_filename_and_address (filename, address + start_point);
1070
1071 unsigned int i;
1072 for (i = 0; i < num_print;)
1073 {
1074 if (print_buf[i] < 127)
1075 putchar (print_buf[i++]);
1076 else
1077 i += display_utf8_char (print_buf + i);
1078 }
1079
1080 /* OK so now we have to start read unchecked bytes. */
1081
1082 /* Find a series of string_min characters. Put them into print_buf. */
1083 do
1084 {
1085 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1086 if (c == EOF)
1087 break;
1088
1089 if (! STRING_ISGRAPHIC (c))
1090 break;
1091
1092 if (c < 127)
1093 {
1094 putchar (c);
1095 continue;
1096 }
1097
1098 if (c < 0xc0)
1099 break;
1100
1101 /* We *might* have a UTF-8 sequence. Time to start peeking. */
1102 unsigned char utf8[4];
1103
1104 utf8[0] = c;
1105 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1106 if (c == EOF)
1107 break;
1108 utf8[1] = c;
1109
1110 if ((utf8[1] & 0xc0) != 0x80)
1111 {
1112 /* Invalid UTF-8. */
1113 putback_buf[num_putback++] = utf8[1];
1114 break;
1115 }
1116 else if ((utf8[0] & 0x20) == 0)
1117 {
1118 /* Valid 2-byte UTF-8. */
1119 if (unicode_display == unicode_invalid)
1120 {
1121 putback_buf[num_putback++] = utf8[1];
1122 break;
1123 }
1124 else
1125 {
1126 (void) display_utf8_char (utf8);
1127 continue;
1128 }
1129 }
1130
1131 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1132 if (c == EOF)
1133 break;
1134 utf8[2] = c;
1135
1136 if ((utf8[2] & 0xc0) != 0x80)
1137 {
1138 /* Invalid UTF-8. */
1139 putback_buf[num_putback++] = utf8[2];
1140 putback_buf[num_putback++] = utf8[1];
1141 break;
1142 }
1143 else if ((utf8[0] & 0x10) == 0)
1144 {
1145 /* Valid 3-byte UTF-8. */
1146 if (unicode_display == unicode_invalid)
1147 {
1148 putback_buf[num_putback++] = utf8[2];
1149 putback_buf[num_putback++] = utf8[1];
1150 break;
1151 }
1152 else
1153 {
1154 (void) display_utf8_char (utf8);
1155 continue;
1156 }
1157 }
1158
1159 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1160 if (c == EOF)
1161 break;
1162 utf8[3] = c;
1163
1164 if ((utf8[3] & 0xc0) != 0x80)
1165 {
1166 /* Invalid UTF-8. */
1167 putback_buf[num_putback++] = utf8[3];
1168 putback_buf[num_putback++] = utf8[2];
1169 putback_buf[num_putback++] = utf8[1];
1170 break;
1171 }
1172 else if (unicode_display == unicode_invalid)
1173 {
1174 putback_buf[num_putback++] = utf8[3];
1175 putback_buf[num_putback++] = utf8[2];
1176 putback_buf[num_putback++] = utf8[1];
1177 break;
1178 }
1179 else
1180 /* A valid 4-byte UTF-8 encoding. */
1181 (void) display_utf8_char (utf8);
1182 }
1183 while (1);
1184
1185 if (output_separator)
1186 fputs (output_separator, stdout);
1187 else
1188 putchar ('\n');
1189 }
1190
1191 if (c != EOF)
1192 /* FIXME: Using tail recursion here is lazy, but it works. */
1193 print_unicode_stream_body (filename, address + num_read, stream, putback_buf, num_putback, print_buf);
1194 }
1195
1196 /* Display strings read in from STREAM. Treat any UTF-8 encoded characters
1197 encountered according to the setting of the unicode_display variable.
1198 The stream is positioned at ADDRESS and is attached to FILENAME. */
1199
1200 static void
1201 print_unicode_stream (const char * filename,
1202 file_ptr address,
1203 FILE * stream)
1204 {
1205 /* Paranoia checks... */
1206 if (filename == NULL
1207 || stream == NULL
1208 || unicode_display == unicode_default
1209 || encoding != 'S'
1210 || encoding_bytes != 1)
1211 {
1212 fprintf (stderr, "ICE: bad arguments to print_unicode_stream\n");
1213 return;
1214 }
1215
1216 /* Allocate space for string_min 4-byte utf-8 characters. */
1217 unsigned char * print_buf = xmalloc ((4 * string_min) + 1);
1218 /* We should never have to put back more than 4 bytes. */
1219 unsigned char putback_buf[5];
1220 unsigned int num_putback = 0;
1221
1222 print_unicode_stream_body (filename, address, stream, putback_buf, num_putback, print_buf);
1223 free (print_buf);
1224 }
1225 \f
1226 /* Find the strings in file FILENAME, read from STREAM.
1227 Assume that STREAM is positioned so that the next byte read
1228 is at address ADDRESS in the file.
1229
1230 If STREAM is NULL, do not read from it.
1231 The caller can supply a buffer of characters
1232 to be processed before the data in STREAM.
1233 MAGIC is the address of the buffer and
1234 MAGICCOUNT is how many characters are in it.
1235 Those characters come at address ADDRESS and the data in STREAM follow. */
1236
1237 static void
1238 print_strings (const char *filename, FILE *stream, file_ptr address,
1239 int magiccount, char *magic)
1240 {
1241 if (unicode_display != unicode_default)
1242 {
1243 if (magic != NULL)
1244 print_unicode_buffer (filename, address,
1245 (const unsigned char *) magic, magiccount);
1246
1247 if (stream != NULL)
1248 print_unicode_stream (filename, address, stream);
1249 return;
1250 }
1251
1252 char *buf = (char *) xmalloc (sizeof (char) * (string_min + 1));
1253
1254 while (1)
1255 {
1256 file_ptr start;
1257 unsigned int i;
1258 long c;
1259
1260 /* See if the next `string_min' chars are all graphic chars. */
1261 tryline:
1262 start = address;
1263 for (i = 0; i < string_min; i++)
1264 {
1265 c = get_char (stream, &address, &magiccount, &magic);
1266 if (c == EOF)
1267 {
1268 free (buf);
1269 return;
1270 }
1271
1272 if (! STRING_ISGRAPHIC (c))
1273 {
1274 /* Found a non-graphic. Try again starting with next byte. */
1275 unget_part_char (c, &address, &magiccount, &magic);
1276 goto tryline;
1277 }
1278 buf[i] = c;
1279 }
1280
1281 /* We found a run of `string_min' graphic characters. Print up
1282 to the next non-graphic character. */
1283 print_filename_and_address (filename, start);
1284
1285 buf[i] = '\0';
1286 fputs (buf, stdout);
1287
1288 while (1)
1289 {
1290 c = get_char (stream, &address, &magiccount, &magic);
1291 if (c == EOF)
1292 break;
1293 if (! STRING_ISGRAPHIC (c))
1294 {
1295 unget_part_char (c, &address, &magiccount, &magic);
1296 break;
1297 }
1298 putchar (c);
1299 }
1300
1301 if (output_separator)
1302 fputs (output_separator, stdout);
1303 else
1304 putchar ('\n');
1305 }
1306 free (buf);
1307 }
1308 \f
1309 static void
1310 usage (FILE *stream, int status)
1311 {
1312 fprintf (stream, _("Usage: %s [option(s)] [file(s)]\n"), program_name);
1313 fprintf (stream, _(" Display printable strings in [file(s)] (stdin by default)\n"));
1314 fprintf (stream, _(" The options are:\n"));
1315
1316 if (DEFAULT_STRINGS_ALL)
1317 fprintf (stream, _("\
1318 -a - --all Scan the entire file, not just the data section [default]\n\
1319 -d --data Only scan the data sections in the file\n"));
1320 else
1321 fprintf (stream, _("\
1322 -a - --all Scan the entire file, not just the data section\n\
1323 -d --data Only scan the data sections in the file [default]\n"));
1324
1325 fprintf (stream, _("\
1326 -f --print-file-name Print the name of the file before each string\n\
1327 -n <number> Locate & print any sequence of at least <number>\n\
1328 --bytes=<number> displayable characters. (The default is 4).\n\
1329 -t --radix={o,d,x} Print the location of the string in base 8, 10 or 16\n\
1330 -w --include-all-whitespace Include all whitespace as valid string characters\n\
1331 -o An alias for --radix=o\n\
1332 -T --target=<BFDNAME> Specify the binary file format\n\
1333 -e --encoding={s,S,b,l,B,L} Select character size and endianness:\n\
1334 s = 7-bit, S = 8-bit, {b,l} = 16-bit, {B,L} = 32-bit\n\
1335 --unicode={default|show|invalid|hex|escape|highlight}\n\
1336 -U {d|s|i|x|e|h} Specify how to treat UTF-8 encoded unicode characters\n\
1337 -s --output-separator=<string> String used to separate strings in output.\n\
1338 @<file> Read options from <file>\n\
1339 -h --help Display this information\n\
1340 -v -V --version Print the program's version number\n"));
1341 list_supported_targets (program_name, stream);
1342 if (REPORT_BUGS_TO[0] && status == 0)
1343 fprintf (stream, _("Report bugs to %s\n"), REPORT_BUGS_TO);
1344 exit (status);
1345 }