binutils/strings.c

   1 /* strings -- print the strings of printable characters in files
   2    Copyright (C) 1993-2023 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 3, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software
  16    Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
  17    02110-1301, USA.  */
  18 \f
  19 /* Usage: strings [options] file...
  20
  21    Options:
  22    --all
  23    -a
  24    -            Scan each file in its entirety.
  25
  26    --data
  27    -d           Scan only the initialized data section(s) of object files.
  28
  29    --print-file-name
  30    -f           Print the name of the file before each string.
  31
  32    --bytes=min-len
  33    -n min-len
  34    -min-len     Print graphic char sequences, MIN-LEN or more bytes long,
  35                 that are followed by a NUL or a non-displayable character.
  36                 Default is 4.
  37
  38    --radix={o,x,d}
  39    -t {o,x,d}   Print the offset within the file before each string,
  40                 in octal/hex/decimal.
  41
  42   --include-all-whitespace
  43   -w            By default tab and space are the only whitepace included in graphic
  44                 char sequences.  This option considers all of isspace() valid.
  45
  46    -o           Like -to.  (Some other implementations have -o like -to,
  47                 others like -td.  We chose one arbitrarily.)
  48
  49    --encoding={s,S,b,l,B,L}
  50    -e {s,S,b,l,B,L}
  51                 Select character encoding: 7-bit-character, 8-bit-character,
  52                 bigendian 16-bit, littleendian 16-bit, bigendian 32-bit,
  53                 littleendian 32-bit.
  54
  55    --target=BFDNAME
  56    -T {bfdname}
  57                 Specify a non-default object file format.
  58
  59   --unicode={default|locale|invalid|hex|escape|highlight}
  60   -U {d|l|i|x|e|h}
  61                 Determine how to handle UTF-8 unicode characters.  The default
  62                 is no special treatment.  All other versions of this option
  63                 only apply if the encoding is valid and enabling the option
  64                 implies --encoding=S.
  65                 The 'locale' option displays the characters according to the
  66                 current locale.  The 'invalid' option treats them as
  67                 non-string characters.  The 'hex' option displays them as hex
  68                 byte sequences.  The 'escape' option displays them as escape
  69                 sequences and the 'highlight' option displays them as
  70                 coloured escape sequences.
  71
  72   --output-separator=sep_string
  73   -s sep_string String used to separate parsed strings in output.
  74                 Default is newline.
  75
  76    --help
  77    -h           Print the usage message on the standard output.
  78
  79    --version
  80    -V
  81    -v           Print the program version number.
  82
  83    Written by Richard Stallman <rms@gnu.ai.mit.edu>
  84    and David MacKenzie <djm@gnu.ai.mit.edu>.  */
  85
  86 #include "sysdep.h"
  87 #include "bfd.h"
  88 #include "getopt.h"
  89 #include "libiberty.h"
  90 #include "safe-ctype.h"
  91 #include "bucomm.h"
  92
  93 #ifndef streq
  94 #define streq(a,b) (strcmp ((a),(b)) == 0)
  95 #endif
  96
  97 typedef enum unicode_display_type
  98 {
  99   unicode_default = 0,
 100   unicode_locale,
 101   unicode_escape,
 102   unicode_hex,
 103   unicode_highlight,
 104   unicode_invalid
 105 } unicode_display_type;
 106
 107 static unicode_display_type unicode_display = unicode_default;
 108
 109 #define STRING_ISGRAPHIC(c) \
 110       (   (c) >= 0 \
 111        && (c) <= 255 \
 112        && ((c) == '\t' || ISPRINT (c) || (encoding == 'S' && (c) > 127) \
 113            || (include_all_whitespace && ISSPACE (c))) \
 114       )
 115
 116 #ifndef errno
 117 extern int errno;
 118 #endif
 119
 120 /* The BFD section flags that identify an initialized data section.  */
 121 #define DATA_FLAGS (SEC_ALLOC | SEC_LOAD | SEC_HAS_CONTENTS)
 122
 123 /* Radix for printing addresses (must be 8, 10 or 16).  */
 124 static int address_radix;
 125
 126 /* Minimum length of sequence of graphic chars to trigger output.  */
 127 static unsigned int string_min;
 128
 129 /* Whether or not we include all whitespace as a graphic char.   */
 130 static bool include_all_whitespace;
 131
 132 /* TRUE means print address within file for each string.  */
 133 static bool print_addresses;
 134
 135 /* TRUE means print filename for each string.  */
 136 static bool print_filenames;
 137
 138 /* TRUE means for object files scan only the data section.  */
 139 static bool datasection_only;
 140
 141 /* The BFD object file format.  */
 142 static char *target;
 143
 144 /* The character encoding format.  */
 145 static char encoding;
 146 static int encoding_bytes;
 147
 148 /* Output string used to separate parsed strings  */
 149 static char *output_separator;
 150
 151 static struct option long_options[] =
 152 {
 153   {"all", no_argument, NULL, 'a'},
 154   {"bytes", required_argument, NULL, 'n'},
 155   {"data", no_argument, NULL, 'd'},
 156   {"encoding", required_argument, NULL, 'e'},
 157   {"help", no_argument, NULL, 'h'},
 158   {"include-all-whitespace", no_argument, NULL, 'w'},
 159   {"output-separator", required_argument, NULL, 's'},
 160   {"print-file-name", no_argument, NULL, 'f'},
 161   {"radix", required_argument, NULL, 't'},
 162   {"target", required_argument, NULL, 'T'},
 163   {"unicode", required_argument, NULL, 'U'},
 164   {"version", no_argument, NULL, 'v'},
 165   {NULL, 0, NULL, 0}
 166 };
 167
 168 static bool strings_file (char *);
 169 static void print_strings (const char *, FILE *, file_ptr, int, char *);
 170 static void usage (FILE *, int) ATTRIBUTE_NORETURN;
 171 \f
 172 int main (int, char **);
 173
 174 int
 175 main (int argc, char **argv)
 176 {
 177   int optc;
 178   int exit_status = 0;
 179   bool files_given = false;
 180   char *s;
 181   int numeric_opt = 0;
 182
 183   setlocale (LC_ALL, "");
 184   bindtextdomain (PACKAGE, LOCALEDIR);
 185   textdomain (PACKAGE);
 186
 187   program_name = argv[0];
 188   xmalloc_set_program_name (program_name);
 189   bfd_set_error_program_name (program_name);
 190
 191   expandargv (&argc, &argv);
 192
 193   string_min = 4;
 194   include_all_whitespace = false;
 195   print_addresses = false;
 196   print_filenames = false;
 197   if (DEFAULT_STRINGS_ALL)
 198     datasection_only = false;
 199   else
 200     datasection_only = true;
 201   target = NULL;
 202   encoding = 's';
 203   output_separator = NULL;
 204
 205   while ((optc = getopt_long (argc, argv, "adfhHn:wot:e:T:s:U:Vv0123456789",
 206                               long_options, (int *) 0)) != EOF)
 207     {
 208       switch (optc)
 209         {
 210         case 'a':
 211           datasection_only = false;
 212           break;
 213
 214         case 'd':
 215           datasection_only = true;
 216           break;
 217
 218         case 'f':
 219           print_filenames = true;
 220           break;
 221
 222         case 'H':
 223         case 'h':
 224           usage (stdout, 0);
 225
 226         case 'n':
 227           string_min = (int) strtoul (optarg, &s, 0);
 228           if (s != NULL && *s != 0)
 229             fatal (_("invalid integer argument %s"), optarg);
 230           break;
 231
 232         case 'w':
 233           include_all_whitespace = true;
 234           break;
 235
 236         case 'o':
 237           print_addresses = true;
 238           address_radix = 8;
 239           break;
 240
 241         case 't':
 242           print_addresses = true;
 243           if (optarg[1] != '\0')
 244             usage (stderr, 1);
 245           switch (optarg[0])
 246             {
 247             case 'o':
 248               address_radix = 8;
 249               break;
 250
 251             case 'd':
 252               address_radix = 10;
 253               break;
 254
 255             case 'x':
 256               address_radix = 16;
 257               break;
 258
 259             default:
 260               usage (stderr, 1);
 261             }
 262           break;
 263
 264         case 'T':
 265           target = optarg;
 266           break;
 267
 268         case 'e':
 269           if (optarg[1] != '\0')
 270             usage (stderr, 1);
 271           encoding = optarg[0];
 272           break;
 273
 274         case 's':
 275           output_separator = optarg;
 276           break;
 277
 278         case 'U':
 279           if (streq (optarg, "default") || streq (optarg, "d"))
 280             unicode_display = unicode_default;
 281           else if (streq (optarg, "locale") || streq (optarg, "l"))
 282             unicode_display = unicode_locale;
 283           else if (streq (optarg, "escape") || streq (optarg, "e"))
 284             unicode_display = unicode_escape;
 285           else if (streq (optarg, "invalid") || streq (optarg, "i"))
 286             unicode_display = unicode_invalid;
 287           else if (streq (optarg, "hex") || streq (optarg, "x"))
 288             unicode_display = unicode_hex;
 289           else if (streq (optarg, "highlight") || streq (optarg, "h"))
 290             unicode_display = unicode_highlight;
 291           else
 292             fatal (_("invalid argument to -U/--unicode: %s"), optarg);
 293           break;
 294
 295         case 'V':
 296         case 'v':
 297           print_version ("strings");
 298           break;
 299
 300         case '?':
 301           usage (stderr, 1);
 302
 303         default:
 304           numeric_opt = optind;
 305           break;
 306         }
 307     }
 308
 309   if (unicode_display != unicode_default)
 310     encoding = 'S';
 311
 312   if (numeric_opt != 0)
 313     {
 314       string_min = (int) strtoul (argv[numeric_opt - 1] + 1, &s, 0);
 315       if (s != NULL && *s != 0)
 316         fatal (_("invalid integer argument %s"), argv[numeric_opt - 1] + 1);
 317     }
 318
 319   if (string_min < 1)
 320     fatal (_("invalid minimum string length %d"), string_min);
 321   /* PR 30595: Look for excessive minimum string lengths.
 322      The "(4 * string_min) + 1" is because this is the value
 323      used to allocate space in print_unicode_stream().  */
 324   else if (string_min == -1U || ((4 * string_min) + 1) == 0)
 325     fatal (_("minimum string length %#x is too big"), string_min);
 326
 327   switch (encoding)
 328     {
 329     case 'S':
 330     case 's':
 331       encoding_bytes = 1;
 332       break;
 333     case 'b':
 334     case 'l':
 335       encoding_bytes = 2;
 336       break;
 337     case 'B':
 338     case 'L':
 339       encoding_bytes = 4;
 340       break;
 341     default:
 342       usage (stderr, 1);
 343     }
 344
 345   if (bfd_init () != BFD_INIT_MAGIC)
 346     fatal (_("fatal error: libbfd ABI mismatch"));
 347   set_default_bfd_target ();
 348
 349   if (optind >= argc)
 350     {
 351       datasection_only = false;
 352       SET_BINARY (fileno (stdin));
 353       print_strings ("{standard input}", stdin, 0, 0, (char *) NULL);
 354       files_given = true;
 355     }
 356   else
 357     {
 358       for (; optind < argc; ++optind)
 359         {
 360           if (streq (argv[optind], "-"))
 361             datasection_only = false;
 362           else
 363             {
 364               files_given = true;
 365               exit_status |= !strings_file (argv[optind]);
 366             }
 367         }
 368     }
 369
 370   if (!files_given)
 371     usage (stderr, 1);
 372
 373   return (exit_status);
 374 }
 375 \f
 376 /* Scan section SECT of the file ABFD, whose printable name is
 377    FILENAME.  If it contains initialized data set GOT_A_SECTION and
 378    print the strings in it.  */
 379
 380 static void
 381 strings_a_section (bfd *abfd, asection *sect, const char *filename,
 382                    bool *got_a_section)
 383 {
 384   bfd_size_type sectsize;
 385   bfd_byte *mem;
 386
 387   if ((sect->flags & DATA_FLAGS) != DATA_FLAGS)
 388     return;
 389
 390   sectsize = bfd_section_size (sect);
 391   if (sectsize == 0)
 392     return;
 393
 394   if (!bfd_malloc_and_get_section (abfd, sect, &mem))
 395     {
 396       non_fatal (_("%s: Reading section %s failed: %s"),
 397                  filename, sect->name, bfd_errmsg (bfd_get_error ()));
 398       return;
 399     }
 400
 401   *got_a_section = true;
 402   print_strings (filename, NULL, sect->filepos, sectsize, (char *) mem);
 403   free (mem);
 404 }
 405
 406 /* Scan all of the sections in FILE, and print the strings
 407    in the initialized data section(s).
 408
 409    Return TRUE if successful,
 410    FALSE if not (such as if FILE is not an object file).  */
 411
 412 static bool
 413 strings_object_file (const char *file)
 414 {
 415   bfd *abfd;
 416   asection *s;
 417   bool got_a_section;
 418
 419   abfd = bfd_openr (file, target);
 420
 421   if (abfd == NULL)
 422     /* Treat the file as a non-object file.  */
 423     return false;
 424
 425   /* This call is mainly for its side effect of reading in the sections.
 426      We follow the traditional behavior of `strings' in that we don't
 427      complain if we don't recognize a file to be an object file.  */
 428   if (!bfd_check_format (abfd, bfd_object))
 429     {
 430       bfd_close (abfd);
 431       return false;
 432     }
 433
 434   got_a_section = false;
 435   for (s = abfd->sections; s != NULL; s = s->next)
 436     strings_a_section (abfd, s, file, &got_a_section);
 437
 438   if (!bfd_close (abfd))
 439     {
 440       bfd_nonfatal (file);
 441       return false;
 442     }
 443
 444   return got_a_section;
 445 }
 446
 447 /* Print the strings in FILE.  Return TRUE if ok, FALSE if an error occurs.  */
 448
 449 static bool
 450 strings_file (char *file)
 451 {
 452   struct stat st;
 453
 454   /* get_file_size does not support non-S_ISREG files.  */
 455
 456   if (stat (file, &st) < 0)
 457     {
 458       if (errno == ENOENT)
 459         non_fatal (_("'%s': No such file"), file);
 460       else
 461         non_fatal (_("Warning: could not locate '%s'.  reason: %s"),
 462                    file, strerror (errno));
 463       return false;
 464     }
 465   else if (S_ISDIR (st.st_mode))
 466     {
 467       non_fatal (_("Warning: '%s' is a directory"), file);
 468       return false;
 469     }
 470
 471   /* If we weren't told to scan the whole file,
 472      try to open it as an object file and only look at
 473      initialized data sections.  If that fails, fall back to the
 474      whole file.  */
 475   if (!datasection_only || !strings_object_file (file))
 476     {
 477       FILE *stream;
 478
 479       stream = fopen (file, FOPEN_RB);
 480       if (stream == NULL)
 481         {
 482           fprintf (stderr, "%s: ", program_name);
 483           perror (file);
 484           return false;
 485         }
 486
 487       print_strings (file, stream, (file_ptr) 0, 0, (char *) NULL);
 488
 489       if (fclose (stream) == EOF)
 490         {
 491           fprintf (stderr, "%s: ", program_name);
 492           perror (file);
 493           return false;
 494         }
 495     }
 496
 497   return true;
 498 }
 499 \f
 500 /* Read the next character, return EOF if none available.
 501    Assume that STREAM is positioned so that the next byte read
 502    is at address ADDRESS in the file.
 503
 504    If STREAM is NULL, do not read from it.
 505    The caller can supply a buffer of characters
 506    to be processed before the data in STREAM.
 507    MAGIC is the address of the buffer and
 508    MAGICCOUNT is how many characters are in it.  */
 509
 510 static long
 511 get_char (FILE *stream, file_ptr *address, int *magiccount, char **magic)
 512 {
 513   int c, i;
 514   long r = 0;
 515
 516   for (i = 0; i < encoding_bytes; i++)
 517     {
 518       if (*magiccount)
 519         {
 520           (*magiccount)--;
 521           c = *(*magic)++;
 522         }
 523       else
 524         {
 525           if (stream == NULL)
 526             return EOF;
 527
 528           /* Only use getc_unlocked if we found a declaration for it.
 529              Otherwise, libc is not thread safe by default, and we
 530              should not use it.  */
 531
 532 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
 533           c = getc_unlocked (stream);
 534 #else
 535           c = getc (stream);
 536 #endif
 537           if (c == EOF)
 538             return EOF;
 539         }
 540
 541       (*address)++;
 542       r = (r << 8) | (c & 0xff);
 543     }
 544
 545   switch (encoding)
 546     {
 547     default:
 548       break;
 549     case 'l':
 550       r = ((r & 0xff) << 8) | ((r & 0xff00) >> 8);
 551       break;
 552     case 'L':
 553       r = (((r & 0xff) << 24) | ((r & 0xff00) << 8)
 554            | ((r & 0xff0000) >> 8) | ((r & 0xff000000) >> 24));
 555       break;
 556     }
 557
 558   return r;
 559 }
 560
 561 /* Throw away one byte of a (possibly) multi-byte char C, updating
 562    address and buffer to suit.  */
 563
 564 static void
 565 unget_part_char (long c, file_ptr *address, int *magiccount, char **magic)
 566 {
 567   static char tmp[4];
 568
 569   if (encoding_bytes > 1)
 570     {
 571       *address -= encoding_bytes - 1;
 572
 573       if (*magiccount == 0)
 574         {
 575           /* If no magic buffer exists, use temp buffer.  */
 576           switch (encoding)
 577             {
 578             default:
 579               break;
 580             case 'b':
 581               tmp[0] = c & 0xff;
 582               *magiccount = 1;
 583               break;
 584             case 'l':
 585               tmp[0] = (c >> 8) & 0xff;
 586               *magiccount = 1;
 587               break;
 588             case 'B':
 589               tmp[0] = (c >> 16) & 0xff;
 590               tmp[1] = (c >> 8) & 0xff;
 591               tmp[2] = c & 0xff;
 592               *magiccount = 3;
 593               break;
 594             case 'L':
 595               tmp[0] = (c >> 8) & 0xff;
 596               tmp[1] = (c >> 16) & 0xff;
 597               tmp[2] = (c >> 24) & 0xff;
 598               *magiccount = 3;
 599               break;
 600             }
 601           *magic = tmp;
 602         }
 603       else
 604         {
 605           /* If magic buffer exists, rewind.  */
 606           *magic -= encoding_bytes - 1;
 607           *magiccount += encoding_bytes - 1;
 608         }
 609     }
 610 }
 611
 612 static void
 613 print_filename_and_address (const char * filename, file_ptr address)
 614 {
 615   if (print_filenames)
 616     printf ("%s: ", filename);
 617
 618   if (! print_addresses)
 619     return;
 620
 621   switch (address_radix)
 622     {
 623     case 8:
 624       if (sizeof (address) > sizeof (long))
 625         {
 626 #ifndef __MSVCRT__
 627           printf ("%7llo ", (unsigned long long) address);
 628 #else
 629           printf ("%7I64o ", (unsigned long long) address);
 630 #endif
 631         }
 632       else
 633         printf ("%7lo ", (unsigned long) address);
 634       break;
 635
 636     case 10:
 637       if (sizeof (address) > sizeof (long))
 638         {
 639 #ifndef __MSVCRT__
 640           printf ("%7llu ", (unsigned long long) address);
 641 #else
 642           printf ("%7I64d ", (unsigned long long) address);
 643 #endif
 644         }
 645       else
 646         printf ("%7ld ", (long) address);
 647       break;
 648
 649     case 16:
 650       if (sizeof (address) > sizeof (long))
 651         {
 652 #ifndef __MSVCRT__
 653           printf ("%7llx ", (unsigned long long) address);
 654 #else
 655           printf ("%7I64x ", (unsigned long long) address);
 656 #endif
 657         }
 658       else
 659         printf ("%7lx ", (unsigned long) address);
 660       break;
 661     }
 662 }
 663
 664 /* Return non-zero if the bytes starting at BUFFER form a valid UTF-8 encoding.
 665    If the encoding is valid then returns the number of bytes it uses.  */
 666
 667 static unsigned int
 668 is_valid_utf8 (const unsigned char * buffer, unsigned long buflen)
 669 {
 670   if (buffer[0] < 0xc0)
 671     return 0;
 672
 673   if (buflen < 2)
 674     return 0;
 675
 676   if ((buffer[1] & 0xc0) != 0x80)
 677     return 0;
 678
 679   if ((buffer[0] & 0x20) == 0)
 680     return 2;
 681
 682   if (buflen < 3)
 683     return 0;
 684
 685   if ((buffer[2] & 0xc0) != 0x80)
 686     return 0;
 687
 688   if ((buffer[0] & 0x10) == 0)
 689     return 3;
 690
 691   if (buflen < 4)
 692     return 0;
 693
 694   if ((buffer[3] & 0xc0) != 0x80)
 695     return 0;
 696
 697   return 4;
 698 }
 699
 700 /* Display a UTF-8 encoded character in BUFFER according to the setting
 701    of unicode_display.  The character is known to be valid.
 702    Returns the number of bytes consumed.  */
 703
 704 static unsigned int
 705 display_utf8_char (const unsigned char * buffer)
 706 {
 707   unsigned int j;
 708   unsigned int utf8_len;
 709
 710   switch (buffer[0] & 0x30)
 711     {
 712     case 0x00:
 713     case 0x10:
 714       utf8_len = 2;
 715       break;
 716     case 0x20:
 717       utf8_len = 3;
 718       break;
 719     default:
 720       utf8_len = 4;
 721     }
 722
 723   switch (unicode_display)
 724     {
 725     default:
 726       fprintf (stderr, "ICE: unexpected unicode display type\n");
 727       break;
 728
 729     case unicode_escape:
 730     case unicode_highlight:
 731       if (unicode_display == unicode_highlight && isatty (1))
 732         printf ("\x1B[31;47m"); /* Red.  */
 733
 734       switch (utf8_len)
 735         {
 736         case 2:
 737           printf ("\\u%02x%02x",
 738                   ((buffer[0] & 0x1c) >> 2),
 739                   ((buffer[0] & 0x03) << 6) | (buffer[1] & 0x3f));
 740           break;
 741
 742         case 3:
 743           printf ("\\u%02x%02x",
 744                   ((buffer[0] & 0x0f) << 4) | ((buffer[1] & 0x3c) >> 2),
 745                   ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3f)));
 746           break;
 747
 748         case 4:
 749           printf ("\\u%02x%02x%02x",
 750                   ((buffer[0] & 0x07) << 6) | ((buffer[1] & 0x3c) >> 2),
 751                   ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3c) >> 2),
 752                   ((buffer[2] & 0x03) << 6) | ((buffer[3] & 0x3f)));
 753           break;
 754         default:
 755           /* URG.  */
 756           break;
 757         }
 758
 759       if (unicode_display == unicode_highlight && isatty (1))
 760         printf ("\033[0m"); /* Default colour.  */
 761       break;
 762
 763     case unicode_hex:
 764       putchar ('<');
 765       printf ("0x");
 766       for (j = 0; j < utf8_len; j++)
 767         printf ("%02x", buffer [j]);
 768       putchar ('>');
 769       break;
 770
 771     case unicode_locale:
 772       printf ("%.1s", buffer);
 773       break;
 774     }
 775
 776   return utf8_len;
 777 }
 778
 779 /* Display strings in BUFFER.  Treat any UTF-8 encoded characters encountered
 780    according to the setting of the unicode_display variable.  The buffer
 781    contains BUFLEN bytes.
 782
 783    Display the characters as if they started at ADDRESS and are contained in
 784    FILENAME.  */
 785
 786 static void
 787 print_unicode_buffer (const char *            filename,
 788                       file_ptr                address,
 789                       const unsigned char *   buffer,
 790                       unsigned long           buflen)
 791 {
 792   /* Paranoia checks...  */
 793   if (filename == NULL
 794       || buffer == NULL
 795       || unicode_display == unicode_default
 796       || encoding != 'S'
 797       || encoding_bytes != 1)
 798     {
 799       fprintf (stderr, "ICE: bad arguments to print_unicode_buffer\n");
 800       return;
 801     }
 802
 803   if (buflen == 0)
 804     return;
 805
 806   /* We must only display strings that are at least string_min *characters*
 807      long.  So we scan the buffer in two stages.  First we locate the start
 808      of a potential string.  Then we walk along it until we have found
 809      string_min characters.  Then we go back to the start point and start
 810      displaying characters according to the unicode_display setting.  */
 811
 812   unsigned long start_point = 0;
 813   unsigned long i = 0;
 814   unsigned int char_len = 1;
 815   unsigned int num_found = 0;
 816
 817   for (i = 0; i < buflen; i += char_len)
 818     {
 819       int c = buffer[i];
 820
 821       char_len = 1;
 822
 823       /* Find the first potential character of a string.  */
 824       if (! STRING_ISGRAPHIC (c))
 825         {
 826           num_found = 0;
 827           continue;
 828         }
 829
 830       if (c > 126)
 831         {
 832           if (c < 0xc0)
 833             {
 834               num_found = 0;
 835               continue;
 836             }
 837
 838           if ((char_len = is_valid_utf8 (buffer + i, buflen - i)) == 0)
 839             {
 840               char_len = 1;
 841               num_found = 0;
 842               continue;
 843             }
 844
 845           if (unicode_display == unicode_invalid)
 846             {
 847               /* We have found a valid UTF-8 character, but we treat it as non-graphic.  */
 848               num_found = 0;
 849               continue;
 850             }
 851         }
 852
 853       if (num_found == 0)
 854         /* We have found a potential starting point for a string.  */
 855         start_point = i;
 856
 857       ++ num_found;
 858
 859       if (num_found >= string_min)
 860         break;
 861     }
 862
 863   if (num_found < string_min)
 864     return;
 865
 866   print_filename_and_address (filename, address + start_point);
 867
 868   /* We have found string_min characters.  Display them and any
 869      more that follow.  */
 870   for (i = start_point; i < buflen; i += char_len)
 871     {
 872       int c = buffer[i];
 873
 874       char_len = 1;
 875
 876       if (! STRING_ISGRAPHIC (c))
 877         break;
 878       else if (c < 127)
 879         putchar (c);
 880       else if (! is_valid_utf8 (buffer + i, buflen - i))
 881         break;
 882       else if (unicode_display == unicode_invalid)
 883         break;
 884       else
 885         char_len = display_utf8_char (buffer + i);
 886     }
 887
 888   if (output_separator)
 889     fputs (output_separator, stdout);
 890   else
 891     putchar ('\n');
 892
 893   /* FIXME: Using tail recursion here is lazy programming...  */
 894   print_unicode_buffer (filename, address + i, buffer + i, buflen - i);
 895 }
 896
 897 static int
 898 get_unicode_byte (FILE *          stream,
 899                   unsigned char * putback,
 900                   unsigned int *  num_putback,
 901                   unsigned int *  num_read)
 902 {
 903   if (* num_putback > 0)
 904     {
 905       * num_putback = * num_putback - 1;
 906       return putback [* num_putback];
 907     }
 908
 909   * num_read = * num_read + 1;
 910
 911 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
 912   return getc_unlocked (stream);
 913 #else
 914   return getc (stream);
 915 #endif
 916 }
 917
 918 /* Helper function for print_unicode_stream.  */
 919
 920 static void
 921 print_unicode_stream_body (const char *     filename,
 922                            file_ptr         address,
 923                            FILE *           stream,
 924                            unsigned char *  putback_buf,
 925                            unsigned int     num_putback,
 926                            unsigned char *  print_buf)
 927 {
 928   /* It would be nice if we could just read the stream into a buffer
 929      and then process if with print_unicode_buffer.  But the input
 930      might be huge or it might time-locked (eg stdin).  So instead
 931      we go one byte at a time...  */
 932
 933   file_ptr start_point = 0;
 934   unsigned int num_read = 0;
 935   unsigned int num_chars = 0;
 936   unsigned int num_print = 0;
 937   int c = 0;
 938
 939   /* Find a series of string_min characters.  Put them into print_buf.  */
 940   do
 941     {
 942       if (num_chars >= string_min)
 943         break;
 944
 945       c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
 946       if (c == EOF)
 947         break;
 948
 949       if (! STRING_ISGRAPHIC (c))
 950         {
 951           num_chars = num_print = 0;
 952           continue;
 953         }
 954
 955       if (num_chars == 0)
 956         start_point = num_read - 1;
 957
 958       if (c < 127)
 959         {
 960           print_buf[num_print] = c;
 961           num_chars ++;
 962           num_print ++;
 963           continue;
 964         }
 965
 966       if (c < 0xc0)
 967         {
 968           num_chars = num_print = 0;
 969           continue;
 970         }
 971
 972       /* We *might* have a UTF-8 sequence.  Time to start peeking.  */
 973       char utf8[4];
 974
 975       utf8[0] = c;
 976       c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
 977       if (c == EOF)
 978         break;
 979       utf8[1] = c;
 980
 981       if ((utf8[1] & 0xc0) != 0x80)
 982         {
 983           /* Invalid UTF-8.  */
 984           putback_buf[num_putback++] = utf8[1];
 985           num_chars = num_print = 0;
 986           continue;
 987         }
 988       else if ((utf8[0] & 0x20) == 0)
 989         {
 990           /* A valid 2-byte UTF-8 encoding.  */
 991           if (unicode_display == unicode_invalid)
 992             {
 993               putback_buf[num_putback++] = utf8[1];
 994               num_chars = num_print = 0;
 995             }
 996           else
 997             {
 998               print_buf[num_print ++] = utf8[0];
 999               print_buf[num_print ++] = utf8[1];
1000               num_chars ++;
1001             }
1002           continue;
1003         }
1004
1005       c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1006       if (c == EOF)
1007         break;
1008       utf8[2] = c;
1009
1010       if ((utf8[2] & 0xc0) != 0x80)
1011         {
1012           /* Invalid UTF-8.  */
1013           putback_buf[num_putback++] = utf8[2];
1014           putback_buf[num_putback++] = utf8[1];
1015           num_chars = num_print = 0;
1016           continue;
1017         }
1018       else if ((utf8[0] & 0x10) == 0)
1019         {
1020           /* A valid 3-byte UTF-8 encoding.  */
1021           if (unicode_display == unicode_invalid)
1022             {
1023               putback_buf[num_putback++] = utf8[2];
1024               putback_buf[num_putback++] = utf8[1];
1025               num_chars = num_print = 0;
1026             }
1027           else
1028             {
1029               print_buf[num_print ++] = utf8[0];
1030               print_buf[num_print ++] = utf8[1];
1031               print_buf[num_print ++] = utf8[2];
1032               num_chars ++;
1033             }
1034           continue;
1035         }
1036
1037       c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1038       if (c == EOF)
1039         break;
1040       utf8[3] = c;
1041
1042       if ((utf8[3] & 0xc0) != 0x80)
1043         {
1044           /* Invalid UTF-8.  */
1045           putback_buf[num_putback++] = utf8[3];
1046           putback_buf[num_putback++] = utf8[2];
1047           putback_buf[num_putback++] = utf8[1];
1048           num_chars = num_print = 0;
1049         }
1050       /* We have a valid 4-byte UTF-8 encoding.  */
1051       else if (unicode_display == unicode_invalid)
1052         {
1053           putback_buf[num_putback++] = utf8[3];
1054           putback_buf[num_putback++] = utf8[1];
1055           putback_buf[num_putback++] = utf8[2];
1056           num_chars = num_print = 0;
1057         }
1058       else
1059         {
1060           print_buf[num_print ++] = utf8[0];
1061           print_buf[num_print ++] = utf8[1];
1062           print_buf[num_print ++] = utf8[2];
1063           print_buf[num_print ++] = utf8[3];
1064           num_chars ++;
1065         }
1066     }
1067   while (1);
1068
1069   if (num_chars >= string_min)
1070     {
1071       /* We know that we have string_min valid characters in print_buf,
1072          and there may be more to come in the stream.  Start displaying
1073          them.  */
1074
1075       print_filename_and_address (filename, address + start_point);
1076
1077       unsigned int i;
1078       for (i = 0; i < num_print;)
1079         {
1080           if (print_buf[i] < 127)
1081             putchar (print_buf[i++]);
1082           else
1083             i += display_utf8_char (print_buf + i);
1084         }
1085
1086       /* OK so now we have to start read unchecked bytes.  */
1087
1088       /* Find a series of string_min characters.  Put them into print_buf.  */
1089       do
1090         {
1091           c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1092           if (c == EOF)
1093             break;
1094
1095           if (! STRING_ISGRAPHIC (c))
1096             break;
1097
1098           if (c < 127)
1099             {
1100               putchar (c);
1101               continue;
1102             }
1103
1104           if (c < 0xc0)
1105             break;
1106
1107           /* We *might* have a UTF-8 sequence.  Time to start peeking.  */
1108           unsigned char utf8[4];
1109
1110           utf8[0] = c;
1111           c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1112           if (c == EOF)
1113             break;
1114           utf8[1] = c;
1115
1116           if ((utf8[1] & 0xc0) != 0x80)
1117             {
1118               /* Invalid UTF-8.  */
1119               putback_buf[num_putback++] = utf8[1];
1120               break;
1121             }
1122           else if ((utf8[0] & 0x20) == 0)
1123             {
1124               /* Valid 2-byte UTF-8.  */
1125               if (unicode_display == unicode_invalid)
1126                 {
1127                   putback_buf[num_putback++] = utf8[1];
1128                   break;
1129                 }
1130               else
1131                 {
1132                   (void) display_utf8_char (utf8);
1133                   continue;
1134                 }
1135             }
1136
1137           c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1138           if (c == EOF)
1139             break;
1140           utf8[2] = c;
1141
1142           if ((utf8[2] & 0xc0) != 0x80)
1143             {
1144               /* Invalid UTF-8.  */
1145               putback_buf[num_putback++] = utf8[2];
1146               putback_buf[num_putback++] = utf8[1];
1147               break;
1148             }
1149           else if ((utf8[0] & 0x10) == 0)
1150             {
1151               /* Valid 3-byte UTF-8.  */
1152               if (unicode_display == unicode_invalid)
1153                 {
1154                   putback_buf[num_putback++] = utf8[2];
1155                   putback_buf[num_putback++] = utf8[1];
1156                   break;
1157                 }
1158               else
1159                 {
1160                   (void) display_utf8_char (utf8);
1161                   continue;
1162                 }
1163             }
1164
1165           c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1166           if (c == EOF)
1167             break;
1168           utf8[3] = c;
1169
1170           if ((utf8[3] & 0xc0) != 0x80)
1171             {
1172               /* Invalid UTF-8.  */
1173               putback_buf[num_putback++] = utf8[3];
1174               putback_buf[num_putback++] = utf8[2];
1175               putback_buf[num_putback++] = utf8[1];
1176               break;
1177             }
1178           else if (unicode_display == unicode_invalid)
1179             {
1180               putback_buf[num_putback++] = utf8[3];
1181               putback_buf[num_putback++] = utf8[2];
1182               putback_buf[num_putback++] = utf8[1];
1183               break;
1184             }
1185           else
1186             /* A valid 4-byte UTF-8 encoding.  */
1187             (void) display_utf8_char (utf8);
1188         }
1189       while (1);
1190
1191       if (output_separator)
1192         fputs (output_separator, stdout);
1193       else
1194         putchar ('\n');
1195     }
1196
1197   if (c != EOF)
1198     /* FIXME: Using tail recursion here is lazy, but it works.  */
1199     print_unicode_stream_body (filename, address + num_read, stream, putback_buf, num_putback, print_buf);
1200 }
1201
1202 /* Display strings read in from STREAM.  Treat any UTF-8 encoded characters
1203    encountered according to the setting of the unicode_display variable.
1204    The stream is positioned at ADDRESS and is attached to FILENAME.  */
1205
1206 static void
1207 print_unicode_stream (const char * filename,
1208                       file_ptr     address,
1209                       FILE *       stream)
1210 {
1211   /* Paranoia checks...  */
1212   if (filename == NULL
1213       || stream == NULL
1214       || unicode_display == unicode_default
1215       || encoding != 'S'
1216       || encoding_bytes != 1)
1217     {
1218       fprintf (stderr, "ICE: bad arguments to print_unicode_stream\n");
1219       return;
1220     }
1221
1222   /* Allocate space for string_min 4-byte utf-8 characters.  */
1223   unsigned char * print_buf = xmalloc ((4 * string_min) + 1);
1224   /* We should never have to put back more than 4 bytes.  */
1225   unsigned char putback_buf[5];
1226   unsigned int num_putback = 0;
1227
1228   print_unicode_stream_body (filename, address, stream, putback_buf, num_putback, print_buf);
1229   free (print_buf);
1230 }
1231 \f
1232 /* Find the strings in file FILENAME, read from STREAM.
1233    Assume that STREAM is positioned so that the next byte read
1234    is at address ADDRESS in the file.
1235
1236    If STREAM is NULL, do not read from it.
1237    The caller can supply a buffer of characters
1238    to be processed before the data in STREAM.
1239    MAGIC is the address of the buffer and
1240    MAGICCOUNT is how many characters are in it.
1241    Those characters come at address ADDRESS and the data in STREAM follow.  */
1242
1243 static void
1244 print_strings (const char *filename, FILE *stream, file_ptr address,
1245                int magiccount, char *magic)
1246 {
1247   if (unicode_display != unicode_default)
1248     {
1249       if (magic != NULL)
1250         print_unicode_buffer (filename, address,
1251                               (const unsigned char *) magic, magiccount);
1252
1253       if (stream != NULL)
1254         print_unicode_stream (filename, address, stream);
1255       return;
1256     }
1257
1258   char *buf = (char *) xmalloc (sizeof (char) * (string_min + 1));
1259
1260   while (1)
1261     {
1262       file_ptr start;
1263       unsigned int i;
1264       long c;
1265
1266       /* See if the next `string_min' chars are all graphic chars.  */
1267     tryline:
1268       start = address;
1269       for (i = 0; i < string_min; i++)
1270         {
1271           c = get_char (stream, &address, &magiccount, &magic);
1272           if (c == EOF)
1273             {
1274               free (buf);
1275               return;
1276             }
1277
1278           if (! STRING_ISGRAPHIC (c))
1279             {
1280               /* Found a non-graphic.  Try again starting with next byte.  */
1281               unget_part_char (c, &address, &magiccount, &magic);
1282               goto tryline;
1283             }
1284           buf[i] = c;
1285         }
1286
1287       /* We found a run of `string_min' graphic characters.  Print up
1288          to the next non-graphic character.  */
1289       print_filename_and_address (filename, start);
1290
1291       buf[i] = '\0';
1292       fputs (buf, stdout);
1293
1294       while (1)
1295         {
1296           c = get_char (stream, &address, &magiccount, &magic);
1297           if (c == EOF)
1298             break;
1299           if (! STRING_ISGRAPHIC (c))
1300             {
1301               unget_part_char (c, &address, &magiccount, &magic);
1302               break;
1303             }
1304           putchar (c);
1305         }
1306
1307       if (output_separator)
1308         fputs (output_separator, stdout);
1309       else
1310         putchar ('\n');
1311     }
1312   free (buf);
1313 }
1314 \f
1315 static void
1316 usage (FILE *stream, int status)
1317 {
1318   fprintf (stream, _("Usage: %s [option(s)] [file(s)]\n"), program_name);
1319   fprintf (stream, _(" Display printable strings in [file(s)] (stdin by default)\n"));
1320   fprintf (stream, _(" The options are:\n"));
1321
1322   if (DEFAULT_STRINGS_ALL)
1323     fprintf (stream, _("\
1324   -a - --all                Scan the entire file, not just the data section [default]\n\
1325   -d --data                 Only scan the data sections in the file\n"));
1326   else
1327     fprintf (stream, _("\
1328   -a - --all                Scan the entire file, not just the data section\n\
1329   -d --data                 Only scan the data sections in the file [default]\n"));
1330
1331   fprintf (stream, _("\
1332   -f --print-file-name      Print the name of the file before each string\n\
1333   -n <number>               Locate & print any sequence of at least <number>\n\
1334     --bytes=<number>         displayable characters.  (The default is 4).\n\
1335   -t --radix={o,d,x}        Print the location of the string in base 8, 10 or 16\n\
1336   -w --include-all-whitespace Include all whitespace as valid string characters\n\
1337   -o                        An alias for --radix=o\n\
1338   -T --target=<BFDNAME>     Specify the binary file format\n\
1339   -e --encoding={s,S,b,l,B,L} Select character size and endianness:\n\
1340                             s = 7-bit, S = 8-bit, {b,l} = 16-bit, {B,L} = 32-bit\n\
1341   --unicode={default|show|invalid|hex|escape|highlight}\n\
1342   -U {d|s|i|x|e|h}          Specify how to treat UTF-8 encoded unicode characters\n\
1343   -s --output-separator=<string> String used to separate strings in output.\n\
1344   @<file>                   Read options from <file>\n\
1345   -h --help                 Display this information\n\
1346   -v -V --version           Print the program's version number\n"));
1347   list_supported_targets (program_name, stream);
1348   if (REPORT_BUGS_TO[0] && status == 0)
1349     fprintf (stream, _("Report bugs to %s\n"), REPORT_BUGS_TO);
1350   exit (status);
1351 }