binutils/strings.c

   1 /* strings -- print the strings of printable characters in files
   2    Copyright (C) 1993-2023 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 3, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software
  16    Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
  17    02110-1301, USA.  */
  18 \f
  19 /* Usage: strings [options] file...
  20
  21    Options:
  22    --all
  23    -a
  24    -            Scan each file in its entirety.
  25
  26    --data
  27    -d           Scan only the initialized data section(s) of object files.
  28
  29    --print-file-name
  30    -f           Print the name of the file before each string.
  31
  32    --bytes=min-len
  33    -n min-len
  34    -min-len     Print graphic char sequences, MIN-LEN or more bytes long,
  35                 that are followed by a NUL or a non-displayable character.
  36                 Default is 4.
  37
  38    --radix={o,x,d}
  39    -t {o,x,d}   Print the offset within the file before each string,
  40                 in octal/hex/decimal.
  41
  42   --include-all-whitespace
  43   -w            By default tab and space are the only whitepace included in graphic
  44                 char sequences.  This option considers all of isspace() valid.
  45
  46    -o           Like -to.  (Some other implementations have -o like -to,
  47                 others like -td.  We chose one arbitrarily.)
  48
  49    --encoding={s,S,b,l,B,L}
  50    -e {s,S,b,l,B,L}
  51                 Select character encoding: 7-bit-character, 8-bit-character,
  52                 bigendian 16-bit, littleendian 16-bit, bigendian 32-bit,
  53                 littleendian 32-bit.
  54
  55    --target=BFDNAME
  56    -T {bfdname}
  57                 Specify a non-default object file format.
  58
  59   --unicode={default|locale|invalid|hex|escape|highlight}
  60   -U {d|l|i|x|e|h}
  61                 Determine how to handle UTF-8 unicode characters.  The default
  62                 is no special treatment.  All other versions of this option
  63                 only apply if the encoding is valid and enabling the option
  64                 implies --encoding=S.
  65                 The 'locale' option displays the characters according to the
  66                 current locale.  The 'invalid' option treats them as
  67                 non-string characters.  The 'hex' option displays them as hex
  68                 byte sequences.  The 'escape' option displays them as escape
  69                 sequences and the 'highlight' option displays them as
  70                 coloured escape sequences.
  71
  72   --output-separator=sep_string
  73   -s sep_string String used to separate parsed strings in output.
  74                 Default is newline.
  75
  76    --help
  77    -h           Print the usage message on the standard output.
  78
  79    --version
  80    -V
  81    -v           Print the program version number.
  82
  83    Written by Richard Stallman <rms@gnu.ai.mit.edu>
  84    and David MacKenzie <djm@gnu.ai.mit.edu>.  */
  85
  86 #include "sysdep.h"
  87 #include "bfd.h"
  88 #include "getopt.h"
  89 #include "libiberty.h"
  90 #include "safe-ctype.h"
  91 #include "bucomm.h"
  92
  93 #ifndef streq
  94 #define streq(a,b) (strcmp ((a),(b)) == 0)
  95 #endif
  96
  97 typedef enum unicode_display_type
  98 {
  99   unicode_default = 0,
 100   unicode_locale,
 101   unicode_escape,
 102   unicode_hex,
 103   unicode_highlight,
 104   unicode_invalid
 105 } unicode_display_type;
 106
 107 static unicode_display_type unicode_display = unicode_default;
 108
 109 #define STRING_ISGRAPHIC(c) \
 110       (   (c) >= 0 \
 111        && (c) <= 255 \
 112        && ((c) == '\t' || ISPRINT (c) || (encoding == 'S' && (c) > 127) \
 113            || (include_all_whitespace && ISSPACE (c))) \
 114       )
 115
 116 #ifndef errno
 117 extern int errno;
 118 #endif
 119
 120 /* The BFD section flags that identify an initialized data section.  */
 121 #define DATA_FLAGS (SEC_ALLOC | SEC_LOAD | SEC_HAS_CONTENTS)
 122
 123 /* Radix for printing addresses (must be 8, 10 or 16).  */
 124 static int address_radix;
 125
 126 /* Minimum length of sequence of graphic chars to trigger output.  */
 127 static unsigned int string_min;
 128
 129 /* Whether or not we include all whitespace as a graphic char.   */
 130 static bool include_all_whitespace;
 131
 132 /* TRUE means print address within file for each string.  */
 133 static bool print_addresses;
 134
 135 /* TRUE means print filename for each string.  */
 136 static bool print_filenames;
 137
 138 /* TRUE means for object files scan only the data section.  */
 139 static bool datasection_only;
 140
 141 /* The BFD object file format.  */
 142 static char *target;
 143
 144 /* The character encoding format.  */
 145 static char encoding;
 146 static int encoding_bytes;
 147
 148 /* Output string used to separate parsed strings  */
 149 static char *output_separator;
 150
 151 static struct option long_options[] =
 152 {
 153   {"all", no_argument, NULL, 'a'},
 154   {"bytes", required_argument, NULL, 'n'},
 155   {"data", no_argument, NULL, 'd'},
 156   {"encoding", required_argument, NULL, 'e'},
 157   {"help", no_argument, NULL, 'h'},
 158   {"include-all-whitespace", no_argument, NULL, 'w'},
 159   {"output-separator", required_argument, NULL, 's'},
 160   {"print-file-name", no_argument, NULL, 'f'},
 161   {"radix", required_argument, NULL, 't'},
 162   {"target", required_argument, NULL, 'T'},
 163   {"unicode", required_argument, NULL, 'U'},
 164   {"version", no_argument, NULL, 'v'},
 165   {NULL, 0, NULL, 0}
 166 };
 167
 168 static bool strings_file (char *);
 169 static void print_strings (const char *, FILE *, file_ptr, int, char *);
 170 static void usage (FILE *, int) ATTRIBUTE_NORETURN;
 171 \f
 172 int main (int, char **);
 173
 174 static void
 175 set_string_min (const char * arg)
 176 {
 177   char *s;
 178   unsigned long l = strtoul (arg, &s, 0);
 179
 180   if (s != NULL && *s != 0)
 181     fatal (_("invalid integer argument %s"), arg);
 182
 183   string_min = (unsigned int) l;
 184
 185   if (l != (unsigned long) string_min)
 186     fatal (_("minimum string length is too big: %s"), arg);
 187
 188   if (string_min < 1)
 189     fatal (_("minimum string length is too small: %s"), arg);
 190
 191   /* PR 30595: Look for minimum string lengths that overflow an 'int'.  */
 192   if (string_min + 1 == 0)
 193     fatal (_("minimum string length %s is too big"), arg);
 194
 195   /* FIXME: Should we warn for unreasonably large minimum
 196      string lengths, even if technically they will work ?  */
 197 }
 198
 199 int
 200 main (int argc, char **argv)
 201 {
 202   int optc;
 203   int exit_status = 0;
 204   bool files_given = false;
 205   int numeric_opt = 0;
 206
 207   setlocale (LC_ALL, "");
 208   bindtextdomain (PACKAGE, LOCALEDIR);
 209   textdomain (PACKAGE);
 210
 211   program_name = argv[0];
 212   xmalloc_set_program_name (program_name);
 213   bfd_set_error_program_name (program_name);
 214
 215   expandargv (&argc, &argv);
 216
 217   string_min = 4;
 218   include_all_whitespace = false;
 219   print_addresses = false;
 220   print_filenames = false;
 221   if (DEFAULT_STRINGS_ALL)
 222     datasection_only = false;
 223   else
 224     datasection_only = true;
 225   target = NULL;
 226   encoding = 's';
 227   output_separator = NULL;
 228
 229   while ((optc = getopt_long (argc, argv, "adfhHn:wot:e:T:s:U:Vv0123456789",
 230                               long_options, (int *) 0)) != EOF)
 231     {
 232       switch (optc)
 233         {
 234         case 'a':
 235           datasection_only = false;
 236           break;
 237
 238         case 'd':
 239           datasection_only = true;
 240           break;
 241
 242         case 'f':
 243           print_filenames = true;
 244           break;
 245
 246         case 'H':
 247         case 'h':
 248           usage (stdout, 0);
 249
 250         case 'n':
 251           set_string_min (optarg);
 252           break;
 253
 254         case 'w':
 255           include_all_whitespace = true;
 256           break;
 257
 258         case 'o':
 259           print_addresses = true;
 260           address_radix = 8;
 261           break;
 262
 263         case 't':
 264           print_addresses = true;
 265           if (optarg[1] != '\0')
 266             usage (stderr, 1);
 267           switch (optarg[0])
 268             {
 269             case 'o':
 270               address_radix = 8;
 271               break;
 272
 273             case 'd':
 274               address_radix = 10;
 275               break;
 276
 277             case 'x':
 278               address_radix = 16;
 279               break;
 280
 281             default:
 282               usage (stderr, 1);
 283             }
 284           break;
 285
 286         case 'T':
 287           target = optarg;
 288           break;
 289
 290         case 'e':
 291           if (optarg[1] != '\0')
 292             usage (stderr, 1);
 293           encoding = optarg[0];
 294           break;
 295
 296         case 's':
 297           output_separator = optarg;
 298           break;
 299
 300         case 'U':
 301           if (streq (optarg, "default") || streq (optarg, "d"))
 302             unicode_display = unicode_default;
 303           else if (streq (optarg, "locale") || streq (optarg, "l"))
 304             unicode_display = unicode_locale;
 305           else if (streq (optarg, "escape") || streq (optarg, "e"))
 306             unicode_display = unicode_escape;
 307           else if (streq (optarg, "invalid") || streq (optarg, "i"))
 308             unicode_display = unicode_invalid;
 309           else if (streq (optarg, "hex") || streq (optarg, "x"))
 310             unicode_display = unicode_hex;
 311           else if (streq (optarg, "highlight") || streq (optarg, "h"))
 312             unicode_display = unicode_highlight;
 313           else
 314             fatal (_("invalid argument to -U/--unicode: %s"), optarg);
 315           break;
 316
 317         case 'V':
 318         case 'v':
 319           print_version ("strings");
 320           break;
 321
 322         case '?':
 323           usage (stderr, 1);
 324
 325         default:
 326           numeric_opt = optind;
 327           break;
 328         }
 329     }
 330
 331   if (unicode_display != unicode_default)
 332     encoding = 'S';
 333
 334   if (numeric_opt != 0)
 335     set_string_min (argv[numeric_opt - 1] + 1);
 336
 337   switch (encoding)
 338     {
 339     case 'S':
 340     case 's':
 341       encoding_bytes = 1;
 342       break;
 343     case 'b':
 344     case 'l':
 345       encoding_bytes = 2;
 346       break;
 347     case 'B':
 348     case 'L':
 349       encoding_bytes = 4;
 350       break;
 351     default:
 352       usage (stderr, 1);
 353     }
 354
 355   if (bfd_init () != BFD_INIT_MAGIC)
 356     fatal (_("fatal error: libbfd ABI mismatch"));
 357   set_default_bfd_target ();
 358
 359   if (optind >= argc)
 360     {
 361       datasection_only = false;
 362       SET_BINARY (fileno (stdin));
 363       print_strings ("{standard input}", stdin, 0, 0, (char *) NULL);
 364       files_given = true;
 365     }
 366   else
 367     {
 368       for (; optind < argc; ++optind)
 369         {
 370           if (streq (argv[optind], "-"))
 371             datasection_only = false;
 372           else
 373             {
 374               files_given = true;
 375               exit_status |= !strings_file (argv[optind]);
 376             }
 377         }
 378     }
 379
 380   if (!files_given)
 381     usage (stderr, 1);
 382
 383   return (exit_status);
 384 }
 385 \f
 386 /* Scan section SECT of the file ABFD, whose printable name is
 387    FILENAME.  If it contains initialized data set GOT_A_SECTION and
 388    print the strings in it.  */
 389
 390 static void
 391 strings_a_section (bfd *abfd, asection *sect, const char *filename,
 392                    bool *got_a_section)
 393 {
 394   bfd_size_type sectsize;
 395   bfd_byte *mem;
 396
 397   if ((sect->flags & DATA_FLAGS) != DATA_FLAGS)
 398     return;
 399
 400   sectsize = bfd_section_size (sect);
 401   if (sectsize == 0)
 402     return;
 403
 404   if (!bfd_malloc_and_get_section (abfd, sect, &mem))
 405     {
 406       non_fatal (_("%s: Reading section %s failed: %s"),
 407                  filename, sect->name, bfd_errmsg (bfd_get_error ()));
 408       return;
 409     }
 410
 411   *got_a_section = true;
 412   print_strings (filename, NULL, sect->filepos, sectsize, (char *) mem);
 413   free (mem);
 414 }
 415
 416 /* Scan all of the sections in FILE, and print the strings
 417    in the initialized data section(s).
 418
 419    Return TRUE if successful,
 420    FALSE if not (such as if FILE is not an object file).  */
 421
 422 static bool
 423 strings_object_file (const char *file)
 424 {
 425   bfd *abfd;
 426   asection *s;
 427   bool got_a_section;
 428
 429   abfd = bfd_openr (file, target);
 430
 431   if (abfd == NULL)
 432     /* Treat the file as a non-object file.  */
 433     return false;
 434
 435   /* This call is mainly for its side effect of reading in the sections.
 436      We follow the traditional behavior of `strings' in that we don't
 437      complain if we don't recognize a file to be an object file.  */
 438   if (!bfd_check_format (abfd, bfd_object))
 439     {
 440       bfd_close (abfd);
 441       return false;
 442     }
 443
 444   got_a_section = false;
 445   for (s = abfd->sections; s != NULL; s = s->next)
 446     strings_a_section (abfd, s, file, &got_a_section);
 447
 448   if (!bfd_close (abfd))
 449     {
 450       bfd_nonfatal (file);
 451       return false;
 452     }
 453
 454   return got_a_section;
 455 }
 456
 457 /* Print the strings in FILE.  Return TRUE if ok, FALSE if an error occurs.  */
 458
 459 static bool
 460 strings_file (char *file)
 461 {
 462   struct stat st;
 463
 464   /* get_file_size does not support non-S_ISREG files.  */
 465
 466   if (stat (file, &st) < 0)
 467     {
 468       if (errno == ENOENT)
 469         non_fatal (_("'%s': No such file"), file);
 470       else
 471         non_fatal (_("Warning: could not locate '%s'.  reason: %s"),
 472                    file, strerror (errno));
 473       return false;
 474     }
 475   else if (S_ISDIR (st.st_mode))
 476     {
 477       non_fatal (_("Warning: '%s' is a directory"), file);
 478       return false;
 479     }
 480
 481   /* If we weren't told to scan the whole file,
 482      try to open it as an object file and only look at
 483      initialized data sections.  If that fails, fall back to the
 484      whole file.  */
 485   if (!datasection_only || !strings_object_file (file))
 486     {
 487       FILE *stream;
 488
 489       stream = fopen (file, FOPEN_RB);
 490       if (stream == NULL)
 491         {
 492           fprintf (stderr, "%s: ", program_name);
 493           perror (file);
 494           return false;
 495         }
 496
 497       print_strings (file, stream, (file_ptr) 0, 0, (char *) NULL);
 498
 499       if (fclose (stream) == EOF)
 500         {
 501           fprintf (stderr, "%s: ", program_name);
 502           perror (file);
 503           return false;
 504         }
 505     }
 506
 507   return true;
 508 }
 509 \f
 510 /* Read the next character, return EOF if none available.
 511    Assume that STREAM is positioned so that the next byte read
 512    is at address ADDRESS in the file.
 513
 514    If STREAM is NULL, do not read from it.
 515    The caller can supply a buffer of characters
 516    to be processed before the data in STREAM.
 517    MAGIC is the address of the buffer and
 518    MAGICCOUNT is how many characters are in it.  */
 519
 520 static long
 521 get_char (FILE *stream, file_ptr *address, int *magiccount, char **magic)
 522 {
 523   int c, i;
 524   long r = 0;
 525
 526   for (i = 0; i < encoding_bytes; i++)
 527     {
 528       if (*magiccount)
 529         {
 530           (*magiccount)--;
 531           c = *(*magic)++;
 532         }
 533       else
 534         {
 535           if (stream == NULL)
 536             return EOF;
 537
 538           /* Only use getc_unlocked if we found a declaration for it.
 539              Otherwise, libc is not thread safe by default, and we
 540              should not use it.  */
 541
 542 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
 543           c = getc_unlocked (stream);
 544 #else
 545           c = getc (stream);
 546 #endif
 547           if (c == EOF)
 548             return EOF;
 549         }
 550
 551       (*address)++;
 552       r = (r << 8) | (c & 0xff);
 553     }
 554
 555   switch (encoding)
 556     {
 557     default:
 558       break;
 559     case 'l':
 560       r = ((r & 0xff) << 8) | ((r & 0xff00) >> 8);
 561       break;
 562     case 'L':
 563       r = (((r & 0xff) << 24) | ((r & 0xff00) << 8)
 564            | ((r & 0xff0000) >> 8) | ((r & 0xff000000) >> 24));
 565       break;
 566     }
 567
 568   return r;
 569 }
 570
 571 /* Throw away one byte of a (possibly) multi-byte char C, updating
 572    address and buffer to suit.  */
 573
 574 static void
 575 unget_part_char (long c, file_ptr *address, int *magiccount, char **magic)
 576 {
 577   static char tmp[4];
 578
 579   if (encoding_bytes > 1)
 580     {
 581       *address -= encoding_bytes - 1;
 582
 583       if (*magiccount == 0)
 584         {
 585           /* If no magic buffer exists, use temp buffer.  */
 586           switch (encoding)
 587             {
 588             default:
 589               break;
 590             case 'b':
 591               tmp[0] = c & 0xff;
 592               *magiccount = 1;
 593               break;
 594             case 'l':
 595               tmp[0] = (c >> 8) & 0xff;
 596               *magiccount = 1;
 597               break;
 598             case 'B':
 599               tmp[0] = (c >> 16) & 0xff;
 600               tmp[1] = (c >> 8) & 0xff;
 601               tmp[2] = c & 0xff;
 602               *magiccount = 3;
 603               break;
 604             case 'L':
 605               tmp[0] = (c >> 8) & 0xff;
 606               tmp[1] = (c >> 16) & 0xff;
 607               tmp[2] = (c >> 24) & 0xff;
 608               *magiccount = 3;
 609               break;
 610             }
 611           *magic = tmp;
 612         }
 613       else
 614         {
 615           /* If magic buffer exists, rewind.  */
 616           *magic -= encoding_bytes - 1;
 617           *magiccount += encoding_bytes - 1;
 618         }
 619     }
 620 }
 621
 622 static void
 623 print_filename_and_address (const char * filename, file_ptr address)
 624 {
 625   if (print_filenames)
 626     printf ("%s: ", filename);
 627
 628   if (! print_addresses)
 629     return;
 630
 631   switch (address_radix)
 632     {
 633     case 8:
 634       if (sizeof (address) > sizeof (long))
 635         {
 636 #ifndef __MSVCRT__
 637           printf ("%7llo ", (unsigned long long) address);
 638 #else
 639           printf ("%7I64o ", (unsigned long long) address);
 640 #endif
 641         }
 642       else
 643         printf ("%7lo ", (unsigned long) address);
 644       break;
 645
 646     case 10:
 647       if (sizeof (address) > sizeof (long))
 648         {
 649 #ifndef __MSVCRT__
 650           printf ("%7llu ", (unsigned long long) address);
 651 #else
 652           printf ("%7I64d ", (unsigned long long) address);
 653 #endif
 654         }
 655       else
 656         printf ("%7ld ", (long) address);
 657       break;
 658
 659     case 16:
 660       if (sizeof (address) > sizeof (long))
 661         {
 662 #ifndef __MSVCRT__
 663           printf ("%7llx ", (unsigned long long) address);
 664 #else
 665           printf ("%7I64x ", (unsigned long long) address);
 666 #endif
 667         }
 668       else
 669         printf ("%7lx ", (unsigned long) address);
 670       break;
 671     }
 672 }
 673
 674 /* Return non-zero if the bytes starting at BUFFER form a valid UTF-8 encoding.
 675    If the encoding is valid then returns the number of bytes it uses.  */
 676
 677 static unsigned int
 678 is_valid_utf8 (const unsigned char * buffer, unsigned long buflen)
 679 {
 680   if (buffer[0] < 0xc0)
 681     return 0;
 682
 683   if (buflen < 2)
 684     return 0;
 685
 686   if ((buffer[1] & 0xc0) != 0x80)
 687     return 0;
 688
 689   if ((buffer[0] & 0x20) == 0)
 690     return 2;
 691
 692   if (buflen < 3)
 693     return 0;
 694
 695   if ((buffer[2] & 0xc0) != 0x80)
 696     return 0;
 697
 698   if ((buffer[0] & 0x10) == 0)
 699     return 3;
 700
 701   if (buflen < 4)
 702     return 0;
 703
 704   if ((buffer[3] & 0xc0) != 0x80)
 705     return 0;
 706
 707   return 4;
 708 }
 709
 710 /* Display a UTF-8 encoded character in BUFFER according to the setting
 711    of unicode_display.  The character is known to be valid.
 712    Returns the number of bytes consumed.  */
 713
 714 static unsigned int
 715 display_utf8_char (const unsigned char * buffer)
 716 {
 717   unsigned int j;
 718   unsigned int utf8_len;
 719
 720   switch (buffer[0] & 0x30)
 721     {
 722     case 0x00:
 723     case 0x10:
 724       utf8_len = 2;
 725       break;
 726     case 0x20:
 727       utf8_len = 3;
 728       break;
 729     default:
 730       utf8_len = 4;
 731     }
 732
 733   switch (unicode_display)
 734     {
 735     default:
 736       fprintf (stderr, "ICE: unexpected unicode display type\n");
 737       break;
 738
 739     case unicode_escape:
 740     case unicode_highlight:
 741       if (unicode_display == unicode_highlight && isatty (1))
 742         printf ("\x1B[31;47m"); /* Red.  */
 743
 744       switch (utf8_len)
 745         {
 746         case 2:
 747           printf ("\\u%02x%02x",
 748                   ((buffer[0] & 0x1c) >> 2),
 749                   ((buffer[0] & 0x03) << 6) | (buffer[1] & 0x3f));
 750           break;
 751
 752         case 3:
 753           printf ("\\u%02x%02x",
 754                   ((buffer[0] & 0x0f) << 4) | ((buffer[1] & 0x3c) >> 2),
 755                   ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3f)));
 756           break;
 757
 758         case 4:
 759           printf ("\\u%02x%02x%02x",
 760                   ((buffer[0] & 0x07) << 6) | ((buffer[1] & 0x3c) >> 2),
 761                   ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3c) >> 2),
 762                   ((buffer[2] & 0x03) << 6) | ((buffer[3] & 0x3f)));
 763           break;
 764         default:
 765           /* URG.  */
 766           break;
 767         }
 768
 769       if (unicode_display == unicode_highlight && isatty (1))
 770         printf ("\033[0m"); /* Default colour.  */
 771       break;
 772
 773     case unicode_hex:
 774       putchar ('<');
 775       printf ("0x");
 776       for (j = 0; j < utf8_len; j++)
 777         printf ("%02x", buffer [j]);
 778       putchar ('>');
 779       break;
 780
 781     case unicode_locale:
 782       printf ("%.1s", buffer);
 783       break;
 784     }
 785
 786   return utf8_len;
 787 }
 788
 789 /* Display strings in BUFFER.  Treat any UTF-8 encoded characters encountered
 790    according to the setting of the unicode_display variable.  The buffer
 791    contains BUFLEN bytes.
 792
 793    Display the characters as if they started at ADDRESS and are contained in
 794    FILENAME.  */
 795
 796 static void
 797 print_unicode_buffer (const char *            filename,
 798                       file_ptr                address,
 799                       const unsigned char *   buffer,
 800                       unsigned long           buflen)
 801 {
 802   /* Paranoia checks...  */
 803   if (filename == NULL
 804       || buffer == NULL
 805       || unicode_display == unicode_default
 806       || encoding != 'S'
 807       || encoding_bytes != 1)
 808     {
 809       fprintf (stderr, "ICE: bad arguments to print_unicode_buffer\n");
 810       return;
 811     }
 812
 813   if (buflen == 0)
 814     return;
 815
 816   /* We must only display strings that are at least string_min *characters*
 817      long.  So we scan the buffer in two stages.  First we locate the start
 818      of a potential string.  Then we walk along it until we have found
 819      string_min characters.  Then we go back to the start point and start
 820      displaying characters according to the unicode_display setting.  */
 821
 822   unsigned long start_point = 0;
 823   unsigned long i = 0;
 824   unsigned int char_len = 1;
 825   unsigned int num_found = 0;
 826
 827   for (i = 0; i < buflen; i += char_len)
 828     {
 829       int c = buffer[i];
 830
 831       char_len = 1;
 832
 833       /* Find the first potential character of a string.  */
 834       if (! STRING_ISGRAPHIC (c))
 835         {
 836           num_found = 0;
 837           continue;
 838         }
 839
 840       if (c > 126)
 841         {
 842           if (c < 0xc0)
 843             {
 844               num_found = 0;
 845               continue;
 846             }
 847
 848           if ((char_len = is_valid_utf8 (buffer + i, buflen - i)) == 0)
 849             {
 850               char_len = 1;
 851               num_found = 0;
 852               continue;
 853             }
 854
 855           if (unicode_display == unicode_invalid)
 856             {
 857               /* We have found a valid UTF-8 character, but we treat it as non-graphic.  */
 858               num_found = 0;
 859               continue;
 860             }
 861         }
 862
 863       if (num_found == 0)
 864         /* We have found a potential starting point for a string.  */
 865         start_point = i;
 866
 867       ++ num_found;
 868
 869       if (num_found >= string_min)
 870         break;
 871     }
 872
 873   if (num_found < string_min)
 874     return;
 875
 876   print_filename_and_address (filename, address + start_point);
 877
 878   /* We have found string_min characters.  Display them and any
 879      more that follow.  */
 880   for (i = start_point; i < buflen; i += char_len)
 881     {
 882       int c = buffer[i];
 883
 884       char_len = 1;
 885
 886       if (! STRING_ISGRAPHIC (c))
 887         break;
 888       else if (c < 127)
 889         putchar (c);
 890       else if (! is_valid_utf8 (buffer + i, buflen - i))
 891         break;
 892       else if (unicode_display == unicode_invalid)
 893         break;
 894       else
 895         char_len = display_utf8_char (buffer + i);
 896     }
 897
 898   if (output_separator)
 899     fputs (output_separator, stdout);
 900   else
 901     putchar ('\n');
 902
 903   /* FIXME: Using tail recursion here is lazy programming...  */
 904   print_unicode_buffer (filename, address + i, buffer + i, buflen - i);
 905 }
 906
 907 static int
 908 get_unicode_byte (FILE *          stream,
 909                   unsigned char * putback,
 910                   unsigned int *  num_putback,
 911                   unsigned int *  num_read)
 912 {
 913   if (* num_putback > 0)
 914     {
 915       * num_putback = * num_putback - 1;
 916       return putback [* num_putback];
 917     }
 918
 919   * num_read = * num_read + 1;
 920
 921 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
 922   return getc_unlocked (stream);
 923 #else
 924   return getc (stream);
 925 #endif
 926 }
 927
 928 /* Helper function for print_unicode_stream.  */
 929
 930 static void
 931 print_unicode_stream_body (const char *     filename,
 932                            file_ptr         address,
 933                            FILE *           stream,
 934                            unsigned char *  putback_buf,
 935                            unsigned int     num_putback,
 936                            unsigned char *  print_buf)
 937 {
 938   /* It would be nice if we could just read the stream into a buffer
 939      and then process if with print_unicode_buffer.  But the input
 940      might be huge or it might time-locked (eg stdin).  So instead
 941      we go one byte at a time...  */
 942
 943   file_ptr start_point = 0;
 944   unsigned int num_read = 0;
 945   unsigned int num_chars = 0;
 946   unsigned int num_print = 0;
 947   int c = 0;
 948
 949   /* Find a series of string_min characters.  Put them into print_buf.  */
 950   do
 951     {
 952       if (num_chars >= string_min)
 953         break;
 954
 955       c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
 956       if (c == EOF)
 957         break;
 958
 959       if (! STRING_ISGRAPHIC (c))
 960         {
 961           num_chars = num_print = 0;
 962           continue;
 963         }
 964
 965       if (num_chars == 0)
 966         start_point = num_read - 1;
 967
 968       if (c < 127)
 969         {
 970           print_buf[num_print] = c;
 971           num_chars ++;
 972           num_print ++;
 973           continue;
 974         }
 975
 976       if (c < 0xc0)
 977         {
 978           num_chars = num_print = 0;
 979           continue;
 980         }
 981
 982       /* We *might* have a UTF-8 sequence.  Time to start peeking.  */
 983       char utf8[4];
 984
 985       utf8[0] = c;
 986       c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
 987       if (c == EOF)
 988         break;
 989       utf8[1] = c;
 990
 991       if ((utf8[1] & 0xc0) != 0x80)
 992         {
 993           /* Invalid UTF-8.  */
 994           putback_buf[num_putback++] = utf8[1];
 995           num_chars = num_print = 0;
 996           continue;
 997         }
 998       else if ((utf8[0] & 0x20) == 0)
 999         {
1000           /* A valid 2-byte UTF-8 encoding.  */
1001           if (unicode_display == unicode_invalid)
1002             {
1003               putback_buf[num_putback++] = utf8[1];
1004               num_chars = num_print = 0;
1005             }
1006           else
1007             {
1008               print_buf[num_print ++] = utf8[0];
1009               print_buf[num_print ++] = utf8[1];
1010               num_chars ++;
1011             }
1012           continue;
1013         }
1014
1015       c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1016       if (c == EOF)
1017         break;
1018       utf8[2] = c;
1019
1020       if ((utf8[2] & 0xc0) != 0x80)
1021         {
1022           /* Invalid UTF-8.  */
1023           putback_buf[num_putback++] = utf8[2];
1024           putback_buf[num_putback++] = utf8[1];
1025           num_chars = num_print = 0;
1026           continue;
1027         }
1028       else if ((utf8[0] & 0x10) == 0)
1029         {
1030           /* A valid 3-byte UTF-8 encoding.  */
1031           if (unicode_display == unicode_invalid)
1032             {
1033               putback_buf[num_putback++] = utf8[2];
1034               putback_buf[num_putback++] = utf8[1];
1035               num_chars = num_print = 0;
1036             }
1037           else
1038             {
1039               print_buf[num_print ++] = utf8[0];
1040               print_buf[num_print ++] = utf8[1];
1041               print_buf[num_print ++] = utf8[2];
1042               num_chars ++;
1043             }
1044           continue;
1045         }
1046
1047       c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1048       if (c == EOF)
1049         break;
1050       utf8[3] = c;
1051
1052       if ((utf8[3] & 0xc0) != 0x80)
1053         {
1054           /* Invalid UTF-8.  */
1055           putback_buf[num_putback++] = utf8[3];
1056           putback_buf[num_putback++] = utf8[2];
1057           putback_buf[num_putback++] = utf8[1];
1058           num_chars = num_print = 0;
1059         }
1060       /* We have a valid 4-byte UTF-8 encoding.  */
1061       else if (unicode_display == unicode_invalid)
1062         {
1063           putback_buf[num_putback++] = utf8[3];
1064           putback_buf[num_putback++] = utf8[1];
1065           putback_buf[num_putback++] = utf8[2];
1066           num_chars = num_print = 0;
1067         }
1068       else
1069         {
1070           print_buf[num_print ++] = utf8[0];
1071           print_buf[num_print ++] = utf8[1];
1072           print_buf[num_print ++] = utf8[2];
1073           print_buf[num_print ++] = utf8[3];
1074           num_chars ++;
1075         }
1076     }
1077   while (1);
1078
1079   if (num_chars >= string_min)
1080     {
1081       /* We know that we have string_min valid characters in print_buf,
1082          and there may be more to come in the stream.  Start displaying
1083          them.  */
1084
1085       print_filename_and_address (filename, address + start_point);
1086
1087       unsigned int i;
1088       for (i = 0; i < num_print;)
1089         {
1090           if (print_buf[i] < 127)
1091             putchar (print_buf[i++]);
1092           else
1093             i += display_utf8_char (print_buf + i);
1094         }
1095
1096       /* OK so now we have to start read unchecked bytes.  */
1097
1098       /* Find a series of string_min characters.  Put them into print_buf.  */
1099       do
1100         {
1101           c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1102           if (c == EOF)
1103             break;
1104
1105           if (! STRING_ISGRAPHIC (c))
1106             break;
1107
1108           if (c < 127)
1109             {
1110               putchar (c);
1111               continue;
1112             }
1113
1114           if (c < 0xc0)
1115             break;
1116
1117           /* We *might* have a UTF-8 sequence.  Time to start peeking.  */
1118           unsigned char utf8[4];
1119
1120           utf8[0] = c;
1121           c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1122           if (c == EOF)
1123             break;
1124           utf8[1] = c;
1125
1126           if ((utf8[1] & 0xc0) != 0x80)
1127             {
1128               /* Invalid UTF-8.  */
1129               putback_buf[num_putback++] = utf8[1];
1130               break;
1131             }
1132           else if ((utf8[0] & 0x20) == 0)
1133             {
1134               /* Valid 2-byte UTF-8.  */
1135               if (unicode_display == unicode_invalid)
1136                 {
1137                   putback_buf[num_putback++] = utf8[1];
1138                   break;
1139                 }
1140               else
1141                 {
1142                   (void) display_utf8_char (utf8);
1143                   continue;
1144                 }
1145             }
1146
1147           c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1148           if (c == EOF)
1149             break;
1150           utf8[2] = c;
1151
1152           if ((utf8[2] & 0xc0) != 0x80)
1153             {
1154               /* Invalid UTF-8.  */
1155               putback_buf[num_putback++] = utf8[2];
1156               putback_buf[num_putback++] = utf8[1];
1157               break;
1158             }
1159           else if ((utf8[0] & 0x10) == 0)
1160             {
1161               /* Valid 3-byte UTF-8.  */
1162               if (unicode_display == unicode_invalid)
1163                 {
1164                   putback_buf[num_putback++] = utf8[2];
1165                   putback_buf[num_putback++] = utf8[1];
1166                   break;
1167                 }
1168               else
1169                 {
1170                   (void) display_utf8_char (utf8);
1171                   continue;
1172                 }
1173             }
1174
1175           c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1176           if (c == EOF)
1177             break;
1178           utf8[3] = c;
1179
1180           if ((utf8[3] & 0xc0) != 0x80)
1181             {
1182               /* Invalid UTF-8.  */
1183               putback_buf[num_putback++] = utf8[3];
1184               putback_buf[num_putback++] = utf8[2];
1185               putback_buf[num_putback++] = utf8[1];
1186               break;
1187             }
1188           else if (unicode_display == unicode_invalid)
1189             {
1190               putback_buf[num_putback++] = utf8[3];
1191               putback_buf[num_putback++] = utf8[2];
1192               putback_buf[num_putback++] = utf8[1];
1193               break;
1194             }
1195           else
1196             /* A valid 4-byte UTF-8 encoding.  */
1197             (void) display_utf8_char (utf8);
1198         }
1199       while (1);
1200
1201       if (output_separator)
1202         fputs (output_separator, stdout);
1203       else
1204         putchar ('\n');
1205     }
1206
1207   if (c != EOF)
1208     /* FIXME: Using tail recursion here is lazy, but it works.  */
1209     print_unicode_stream_body (filename, address + num_read, stream, putback_buf, num_putback, print_buf);
1210 }
1211
1212 /* Display strings read in from STREAM.  Treat any UTF-8 encoded characters
1213    encountered according to the setting of the unicode_display variable.
1214    The stream is positioned at ADDRESS and is attached to FILENAME.  */
1215
1216 static void
1217 print_unicode_stream (const char * filename,
1218                       file_ptr     address,
1219                       FILE *       stream)
1220 {
1221   /* Paranoia checks...  */
1222   if (filename == NULL
1223       || stream == NULL
1224       || unicode_display == unicode_default
1225       || encoding != 'S'
1226       || encoding_bytes != 1)
1227     {
1228       fprintf (stderr, "ICE: bad arguments to print_unicode_stream\n");
1229       return;
1230     }
1231
1232   /* Allocate space for string_min 4-byte utf-8 characters.  */
1233   size_t amt = string_min;
1234   amt = (4 * amt) + 1;
1235   unsigned char * print_buf = xmalloc (amt);
1236   /* We should never have to put back more than 4 bytes.  */
1237   unsigned char putback_buf[5];
1238   unsigned int num_putback = 0;
1239
1240   print_unicode_stream_body (filename, address, stream, putback_buf, num_putback, print_buf);
1241   free (print_buf);
1242 }
1243 \f
1244 /* Find the strings in file FILENAME, read from STREAM.
1245    Assume that STREAM is positioned so that the next byte read
1246    is at address ADDRESS in the file.
1247
1248    If STREAM is NULL, do not read from it.
1249    The caller can supply a buffer of characters
1250    to be processed before the data in STREAM.
1251    MAGIC is the address of the buffer and
1252    MAGICCOUNT is how many characters are in it.
1253    Those characters come at address ADDRESS and the data in STREAM follow.  */
1254
1255 static void
1256 print_strings (const char *filename, FILE *stream, file_ptr address,
1257                int magiccount, char *magic)
1258 {
1259   if (unicode_display != unicode_default)
1260     {
1261       if (magic != NULL)
1262         print_unicode_buffer (filename, address,
1263                               (const unsigned char *) magic, magiccount);
1264
1265       if (stream != NULL)
1266         print_unicode_stream (filename, address, stream);
1267       return;
1268     }
1269
1270   char *buf = (char *) xmalloc (sizeof (char) * (string_min + 1));
1271
1272   while (1)
1273     {
1274       file_ptr start;
1275       unsigned int i;
1276       long c;
1277
1278       /* See if the next `string_min' chars are all graphic chars.  */
1279     tryline:
1280       start = address;
1281       for (i = 0; i < string_min; i++)
1282         {
1283           c = get_char (stream, &address, &magiccount, &magic);
1284           if (c == EOF)
1285             {
1286               free (buf);
1287               return;
1288             }
1289
1290           if (! STRING_ISGRAPHIC (c))
1291             {
1292               /* Found a non-graphic.  Try again starting with next byte.  */
1293               unget_part_char (c, &address, &magiccount, &magic);
1294               goto tryline;
1295             }
1296           buf[i] = c;
1297         }
1298
1299       /* We found a run of `string_min' graphic characters.  Print up
1300          to the next non-graphic character.  */
1301       print_filename_and_address (filename, start);
1302
1303       buf[i] = '\0';
1304       fputs (buf, stdout);
1305
1306       while (1)
1307         {
1308           c = get_char (stream, &address, &magiccount, &magic);
1309           if (c == EOF)
1310             break;
1311           if (! STRING_ISGRAPHIC (c))
1312             {
1313               unget_part_char (c, &address, &magiccount, &magic);
1314               break;
1315             }
1316           putchar (c);
1317         }
1318
1319       if (output_separator)
1320         fputs (output_separator, stdout);
1321       else
1322         putchar ('\n');
1323     }
1324   free (buf);
1325 }
1326 \f
1327 static void
1328 usage (FILE *stream, int status)
1329 {
1330   fprintf (stream, _("Usage: %s [option(s)] [file(s)]\n"), program_name);
1331   fprintf (stream, _(" Display printable strings in [file(s)] (stdin by default)\n"));
1332   fprintf (stream, _(" The options are:\n"));
1333
1334   if (DEFAULT_STRINGS_ALL)
1335     fprintf (stream, _("\
1336   -a - --all                Scan the entire file, not just the data section [default]\n\
1337   -d --data                 Only scan the data sections in the file\n"));
1338   else
1339     fprintf (stream, _("\
1340   -a - --all                Scan the entire file, not just the data section\n\
1341   -d --data                 Only scan the data sections in the file [default]\n"));
1342
1343   fprintf (stream, _("\
1344   -f --print-file-name      Print the name of the file before each string\n\
1345   -n <number>               Locate & print any sequence of at least <number>\n\
1346     --bytes=<number>         displayable characters.  (The default is 4).\n\
1347   -t --radix={o,d,x}        Print the location of the string in base 8, 10 or 16\n\
1348   -w --include-all-whitespace Include all whitespace as valid string characters\n\
1349   -o                        An alias for --radix=o\n\
1350   -T --target=<BFDNAME>     Specify the binary file format\n\
1351   -e --encoding={s,S,b,l,B,L} Select character size and endianness:\n\
1352                             s = 7-bit, S = 8-bit, {b,l} = 16-bit, {B,L} = 32-bit\n\
1353   --unicode={default|show|invalid|hex|escape|highlight}\n\
1354   -U {d|s|i|x|e|h}          Specify how to treat UTF-8 encoded unicode characters\n\
1355   -s --output-separator=<string> String used to separate strings in output.\n\
1356   @<file>                   Read options from <file>\n\
1357   -h --help                 Display this information\n\
1358   -v -V --version           Print the program's version number\n"));
1359   list_supported_targets (program_name, stream);
1360   if (REPORT_BUGS_TO[0] && status == 0)
1361     fprintf (stream, _("Report bugs to %s\n"), REPORT_BUGS_TO);
1362   exit (status);
1363 }