gas/app.c

   1 /* This is the Assembler Pre-Processor
   2    Copyright (C) 1987-2021 Free Software Foundation, Inc.
   3
   4    This file is part of GAS, the GNU Assembler.
   5
   6    GAS is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3, or (at your option)
   9    any later version.
  10
  11    GAS is distributed in the hope that it will be useful, but WITHOUT
  12    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
  14    License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with GAS; see the file COPYING.  If not, write to the Free
  18    Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA
  19    02110-1301, USA.  */
  20
  21 /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90.  */
  22 /* App, the assembler pre-processor.  This pre-processor strips out
  23    excess spaces, turns single-quoted characters into a decimal
  24    constant, and turns the # in # <number> <filename> <garbage> into a
  25    .linefile.  This needs better error-handling.  */
  26
  27 #include "as.h"
  28
  29 #if (__STDC__ != 1)
  30 #ifndef const
  31 #define const  /* empty */
  32 #endif
  33 #endif
  34
  35 #ifdef H_TICK_HEX
  36 int enable_h_tick_hex = 0;
  37 #endif
  38
  39 #ifdef TC_M68K
  40 /* Whether we are scrubbing in m68k MRI mode.  This is different from
  41    flag_m68k_mri, because the two flags will be affected by the .mri
  42    pseudo-op at different times.  */
  43 static int scrub_m68k_mri;
  44
  45 /* The pseudo-op which switches in and out of MRI mode.  See the
  46    comment in do_scrub_chars.  */
  47 static const char mri_pseudo[] = ".mri 0";
  48 #else
  49 #define scrub_m68k_mri 0
  50 #endif
  51
  52 #if defined TC_ARM && defined OBJ_ELF
  53 /* The pseudo-op for which we need to special-case `@' characters.
  54    See the comment in do_scrub_chars.  */
  55 static const char   symver_pseudo[] = ".symver";
  56 static const char * symver_state;
  57 #endif
  58
  59 static char last_char;
  60
  61 static char lex[256];
  62 static const char symbol_chars[] =
  63 "$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
  64
  65 #define LEX_IS_SYMBOL_COMPONENT         1
  66 #define LEX_IS_WHITESPACE               2
  67 #define LEX_IS_LINE_SEPARATOR           3
  68 #define LEX_IS_COMMENT_START            4
  69 #define LEX_IS_LINE_COMMENT_START       5
  70 #define LEX_IS_TWOCHAR_COMMENT_1ST      6
  71 #define LEX_IS_STRINGQUOTE              8
  72 #define LEX_IS_COLON                    9
  73 #define LEX_IS_NEWLINE                  10
  74 #define LEX_IS_ONECHAR_QUOTE            11
  75 #ifdef TC_V850
  76 #define LEX_IS_DOUBLEDASH_1ST           12
  77 #endif
  78 #ifdef TC_M32R
  79 #define DOUBLEBAR_PARALLEL
  80 #endif
  81 #ifdef DOUBLEBAR_PARALLEL
  82 #define LEX_IS_DOUBLEBAR_1ST            13
  83 #endif
  84 #define LEX_IS_PARALLEL_SEPARATOR       14
  85 #ifdef H_TICK_HEX
  86 #define LEX_IS_H                        15
  87 #endif
  88 #define IS_SYMBOL_COMPONENT(c)          (lex[c] == LEX_IS_SYMBOL_COMPONENT)
  89 #define IS_WHITESPACE(c)                (lex[c] == LEX_IS_WHITESPACE)
  90 #define IS_LINE_SEPARATOR(c)            (lex[c] == LEX_IS_LINE_SEPARATOR)
  91 #define IS_PARALLEL_SEPARATOR(c)        (lex[c] == LEX_IS_PARALLEL_SEPARATOR)
  92 #define IS_COMMENT(c)                   (lex[c] == LEX_IS_COMMENT_START)
  93 #define IS_LINE_COMMENT(c)              (lex[c] == LEX_IS_LINE_COMMENT_START)
  94 #define IS_NEWLINE(c)                   (lex[c] == LEX_IS_NEWLINE)
  95
  96 static int process_escape (int);
  97
  98 /* FIXME-soon: The entire lexer/parser thingy should be
  99    built statically at compile time rather than dynamically
 100    each and every time the assembler is run.  xoxorich.  */
 101
 102 void
 103 do_scrub_begin (int m68k_mri ATTRIBUTE_UNUSED)
 104 {
 105   const char *p;
 106   int c;
 107
 108   lex[' '] = LEX_IS_WHITESPACE;
 109   lex['\t'] = LEX_IS_WHITESPACE;
 110   lex['\r'] = LEX_IS_WHITESPACE;
 111   lex['\n'] = LEX_IS_NEWLINE;
 112   lex[':'] = LEX_IS_COLON;
 113
 114 #ifdef TC_M68K
 115   scrub_m68k_mri = m68k_mri;
 116
 117   if (! m68k_mri)
 118 #endif
 119     {
 120       lex['"'] = LEX_IS_STRINGQUOTE;
 121
 122 #if ! defined (TC_HPPA)
 123       lex['\''] = LEX_IS_ONECHAR_QUOTE;
 124 #endif
 125
 126 #ifdef SINGLE_QUOTE_STRINGS
 127       lex['\''] = LEX_IS_STRINGQUOTE;
 128 #endif
 129     }
 130
 131   /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop
 132      in state 5 of do_scrub_chars must be changed.  */
 133
 134   /* Note that these override the previous defaults, e.g. if ';' is a
 135      comment char, then it isn't a line separator.  */
 136   for (p = symbol_chars; *p; ++p)
 137     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
 138
 139   for (c = 128; c < 256; ++c)
 140     lex[c] = LEX_IS_SYMBOL_COMPONENT;
 141
 142 #ifdef tc_symbol_chars
 143   /* This macro permits the processor to specify all characters which
 144      may appears in an operand.  This will prevent the scrubber from
 145      discarding meaningful whitespace in certain cases.  The i386
 146      backend uses this to support prefixes, which can confuse the
 147      scrubber as to whether it is parsing operands or opcodes.  */
 148   for (p = tc_symbol_chars; *p; ++p)
 149     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
 150 #endif
 151
 152   /* The m68k backend wants to be able to change comment_chars.  */
 153 #ifndef tc_comment_chars
 154 #define tc_comment_chars comment_chars
 155 #endif
 156   for (p = tc_comment_chars; *p; p++)
 157     lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
 158
 159   for (p = line_comment_chars; *p; p++)
 160     lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
 161
 162 #ifndef tc_line_separator_chars
 163 #define tc_line_separator_chars line_separator_chars
 164 #endif
 165   for (p = tc_line_separator_chars; *p; p++)
 166     lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
 167
 168 #ifdef tc_parallel_separator_chars
 169   /* This macro permits the processor to specify all characters which
 170      separate parallel insns on the same line.  */
 171   for (p = tc_parallel_separator_chars; *p; p++)
 172     lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR;
 173 #endif
 174
 175   /* Only allow slash-star comments if slash is not in use.
 176      FIXME: This isn't right.  We should always permit them.  */
 177   if (lex['/'] == 0)
 178     lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
 179
 180 #ifdef TC_M68K
 181   if (m68k_mri)
 182     {
 183       lex['\''] = LEX_IS_STRINGQUOTE;
 184       lex[';'] = LEX_IS_COMMENT_START;
 185       lex['*'] = LEX_IS_LINE_COMMENT_START;
 186       /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
 187          then it can't be used in an expression.  */
 188       lex['!'] = LEX_IS_LINE_COMMENT_START;
 189     }
 190 #endif
 191
 192 #ifdef TC_V850
 193   lex['-'] = LEX_IS_DOUBLEDASH_1ST;
 194 #endif
 195 #ifdef DOUBLEBAR_PARALLEL
 196   lex['|'] = LEX_IS_DOUBLEBAR_1ST;
 197 #endif
 198 #ifdef TC_D30V
 199   /* Must do this is we want VLIW instruction with "->" or "<-".  */
 200   lex['-'] = LEX_IS_SYMBOL_COMPONENT;
 201 #endif
 202
 203 #ifdef H_TICK_HEX
 204   if (enable_h_tick_hex)
 205     {
 206       lex['h'] = LEX_IS_H;
 207       lex['H'] = LEX_IS_H;
 208     }
 209 #endif
 210 }
 211
 212 /* Saved state of the scrubber.  */
 213 static int state;
 214 static int old_state;
 215 static const char *out_string;
 216 static char out_buf[20];
 217 static int add_newlines;
 218 static char *saved_input;
 219 static size_t saved_input_len;
 220 static char input_buffer[32 * 1024];
 221 static const char *mri_state;
 222 static char mri_last_ch;
 223
 224 /* Data structure for saving the state of app across #include's.  Note that
 225    app is called asynchronously to the parsing of the .include's, so our
 226    state at the time .include is interpreted is completely unrelated.
 227    That's why we have to save it all.  */
 228
 229 struct app_save
 230 {
 231   int          state;
 232   int          old_state;
 233   const char * out_string;
 234   char         out_buf[sizeof (out_buf)];
 235   int          add_newlines;
 236   char *       saved_input;
 237   size_t       saved_input_len;
 238 #ifdef TC_M68K
 239   int          scrub_m68k_mri;
 240 #endif
 241   const char * mri_state;
 242   char         mri_last_ch;
 243 #if defined TC_ARM && defined OBJ_ELF
 244   const char * symver_state;
 245 #endif
 246   char         last_char;
 247 };
 248
 249 char *
 250 app_push (void)
 251 {
 252   struct app_save *saved;
 253
 254   saved = XNEW (struct app_save);
 255   saved->state = state;
 256   saved->old_state = old_state;
 257   saved->out_string = out_string;
 258   memcpy (saved->out_buf, out_buf, sizeof (out_buf));
 259   saved->add_newlines = add_newlines;
 260   if (saved_input == NULL)
 261     saved->saved_input = NULL;
 262   else
 263     {
 264       saved->saved_input = XNEWVEC (char, saved_input_len);
 265       memcpy (saved->saved_input, saved_input, saved_input_len);
 266       saved->saved_input_len = saved_input_len;
 267     }
 268 #ifdef TC_M68K
 269   saved->scrub_m68k_mri = scrub_m68k_mri;
 270 #endif
 271   saved->mri_state = mri_state;
 272   saved->mri_last_ch = mri_last_ch;
 273 #if defined TC_ARM && defined OBJ_ELF
 274   saved->symver_state = symver_state;
 275 #endif
 276   saved->last_char = last_char;
 277
 278   /* do_scrub_begin() is not useful, just wastes time.  */
 279
 280   state = 0;
 281   saved_input = NULL;
 282   add_newlines = 0;
 283
 284   return (char *) saved;
 285 }
 286
 287 void
 288 app_pop (char *arg)
 289 {
 290   struct app_save *saved = (struct app_save *) arg;
 291
 292   /* There is no do_scrub_end ().  */
 293   state = saved->state;
 294   old_state = saved->old_state;
 295   out_string = saved->out_string;
 296   memcpy (out_buf, saved->out_buf, sizeof (out_buf));
 297   add_newlines = saved->add_newlines;
 298   if (saved->saved_input == NULL)
 299     saved_input = NULL;
 300   else
 301     {
 302       gas_assert (saved->saved_input_len <= sizeof (input_buffer));
 303       memcpy (input_buffer, saved->saved_input, saved->saved_input_len);
 304       saved_input = input_buffer;
 305       saved_input_len = saved->saved_input_len;
 306       free (saved->saved_input);
 307     }
 308 #ifdef TC_M68K
 309   scrub_m68k_mri = saved->scrub_m68k_mri;
 310 #endif
 311   mri_state = saved->mri_state;
 312   mri_last_ch = saved->mri_last_ch;
 313 #if defined TC_ARM && defined OBJ_ELF
 314   symver_state = saved->symver_state;
 315 #endif
 316   last_char = saved->last_char;
 317
 318   free (arg);
 319 }
 320
 321 /* @@ This assumes that \n &c are the same on host and target.  This is not
 322    necessarily true.  */
 323
 324 static int
 325 process_escape (int ch)
 326 {
 327   switch (ch)
 328     {
 329     case 'b':
 330       return '\b';
 331     case 'f':
 332       return '\f';
 333     case 'n':
 334       return '\n';
 335     case 'r':
 336       return '\r';
 337     case 't':
 338       return '\t';
 339     case '\'':
 340       return '\'';
 341     case '"':
 342       return '\"';
 343     default:
 344       return ch;
 345     }
 346 }
 347
 348 /* This function is called to process input characters.  The GET
 349    parameter is used to retrieve more input characters.  GET should
 350    set its parameter to point to a buffer, and return the length of
 351    the buffer; it should return 0 at end of file.  The scrubbed output
 352    characters are put into the buffer starting at TOSTART; the TOSTART
 353    buffer is TOLEN bytes in length.  The function returns the number
 354    of scrubbed characters put into TOSTART.  This will be TOLEN unless
 355    end of file was seen.  This function is arranged as a state
 356    machine, and saves its state so that it may return at any point.
 357    This is the way the old code used to work.  */
 358
 359 size_t
 360 do_scrub_chars (size_t (*get) (char *, size_t), char *tostart, size_t tolen)
 361 {
 362   char *to = tostart;
 363   char *toend = tostart + tolen;
 364   char *from;
 365   char *fromend;
 366   size_t fromlen;
 367   int ch, ch2 = 0;
 368   /* Character that started the string we're working on.  */
 369   static char quotechar;
 370
 371   /*State 0: beginning of normal line
 372           1: After first whitespace on line (flush more white)
 373           2: After first non-white (opcode) on line (keep 1white)
 374           3: after second white on line (into operands) (flush white)
 375           4: after putting out a .linefile, put out digits
 376           5: parsing a string, then go to old-state
 377           6: putting out \ escape in a "d string.
 378           7: no longer used
 379           8: no longer used
 380           9: After seeing symbol char in state 3 (keep 1white after symchar)
 381          10: After seeing whitespace in state 9 (keep white before symchar)
 382          11: After seeing a symbol character in state 0 (eg a label definition)
 383          -1: output string in out_string and go to the state in old_state
 384          -2: flush text until a '*' '/' is seen, then go to state old_state
 385 #ifdef TC_V850
 386          12: After seeing a dash, looking for a second dash as a start
 387              of comment.
 388 #endif
 389 #ifdef DOUBLEBAR_PARALLEL
 390          13: After seeing a vertical bar, looking for a second
 391              vertical bar as a parallel expression separator.
 392 #endif
 393 #ifdef TC_PREDICATE_START_CHAR
 394          14: After seeing a predicate start character at state 0, looking
 395              for a predicate end character as predicate.
 396          15: After seeing a predicate start character at state 1, looking
 397              for a predicate end character as predicate.
 398 #endif
 399 #ifdef TC_Z80
 400          16: After seeing an 'a' or an 'A' at the start of a symbol
 401          17: After seeing an 'f' or an 'F' in state 16
 402 #endif
 403           */
 404
 405   /* I added states 9 and 10 because the MIPS ECOFF assembler uses
 406      constructs like ``.loc 1 20''.  This was turning into ``.loc
 407      120''.  States 9 and 10 ensure that a space is never dropped in
 408      between characters which could appear in an identifier.  Ian
 409      Taylor, ian@cygnus.com.
 410
 411      I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
 412      correctly on the PA (and any other target where colons are optional).
 413      Jeff Law, law@cs.utah.edu.
 414
 415      I added state 13 so that something like "cmp r1, r2 || trap #1" does not
 416      get squashed into "cmp r1,r2||trap#1", with the all important space
 417      between the 'trap' and the '#1' being eliminated.  nickc@cygnus.com  */
 418
 419   /* This macro gets the next input character.  */
 420
 421 #define GET()                                                   \
 422   (from < fromend                                               \
 423    ? * (unsigned char *) (from++)                               \
 424    : (saved_input = NULL,                                       \
 425       fromlen = (*get) (input_buffer, sizeof input_buffer),     \
 426       from = input_buffer,                                      \
 427       fromend = from + fromlen,                                 \
 428       (fromlen == 0                                             \
 429        ? EOF                                                    \
 430        : * (unsigned char *) (from++))))
 431
 432   /* This macro pushes a character back on the input stream.  */
 433
 434 #define UNGET(uch) (*--from = (uch))
 435
 436   /* This macro puts a character into the output buffer.  If this
 437      character fills the output buffer, this macro jumps to the label
 438      TOFULL.  We use this rather ugly approach because we need to
 439      handle two different termination conditions: EOF on the input
 440      stream, and a full output buffer.  It would be simpler if we
 441      always read in the entire input stream before processing it, but
 442      I don't want to make such a significant change to the assembler's
 443      memory usage.  */
 444
 445 #define PUT(pch)                                \
 446   do                                            \
 447     {                                           \
 448       *to++ = (pch);                            \
 449       if (to >= toend)                          \
 450         goto tofull;                            \
 451     }                                           \
 452   while (0)
 453
 454   if (saved_input != NULL)
 455     {
 456       from = saved_input;
 457       fromend = from + saved_input_len;
 458     }
 459   else
 460     {
 461       fromlen = (*get) (input_buffer, sizeof input_buffer);
 462       if (fromlen == 0)
 463         return 0;
 464       from = input_buffer;
 465       fromend = from + fromlen;
 466     }
 467
 468   while (1)
 469     {
 470       /* The cases in this switch end with continue, in order to
 471          branch back to the top of this while loop and generate the
 472          next output character in the appropriate state.  */
 473       switch (state)
 474         {
 475         case -1:
 476           ch = *out_string++;
 477           if (*out_string == '\0')
 478             {
 479               state = old_state;
 480               old_state = 3;
 481             }
 482           PUT (ch);
 483           continue;
 484
 485         case -2:
 486           for (;;)
 487             {
 488               do
 489                 {
 490                   ch = GET ();
 491
 492                   if (ch == EOF)
 493                     {
 494                       as_warn (_("end of file in comment"));
 495                       goto fromeof;
 496                     }
 497
 498                   if (ch == '\n')
 499                     PUT ('\n');
 500                 }
 501               while (ch != '*');
 502
 503               while ((ch = GET ()) == '*')
 504                 ;
 505
 506               if (ch == EOF)
 507                 {
 508                   as_warn (_("end of file in comment"));
 509                   goto fromeof;
 510                 }
 511
 512               if (ch == '/')
 513                 break;
 514
 515               UNGET (ch);
 516             }
 517
 518           state = old_state;
 519           UNGET (' ');
 520           continue;
 521
 522         case 4:
 523           ch = GET ();
 524           if (ch == EOF)
 525             goto fromeof;
 526           else if (ch >= '0' && ch <= '9')
 527             PUT (ch);
 528           else
 529             {
 530               while (ch != EOF && IS_WHITESPACE (ch))
 531                 ch = GET ();
 532               if (ch == '"')
 533                 {
 534                   quotechar = ch;
 535                   state = 5;
 536                   old_state = 3;
 537                   PUT (ch);
 538                 }
 539               else
 540                 {
 541                   while (ch != EOF && ch != '\n')
 542                     ch = GET ();
 543                   state = 0;
 544                   PUT (ch);
 545                 }
 546             }
 547           continue;
 548
 549         case 5:
 550           /* We are going to copy everything up to a quote character,
 551              with special handling for a backslash.  We try to
 552              optimize the copying in the simple case without using the
 553              GET and PUT macros.  */
 554           {
 555             char *s;
 556             ptrdiff_t len;
 557
 558             for (s = from; s < fromend; s++)
 559               {
 560                 ch = *s;
 561                 if (ch == '\\'
 562                     || ch == quotechar
 563                     || ch == '\n')
 564                   break;
 565               }
 566             len = s - from;
 567             if (len > toend - to)
 568               len = toend - to;
 569             if (len > 0)
 570               {
 571                 memcpy (to, from, len);
 572                 to += len;
 573                 from += len;
 574                 if (to >= toend)
 575                   goto tofull;
 576               }
 577           }
 578
 579           ch = GET ();
 580           if (ch == EOF)
 581             {
 582               /* This buffer is here specifically so
 583                  that the UNGET below will work.  */
 584               static char one_char_buf[1];
 585
 586               as_warn (_("end of file in string; '%c' inserted"), quotechar);
 587               state = old_state;
 588               from = fromend = one_char_buf + 1;
 589               fromlen = 1;
 590               UNGET ('\n');
 591               PUT (quotechar);
 592             }
 593           else if (ch == quotechar)
 594             {
 595               state = old_state;
 596               PUT (ch);
 597             }
 598           else if (TC_STRING_ESCAPES && ch == '\\')
 599             {
 600               state = 6;
 601               PUT (ch);
 602             }
 603           else if (scrub_m68k_mri && ch == '\n')
 604             {
 605               /* Just quietly terminate the string.  This permits lines like
 606                    bne  label   loop if we haven't reach end yet.  */
 607               state = old_state;
 608               UNGET (ch);
 609               PUT ('\'');
 610             }
 611           else
 612             {
 613               PUT (ch);
 614             }
 615           continue;
 616
 617         case 6:
 618           state = 5;
 619           ch = GET ();
 620           switch (ch)
 621             {
 622               /* Handle strings broken across lines, by turning '\n' into
 623                  '\\' and 'n'.  */
 624             case '\n':
 625               UNGET ('n');
 626               add_newlines++;
 627               PUT ('\\');
 628               continue;
 629
 630             case EOF:
 631               as_warn (_("end of file in string; '%c' inserted"), quotechar);
 632               PUT (quotechar);
 633               continue;
 634
 635             case '"':
 636             case '\\':
 637             case 'b':
 638             case 'f':
 639             case 'n':
 640             case 'r':
 641             case 't':
 642             case 'v':
 643             case 'x':
 644             case 'X':
 645             case '0':
 646             case '1':
 647             case '2':
 648             case '3':
 649             case '4':
 650             case '5':
 651             case '6':
 652             case '7':
 653               break;
 654
 655             default:
 656 #ifdef ONLY_STANDARD_ESCAPES
 657               as_warn (_("unknown escape '\\%c' in string; ignored"), ch);
 658 #endif
 659               break;
 660             }
 661           PUT (ch);
 662           continue;
 663
 664 #ifdef DOUBLEBAR_PARALLEL
 665         case 13:
 666           ch = GET ();
 667           if (ch != '|')
 668             abort ();
 669
 670           /* Reset back to state 1 and pretend that we are parsing a
 671              line from just after the first white space.  */
 672           state = 1;
 673           PUT ('|');
 674 #ifdef TC_TIC6X
 675           /* "||^" is used for SPMASKed instructions.  */
 676           ch = GET ();
 677           if (ch == EOF)
 678             goto fromeof;
 679           else if (ch == '^')
 680             PUT ('^');
 681           else
 682             UNGET (ch);
 683 #endif
 684           continue;
 685 #endif
 686 #ifdef TC_Z80
 687         case 16:
 688           /* We have seen an 'a' at the start of a symbol, look for an 'f'.  */
 689           ch = GET ();
 690           if (ch == 'f' || ch == 'F')
 691             {
 692               state = 17;
 693               PUT (ch);
 694             }
 695           else
 696             {
 697               state = 9;
 698               break;
 699             }
 700           /* Fall through.  */
 701         case 17:
 702           /* We have seen "af" at the start of a symbol,
 703              a ' here is a part of that symbol.  */
 704           ch = GET ();
 705           state = 9;
 706           if (ch == '\'')
 707             /* Change to avoid warning about unclosed string.  */
 708             PUT ('`');
 709           else if (ch != EOF)
 710             UNGET (ch);
 711           break;
 712 #endif
 713         }
 714
 715       /* OK, we are somewhere in states 0 through 4 or 9 through 11.  */
 716
 717       /* flushchar: */
 718       ch = GET ();
 719
 720 #ifdef TC_PREDICATE_START_CHAR
 721       if (ch == TC_PREDICATE_START_CHAR && (state == 0 || state == 1))
 722         {
 723           state += 14;
 724           PUT (ch);
 725           continue;
 726         }
 727       else if (state == 14 || state == 15)
 728         {
 729           if (ch == TC_PREDICATE_END_CHAR)
 730             {
 731               state -= 14;
 732               PUT (ch);
 733               ch = GET ();
 734             }
 735           else
 736             {
 737               PUT (ch);
 738               continue;
 739             }
 740         }
 741 #endif
 742
 743     recycle:
 744
 745 #if defined TC_ARM && defined OBJ_ELF
 746       /* We need to watch out for .symver directives.  See the comment later
 747          in this function.  */
 748       if (symver_state == NULL)
 749         {
 750           if ((state == 0 || state == 1) && ch == symver_pseudo[0])
 751             symver_state = symver_pseudo + 1;
 752         }
 753       else
 754         {
 755           /* We advance to the next state if we find the right
 756              character.  */
 757           if (ch != '\0' && (*symver_state == ch))
 758             ++symver_state;
 759           else if (*symver_state != '\0')
 760             /* We did not get the expected character, or we didn't
 761                get a valid terminating character after seeing the
 762                entire pseudo-op, so we must go back to the beginning.  */
 763             symver_state = NULL;
 764           else
 765             {
 766               /* We've read the entire pseudo-op.  If this is the end
 767                  of the line, go back to the beginning.  */
 768               if (IS_NEWLINE (ch))
 769                 symver_state = NULL;
 770             }
 771         }
 772 #endif /* TC_ARM && OBJ_ELF */
 773
 774 #ifdef TC_M68K
 775       /* We want to have pseudo-ops which control whether we are in
 776          MRI mode or not.  Unfortunately, since m68k MRI mode affects
 777          the scrubber, that means that we need a special purpose
 778          recognizer here.  */
 779       if (mri_state == NULL)
 780         {
 781           if ((state == 0 || state == 1)
 782               && ch == mri_pseudo[0])
 783             mri_state = mri_pseudo + 1;
 784         }
 785       else
 786         {
 787           /* We advance to the next state if we find the right
 788              character, or if we need a space character and we get any
 789              whitespace character, or if we need a '0' and we get a
 790              '1' (this is so that we only need one state to handle
 791              ``.mri 0'' and ``.mri 1'').  */
 792           if (ch != '\0'
 793               && (*mri_state == ch
 794                   || (*mri_state == ' '
 795                       && lex[ch] == LEX_IS_WHITESPACE)
 796                   || (*mri_state == '0'
 797                       && ch == '1')))
 798             {
 799               mri_last_ch = ch;
 800               ++mri_state;
 801             }
 802           else if (*mri_state != '\0'
 803                    || (lex[ch] != LEX_IS_WHITESPACE
 804                        && lex[ch] != LEX_IS_NEWLINE))
 805             {
 806               /* We did not get the expected character, or we didn't
 807                  get a valid terminating character after seeing the
 808                  entire pseudo-op, so we must go back to the
 809                  beginning.  */
 810               mri_state = NULL;
 811             }
 812           else
 813             {
 814               /* We've read the entire pseudo-op.  mips_last_ch is
 815                  either '0' or '1' indicating whether to enter or
 816                  leave MRI mode.  */
 817               do_scrub_begin (mri_last_ch == '1');
 818               mri_state = NULL;
 819
 820               /* We continue handling the character as usual.  The
 821                  main gas reader must also handle the .mri pseudo-op
 822                  to control expression parsing and the like.  */
 823             }
 824         }
 825 #endif
 826
 827       if (ch == EOF)
 828         {
 829           if (state != 0)
 830             {
 831               as_warn (_("end of file not at end of a line; newline inserted"));
 832               state = 0;
 833               PUT ('\n');
 834             }
 835           goto fromeof;
 836         }
 837
 838       switch (lex[ch])
 839         {
 840         case LEX_IS_WHITESPACE:
 841           do
 842             {
 843               ch = GET ();
 844             }
 845           while (ch != EOF && IS_WHITESPACE (ch));
 846           if (ch == EOF)
 847             goto fromeof;
 848
 849           if (state == 0)
 850             {
 851               /* Preserve a single whitespace character at the
 852                  beginning of a line.  */
 853               state = 1;
 854               UNGET (ch);
 855               PUT (' ');
 856               break;
 857             }
 858
 859 #ifdef KEEP_WHITE_AROUND_COLON
 860           if (lex[ch] == LEX_IS_COLON)
 861             {
 862               /* Only keep this white if there's no white *after* the
 863                  colon.  */
 864               ch2 = GET ();
 865               if (ch2 != EOF)
 866                 UNGET (ch2);
 867               if (!IS_WHITESPACE (ch2))
 868                 {
 869                   state = 9;
 870                   UNGET (ch);
 871                   PUT (' ');
 872                   break;
 873                 }
 874             }
 875 #endif
 876           if (IS_COMMENT (ch)
 877               || ch == '/'
 878               || IS_LINE_SEPARATOR (ch)
 879               || IS_PARALLEL_SEPARATOR (ch))
 880             {
 881               if (scrub_m68k_mri)
 882                 {
 883                   /* In MRI mode, we keep these spaces.  */
 884                   UNGET (ch);
 885                   PUT (' ');
 886                   break;
 887                 }
 888               goto recycle;
 889             }
 890
 891           /* If we're in state 2 or 11, we've seen a non-white
 892              character followed by whitespace.  If the next character
 893              is ':', this is whitespace after a label name which we
 894              normally must ignore.  In MRI mode, though, spaces are
 895              not permitted between the label and the colon.  */
 896           if ((state == 2 || state == 11)
 897               && lex[ch] == LEX_IS_COLON
 898               && ! scrub_m68k_mri)
 899             {
 900               state = 1;
 901               PUT (ch);
 902               break;
 903             }
 904
 905           switch (state)
 906             {
 907             case 1:
 908               /* We can arrive here if we leave a leading whitespace
 909                  character at the beginning of a line.  */
 910               goto recycle;
 911             case 2:
 912               state = 3;
 913               if (to + 1 < toend)
 914                 {
 915                   /* Optimize common case by skipping UNGET/GET.  */
 916                   PUT (' ');    /* Sp after opco */
 917                   goto recycle;
 918                 }
 919               UNGET (ch);
 920               PUT (' ');
 921               break;
 922             case 3:
 923 #ifndef TC_KEEP_OPERAND_SPACES
 924               /* For TI C6X, we keep these spaces as they may separate
 925                  functional unit specifiers from operands.  */
 926               if (scrub_m68k_mri)
 927 #endif
 928                 {
 929                   /* In MRI mode, we keep these spaces.  */
 930                   UNGET (ch);
 931                   PUT (' ');
 932                   break;
 933                 }
 934               goto recycle;     /* Sp in operands */
 935             case 9:
 936             case 10:
 937 #ifndef TC_KEEP_OPERAND_SPACES
 938               if (scrub_m68k_mri)
 939 #endif
 940                 {
 941                   /* In MRI mode, we keep these spaces.  */
 942                   state = 3;
 943                   UNGET (ch);
 944                   PUT (' ');
 945                   break;
 946                 }
 947               state = 10;       /* Sp after symbol char */
 948               goto recycle;
 949             case 11:
 950               if (LABELS_WITHOUT_COLONS || flag_m68k_mri)
 951                 state = 1;
 952               else
 953                 {
 954                   /* We know that ch is not ':', since we tested that
 955                      case above.  Therefore this is not a label, so it
 956                      must be the opcode, and we've just seen the
 957                      whitespace after it.  */
 958                   state = 3;
 959                 }
 960               UNGET (ch);
 961               PUT (' ');        /* Sp after label definition.  */
 962               break;
 963             default:
 964               BAD_CASE (state);
 965             }
 966           break;
 967
 968         case LEX_IS_TWOCHAR_COMMENT_1ST:
 969           ch2 = GET ();
 970           if (ch2 == '*')
 971             {
 972               for (;;)
 973                 {
 974                   do
 975                     {
 976                       ch2 = GET ();
 977                       if (ch2 != EOF && IS_NEWLINE (ch2))
 978                         add_newlines++;
 979                     }
 980                   while (ch2 != EOF && ch2 != '*');
 981
 982                   while (ch2 == '*')
 983                     ch2 = GET ();
 984
 985                   if (ch2 == EOF || ch2 == '/')
 986                     break;
 987
 988                   /* This UNGET will ensure that we count newlines
 989                      correctly.  */
 990                   UNGET (ch2);
 991                 }
 992
 993               if (ch2 == EOF)
 994                 as_warn (_("end of file in multiline comment"));
 995
 996               ch = ' ';
 997               goto recycle;
 998             }
 999 #ifdef DOUBLESLASH_LINE_COMMENTS
1000           else if (ch2 == '/')
1001             {
1002               do
1003                 {
1004                   ch = GET ();
1005                 }
1006               while (ch != EOF && !IS_NEWLINE (ch));
1007               if (ch == EOF)
1008                 as_warn ("end of file in comment; newline inserted");
1009               state = 0;
1010               PUT ('\n');
1011               break;
1012             }
1013 #endif
1014           else
1015             {
1016               if (ch2 != EOF)
1017                 UNGET (ch2);
1018               if (state == 9 || state == 10)
1019                 state = 3;
1020               PUT (ch);
1021             }
1022           break;
1023
1024         case LEX_IS_STRINGQUOTE:
1025           quotechar = ch;
1026           if (state == 10)
1027             {
1028               /* Preserve the whitespace in foo "bar".  */
1029               UNGET (ch);
1030               state = 3;
1031               PUT (' ');
1032
1033               /* PUT didn't jump out.  We could just break, but we
1034                  know what will happen, so optimize a bit.  */
1035               ch = GET ();
1036               old_state = 3;
1037             }
1038           else if (state == 9)
1039             old_state = 3;
1040           else
1041             old_state = state;
1042           state = 5;
1043           PUT (ch);
1044           break;
1045
1046         case LEX_IS_ONECHAR_QUOTE:
1047 #ifdef H_TICK_HEX
1048           if (state == 9 && enable_h_tick_hex)
1049             {
1050               char c;
1051
1052               c = GET ();
1053               as_warn ("'%c found after symbol", c);
1054               UNGET (c);
1055             }
1056 #endif
1057           if (state == 10)
1058             {
1059               /* Preserve the whitespace in foo 'b'.  */
1060               UNGET (ch);
1061               state = 3;
1062               PUT (' ');
1063               break;
1064             }
1065           ch = GET ();
1066           if (ch == EOF)
1067             {
1068               as_warn (_("end of file after a one-character quote; \\0 inserted"));
1069               ch = 0;
1070             }
1071           if (ch == '\\')
1072             {
1073               ch = GET ();
1074               if (ch == EOF)
1075                 {
1076                   as_warn (_("end of file in escape character"));
1077                   ch = '\\';
1078                 }
1079               else
1080                 ch = process_escape (ch);
1081             }
1082           sprintf (out_buf, "%d", (int) (unsigned char) ch);
1083
1084           /* None of these 'x constants for us.  We want 'x'.  */
1085           if ((ch = GET ()) != '\'')
1086             {
1087 #ifdef REQUIRE_CHAR_CLOSE_QUOTE
1088               as_warn (_("missing close quote; (assumed)"));
1089 #else
1090               if (ch != EOF)
1091                 UNGET (ch);
1092 #endif
1093             }
1094           if (strlen (out_buf) == 1)
1095             {
1096               PUT (out_buf[0]);
1097               break;
1098             }
1099           if (state == 9)
1100             old_state = 3;
1101           else
1102             old_state = state;
1103           state = -1;
1104           out_string = out_buf;
1105           PUT (*out_string++);
1106           break;
1107
1108         case LEX_IS_COLON:
1109 #ifdef KEEP_WHITE_AROUND_COLON
1110           state = 9;
1111 #else
1112           if (state == 9 || state == 10)
1113             state = 3;
1114           else if (state != 3)
1115             state = 1;
1116 #endif
1117           PUT (ch);
1118           break;
1119
1120         case LEX_IS_NEWLINE:
1121           /* Roll out a bunch of newlines from inside comments, etc.  */
1122           if (add_newlines)
1123             {
1124               --add_newlines;
1125               UNGET (ch);
1126             }
1127           /* Fall through.  */
1128
1129         case LEX_IS_LINE_SEPARATOR:
1130           state = 0;
1131           PUT (ch);
1132           break;
1133
1134         case LEX_IS_PARALLEL_SEPARATOR:
1135           state = 1;
1136           PUT (ch);
1137           break;
1138
1139 #ifdef TC_V850
1140         case LEX_IS_DOUBLEDASH_1ST:
1141           ch2 = GET ();
1142           if (ch2 != '-')
1143             {
1144               if (ch2 != EOF)
1145                 UNGET (ch2);
1146               goto de_fault;
1147             }
1148           /* Read and skip to end of line.  */
1149           do
1150             {
1151               ch = GET ();
1152             }
1153           while (ch != EOF && ch != '\n');
1154
1155           if (ch == EOF)
1156             as_warn (_("end of file in comment; newline inserted"));
1157
1158           state = 0;
1159           PUT ('\n');
1160           break;
1161 #endif
1162 #ifdef DOUBLEBAR_PARALLEL
1163         case LEX_IS_DOUBLEBAR_1ST:
1164           ch2 = GET ();
1165           if (ch2 != EOF)
1166             UNGET (ch2);
1167           if (ch2 != '|')
1168             goto de_fault;
1169
1170           /* Handle '||' in two states as invoking PUT twice might
1171              result in the first one jumping out of this loop.  We'd
1172              then lose track of the state and one '|' char.  */
1173           state = 13;
1174           PUT ('|');
1175           break;
1176 #endif
1177         case LEX_IS_LINE_COMMENT_START:
1178           /* FIXME-someday: The two character comment stuff was badly
1179              thought out.  On i386, we want '/' as line comment start
1180              AND we want C style comments.  hence this hack.  The
1181              whole lexical process should be reworked.  xoxorich.  */
1182           if (ch == '/')
1183             {
1184               ch2 = GET ();
1185               if (ch2 == '*')
1186                 {
1187                   old_state = 3;
1188                   state = -2;
1189                   break;
1190                 }
1191               else if (ch2 != EOF)
1192                 {
1193                   UNGET (ch2);
1194                 }
1195             }
1196
1197           if (state == 0 || state == 1) /* Only comment at start of line.  */
1198             {
1199               int startch;
1200
1201               startch = ch;
1202
1203               do
1204                 {
1205                   ch = GET ();
1206                 }
1207               while (ch != EOF && IS_WHITESPACE (ch));
1208
1209               if (ch == EOF)
1210                 {
1211                   as_warn (_("end of file in comment; newline inserted"));
1212                   PUT ('\n');
1213                   break;
1214                 }
1215
1216               if (ch < '0' || ch > '9' || state != 0 || startch != '#')
1217                 {
1218                   /* Not a cpp line.  */
1219                   while (ch != EOF && !IS_NEWLINE (ch))
1220                     ch = GET ();
1221                   if (ch == EOF)
1222                     {
1223                       as_warn (_("end of file in comment; newline inserted"));
1224                       PUT ('\n');
1225                     }
1226                   else /* IS_NEWLINE (ch) */
1227                     {
1228                       /* To process non-zero add_newlines.  */
1229                       UNGET (ch);
1230                     }
1231                   state = 0;
1232                   break;
1233                 }
1234               /* Looks like `# 123 "filename"' from cpp.  */
1235               UNGET (ch);
1236               old_state = 4;
1237               state = -1;
1238               if (scrub_m68k_mri)
1239                 out_string = "\tlinefile ";
1240               else
1241                 out_string = "\t.linefile ";
1242               PUT (*out_string++);
1243               break;
1244             }
1245
1246 #ifdef TC_D10V
1247           /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true.
1248              Trap is the only short insn that has a first operand that is
1249              neither register nor label.
1250              We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 .
1251              We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is
1252              already LEX_IS_LINE_COMMENT_START.  However, it is the
1253              only character in line_comment_chars for d10v, hence we
1254              can recognize it as such.  */
1255           /* An alternative approach would be to reset the state to 1 when
1256              we see '||', '<'- or '->', but that seems to be overkill.  */
1257           if (state == 10)
1258             PUT (' ');
1259 #endif
1260           /* We have a line comment character which is not at the
1261              start of a line.  If this is also a normal comment
1262              character, fall through.  Otherwise treat it as a default
1263              character.  */
1264           if (strchr (tc_comment_chars, ch) == NULL
1265               && (! scrub_m68k_mri
1266                   || (ch != '!' && ch != '*')))
1267             goto de_fault;
1268           if (scrub_m68k_mri
1269               && (ch == '!' || ch == '*' || ch == '#')
1270               && state != 1
1271               && state != 10)
1272             goto de_fault;
1273           /* Fall through.  */
1274         case LEX_IS_COMMENT_START:
1275 #if defined TC_ARM && defined OBJ_ELF
1276           /* On the ARM, `@' is the comment character.
1277              Unfortunately this is also a special character in ELF .symver
1278              directives (and .type, though we deal with those another way).
1279              So we check if this line is such a directive, and treat
1280              the character as default if so.  This is a hack.  */
1281           if ((symver_state != NULL) && (*symver_state == 0))
1282             goto de_fault;
1283 #endif
1284
1285           /* Care is needed not to damage occurrences of \<comment-char>
1286              by stripping the <comment-char> onwards.  Yuck.  */
1287           if ((to > tostart ? to[-1] : last_char) == '\\')
1288             /* Do not treat the <comment-char> as a start-of-comment.  */
1289             goto de_fault;
1290
1291 #ifdef WARN_COMMENTS
1292           if (!found_comment)
1293             found_comment_file = as_where (&found_comment);
1294 #endif
1295           do
1296             {
1297               ch = GET ();
1298             }
1299           while (ch != EOF && !IS_NEWLINE (ch));
1300           if (ch == EOF)
1301             as_warn (_("end of file in comment; newline inserted"));
1302           state = 0;
1303           PUT ('\n');
1304           break;
1305
1306 #ifdef H_TICK_HEX
1307         case LEX_IS_H:
1308           /* Look for strings like H'[0-9A-Fa-f] and if found, replace
1309              the H' with 0x to make them gas-style hex characters.  */
1310           if (enable_h_tick_hex)
1311             {
1312               char quot;
1313
1314               quot = GET ();
1315               if (quot == '\'')
1316                 {
1317                   UNGET ('x');
1318                   ch = '0';
1319                 }
1320               else
1321                 UNGET (quot);
1322             }
1323 #endif
1324           /* Fall through.  */
1325
1326         case LEX_IS_SYMBOL_COMPONENT:
1327           if (state == 10)
1328             {
1329               /* This is a symbol character following another symbol
1330                  character, with whitespace in between.  We skipped
1331                  the whitespace earlier, so output it now.  */
1332               UNGET (ch);
1333               state = 3;
1334               PUT (' ');
1335               break;
1336             }
1337
1338 #ifdef TC_Z80
1339           /* "af'" is a symbol containing '\''.  */
1340           if (state == 3 && (ch == 'a' || ch == 'A'))
1341             {
1342               state = 16;
1343               PUT (ch);
1344               ch = GET ();
1345               if (ch == 'f' || ch == 'F')
1346                 {
1347                   state = 17;
1348                   PUT (ch);
1349                   break;
1350                 }
1351               else
1352                 {
1353                   state = 9;
1354                   if (ch == EOF || !IS_SYMBOL_COMPONENT (ch))
1355                     {
1356                       if (ch != EOF)
1357                         UNGET (ch);
1358                       break;
1359                     }
1360                 }
1361             }
1362 #endif
1363           if (state == 3)
1364             state = 9;
1365
1366           /* This is a common case.  Quickly copy CH and all the
1367              following symbol component or normal characters.  */
1368           if (to + 1 < toend
1369               && mri_state == NULL
1370 #if defined TC_ARM && defined OBJ_ELF
1371               && symver_state == NULL
1372 #endif
1373               )
1374             {
1375               char *s;
1376               ptrdiff_t len;
1377
1378               for (s = from; s < fromend; s++)
1379                 {
1380                   int type;
1381
1382                   ch2 = *(unsigned char *) s;
1383                   type = lex[ch2];
1384                   if (type != 0
1385                       && type != LEX_IS_SYMBOL_COMPONENT)
1386                     break;
1387                 }
1388
1389               if (s > from)
1390                 /* Handle the last character normally, for
1391                    simplicity.  */
1392                 --s;
1393
1394               len = s - from;
1395
1396               if (len > (toend - to) - 1)
1397                 len = (toend - to) - 1;
1398
1399               if (len > 0)
1400                 {
1401                   PUT (ch);
1402                   memcpy (to, from, len);
1403                   to += len;
1404                   from += len;
1405                   if (to >= toend)
1406                     goto tofull;
1407                   ch = GET ();
1408                 }
1409             }
1410
1411           /* Fall through.  */
1412         default:
1413         de_fault:
1414           /* Some relatively `normal' character.  */
1415           if (state == 0)
1416             {
1417               state = 11;       /* Now seeing label definition.  */
1418             }
1419           else if (state == 1)
1420             {
1421               state = 2;        /* Ditto.  */
1422             }
1423           else if (state == 9)
1424             {
1425               if (!IS_SYMBOL_COMPONENT (ch))
1426                 state = 3;
1427             }
1428           else if (state == 10)
1429             {
1430               if (ch == '\\')
1431                 {
1432                   /* Special handling for backslash: a backslash may
1433                      be the beginning of a formal parameter (of a
1434                      macro) following another symbol character, with
1435                      whitespace in between.  If that is the case, we
1436                      output a space before the parameter.  Strictly
1437                      speaking, correct handling depends upon what the
1438                      macro parameter expands into; if the parameter
1439                      expands into something which does not start with
1440                      an operand character, then we don't want to keep
1441                      the space.  We don't have enough information to
1442                      make the right choice, so here we are making the
1443                      choice which is more likely to be correct.  */
1444                   if (to + 1 >= toend)
1445                     {
1446                       /* If we're near the end of the buffer, save the
1447                          character for the next time round.  Otherwise
1448                          we'll lose our state.  */
1449                       UNGET (ch);
1450                       goto tofull;
1451                     }
1452                   *to++ = ' ';
1453                 }
1454
1455               state = 3;
1456             }
1457           PUT (ch);
1458           break;
1459         }
1460     }
1461
1462   /*NOTREACHED*/
1463
1464  fromeof:
1465   /* We have reached the end of the input.  */
1466   if (to > tostart)
1467     last_char = to[-1];
1468   return to - tostart;
1469
1470  tofull:
1471   /* The output buffer is full.  Save any input we have not yet
1472      processed.  */
1473   if (fromend > from)
1474     {
1475       saved_input = from;
1476       saved_input_len = fromend - from;
1477     }
1478   else
1479     saved_input = NULL;
1480
1481   if (to > tostart)
1482     last_char = to[-1];
1483   return to - tostart;
1484 }