gas/app.c

   1 /* This is the Assembler Pre-Processor
   2    Copyright (C) 1987-2021 Free Software Foundation, Inc.
   3
   4    This file is part of GAS, the GNU Assembler.
   5
   6    GAS is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3, or (at your option)
   9    any later version.
  10
  11    GAS is distributed in the hope that it will be useful, but WITHOUT
  12    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  13    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
  14    License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with GAS; see the file COPYING.  If not, write to the Free
  18    Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA
  19    02110-1301, USA.  */
  20
  21 /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90.  */
  22 /* App, the assembler pre-processor.  This pre-processor strips out
  23    excess spaces, turns single-quoted characters into a decimal
  24    constant, and turns the # in # <number> <filename> <garbage> into a
  25    .linefile.  This needs better error-handling.  */
  26
  27 #include "as.h"
  28
  29 #if (__STDC__ != 1)
  30 #ifndef const
  31 #define const  /* empty */
  32 #endif
  33 #endif
  34
  35 #ifdef H_TICK_HEX
  36 int enable_h_tick_hex = 0;
  37 #endif
  38
  39 #ifdef TC_M68K
  40 /* Whether we are scrubbing in m68k MRI mode.  This is different from
  41    flag_m68k_mri, because the two flags will be affected by the .mri
  42    pseudo-op at different times.  */
  43 static int scrub_m68k_mri;
  44
  45 /* The pseudo-op which switches in and out of MRI mode.  See the
  46    comment in do_scrub_chars.  */
  47 static const char mri_pseudo[] = ".mri 0";
  48 #else
  49 #define scrub_m68k_mri 0
  50 #endif
  51
  52 #if defined TC_ARM && defined OBJ_ELF
  53 /* The pseudo-op for which we need to special-case `@' characters.
  54    See the comment in do_scrub_chars.  */
  55 static const char   symver_pseudo[] = ".symver";
  56 static const char * symver_state;
  57 #endif
  58
  59 static char last_char;
  60
  61 static char lex[256];
  62 static const char symbol_chars[] =
  63 "$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
  64
  65 #define LEX_IS_SYMBOL_COMPONENT         1
  66 #define LEX_IS_WHITESPACE               2
  67 #define LEX_IS_LINE_SEPARATOR           3
  68 #define LEX_IS_COMMENT_START            4
  69 #define LEX_IS_LINE_COMMENT_START       5
  70 #define LEX_IS_TWOCHAR_COMMENT_1ST      6
  71 #define LEX_IS_STRINGQUOTE              8
  72 #define LEX_IS_COLON                    9
  73 #define LEX_IS_NEWLINE                  10
  74 #define LEX_IS_ONECHAR_QUOTE            11
  75 #ifdef TC_V850
  76 #define LEX_IS_DOUBLEDASH_1ST           12
  77 #endif
  78 #ifdef TC_M32R
  79 #define DOUBLEBAR_PARALLEL
  80 #endif
  81 #ifdef DOUBLEBAR_PARALLEL
  82 #define LEX_IS_DOUBLEBAR_1ST            13
  83 #endif
  84 #define LEX_IS_PARALLEL_SEPARATOR       14
  85 #ifdef H_TICK_HEX
  86 #define LEX_IS_H                        15
  87 #endif
  88 #define IS_SYMBOL_COMPONENT(c)          (lex[c] == LEX_IS_SYMBOL_COMPONENT)
  89 #define IS_WHITESPACE(c)                (lex[c] == LEX_IS_WHITESPACE)
  90 #define IS_LINE_SEPARATOR(c)            (lex[c] == LEX_IS_LINE_SEPARATOR)
  91 #define IS_PARALLEL_SEPARATOR(c)        (lex[c] == LEX_IS_PARALLEL_SEPARATOR)
  92 #define IS_COMMENT(c)                   (lex[c] == LEX_IS_COMMENT_START)
  93 #define IS_LINE_COMMENT(c)              (lex[c] == LEX_IS_LINE_COMMENT_START)
  94 #define IS_NEWLINE(c)                   (lex[c] == LEX_IS_NEWLINE)
  95
  96 static int process_escape (int);
  97
  98 /* FIXME-soon: The entire lexer/parser thingy should be
  99    built statically at compile time rather than dynamically
 100    each and every time the assembler is run.  xoxorich.  */
 101
 102 void
 103 do_scrub_begin (int m68k_mri ATTRIBUTE_UNUSED)
 104 {
 105   const char *p;
 106   int c;
 107
 108   lex[' '] = LEX_IS_WHITESPACE;
 109   lex['\t'] = LEX_IS_WHITESPACE;
 110   lex['\r'] = LEX_IS_WHITESPACE;
 111   lex['\n'] = LEX_IS_NEWLINE;
 112   lex[':'] = LEX_IS_COLON;
 113
 114 #ifdef TC_M68K
 115   scrub_m68k_mri = m68k_mri;
 116
 117   if (! m68k_mri)
 118 #endif
 119     {
 120       lex['"'] = LEX_IS_STRINGQUOTE;
 121
 122 #if ! defined (TC_HPPA)
 123       lex['\''] = LEX_IS_ONECHAR_QUOTE;
 124 #endif
 125
 126 #ifdef SINGLE_QUOTE_STRINGS
 127       lex['\''] = LEX_IS_STRINGQUOTE;
 128 #endif
 129     }
 130
 131   /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop
 132      in state 5 of do_scrub_chars must be changed.  */
 133
 134   /* Note that these override the previous defaults, e.g. if ';' is a
 135      comment char, then it isn't a line separator.  */
 136   for (p = symbol_chars; *p; ++p)
 137     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
 138
 139   for (c = 128; c < 256; ++c)
 140     lex[c] = LEX_IS_SYMBOL_COMPONENT;
 141
 142 #ifdef tc_symbol_chars
 143   /* This macro permits the processor to specify all characters which
 144      may appears in an operand.  This will prevent the scrubber from
 145      discarding meaningful whitespace in certain cases.  The i386
 146      backend uses this to support prefixes, which can confuse the
 147      scrubber as to whether it is parsing operands or opcodes.  */
 148   for (p = tc_symbol_chars; *p; ++p)
 149     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
 150 #endif
 151
 152   /* The m68k backend wants to be able to change comment_chars.  */
 153 #ifndef tc_comment_chars
 154 #define tc_comment_chars comment_chars
 155 #endif
 156   for (p = tc_comment_chars; *p; p++)
 157     lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
 158
 159   for (p = line_comment_chars; *p; p++)
 160     lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
 161
 162 #ifndef tc_line_separator_chars
 163 #define tc_line_separator_chars line_separator_chars
 164 #endif
 165   for (p = tc_line_separator_chars; *p; p++)
 166     lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
 167
 168 #ifdef tc_parallel_separator_chars
 169   /* This macro permits the processor to specify all characters which
 170      separate parallel insns on the same line.  */
 171   for (p = tc_parallel_separator_chars; *p; p++)
 172     lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR;
 173 #endif
 174
 175   /* Only allow slash-star comments if slash is not in use.
 176      FIXME: This isn't right.  We should always permit them.  */
 177   if (lex['/'] == 0)
 178     lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
 179
 180 #ifdef TC_M68K
 181   if (m68k_mri)
 182     {
 183       lex['\''] = LEX_IS_STRINGQUOTE;
 184       lex[';'] = LEX_IS_COMMENT_START;
 185       lex['*'] = LEX_IS_LINE_COMMENT_START;
 186       /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
 187          then it can't be used in an expression.  */
 188       lex['!'] = LEX_IS_LINE_COMMENT_START;
 189     }
 190 #endif
 191
 192 #ifdef TC_V850
 193   lex['-'] = LEX_IS_DOUBLEDASH_1ST;
 194 #endif
 195 #ifdef DOUBLEBAR_PARALLEL
 196   lex['|'] = LEX_IS_DOUBLEBAR_1ST;
 197 #endif
 198 #ifdef TC_D30V
 199   /* Must do this is we want VLIW instruction with "->" or "<-".  */
 200   lex['-'] = LEX_IS_SYMBOL_COMPONENT;
 201 #endif
 202
 203 #ifdef H_TICK_HEX
 204   if (enable_h_tick_hex)
 205     {
 206       lex['h'] = LEX_IS_H;
 207       lex['H'] = LEX_IS_H;
 208     }
 209 #endif
 210 }
 211
 212 /* Saved state of the scrubber.  */
 213 static int state;
 214 static int old_state;
 215 static const char *out_string;
 216 static char out_buf[20];
 217 static int add_newlines;
 218 static char *saved_input;
 219 static size_t saved_input_len;
 220 static char input_buffer[32 * 1024];
 221 static const char *mri_state;
 222 static char mri_last_ch;
 223
 224 /* Data structure for saving the state of app across #include's.  Note that
 225    app is called asynchronously to the parsing of the .include's, so our
 226    state at the time .include is interpreted is completely unrelated.
 227    That's why we have to save it all.  */
 228
 229 struct app_save
 230 {
 231   int          state;
 232   int          old_state;
 233   const char * out_string;
 234   char         out_buf[sizeof (out_buf)];
 235   int          add_newlines;
 236   char *       saved_input;
 237   size_t       saved_input_len;
 238 #ifdef TC_M68K
 239   int          scrub_m68k_mri;
 240 #endif
 241   const char * mri_state;
 242   char         mri_last_ch;
 243 #if defined TC_ARM && defined OBJ_ELF
 244   const char * symver_state;
 245 #endif
 246   char         last_char;
 247 };
 248
 249 char *
 250 app_push (void)
 251 {
 252   struct app_save *saved;
 253
 254   saved = XNEW (struct app_save);
 255   saved->state = state;
 256   saved->old_state = old_state;
 257   saved->out_string = out_string;
 258   memcpy (saved->out_buf, out_buf, sizeof (out_buf));
 259   saved->add_newlines = add_newlines;
 260   if (saved_input == NULL)
 261     saved->saved_input = NULL;
 262   else
 263     {
 264       saved->saved_input = XNEWVEC (char, saved_input_len);
 265       memcpy (saved->saved_input, saved_input, saved_input_len);
 266       saved->saved_input_len = saved_input_len;
 267     }
 268 #ifdef TC_M68K
 269   saved->scrub_m68k_mri = scrub_m68k_mri;
 270 #endif
 271   saved->mri_state = mri_state;
 272   saved->mri_last_ch = mri_last_ch;
 273 #if defined TC_ARM && defined OBJ_ELF
 274   saved->symver_state = symver_state;
 275 #endif
 276   saved->last_char = last_char;
 277
 278   /* do_scrub_begin() is not useful, just wastes time.  */
 279
 280   state = 0;
 281   saved_input = NULL;
 282   add_newlines = 0;
 283
 284   return (char *) saved;
 285 }
 286
 287 void
 288 app_pop (char *arg)
 289 {
 290   struct app_save *saved = (struct app_save *) arg;
 291
 292   /* There is no do_scrub_end ().  */
 293   state = saved->state;
 294   old_state = saved->old_state;
 295   out_string = saved->out_string;
 296   memcpy (out_buf, saved->out_buf, sizeof (out_buf));
 297   add_newlines = saved->add_newlines;
 298   if (saved->saved_input == NULL)
 299     saved_input = NULL;
 300   else
 301     {
 302       gas_assert (saved->saved_input_len <= sizeof (input_buffer));
 303       memcpy (input_buffer, saved->saved_input, saved->saved_input_len);
 304       saved_input = input_buffer;
 305       saved_input_len = saved->saved_input_len;
 306       free (saved->saved_input);
 307     }
 308 #ifdef TC_M68K
 309   scrub_m68k_mri = saved->scrub_m68k_mri;
 310 #endif
 311   mri_state = saved->mri_state;
 312   mri_last_ch = saved->mri_last_ch;
 313 #if defined TC_ARM && defined OBJ_ELF
 314   symver_state = saved->symver_state;
 315 #endif
 316   last_char = saved->last_char;
 317
 318   free (arg);
 319 }
 320
 321 /* @@ This assumes that \n &c are the same on host and target.  This is not
 322    necessarily true.  */
 323
 324 static int
 325 process_escape (int ch)
 326 {
 327   switch (ch)
 328     {
 329     case 'b':
 330       return '\b';
 331     case 'f':
 332       return '\f';
 333     case 'n':
 334       return '\n';
 335     case 'r':
 336       return '\r';
 337     case 't':
 338       return '\t';
 339     case '\'':
 340       return '\'';
 341     case '"':
 342       return '\"';
 343     default:
 344       return ch;
 345     }
 346 }
 347
 348 #define MULTIBYTE_WARN_COUNT_LIMIT 10
 349 static unsigned int multibyte_warn_count = 0;
 350
 351 bool
 352 scan_for_multibyte_characters (const unsigned char *  start,
 353                                const unsigned char *  end,
 354                                bool                   warn)
 355 {
 356   if (end <= start)
 357     return false;
 358
 359   if (warn && multibyte_warn_count > MULTIBYTE_WARN_COUNT_LIMIT)
 360     return false;
 361
 362   bool found = false;
 363
 364   while (start < end)
 365     {
 366       unsigned char c;
 367
 368       if ((c = * start++) <= 0x7f)
 369         continue;
 370
 371       if (!warn)
 372         return true;
 373
 374       found = true;
 375
 376       const char * filename;
 377       unsigned int lineno;
 378
 379       filename = as_where (& lineno);
 380       if (filename == NULL)
 381         as_warn (_("multibyte character (%#x) encountered in input"), c);
 382       else if (lineno == 0)
 383         as_warn (_("multibyte character (%#x) encountered in %s"), c, filename);
 384       else
 385         as_warn (_("multibyte character (%#x) encountered in %s at or near line %u"), c, filename, lineno);
 386
 387       if (++ multibyte_warn_count == MULTIBYTE_WARN_COUNT_LIMIT)
 388         {
 389           as_warn (_("further multibyte character warnings suppressed"));
 390           break;
 391         }
 392     }
 393
 394   return found;
 395 }
 396
 397 /* This function is called to process input characters.  The GET
 398    parameter is used to retrieve more input characters.  GET should
 399    set its parameter to point to a buffer, and return the length of
 400    the buffer; it should return 0 at end of file.  The scrubbed output
 401    characters are put into the buffer starting at TOSTART; the TOSTART
 402    buffer is TOLEN bytes in length.  The function returns the number
 403    of scrubbed characters put into TOSTART.  This will be TOLEN unless
 404    end of file was seen.  This function is arranged as a state
 405    machine, and saves its state so that it may return at any point.
 406    This is the way the old code used to work.  */
 407
 408 size_t
 409 do_scrub_chars (size_t (*get) (char *, size_t), char *tostart, size_t tolen)
 410 {
 411   char *to = tostart;
 412   char *toend = tostart + tolen;
 413   char *from;
 414   char *fromend;
 415   size_t fromlen;
 416   int ch, ch2 = 0;
 417   /* Character that started the string we're working on.  */
 418   static char quotechar;
 419
 420   /*State 0: beginning of normal line
 421           1: After first whitespace on line (flush more white)
 422           2: After first non-white (opcode) on line (keep 1white)
 423           3: after second white on line (into operands) (flush white)
 424           4: after putting out a .linefile, put out digits
 425           5: parsing a string, then go to old-state
 426           6: putting out \ escape in a "d string.
 427           7: no longer used
 428           8: no longer used
 429           9: After seeing symbol char in state 3 (keep 1white after symchar)
 430          10: After seeing whitespace in state 9 (keep white before symchar)
 431          11: After seeing a symbol character in state 0 (eg a label definition)
 432          -1: output string in out_string and go to the state in old_state
 433          -2: flush text until a '*' '/' is seen, then go to state old_state
 434 #ifdef TC_V850
 435          12: After seeing a dash, looking for a second dash as a start
 436              of comment.
 437 #endif
 438 #ifdef DOUBLEBAR_PARALLEL
 439          13: After seeing a vertical bar, looking for a second
 440              vertical bar as a parallel expression separator.
 441 #endif
 442 #ifdef TC_PREDICATE_START_CHAR
 443          14: After seeing a predicate start character at state 0, looking
 444              for a predicate end character as predicate.
 445          15: After seeing a predicate start character at state 1, looking
 446              for a predicate end character as predicate.
 447 #endif
 448 #ifdef TC_Z80
 449          16: After seeing an 'a' or an 'A' at the start of a symbol
 450          17: After seeing an 'f' or an 'F' in state 16
 451 #endif
 452           */
 453
 454   /* I added states 9 and 10 because the MIPS ECOFF assembler uses
 455      constructs like ``.loc 1 20''.  This was turning into ``.loc
 456      120''.  States 9 and 10 ensure that a space is never dropped in
 457      between characters which could appear in an identifier.  Ian
 458      Taylor, ian@cygnus.com.
 459
 460      I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
 461      correctly on the PA (and any other target where colons are optional).
 462      Jeff Law, law@cs.utah.edu.
 463
 464      I added state 13 so that something like "cmp r1, r2 || trap #1" does not
 465      get squashed into "cmp r1,r2||trap#1", with the all important space
 466      between the 'trap' and the '#1' being eliminated.  nickc@cygnus.com  */
 467
 468   /* This macro gets the next input character.  */
 469
 470 #define GET()                                                   \
 471   (from < fromend                                               \
 472    ? * (unsigned char *) (from++)                               \
 473    : (saved_input = NULL,                                       \
 474       fromlen = (*get) (input_buffer, sizeof input_buffer),     \
 475       from = input_buffer,                                      \
 476       fromend = from + fromlen,                                 \
 477       (fromlen == 0                                             \
 478        ? EOF                                                    \
 479        : * (unsigned char *) (from++))))
 480
 481   /* This macro pushes a character back on the input stream.  */
 482
 483 #define UNGET(uch) (*--from = (uch))
 484
 485   /* This macro puts a character into the output buffer.  If this
 486      character fills the output buffer, this macro jumps to the label
 487      TOFULL.  We use this rather ugly approach because we need to
 488      handle two different termination conditions: EOF on the input
 489      stream, and a full output buffer.  It would be simpler if we
 490      always read in the entire input stream before processing it, but
 491      I don't want to make such a significant change to the assembler's
 492      memory usage.  */
 493
 494 #define PUT(pch)                                \
 495   do                                            \
 496     {                                           \
 497       *to++ = (pch);                            \
 498       if (to >= toend)                          \
 499         goto tofull;                            \
 500     }                                           \
 501   while (0)
 502
 503   if (saved_input != NULL)
 504     {
 505       from = saved_input;
 506       fromend = from + saved_input_len;
 507     }
 508   else
 509     {
 510       fromlen = (*get) (input_buffer, sizeof input_buffer);
 511       if (fromlen == 0)
 512         return 0;
 513       from = input_buffer;
 514       fromend = from + fromlen;
 515
 516       if (multibyte_handling == multibyte_warn)
 517         (void) scan_for_multibyte_characters ((const unsigned char *) from,
 518                                               (const unsigned char* ) fromend,
 519                                               true /* Generate warnings.  */);
 520     }
 521
 522   while (1)
 523     {
 524       /* The cases in this switch end with continue, in order to
 525          branch back to the top of this while loop and generate the
 526          next output character in the appropriate state.  */
 527       switch (state)
 528         {
 529         case -1:
 530           ch = *out_string++;
 531           if (*out_string == '\0')
 532             {
 533               state = old_state;
 534               old_state = 3;
 535             }
 536           PUT (ch);
 537           continue;
 538
 539         case -2:
 540           for (;;)
 541             {
 542               do
 543                 {
 544                   ch = GET ();
 545
 546                   if (ch == EOF)
 547                     {
 548                       as_warn (_("end of file in comment"));
 549                       goto fromeof;
 550                     }
 551
 552                   if (ch == '\n')
 553                     PUT ('\n');
 554                 }
 555               while (ch != '*');
 556
 557               while ((ch = GET ()) == '*')
 558                 ;
 559
 560               if (ch == EOF)
 561                 {
 562                   as_warn (_("end of file in comment"));
 563                   goto fromeof;
 564                 }
 565
 566               if (ch == '/')
 567                 break;
 568
 569               UNGET (ch);
 570             }
 571
 572           state = old_state;
 573           UNGET (' ');
 574           continue;
 575
 576         case 4:
 577           ch = GET ();
 578           if (ch == EOF)
 579             goto fromeof;
 580           else if (ch >= '0' && ch <= '9')
 581             PUT (ch);
 582           else
 583             {
 584               while (ch != EOF && IS_WHITESPACE (ch))
 585                 ch = GET ();
 586               if (ch == '"')
 587                 {
 588                   quotechar = ch;
 589                   state = 5;
 590                   old_state = 3;
 591                   PUT (ch);
 592                 }
 593               else
 594                 {
 595                   while (ch != EOF && ch != '\n')
 596                     ch = GET ();
 597                   state = 0;
 598                   PUT (ch);
 599                 }
 600             }
 601           continue;
 602
 603         case 5:
 604           /* We are going to copy everything up to a quote character,
 605              with special handling for a backslash.  We try to
 606              optimize the copying in the simple case without using the
 607              GET and PUT macros.  */
 608           {
 609             char *s;
 610             ptrdiff_t len;
 611
 612             for (s = from; s < fromend; s++)
 613               {
 614                 ch = *s;
 615                 if (ch == '\\'
 616                     || ch == quotechar
 617                     || ch == '\n')
 618                   break;
 619               }
 620             len = s - from;
 621             if (len > toend - to)
 622               len = toend - to;
 623             if (len > 0)
 624               {
 625                 memcpy (to, from, len);
 626                 to += len;
 627                 from += len;
 628                 if (to >= toend)
 629                   goto tofull;
 630               }
 631           }
 632
 633           ch = GET ();
 634           if (ch == EOF)
 635             {
 636               /* This buffer is here specifically so
 637                  that the UNGET below will work.  */
 638               static char one_char_buf[1];
 639
 640               as_warn (_("end of file in string; '%c' inserted"), quotechar);
 641               state = old_state;
 642               from = fromend = one_char_buf + 1;
 643               fromlen = 1;
 644               UNGET ('\n');
 645               PUT (quotechar);
 646             }
 647           else if (ch == quotechar)
 648             {
 649               state = old_state;
 650               PUT (ch);
 651             }
 652           else if (TC_STRING_ESCAPES && ch == '\\')
 653             {
 654               state = 6;
 655               PUT (ch);
 656             }
 657           else if (scrub_m68k_mri && ch == '\n')
 658             {
 659               /* Just quietly terminate the string.  This permits lines like
 660                    bne  label   loop if we haven't reach end yet.  */
 661               state = old_state;
 662               UNGET (ch);
 663               PUT ('\'');
 664             }
 665           else
 666             {
 667               PUT (ch);
 668             }
 669           continue;
 670
 671         case 6:
 672           state = 5;
 673           ch = GET ();
 674           switch (ch)
 675             {
 676               /* Handle strings broken across lines, by turning '\n' into
 677                  '\\' and 'n'.  */
 678             case '\n':
 679               UNGET ('n');
 680               add_newlines++;
 681               PUT ('\\');
 682               continue;
 683
 684             case EOF:
 685               as_warn (_("end of file in string; '%c' inserted"), quotechar);
 686               PUT (quotechar);
 687               continue;
 688
 689             case '"':
 690             case '\\':
 691             case 'b':
 692             case 'f':
 693             case 'n':
 694             case 'r':
 695             case 't':
 696             case 'v':
 697             case 'x':
 698             case 'X':
 699             case '0':
 700             case '1':
 701             case '2':
 702             case '3':
 703             case '4':
 704             case '5':
 705             case '6':
 706             case '7':
 707               break;
 708
 709             default:
 710 #ifdef ONLY_STANDARD_ESCAPES
 711               as_warn (_("unknown escape '\\%c' in string; ignored"), ch);
 712 #endif
 713               break;
 714             }
 715           PUT (ch);
 716           continue;
 717
 718 #ifdef DOUBLEBAR_PARALLEL
 719         case 13:
 720           ch = GET ();
 721           if (ch != '|')
 722             abort ();
 723
 724           /* Reset back to state 1 and pretend that we are parsing a
 725              line from just after the first white space.  */
 726           state = 1;
 727           PUT ('|');
 728 #ifdef TC_TIC6X
 729           /* "||^" is used for SPMASKed instructions.  */
 730           ch = GET ();
 731           if (ch == EOF)
 732             goto fromeof;
 733           else if (ch == '^')
 734             PUT ('^');
 735           else
 736             UNGET (ch);
 737 #endif
 738           continue;
 739 #endif
 740 #ifdef TC_Z80
 741         case 16:
 742           /* We have seen an 'a' at the start of a symbol, look for an 'f'.  */
 743           ch = GET ();
 744           if (ch == 'f' || ch == 'F')
 745             {
 746               state = 17;
 747               PUT (ch);
 748             }
 749           else
 750             {
 751               state = 9;
 752               break;
 753             }
 754           /* Fall through.  */
 755         case 17:
 756           /* We have seen "af" at the start of a symbol,
 757              a ' here is a part of that symbol.  */
 758           ch = GET ();
 759           state = 9;
 760           if (ch == '\'')
 761             /* Change to avoid warning about unclosed string.  */
 762             PUT ('`');
 763           else if (ch != EOF)
 764             UNGET (ch);
 765           break;
 766 #endif
 767         }
 768
 769       /* OK, we are somewhere in states 0 through 4 or 9 through 11.  */
 770
 771       /* flushchar: */
 772       ch = GET ();
 773
 774 #ifdef TC_PREDICATE_START_CHAR
 775       if (ch == TC_PREDICATE_START_CHAR && (state == 0 || state == 1))
 776         {
 777           state += 14;
 778           PUT (ch);
 779           continue;
 780         }
 781       else if (state == 14 || state == 15)
 782         {
 783           if (ch == TC_PREDICATE_END_CHAR)
 784             {
 785               state -= 14;
 786               PUT (ch);
 787               ch = GET ();
 788             }
 789           else
 790             {
 791               PUT (ch);
 792               continue;
 793             }
 794         }
 795 #endif
 796
 797     recycle:
 798
 799 #if defined TC_ARM && defined OBJ_ELF
 800       /* We need to watch out for .symver directives.  See the comment later
 801          in this function.  */
 802       if (symver_state == NULL)
 803         {
 804           if ((state == 0 || state == 1) && ch == symver_pseudo[0])
 805             symver_state = symver_pseudo + 1;
 806         }
 807       else
 808         {
 809           /* We advance to the next state if we find the right
 810              character.  */
 811           if (ch != '\0' && (*symver_state == ch))
 812             ++symver_state;
 813           else if (*symver_state != '\0')
 814             /* We did not get the expected character, or we didn't
 815                get a valid terminating character after seeing the
 816                entire pseudo-op, so we must go back to the beginning.  */
 817             symver_state = NULL;
 818           else
 819             {
 820               /* We've read the entire pseudo-op.  If this is the end
 821                  of the line, go back to the beginning.  */
 822               if (IS_NEWLINE (ch))
 823                 symver_state = NULL;
 824             }
 825         }
 826 #endif /* TC_ARM && OBJ_ELF */
 827
 828 #ifdef TC_M68K
 829       /* We want to have pseudo-ops which control whether we are in
 830          MRI mode or not.  Unfortunately, since m68k MRI mode affects
 831          the scrubber, that means that we need a special purpose
 832          recognizer here.  */
 833       if (mri_state == NULL)
 834         {
 835           if ((state == 0 || state == 1)
 836               && ch == mri_pseudo[0])
 837             mri_state = mri_pseudo + 1;
 838         }
 839       else
 840         {
 841           /* We advance to the next state if we find the right
 842              character, or if we need a space character and we get any
 843              whitespace character, or if we need a '0' and we get a
 844              '1' (this is so that we only need one state to handle
 845              ``.mri 0'' and ``.mri 1'').  */
 846           if (ch != '\0'
 847               && (*mri_state == ch
 848                   || (*mri_state == ' '
 849                       && lex[ch] == LEX_IS_WHITESPACE)
 850                   || (*mri_state == '0'
 851                       && ch == '1')))
 852             {
 853               mri_last_ch = ch;
 854               ++mri_state;
 855             }
 856           else if (*mri_state != '\0'
 857                    || (lex[ch] != LEX_IS_WHITESPACE
 858                        && lex[ch] != LEX_IS_NEWLINE))
 859             {
 860               /* We did not get the expected character, or we didn't
 861                  get a valid terminating character after seeing the
 862                  entire pseudo-op, so we must go back to the
 863                  beginning.  */
 864               mri_state = NULL;
 865             }
 866           else
 867             {
 868               /* We've read the entire pseudo-op.  mips_last_ch is
 869                  either '0' or '1' indicating whether to enter or
 870                  leave MRI mode.  */
 871               do_scrub_begin (mri_last_ch == '1');
 872               mri_state = NULL;
 873
 874               /* We continue handling the character as usual.  The
 875                  main gas reader must also handle the .mri pseudo-op
 876                  to control expression parsing and the like.  */
 877             }
 878         }
 879 #endif
 880
 881       if (ch == EOF)
 882         {
 883           if (state != 0)
 884             {
 885               as_warn (_("end of file not at end of a line; newline inserted"));
 886               state = 0;
 887               PUT ('\n');
 888             }
 889           goto fromeof;
 890         }
 891
 892       switch (lex[ch])
 893         {
 894         case LEX_IS_WHITESPACE:
 895           do
 896             {
 897               ch = GET ();
 898             }
 899           while (ch != EOF && IS_WHITESPACE (ch));
 900           if (ch == EOF)
 901             goto fromeof;
 902
 903           if (state == 0)
 904             {
 905               /* Preserve a single whitespace character at the
 906                  beginning of a line.  */
 907               state = 1;
 908               UNGET (ch);
 909               PUT (' ');
 910               break;
 911             }
 912
 913 #ifdef KEEP_WHITE_AROUND_COLON
 914           if (lex[ch] == LEX_IS_COLON)
 915             {
 916               /* Only keep this white if there's no white *after* the
 917                  colon.  */
 918               ch2 = GET ();
 919               if (ch2 != EOF)
 920                 UNGET (ch2);
 921               if (!IS_WHITESPACE (ch2))
 922                 {
 923                   state = 9;
 924                   UNGET (ch);
 925                   PUT (' ');
 926                   break;
 927                 }
 928             }
 929 #endif
 930           if (IS_COMMENT (ch)
 931               || IS_LINE_SEPARATOR (ch)
 932               || IS_PARALLEL_SEPARATOR (ch))
 933             {
 934               if (scrub_m68k_mri)
 935                 {
 936                   /* In MRI mode, we keep these spaces.  */
 937                   UNGET (ch);
 938                   PUT (' ');
 939                   break;
 940                 }
 941               goto recycle;
 942             }
 943
 944           /* If we're in state 2 or 11, we've seen a non-white
 945              character followed by whitespace.  If the next character
 946              is ':', this is whitespace after a label name which we
 947              normally must ignore.  In MRI mode, though, spaces are
 948              not permitted between the label and the colon.  */
 949           if ((state == 2 || state == 11)
 950               && lex[ch] == LEX_IS_COLON
 951               && ! scrub_m68k_mri)
 952             {
 953               state = 1;
 954               PUT (ch);
 955               break;
 956             }
 957
 958           switch (state)
 959             {
 960             case 1:
 961               /* We can arrive here if we leave a leading whitespace
 962                  character at the beginning of a line.  */
 963               goto recycle;
 964             case 2:
 965               state = 3;
 966               if (to + 1 < toend)
 967                 {
 968                   /* Optimize common case by skipping UNGET/GET.  */
 969                   PUT (' ');    /* Sp after opco */
 970                   goto recycle;
 971                 }
 972               UNGET (ch);
 973               PUT (' ');
 974               break;
 975             case 3:
 976 #ifndef TC_KEEP_OPERAND_SPACES
 977               /* For TI C6X, we keep these spaces as they may separate
 978                  functional unit specifiers from operands.  */
 979               if (scrub_m68k_mri)
 980 #endif
 981                 {
 982                   /* In MRI mode, we keep these spaces.  */
 983                   UNGET (ch);
 984                   PUT (' ');
 985                   break;
 986                 }
 987               goto recycle;     /* Sp in operands */
 988             case 9:
 989             case 10:
 990 #ifndef TC_KEEP_OPERAND_SPACES
 991               if (scrub_m68k_mri)
 992 #endif
 993                 {
 994                   /* In MRI mode, we keep these spaces.  */
 995                   state = 3;
 996                   UNGET (ch);
 997                   PUT (' ');
 998                   break;
 999                 }
1000               state = 10;       /* Sp after symbol char */
1001               goto recycle;
1002             case 11:
1003               if (LABELS_WITHOUT_COLONS || flag_m68k_mri)
1004                 state = 1;
1005               else
1006                 {
1007                   /* We know that ch is not ':', since we tested that
1008                      case above.  Therefore this is not a label, so it
1009                      must be the opcode, and we've just seen the
1010                      whitespace after it.  */
1011                   state = 3;
1012                 }
1013               UNGET (ch);
1014               PUT (' ');        /* Sp after label definition.  */
1015               break;
1016             default:
1017               BAD_CASE (state);
1018             }
1019           break;
1020
1021         case LEX_IS_TWOCHAR_COMMENT_1ST:
1022           ch2 = GET ();
1023           if (ch2 == '*')
1024             {
1025               for (;;)
1026                 {
1027                   do
1028                     {
1029                       ch2 = GET ();
1030                       if (ch2 != EOF && IS_NEWLINE (ch2))
1031                         add_newlines++;
1032                     }
1033                   while (ch2 != EOF && ch2 != '*');
1034
1035                   while (ch2 == '*')
1036                     ch2 = GET ();
1037
1038                   if (ch2 == EOF || ch2 == '/')
1039                     break;
1040
1041                   /* This UNGET will ensure that we count newlines
1042                      correctly.  */
1043                   UNGET (ch2);
1044                 }
1045
1046               if (ch2 == EOF)
1047                 as_warn (_("end of file in multiline comment"));
1048
1049               ch = ' ';
1050               goto recycle;
1051             }
1052 #ifdef DOUBLESLASH_LINE_COMMENTS
1053           else if (ch2 == '/')
1054             {
1055               do
1056                 {
1057                   ch = GET ();
1058                 }
1059               while (ch != EOF && !IS_NEWLINE (ch));
1060               if (ch == EOF)
1061                 as_warn ("end of file in comment; newline inserted");
1062               state = 0;
1063               PUT ('\n');
1064               break;
1065             }
1066 #endif
1067           else
1068             {
1069               if (ch2 != EOF)
1070                 UNGET (ch2);
1071               if (state == 9 || state == 10)
1072                 state = 3;
1073               PUT (ch);
1074             }
1075           break;
1076
1077         case LEX_IS_STRINGQUOTE:
1078           quotechar = ch;
1079           if (state == 10)
1080             {
1081               /* Preserve the whitespace in foo "bar".  */
1082               UNGET (ch);
1083               state = 3;
1084               PUT (' ');
1085
1086               /* PUT didn't jump out.  We could just break, but we
1087                  know what will happen, so optimize a bit.  */
1088               ch = GET ();
1089               old_state = 3;
1090             }
1091           else if (state == 9)
1092             old_state = 3;
1093           else
1094             old_state = state;
1095           state = 5;
1096           PUT (ch);
1097           break;
1098
1099         case LEX_IS_ONECHAR_QUOTE:
1100 #ifdef H_TICK_HEX
1101           if (state == 9 && enable_h_tick_hex)
1102             {
1103               char c;
1104
1105               c = GET ();
1106               as_warn ("'%c found after symbol", c);
1107               UNGET (c);
1108             }
1109 #endif
1110           if (state == 10)
1111             {
1112               /* Preserve the whitespace in foo 'b'.  */
1113               UNGET (ch);
1114               state = 3;
1115               PUT (' ');
1116               break;
1117             }
1118           ch = GET ();
1119           if (ch == EOF)
1120             {
1121               as_warn (_("end of file after a one-character quote; \\0 inserted"));
1122               ch = 0;
1123             }
1124           if (ch == '\\')
1125             {
1126               ch = GET ();
1127               if (ch == EOF)
1128                 {
1129                   as_warn (_("end of file in escape character"));
1130                   ch = '\\';
1131                 }
1132               else
1133                 ch = process_escape (ch);
1134             }
1135           sprintf (out_buf, "%d", (int) (unsigned char) ch);
1136
1137           /* None of these 'x constants for us.  We want 'x'.  */
1138           if ((ch = GET ()) != '\'')
1139             {
1140 #ifdef REQUIRE_CHAR_CLOSE_QUOTE
1141               as_warn (_("missing close quote; (assumed)"));
1142 #else
1143               if (ch != EOF)
1144                 UNGET (ch);
1145 #endif
1146             }
1147           if (strlen (out_buf) == 1)
1148             {
1149               PUT (out_buf[0]);
1150               break;
1151             }
1152           if (state == 9)
1153             old_state = 3;
1154           else
1155             old_state = state;
1156           state = -1;
1157           out_string = out_buf;
1158           PUT (*out_string++);
1159           break;
1160
1161         case LEX_IS_COLON:
1162 #ifdef KEEP_WHITE_AROUND_COLON
1163           state = 9;
1164 #else
1165           if (state == 9 || state == 10)
1166             state = 3;
1167           else if (state != 3)
1168             state = 1;
1169 #endif
1170           PUT (ch);
1171           break;
1172
1173         case LEX_IS_NEWLINE:
1174           /* Roll out a bunch of newlines from inside comments, etc.  */
1175           if (add_newlines)
1176             {
1177               --add_newlines;
1178               UNGET (ch);
1179             }
1180           /* Fall through.  */
1181
1182         case LEX_IS_LINE_SEPARATOR:
1183           state = 0;
1184           PUT (ch);
1185           break;
1186
1187         case LEX_IS_PARALLEL_SEPARATOR:
1188           state = 1;
1189           PUT (ch);
1190           break;
1191
1192 #ifdef TC_V850
1193         case LEX_IS_DOUBLEDASH_1ST:
1194           ch2 = GET ();
1195           if (ch2 != '-')
1196             {
1197               if (ch2 != EOF)
1198                 UNGET (ch2);
1199               goto de_fault;
1200             }
1201           /* Read and skip to end of line.  */
1202           do
1203             {
1204               ch = GET ();
1205             }
1206           while (ch != EOF && ch != '\n');
1207
1208           if (ch == EOF)
1209             as_warn (_("end of file in comment; newline inserted"));
1210
1211           state = 0;
1212           PUT ('\n');
1213           break;
1214 #endif
1215 #ifdef DOUBLEBAR_PARALLEL
1216         case LEX_IS_DOUBLEBAR_1ST:
1217           ch2 = GET ();
1218           if (ch2 != EOF)
1219             UNGET (ch2);
1220           if (ch2 != '|')
1221             goto de_fault;
1222
1223           /* Handle '||' in two states as invoking PUT twice might
1224              result in the first one jumping out of this loop.  We'd
1225              then lose track of the state and one '|' char.  */
1226           state = 13;
1227           PUT ('|');
1228           break;
1229 #endif
1230         case LEX_IS_LINE_COMMENT_START:
1231           /* FIXME-someday: The two character comment stuff was badly
1232              thought out.  On i386, we want '/' as line comment start
1233              AND we want C style comments.  hence this hack.  The
1234              whole lexical process should be reworked.  xoxorich.  */
1235           if (ch == '/')
1236             {
1237               ch2 = GET ();
1238               if (ch2 == '*')
1239                 {
1240                   old_state = 3;
1241                   state = -2;
1242                   break;
1243                 }
1244               else if (ch2 != EOF)
1245                 {
1246                   UNGET (ch2);
1247                 }
1248             }
1249
1250           if (state == 0 || state == 1) /* Only comment at start of line.  */
1251             {
1252               int startch;
1253
1254               startch = ch;
1255
1256               do
1257                 {
1258                   ch = GET ();
1259                 }
1260               while (ch != EOF && IS_WHITESPACE (ch));
1261
1262               if (ch == EOF)
1263                 {
1264                   as_warn (_("end of file in comment; newline inserted"));
1265                   PUT ('\n');
1266                   break;
1267                 }
1268
1269               if (ch < '0' || ch > '9' || state != 0 || startch != '#')
1270                 {
1271                   /* Not a cpp line.  */
1272                   while (ch != EOF && !IS_NEWLINE (ch))
1273                     ch = GET ();
1274                   if (ch == EOF)
1275                     {
1276                       as_warn (_("end of file in comment; newline inserted"));
1277                       PUT ('\n');
1278                     }
1279                   else /* IS_NEWLINE (ch) */
1280                     {
1281                       /* To process non-zero add_newlines.  */
1282                       UNGET (ch);
1283                     }
1284                   state = 0;
1285                   break;
1286                 }
1287               /* Looks like `# 123 "filename"' from cpp.  */
1288               UNGET (ch);
1289               old_state = 4;
1290               state = -1;
1291               if (scrub_m68k_mri)
1292                 out_string = "\tlinefile ";
1293               else
1294                 out_string = "\t.linefile ";
1295               PUT (*out_string++);
1296               break;
1297             }
1298
1299 #ifdef TC_D10V
1300           /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true.
1301              Trap is the only short insn that has a first operand that is
1302              neither register nor label.
1303              We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 .
1304              We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is
1305              already LEX_IS_LINE_COMMENT_START.  However, it is the
1306              only character in line_comment_chars for d10v, hence we
1307              can recognize it as such.  */
1308           /* An alternative approach would be to reset the state to 1 when
1309              we see '||', '<'- or '->', but that seems to be overkill.  */
1310           if (state == 10)
1311             PUT (' ');
1312 #endif
1313           /* We have a line comment character which is not at the
1314              start of a line.  If this is also a normal comment
1315              character, fall through.  Otherwise treat it as a default
1316              character.  */
1317           if (strchr (tc_comment_chars, ch) == NULL
1318               && (! scrub_m68k_mri
1319                   || (ch != '!' && ch != '*')))
1320             goto de_fault;
1321           if (scrub_m68k_mri
1322               && (ch == '!' || ch == '*' || ch == '#')
1323               && state != 1
1324               && state != 10)
1325             goto de_fault;
1326           /* Fall through.  */
1327         case LEX_IS_COMMENT_START:
1328 #if defined TC_ARM && defined OBJ_ELF
1329           /* On the ARM, `@' is the comment character.
1330              Unfortunately this is also a special character in ELF .symver
1331              directives (and .type, though we deal with those another way).
1332              So we check if this line is such a directive, and treat
1333              the character as default if so.  This is a hack.  */
1334           if ((symver_state != NULL) && (*symver_state == 0))
1335             goto de_fault;
1336 #endif
1337
1338           /* Care is needed not to damage occurrences of \<comment-char>
1339              by stripping the <comment-char> onwards.  Yuck.  */
1340           if ((to > tostart ? to[-1] : last_char) == '\\')
1341             /* Do not treat the <comment-char> as a start-of-comment.  */
1342             goto de_fault;
1343
1344 #ifdef WARN_COMMENTS
1345           if (!found_comment)
1346             found_comment_file = as_where (&found_comment);
1347 #endif
1348           do
1349             {
1350               ch = GET ();
1351             }
1352           while (ch != EOF && !IS_NEWLINE (ch));
1353           if (ch == EOF)
1354             as_warn (_("end of file in comment; newline inserted"));
1355           state = 0;
1356           PUT ('\n');
1357           break;
1358
1359 #ifdef H_TICK_HEX
1360         case LEX_IS_H:
1361           /* Look for strings like H'[0-9A-Fa-f] and if found, replace
1362              the H' with 0x to make them gas-style hex characters.  */
1363           if (enable_h_tick_hex)
1364             {
1365               char quot;
1366
1367               quot = GET ();
1368               if (quot == '\'')
1369                 {
1370                   UNGET ('x');
1371                   ch = '0';
1372                 }
1373               else
1374                 UNGET (quot);
1375             }
1376 #endif
1377           /* Fall through.  */
1378
1379         case LEX_IS_SYMBOL_COMPONENT:
1380           if (state == 10)
1381             {
1382               /* This is a symbol character following another symbol
1383                  character, with whitespace in between.  We skipped
1384                  the whitespace earlier, so output it now.  */
1385               UNGET (ch);
1386               state = 3;
1387               PUT (' ');
1388               break;
1389             }
1390
1391 #ifdef TC_Z80
1392           /* "af'" is a symbol containing '\''.  */
1393           if (state == 3 && (ch == 'a' || ch == 'A'))
1394             {
1395               state = 16;
1396               PUT (ch);
1397               ch = GET ();
1398               if (ch == 'f' || ch == 'F')
1399                 {
1400                   state = 17;
1401                   PUT (ch);
1402                   break;
1403                 }
1404               else
1405                 {
1406                   state = 9;
1407                   if (ch == EOF || !IS_SYMBOL_COMPONENT (ch))
1408                     {
1409                       if (ch != EOF)
1410                         UNGET (ch);
1411                       break;
1412                     }
1413                 }
1414             }
1415 #endif
1416           if (state == 3)
1417             state = 9;
1418
1419           /* This is a common case.  Quickly copy CH and all the
1420              following symbol component or normal characters.  */
1421           if (to + 1 < toend
1422               && mri_state == NULL
1423 #if defined TC_ARM && defined OBJ_ELF
1424               && symver_state == NULL
1425 #endif
1426               )
1427             {
1428               char *s;
1429               ptrdiff_t len;
1430
1431               for (s = from; s < fromend; s++)
1432                 {
1433                   int type;
1434
1435                   ch2 = *(unsigned char *) s;
1436                   type = lex[ch2];
1437                   if (type != 0
1438                       && type != LEX_IS_SYMBOL_COMPONENT)
1439                     break;
1440                 }
1441
1442               if (s > from)
1443                 /* Handle the last character normally, for
1444                    simplicity.  */
1445                 --s;
1446
1447               len = s - from;
1448
1449               if (len > (toend - to) - 1)
1450                 len = (toend - to) - 1;
1451
1452               if (len > 0)
1453                 {
1454                   PUT (ch);
1455                   memcpy (to, from, len);
1456                   to += len;
1457                   from += len;
1458                   if (to >= toend)
1459                     goto tofull;
1460                   ch = GET ();
1461                 }
1462             }
1463
1464           /* Fall through.  */
1465         default:
1466         de_fault:
1467           /* Some relatively `normal' character.  */
1468           if (state == 0)
1469             {
1470               state = 11;       /* Now seeing label definition.  */
1471             }
1472           else if (state == 1)
1473             {
1474               state = 2;        /* Ditto.  */
1475             }
1476           else if (state == 9)
1477             {
1478               if (!IS_SYMBOL_COMPONENT (ch))
1479                 state = 3;
1480             }
1481           else if (state == 10)
1482             {
1483               if (ch == '\\')
1484                 {
1485                   /* Special handling for backslash: a backslash may
1486                      be the beginning of a formal parameter (of a
1487                      macro) following another symbol character, with
1488                      whitespace in between.  If that is the case, we
1489                      output a space before the parameter.  Strictly
1490                      speaking, correct handling depends upon what the
1491                      macro parameter expands into; if the parameter
1492                      expands into something which does not start with
1493                      an operand character, then we don't want to keep
1494                      the space.  We don't have enough information to
1495                      make the right choice, so here we are making the
1496                      choice which is more likely to be correct.  */
1497                   if (to + 1 >= toend)
1498                     {
1499                       /* If we're near the end of the buffer, save the
1500                          character for the next time round.  Otherwise
1501                          we'll lose our state.  */
1502                       UNGET (ch);
1503                       goto tofull;
1504                     }
1505                   *to++ = ' ';
1506                 }
1507
1508               state = 3;
1509             }
1510           PUT (ch);
1511           break;
1512         }
1513     }
1514
1515   /*NOTREACHED*/
1516
1517  fromeof:
1518   /* We have reached the end of the input.  */
1519   if (to > tostart)
1520     last_char = to[-1];
1521   return to - tostart;
1522
1523  tofull:
1524   /* The output buffer is full.  Save any input we have not yet
1525      processed.  */
1526   if (fromend > from)
1527     {
1528       saved_input = from;
1529       saved_input_len = fromend - from;
1530     }
1531   else
1532     saved_input = NULL;
1533
1534   if (to > tostart)
1535     last_char = to[-1];
1536   return to - tostart;
1537 }