]> git.ipfire.org Git - thirdparty/binutils-gdb.git/blame - gas/app.c
Update year range in copyright notice of binutils files
[thirdparty/binutils-gdb.git] / gas / app.c
CommitLineData
252b5132 1/* This is the Assembler Pre-Processor
a2c58332 2 Copyright (C) 1987-2022 Free Software Foundation, Inc.
252b5132
RH
3
4 This file is part of GAS, the GNU Assembler.
5
6 GAS is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
ec2655a6 8 the Free Software Foundation; either version 3, or (at your option)
252b5132
RH
9 any later version.
10
ec2655a6
NC
11 GAS is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
14 License for more details.
252b5132
RH
15
16 You should have received a copy of the GNU General Public License
17 along with GAS; see the file COPYING. If not, write to the Free
4b4da160
NC
18 Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA
19 02110-1301, USA. */
252b5132 20
204cd129 21/* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90. */
93e914b2
AO
22/* App, the assembler pre-processor. This pre-processor strips out
23 excess spaces, turns single-quoted characters into a decimal
24 constant, and turns the # in # <number> <filename> <garbage> into a
25 .linefile. This needs better error-handling. */
252b5132 26
ebd1c875 27#include "as.h"
252b5132
RH
28
29#if (__STDC__ != 1)
30#ifndef const
31#define const /* empty */
32#endif
33#endif
34
c54b5932
DD
35#ifdef H_TICK_HEX
36int enable_h_tick_hex = 0;
37#endif
38
abd63a32 39#ifdef TC_M68K
252b5132
RH
40/* Whether we are scrubbing in m68k MRI mode. This is different from
41 flag_m68k_mri, because the two flags will be affected by the .mri
42 pseudo-op at different times. */
43static int scrub_m68k_mri;
44
45/* The pseudo-op which switches in and out of MRI mode. See the
46 comment in do_scrub_chars. */
47static const char mri_pseudo[] = ".mri 0";
72297628
AM
48#else
49#define scrub_m68k_mri 0
50#endif
252b5132
RH
51
52#if defined TC_ARM && defined OBJ_ELF
3ee4defc 53/* The pseudo-op for which we need to special-case `@' characters.
252b5132
RH
54 See the comment in do_scrub_chars. */
55static const char symver_pseudo[] = ".symver";
56static const char * symver_state;
57#endif
750e4bf7 58
ab1fadc6 59static char last_char;
252b5132
RH
60
61static char lex[256];
62static const char symbol_chars[] =
63"$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
64
65#define LEX_IS_SYMBOL_COMPONENT 1
66#define LEX_IS_WHITESPACE 2
67#define LEX_IS_LINE_SEPARATOR 3
68#define LEX_IS_COMMENT_START 4
69#define LEX_IS_LINE_COMMENT_START 5
70#define LEX_IS_TWOCHAR_COMMENT_1ST 6
71#define LEX_IS_STRINGQUOTE 8
72#define LEX_IS_COLON 9
73#define LEX_IS_NEWLINE 10
74#define LEX_IS_ONECHAR_QUOTE 11
75#ifdef TC_V850
76#define LEX_IS_DOUBLEDASH_1ST 12
77#endif
78#ifdef TC_M32R
f28e8eb3
TW
79#define DOUBLEBAR_PARALLEL
80#endif
81#ifdef DOUBLEBAR_PARALLEL
252b5132
RH
82#define LEX_IS_DOUBLEBAR_1ST 13
83#endif
62f65a7b 84#define LEX_IS_PARALLEL_SEPARATOR 14
c54b5932
DD
85#ifdef H_TICK_HEX
86#define LEX_IS_H 15
87#endif
252b5132
RH
88#define IS_SYMBOL_COMPONENT(c) (lex[c] == LEX_IS_SYMBOL_COMPONENT)
89#define IS_WHITESPACE(c) (lex[c] == LEX_IS_WHITESPACE)
90#define IS_LINE_SEPARATOR(c) (lex[c] == LEX_IS_LINE_SEPARATOR)
62f65a7b 91#define IS_PARALLEL_SEPARATOR(c) (lex[c] == LEX_IS_PARALLEL_SEPARATOR)
252b5132
RH
92#define IS_COMMENT(c) (lex[c] == LEX_IS_COMMENT_START)
93#define IS_LINE_COMMENT(c) (lex[c] == LEX_IS_LINE_COMMENT_START)
94#define IS_NEWLINE(c) (lex[c] == LEX_IS_NEWLINE)
95
73ee5e4c 96static int process_escape (int);
252b5132
RH
97
98/* FIXME-soon: The entire lexer/parser thingy should be
99 built statically at compile time rather than dynamically
3ee4defc 100 each and every time the assembler is run. xoxorich. */
252b5132 101
3ee4defc 102void
73ee5e4c 103do_scrub_begin (int m68k_mri ATTRIBUTE_UNUSED)
252b5132
RH
104{
105 const char *p;
106 int c;
107
252b5132
RH
108 lex[' '] = LEX_IS_WHITESPACE;
109 lex['\t'] = LEX_IS_WHITESPACE;
110 lex['\r'] = LEX_IS_WHITESPACE;
111 lex['\n'] = LEX_IS_NEWLINE;
252b5132
RH
112 lex[':'] = LEX_IS_COLON;
113
abd63a32
AM
114#ifdef TC_M68K
115 scrub_m68k_mri = m68k_mri;
116
252b5132 117 if (! m68k_mri)
abd63a32 118#endif
252b5132
RH
119 {
120 lex['"'] = LEX_IS_STRINGQUOTE;
121
6793974d 122#if ! defined (TC_HPPA)
252b5132
RH
123 lex['\''] = LEX_IS_ONECHAR_QUOTE;
124#endif
125
126#ifdef SINGLE_QUOTE_STRINGS
127 lex['\''] = LEX_IS_STRINGQUOTE;
128#endif
129 }
130
131 /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop
132 in state 5 of do_scrub_chars must be changed. */
133
134 /* Note that these override the previous defaults, e.g. if ';' is a
135 comment char, then it isn't a line separator. */
136 for (p = symbol_chars; *p; ++p)
204cd129 137 lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
252b5132
RH
138
139 for (c = 128; c < 256; ++c)
140 lex[c] = LEX_IS_SYMBOL_COMPONENT;
141
142#ifdef tc_symbol_chars
143 /* This macro permits the processor to specify all characters which
144 may appears in an operand. This will prevent the scrubber from
145 discarding meaningful whitespace in certain cases. The i386
146 backend uses this to support prefixes, which can confuse the
147 scrubber as to whether it is parsing operands or opcodes. */
148 for (p = tc_symbol_chars; *p; ++p)
149 lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
150#endif
151
152 /* The m68k backend wants to be able to change comment_chars. */
153#ifndef tc_comment_chars
154#define tc_comment_chars comment_chars
155#endif
156 for (p = tc_comment_chars; *p; p++)
204cd129 157 lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
252b5132
RH
158
159 for (p = line_comment_chars; *p; p++)
204cd129 160 lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
252b5132 161
2e6976a8
DG
162#ifndef tc_line_separator_chars
163#define tc_line_separator_chars line_separator_chars
164#endif
165 for (p = tc_line_separator_chars; *p; p++)
204cd129 166 lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
252b5132 167
62f65a7b
DB
168#ifdef tc_parallel_separator_chars
169 /* This macro permits the processor to specify all characters which
170 separate parallel insns on the same line. */
171 for (p = tc_parallel_separator_chars; *p; p++)
204cd129 172 lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR;
62f65a7b
DB
173#endif
174
252b5132
RH
175 /* Only allow slash-star comments if slash is not in use.
176 FIXME: This isn't right. We should always permit them. */
177 if (lex['/'] == 0)
204cd129 178 lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
252b5132 179
abd63a32 180#ifdef TC_M68K
252b5132
RH
181 if (m68k_mri)
182 {
183 lex['\''] = LEX_IS_STRINGQUOTE;
184 lex[';'] = LEX_IS_COMMENT_START;
185 lex['*'] = LEX_IS_LINE_COMMENT_START;
186 /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
b1ac4c66 187 then it can't be used in an expression. */
252b5132
RH
188 lex['!'] = LEX_IS_LINE_COMMENT_START;
189 }
abd63a32 190#endif
252b5132
RH
191
192#ifdef TC_V850
193 lex['-'] = LEX_IS_DOUBLEDASH_1ST;
194#endif
f28e8eb3 195#ifdef DOUBLEBAR_PARALLEL
252b5132
RH
196 lex['|'] = LEX_IS_DOUBLEBAR_1ST;
197#endif
198#ifdef TC_D30V
204cd129 199 /* Must do this is we want VLIW instruction with "->" or "<-". */
252b5132
RH
200 lex['-'] = LEX_IS_SYMBOL_COMPONENT;
201#endif
c54b5932
DD
202
203#ifdef H_TICK_HEX
204 if (enable_h_tick_hex)
205 {
206 lex['h'] = LEX_IS_H;
207 lex['H'] = LEX_IS_H;
208 }
209#endif
204cd129 210}
252b5132 211
204cd129 212/* Saved state of the scrubber. */
252b5132
RH
213static int state;
214static int old_state;
cd0bbe6e 215static const char *out_string;
252b5132
RH
216static char out_buf[20];
217static int add_newlines;
218static char *saved_input;
39a45edc 219static size_t saved_input_len;
2b47531b 220static char input_buffer[32 * 1024];
252b5132
RH
221static const char *mri_state;
222static char mri_last_ch;
223
224/* Data structure for saving the state of app across #include's. Note that
225 app is called asynchronously to the parsing of the .include's, so our
226 state at the time .include is interpreted is completely unrelated.
227 That's why we have to save it all. */
228
204cd129
NC
229struct app_save
230{
30a2b4ef
KH
231 int state;
232 int old_state;
cd0bbe6e 233 const char * out_string;
30a2b4ef
KH
234 char out_buf[sizeof (out_buf)];
235 int add_newlines;
236 char * saved_input;
39a45edc 237 size_t saved_input_len;
abd63a32 238#ifdef TC_M68K
30a2b4ef 239 int scrub_m68k_mri;
abd63a32 240#endif
30a2b4ef
KH
241 const char * mri_state;
242 char mri_last_ch;
252b5132 243#if defined TC_ARM && defined OBJ_ELF
30a2b4ef 244 const char * symver_state;
252b5132 245#endif
750e4bf7 246 char last_char;
30a2b4ef 247};
252b5132
RH
248
249char *
73ee5e4c 250app_push (void)
252b5132 251{
ed9e98c2 252 struct app_save *saved;
252b5132 253
325801bd 254 saved = XNEW (struct app_save);
252b5132
RH
255 saved->state = state;
256 saved->old_state = old_state;
257 saved->out_string = out_string;
258 memcpy (saved->out_buf, out_buf, sizeof (out_buf));
259 saved->add_newlines = add_newlines;
2b47531b
ILT
260 if (saved_input == NULL)
261 saved->saved_input = NULL;
262 else
263 {
add39d23 264 saved->saved_input = XNEWVEC (char, saved_input_len);
2b47531b
ILT
265 memcpy (saved->saved_input, saved_input, saved_input_len);
266 saved->saved_input_len = saved_input_len;
267 }
abd63a32 268#ifdef TC_M68K
252b5132 269 saved->scrub_m68k_mri = scrub_m68k_mri;
abd63a32 270#endif
252b5132
RH
271 saved->mri_state = mri_state;
272 saved->mri_last_ch = mri_last_ch;
273#if defined TC_ARM && defined OBJ_ELF
274 saved->symver_state = symver_state;
275#endif
ab1fadc6 276 saved->last_char = last_char;
252b5132 277
3ee4defc 278 /* do_scrub_begin() is not useful, just wastes time. */
252b5132
RH
279
280 state = 0;
281 saved_input = NULL;
f8819316 282 add_newlines = 0;
252b5132
RH
283
284 return (char *) saved;
285}
286
3ee4defc 287void
73ee5e4c 288app_pop (char *arg)
252b5132 289{
ed9e98c2 290 struct app_save *saved = (struct app_save *) arg;
252b5132 291
3ee4defc 292 /* There is no do_scrub_end (). */
252b5132
RH
293 state = saved->state;
294 old_state = saved->old_state;
295 out_string = saved->out_string;
296 memcpy (out_buf, saved->out_buf, sizeof (out_buf));
297 add_newlines = saved->add_newlines;
2b47531b
ILT
298 if (saved->saved_input == NULL)
299 saved_input = NULL;
300 else
301 {
39a45edc 302 gas_assert (saved->saved_input_len <= sizeof (input_buffer));
2b47531b
ILT
303 memcpy (input_buffer, saved->saved_input, saved->saved_input_len);
304 saved_input = input_buffer;
305 saved_input_len = saved->saved_input_len;
306 free (saved->saved_input);
307 }
abd63a32 308#ifdef TC_M68K
252b5132 309 scrub_m68k_mri = saved->scrub_m68k_mri;
abd63a32 310#endif
252b5132
RH
311 mri_state = saved->mri_state;
312 mri_last_ch = saved->mri_last_ch;
313#if defined TC_ARM && defined OBJ_ELF
314 symver_state = saved->symver_state;
315#endif
ab1fadc6 316 last_char = saved->last_char;
252b5132
RH
317
318 free (arg);
204cd129 319}
252b5132
RH
320
321/* @@ This assumes that \n &c are the same on host and target. This is not
322 necessarily true. */
204cd129 323
3ee4defc 324static int
73ee5e4c 325process_escape (int ch)
252b5132
RH
326{
327 switch (ch)
328 {
329 case 'b':
330 return '\b';
331 case 'f':
332 return '\f';
333 case 'n':
334 return '\n';
335 case 'r':
336 return '\r';
337 case 't':
338 return '\t';
339 case '\'':
340 return '\'';
341 case '"':
342 return '\"';
343 default:
344 return ch;
345 }
346}
347
578c64a4
NC
348#define MULTIBYTE_WARN_COUNT_LIMIT 10
349static unsigned int multibyte_warn_count = 0;
350
351bool
352scan_for_multibyte_characters (const unsigned char * start,
353 const unsigned char * end,
354 bool warn)
355{
356 if (end <= start)
357 return false;
358
359 if (warn && multibyte_warn_count > MULTIBYTE_WARN_COUNT_LIMIT)
360 return false;
361
362 bool found = false;
363
364 while (start < end)
365 {
366 unsigned char c;
367
368 if ((c = * start++) <= 0x7f)
369 continue;
370
371 if (!warn)
372 return true;
373
374 found = true;
375
376 const char * filename;
377 unsigned int lineno;
378
379 filename = as_where (& lineno);
380 if (filename == NULL)
381 as_warn (_("multibyte character (%#x) encountered in input"), c);
382 else if (lineno == 0)
383 as_warn (_("multibyte character (%#x) encountered in %s"), c, filename);
384 else
385 as_warn (_("multibyte character (%#x) encountered in %s at or near line %u"), c, filename, lineno);
386
387 if (++ multibyte_warn_count == MULTIBYTE_WARN_COUNT_LIMIT)
388 {
389 as_warn (_("further multibyte character warnings suppressed"));
390 break;
391 }
392 }
393
394 return found;
395}
396
252b5132
RH
397/* This function is called to process input characters. The GET
398 parameter is used to retrieve more input characters. GET should
399 set its parameter to point to a buffer, and return the length of
400 the buffer; it should return 0 at end of file. The scrubbed output
401 characters are put into the buffer starting at TOSTART; the TOSTART
402 buffer is TOLEN bytes in length. The function returns the number
403 of scrubbed characters put into TOSTART. This will be TOLEN unless
404 end of file was seen. This function is arranged as a state
405 machine, and saves its state so that it may return at any point.
406 This is the way the old code used to work. */
407
39a45edc
AM
408size_t
409do_scrub_chars (size_t (*get) (char *, size_t), char *tostart, size_t tolen)
252b5132
RH
410{
411 char *to = tostart;
412 char *toend = tostart + tolen;
413 char *from;
414 char *fromend;
39a45edc 415 size_t fromlen;
ed9e98c2 416 int ch, ch2 = 0;
c9c5dcda
AM
417 /* Character that started the string we're working on. */
418 static char quotechar;
252b5132
RH
419
420 /*State 0: beginning of normal line
421 1: After first whitespace on line (flush more white)
422 2: After first non-white (opcode) on line (keep 1white)
423 3: after second white on line (into operands) (flush white)
93e914b2 424 4: after putting out a .linefile, put out digits
252b5132
RH
425 5: parsing a string, then go to old-state
426 6: putting out \ escape in a "d string.
93e914b2 427 7: no longer used
e9fc6c21 428 8: no longer used
252b5132
RH
429 9: After seeing symbol char in state 3 (keep 1white after symchar)
430 10: After seeing whitespace in state 9 (keep white before symchar)
431 11: After seeing a symbol character in state 0 (eg a label definition)
432 -1: output string in out_string and go to the state in old_state
433 -2: flush text until a '*' '/' is seen, then go to state old_state
434#ifdef TC_V850
b1ac4c66
AM
435 12: After seeing a dash, looking for a second dash as a start
436 of comment.
252b5132 437#endif
f28e8eb3 438#ifdef DOUBLEBAR_PARALLEL
b1ac4c66
AM
439 13: After seeing a vertical bar, looking for a second
440 vertical bar as a parallel expression separator.
52628315 441#endif
40b36596
JM
442#ifdef TC_PREDICATE_START_CHAR
443 14: After seeing a predicate start character at state 0, looking
444 for a predicate end character as predicate.
445 15: After seeing a predicate start character at state 1, looking
446 for a predicate end character as predicate.
3c9b82ba
NC
447#endif
448#ifdef TC_Z80
449 16: After seeing an 'a' or an 'A' at the start of a symbol
450 17: After seeing an 'f' or an 'F' in state 16
252b5132
RH
451#endif
452 */
453
454 /* I added states 9 and 10 because the MIPS ECOFF assembler uses
455 constructs like ``.loc 1 20''. This was turning into ``.loc
456 120''. States 9 and 10 ensure that a space is never dropped in
3b37fd66 457 between characters which could appear in an identifier. Ian
252b5132
RH
458 Taylor, ian@cygnus.com.
459
460 I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
461 correctly on the PA (and any other target where colons are optional).
462 Jeff Law, law@cs.utah.edu.
463
464 I added state 13 so that something like "cmp r1, r2 || trap #1" does not
465 get squashed into "cmp r1,r2||trap#1", with the all important space
466 between the 'trap' and the '#1' being eliminated. nickc@cygnus.com */
467
468 /* This macro gets the next input character. */
469
2b47531b
ILT
470#define GET() \
471 (from < fromend \
472 ? * (unsigned char *) (from++) \
473 : (saved_input = NULL, \
474 fromlen = (*get) (input_buffer, sizeof input_buffer), \
475 from = input_buffer, \
476 fromend = from + fromlen, \
477 (fromlen == 0 \
478 ? EOF \
252b5132
RH
479 : * (unsigned char *) (from++))))
480
481 /* This macro pushes a character back on the input stream. */
482
483#define UNGET(uch) (*--from = (uch))
484
485 /* This macro puts a character into the output buffer. If this
486 character fills the output buffer, this macro jumps to the label
487 TOFULL. We use this rather ugly approach because we need to
488 handle two different termination conditions: EOF on the input
489 stream, and a full output buffer. It would be simpler if we
490 always read in the entire input stream before processing it, but
491 I don't want to make such a significant change to the assembler's
492 memory usage. */
493
411863a4
KH
494#define PUT(pch) \
495 do \
496 { \
497 *to++ = (pch); \
498 if (to >= toend) \
499 goto tofull; \
500 } \
252b5132
RH
501 while (0)
502
503 if (saved_input != NULL)
504 {
505 from = saved_input;
506 fromend = from + saved_input_len;
507 }
508 else
509 {
2b47531b 510 fromlen = (*get) (input_buffer, sizeof input_buffer);
252b5132
RH
511 if (fromlen == 0)
512 return 0;
2b47531b 513 from = input_buffer;
252b5132 514 fromend = from + fromlen;
578c64a4
NC
515
516 if (multibyte_handling == multibyte_warn)
517 (void) scan_for_multibyte_characters ((const unsigned char *) from,
518 (const unsigned char* ) fromend,
519 true /* Generate warnings. */);
252b5132
RH
520 }
521
522 while (1)
523 {
524 /* The cases in this switch end with continue, in order to
b1ac4c66
AM
525 branch back to the top of this while loop and generate the
526 next output character in the appropriate state. */
252b5132
RH
527 switch (state)
528 {
529 case -1:
530 ch = *out_string++;
531 if (*out_string == '\0')
532 {
533 state = old_state;
534 old_state = 3;
535 }
536 PUT (ch);
537 continue;
538
539 case -2:
540 for (;;)
541 {
542 do
543 {
544 ch = GET ();
545
546 if (ch == EOF)
547 {
548 as_warn (_("end of file in comment"));
549 goto fromeof;
550 }
551
552 if (ch == '\n')
553 PUT ('\n');
554 }
555 while (ch != '*');
556
557 while ((ch = GET ()) == '*')
558 ;
559
560 if (ch == EOF)
561 {
562 as_warn (_("end of file in comment"));
563 goto fromeof;
564 }
565
566 if (ch == '/')
567 break;
568
569 UNGET (ch);
570 }
571
572 state = old_state;
573 UNGET (' ');
574 continue;
575
576 case 4:
577 ch = GET ();
578 if (ch == EOF)
579 goto fromeof;
580 else if (ch >= '0' && ch <= '9')
581 PUT (ch);
582 else
583 {
584 while (ch != EOF && IS_WHITESPACE (ch))
585 ch = GET ();
586 if (ch == '"')
587 {
93e914b2
AO
588 quotechar = ch;
589 state = 5;
e9fc6c21 590 old_state = 3;
4061927e 591 PUT (ch);
252b5132
RH
592 }
593 else
594 {
595 while (ch != EOF && ch != '\n')
596 ch = GET ();
597 state = 0;
598 PUT (ch);
599 }
600 }
601 continue;
602
603 case 5:
604 /* We are going to copy everything up to a quote character,
b1ac4c66
AM
605 with special handling for a backslash. We try to
606 optimize the copying in the simple case without using the
607 GET and PUT macros. */
252b5132
RH
608 {
609 char *s;
39a45edc 610 ptrdiff_t len;
252b5132
RH
611
612 for (s = from; s < fromend; s++)
613 {
614 ch = *s;
252b5132 615 if (ch == '\\'
c9c5dcda 616 || ch == quotechar
252b5132
RH
617 || ch == '\n')
618 break;
619 }
620 len = s - from;
621 if (len > toend - to)
622 len = toend - to;
623 if (len > 0)
624 {
625 memcpy (to, from, len);
626 to += len;
627 from += len;
df816087
AM
628 if (to >= toend)
629 goto tofull;
252b5132
RH
630 }
631 }
632
633 ch = GET ();
634 if (ch == EOF)
635 {
fc5910c0
NC
636 /* This buffer is here specifically so
637 that the UNGET below will work. */
638 static char one_char_buf[1];
639
c9c5dcda 640 as_warn (_("end of file in string; '%c' inserted"), quotechar);
252b5132 641 state = old_state;
fc5910c0
NC
642 from = fromend = one_char_buf + 1;
643 fromlen = 1;
252b5132 644 UNGET ('\n');
c9c5dcda 645 PUT (quotechar);
252b5132 646 }
c9c5dcda 647 else if (ch == quotechar)
252b5132
RH
648 {
649 state = old_state;
650 PUT (ch);
651 }
16d87673 652 else if (TC_STRING_ESCAPES && ch == '\\')
252b5132
RH
653 {
654 state = 6;
655 PUT (ch);
656 }
252b5132
RH
657 else if (scrub_m68k_mri && ch == '\n')
658 {
659 /* Just quietly terminate the string. This permits lines like
204cd129 660 bne label loop if we haven't reach end yet. */
252b5132
RH
661 state = old_state;
662 UNGET (ch);
663 PUT ('\'');
664 }
665 else
666 {
667 PUT (ch);
668 }
669 continue;
670
671 case 6:
672 state = 5;
673 ch = GET ();
674 switch (ch)
675 {
676 /* Handle strings broken across lines, by turning '\n' into
677 '\\' and 'n'. */
678 case '\n':
679 UNGET ('n');
680 add_newlines++;
681 PUT ('\\');
682 continue;
683
4252e537 684 case EOF:
c9c5dcda
AM
685 as_warn (_("end of file in string; '%c' inserted"), quotechar);
686 PUT (quotechar);
4252e537
AM
687 continue;
688
252b5132
RH
689 case '"':
690 case '\\':
691 case 'b':
692 case 'f':
693 case 'n':
694 case 'r':
695 case 't':
696 case 'v':
697 case 'x':
698 case 'X':
699 case '0':
700 case '1':
701 case '2':
702 case '3':
703 case '4':
704 case '5':
705 case '6':
706 case '7':
707 break;
4252e537 708
252b5132 709 default:
4252e537 710#ifdef ONLY_STANDARD_ESCAPES
0e389e77 711 as_warn (_("unknown escape '\\%c' in string; ignored"), ch);
4252e537 712#endif
252b5132 713 break;
252b5132
RH
714 }
715 PUT (ch);
716 continue;
717
b1ac4c66
AM
718#ifdef DOUBLEBAR_PARALLEL
719 case 13:
720 ch = GET ();
721 if (ch != '|')
722 abort ();
723
724 /* Reset back to state 1 and pretend that we are parsing a
725 line from just after the first white space. */
726 state = 1;
727 PUT ('|');
40b36596
JM
728#ifdef TC_TIC6X
729 /* "||^" is used for SPMASKed instructions. */
730 ch = GET ();
731 if (ch == EOF)
732 goto fromeof;
733 else if (ch == '^')
734 PUT ('^');
735 else
736 UNGET (ch);
737#endif
b1ac4c66 738 continue;
3c9b82ba
NC
739#endif
740#ifdef TC_Z80
741 case 16:
742 /* We have seen an 'a' at the start of a symbol, look for an 'f'. */
743 ch = GET ();
34bca508 744 if (ch == 'f' || ch == 'F')
3c9b82ba
NC
745 {
746 state = 17;
747 PUT (ch);
748 }
749 else
750 {
751 state = 9;
752 break;
753 }
1a0670f3 754 /* Fall through. */
3c9b82ba
NC
755 case 17:
756 /* We have seen "af" at the start of a symbol,
757 a ' here is a part of that symbol. */
758 ch = GET ();
759 state = 9;
760 if (ch == '\'')
761 /* Change to avoid warning about unclosed string. */
762 PUT ('`');
0146fc9d 763 else if (ch != EOF)
3c9b82ba
NC
764 UNGET (ch);
765 break;
b1ac4c66 766#endif
252b5132
RH
767 }
768
204cd129 769 /* OK, we are somewhere in states 0 through 4 or 9 through 11. */
252b5132
RH
770
771 /* flushchar: */
772 ch = GET ();
773
40b36596
JM
774#ifdef TC_PREDICATE_START_CHAR
775 if (ch == TC_PREDICATE_START_CHAR && (state == 0 || state == 1))
52628315
L
776 {
777 state += 14;
778 PUT (ch);
779 continue;
780 }
781 else if (state == 14 || state == 15)
782 {
40b36596 783 if (ch == TC_PREDICATE_END_CHAR)
70b911ad
JJ
784 {
785 state -= 14;
786 PUT (ch);
787 ch = GET ();
788 }
52628315
L
789 else
790 {
791 PUT (ch);
792 continue;
793 }
794 }
795#endif
796
252b5132
RH
797 recycle:
798
799#if defined TC_ARM && defined OBJ_ELF
800 /* We need to watch out for .symver directives. See the comment later
801 in this function. */
802 if (symver_state == NULL)
803 {
804 if ((state == 0 || state == 1) && ch == symver_pseudo[0])
805 symver_state = symver_pseudo + 1;
806 }
807 else
808 {
809 /* We advance to the next state if we find the right
810 character. */
811 if (ch != '\0' && (*symver_state == ch))
812 ++symver_state;
813 else if (*symver_state != '\0')
814 /* We did not get the expected character, or we didn't
815 get a valid terminating character after seeing the
816 entire pseudo-op, so we must go back to the beginning. */
817 symver_state = NULL;
818 else
819 {
820 /* We've read the entire pseudo-op. If this is the end
821 of the line, go back to the beginning. */
822 if (IS_NEWLINE (ch))
823 symver_state = NULL;
824 }
825 }
826#endif /* TC_ARM && OBJ_ELF */
827
828#ifdef TC_M68K
829 /* We want to have pseudo-ops which control whether we are in
b1ac4c66
AM
830 MRI mode or not. Unfortunately, since m68k MRI mode affects
831 the scrubber, that means that we need a special purpose
832 recognizer here. */
252b5132
RH
833 if (mri_state == NULL)
834 {
835 if ((state == 0 || state == 1)
836 && ch == mri_pseudo[0])
837 mri_state = mri_pseudo + 1;
838 }
839 else
840 {
841 /* We advance to the next state if we find the right
842 character, or if we need a space character and we get any
843 whitespace character, or if we need a '0' and we get a
844 '1' (this is so that we only need one state to handle
845 ``.mri 0'' and ``.mri 1''). */
846 if (ch != '\0'
847 && (*mri_state == ch
848 || (*mri_state == ' '
849 && lex[ch] == LEX_IS_WHITESPACE)
850 || (*mri_state == '0'
851 && ch == '1')))
852 {
853 mri_last_ch = ch;
854 ++mri_state;
855 }
856 else if (*mri_state != '\0'
857 || (lex[ch] != LEX_IS_WHITESPACE
858 && lex[ch] != LEX_IS_NEWLINE))
859 {
860 /* We did not get the expected character, or we didn't
861 get a valid terminating character after seeing the
862 entire pseudo-op, so we must go back to the
863 beginning. */
864 mri_state = NULL;
865 }
866 else
867 {
868 /* We've read the entire pseudo-op. mips_last_ch is
b1ac4c66
AM
869 either '0' or '1' indicating whether to enter or
870 leave MRI mode. */
252b5132
RH
871 do_scrub_begin (mri_last_ch == '1');
872 mri_state = NULL;
873
874 /* We continue handling the character as usual. The
b1ac4c66
AM
875 main gas reader must also handle the .mri pseudo-op
876 to control expression parsing and the like. */
252b5132
RH
877 }
878 }
879#endif
880
881 if (ch == EOF)
882 {
883 if (state != 0)
884 {
885 as_warn (_("end of file not at end of a line; newline inserted"));
886 state = 0;
887 PUT ('\n');
888 }
889 goto fromeof;
890 }
891
892 switch (lex[ch])
893 {
894 case LEX_IS_WHITESPACE:
895 do
896 {
897 ch = GET ();
898 }
899 while (ch != EOF && IS_WHITESPACE (ch));
900 if (ch == EOF)
901 goto fromeof;
902
903 if (state == 0)
904 {
905 /* Preserve a single whitespace character at the
906 beginning of a line. */
907 state = 1;
908 UNGET (ch);
909 PUT (' ');
910 break;
911 }
912
f28e8eb3 913#ifdef KEEP_WHITE_AROUND_COLON
30a2b4ef
KH
914 if (lex[ch] == LEX_IS_COLON)
915 {
916 /* Only keep this white if there's no white *after* the
b1ac4c66 917 colon. */
30a2b4ef 918 ch2 = GET ();
83bd7402
NC
919 if (ch2 != EOF)
920 UNGET (ch2);
30a2b4ef
KH
921 if (!IS_WHITESPACE (ch2))
922 {
923 state = 9;
924 UNGET (ch);
925 PUT (' ');
926 break;
927 }
928 }
f28e8eb3 929#endif
252b5132 930 if (IS_COMMENT (ch)
62f65a7b
DB
931 || IS_LINE_SEPARATOR (ch)
932 || IS_PARALLEL_SEPARATOR (ch))
252b5132
RH
933 {
934 if (scrub_m68k_mri)
935 {
936 /* In MRI mode, we keep these spaces. */
937 UNGET (ch);
938 PUT (' ');
939 break;
940 }
941 goto recycle;
942 }
943
944 /* If we're in state 2 or 11, we've seen a non-white
945 character followed by whitespace. If the next character
946 is ':', this is whitespace after a label name which we
947 normally must ignore. In MRI mode, though, spaces are
948 not permitted between the label and the colon. */
949 if ((state == 2 || state == 11)
950 && lex[ch] == LEX_IS_COLON
951 && ! scrub_m68k_mri)
952 {
953 state = 1;
954 PUT (ch);
955 break;
956 }
957
958 switch (state)
959 {
252b5132
RH
960 case 1:
961 /* We can arrive here if we leave a leading whitespace
962 character at the beginning of a line. */
963 goto recycle;
964 case 2:
965 state = 3;
966 if (to + 1 < toend)
967 {
968 /* Optimize common case by skipping UNGET/GET. */
969 PUT (' '); /* Sp after opco */
970 goto recycle;
971 }
972 UNGET (ch);
973 PUT (' ');
974 break;
975 case 3:
40b36596
JM
976#ifndef TC_KEEP_OPERAND_SPACES
977 /* For TI C6X, we keep these spaces as they may separate
978 functional unit specifiers from operands. */
252b5132 979 if (scrub_m68k_mri)
40b36596 980#endif
252b5132
RH
981 {
982 /* In MRI mode, we keep these spaces. */
983 UNGET (ch);
984 PUT (' ');
985 break;
986 }
987 goto recycle; /* Sp in operands */
988 case 9:
989 case 10:
40b36596 990#ifndef TC_KEEP_OPERAND_SPACES
252b5132 991 if (scrub_m68k_mri)
40b36596 992#endif
252b5132
RH
993 {
994 /* In MRI mode, we keep these spaces. */
995 state = 3;
996 UNGET (ch);
997 PUT (' ');
998 break;
999 }
1000 state = 10; /* Sp after symbol char */
1001 goto recycle;
1002 case 11:
abd63a32 1003 if (LABELS_WITHOUT_COLONS || flag_m68k_mri)
252b5132
RH
1004 state = 1;
1005 else
1006 {
1007 /* We know that ch is not ':', since we tested that
b1ac4c66
AM
1008 case above. Therefore this is not a label, so it
1009 must be the opcode, and we've just seen the
1010 whitespace after it. */
252b5132
RH
1011 state = 3;
1012 }
1013 UNGET (ch);
1014 PUT (' '); /* Sp after label definition. */
1015 break;
1016 default:
1017 BAD_CASE (state);
1018 }
1019 break;
1020
1021 case LEX_IS_TWOCHAR_COMMENT_1ST:
1022 ch2 = GET ();
1023 if (ch2 == '*')
1024 {
1025 for (;;)
1026 {
1027 do
1028 {
1029 ch2 = GET ();
1030 if (ch2 != EOF && IS_NEWLINE (ch2))
1031 add_newlines++;
1032 }
1033 while (ch2 != EOF && ch2 != '*');
1034
1035 while (ch2 == '*')
1036 ch2 = GET ();
1037
1038 if (ch2 == EOF || ch2 == '/')
1039 break;
1040
1041 /* This UNGET will ensure that we count newlines
b1ac4c66 1042 correctly. */
252b5132
RH
1043 UNGET (ch2);
1044 }
1045
1046 if (ch2 == EOF)
1047 as_warn (_("end of file in multiline comment"));
1048
1049 ch = ' ';
1050 goto recycle;
1051 }
800eeca4
JW
1052#ifdef DOUBLESLASH_LINE_COMMENTS
1053 else if (ch2 == '/')
1054 {
1055 do
1056 {
1057 ch = GET ();
1058 }
1059 while (ch != EOF && !IS_NEWLINE (ch));
1060 if (ch == EOF)
1061 as_warn ("end of file in comment; newline inserted");
1062 state = 0;
1063 PUT ('\n');
1064 break;
1065 }
1066#endif
252b5132
RH
1067 else
1068 {
1069 if (ch2 != EOF)
1070 UNGET (ch2);
1071 if (state == 9 || state == 10)
1072 state = 3;
1073 PUT (ch);
1074 }
1075 break;
1076
1077 case LEX_IS_STRINGQUOTE:
c9c5dcda 1078 quotechar = ch;
252b5132
RH
1079 if (state == 10)
1080 {
204cd129 1081 /* Preserve the whitespace in foo "bar". */
252b5132
RH
1082 UNGET (ch);
1083 state = 3;
1084 PUT (' ');
1085
1086 /* PUT didn't jump out. We could just break, but we
b1ac4c66 1087 know what will happen, so optimize a bit. */
252b5132
RH
1088 ch = GET ();
1089 old_state = 3;
1090 }
1091 else if (state == 9)
1092 old_state = 3;
1093 else
1094 old_state = state;
1095 state = 5;
1096 PUT (ch);
1097 break;
1098
252b5132 1099 case LEX_IS_ONECHAR_QUOTE:
c0a139c7
NC
1100#ifdef H_TICK_HEX
1101 if (state == 9 && enable_h_tick_hex)
c54b5932
DD
1102 {
1103 char c;
1104
1105 c = GET ();
1106 as_warn ("'%c found after symbol", c);
1107 UNGET (c);
1108 }
c0a139c7 1109#endif
252b5132
RH
1110 if (state == 10)
1111 {
204cd129 1112 /* Preserve the whitespace in foo 'b'. */
252b5132
RH
1113 UNGET (ch);
1114 state = 3;
1115 PUT (' ');
1116 break;
1117 }
1118 ch = GET ();
1119 if (ch == EOF)
1120 {
1121 as_warn (_("end of file after a one-character quote; \\0 inserted"));
1122 ch = 0;
1123 }
1124 if (ch == '\\')
1125 {
1126 ch = GET ();
1127 if (ch == EOF)
1128 {
1129 as_warn (_("end of file in escape character"));
1130 ch = '\\';
1131 }
1132 else
1133 ch = process_escape (ch);
1134 }
1135 sprintf (out_buf, "%d", (int) (unsigned char) ch);
1136
1137 /* None of these 'x constants for us. We want 'x'. */
1138 if ((ch = GET ()) != '\'')
1139 {
1140#ifdef REQUIRE_CHAR_CLOSE_QUOTE
0e389e77 1141 as_warn (_("missing close quote; (assumed)"));
252b5132
RH
1142#else
1143 if (ch != EOF)
1144 UNGET (ch);
1145#endif
1146 }
1147 if (strlen (out_buf) == 1)
1148 {
1149 PUT (out_buf[0]);
1150 break;
1151 }
1152 if (state == 9)
1153 old_state = 3;
1154 else
1155 old_state = state;
1156 state = -1;
1157 out_string = out_buf;
1158 PUT (*out_string++);
1159 break;
252b5132
RH
1160
1161 case LEX_IS_COLON:
f28e8eb3 1162#ifdef KEEP_WHITE_AROUND_COLON
30a2b4ef 1163 state = 9;
f28e8eb3 1164#else
252b5132
RH
1165 if (state == 9 || state == 10)
1166 state = 3;
1167 else if (state != 3)
1168 state = 1;
f28e8eb3 1169#endif
252b5132
RH
1170 PUT (ch);
1171 break;
1172
1173 case LEX_IS_NEWLINE:
1174 /* Roll out a bunch of newlines from inside comments, etc. */
1175 if (add_newlines)
1176 {
1177 --add_newlines;
1178 UNGET (ch);
1179 }
3ee4defc 1180 /* Fall through. */
252b5132
RH
1181
1182 case LEX_IS_LINE_SEPARATOR:
1183 state = 0;
1184 PUT (ch);
1185 break;
1186
62f65a7b
DB
1187 case LEX_IS_PARALLEL_SEPARATOR:
1188 state = 1;
1189 PUT (ch);
1190 break;
1191
252b5132
RH
1192#ifdef TC_V850
1193 case LEX_IS_DOUBLEDASH_1ST:
30a2b4ef 1194 ch2 = GET ();
252b5132
RH
1195 if (ch2 != '-')
1196 {
0146fc9d
NC
1197 if (ch2 != EOF)
1198 UNGET (ch2);
252b5132
RH
1199 goto de_fault;
1200 }
3ee4defc 1201 /* Read and skip to end of line. */
252b5132
RH
1202 do
1203 {
1204 ch = GET ();
1205 }
1206 while (ch != EOF && ch != '\n');
204cd129 1207
252b5132 1208 if (ch == EOF)
204cd129
NC
1209 as_warn (_("end of file in comment; newline inserted"));
1210
252b5132
RH
1211 state = 0;
1212 PUT ('\n');
1213 break;
3ee4defc 1214#endif
f28e8eb3 1215#ifdef DOUBLEBAR_PARALLEL
252b5132 1216 case LEX_IS_DOUBLEBAR_1ST:
30a2b4ef 1217 ch2 = GET ();
83bd7402
NC
1218 if (ch2 != EOF)
1219 UNGET (ch2);
252b5132 1220 if (ch2 != '|')
204cd129
NC
1221 goto de_fault;
1222
b1ac4c66
AM
1223 /* Handle '||' in two states as invoking PUT twice might
1224 result in the first one jumping out of this loop. We'd
1225 then lose track of the state and one '|' char. */
1226 state = 13;
252b5132
RH
1227 PUT ('|');
1228 break;
3ee4defc 1229#endif
252b5132
RH
1230 case LEX_IS_LINE_COMMENT_START:
1231 /* FIXME-someday: The two character comment stuff was badly
1232 thought out. On i386, we want '/' as line comment start
1233 AND we want C style comments. hence this hack. The
1234 whole lexical process should be reworked. xoxorich. */
1235 if (ch == '/')
1236 {
1237 ch2 = GET ();
1238 if (ch2 == '*')
1239 {
1240 old_state = 3;
1241 state = -2;
1242 break;
1243 }
69ace220 1244 else if (ch2 != EOF)
252b5132
RH
1245 {
1246 UNGET (ch2);
1247 }
204cd129 1248 }
252b5132
RH
1249
1250 if (state == 0 || state == 1) /* Only comment at start of line. */
1251 {
1252 int startch;
1253
1254 startch = ch;
1255
1256 do
1257 {
1258 ch = GET ();
1259 }
1260 while (ch != EOF && IS_WHITESPACE (ch));
204cd129 1261
252b5132
RH
1262 if (ch == EOF)
1263 {
1264 as_warn (_("end of file in comment; newline inserted"));
1265 PUT ('\n');
1266 break;
1267 }
204cd129 1268
252b5132
RH
1269 if (ch < '0' || ch > '9' || state != 0 || startch != '#')
1270 {
1271 /* Not a cpp line. */
1272 while (ch != EOF && !IS_NEWLINE (ch))
1273 ch = GET ();
1274 if (ch == EOF)
cf3f45fa
AM
1275 {
1276 as_warn (_("end of file in comment; newline inserted"));
1277 PUT ('\n');
1278 }
1279 else /* IS_NEWLINE (ch) */
1280 {
1281 /* To process non-zero add_newlines. */
1282 UNGET (ch);
1283 }
252b5132 1284 state = 0;
252b5132
RH
1285 break;
1286 }
3ee4defc 1287 /* Looks like `# 123 "filename"' from cpp. */
252b5132
RH
1288 UNGET (ch);
1289 old_state = 4;
1290 state = -1;
1291 if (scrub_m68k_mri)
93e914b2 1292 out_string = "\tlinefile ";
252b5132 1293 else
93e914b2 1294 out_string = "\t.linefile ";
252b5132
RH
1295 PUT (*out_string++);
1296 break;
1297 }
1298
1299#ifdef TC_D10V
1300 /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true.
1301 Trap is the only short insn that has a first operand that is
1302 neither register nor label.
1303 We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 .
30a2b4ef
KH
1304 We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is
1305 already LEX_IS_LINE_COMMENT_START. However, it is the
1306 only character in line_comment_chars for d10v, hence we
1307 can recognize it as such. */
252b5132
RH
1308 /* An alternative approach would be to reset the state to 1 when
1309 we see '||', '<'- or '->', but that seems to be overkill. */
30a2b4ef
KH
1310 if (state == 10)
1311 PUT (' ');
252b5132
RH
1312#endif
1313 /* We have a line comment character which is not at the
1314 start of a line. If this is also a normal comment
1315 character, fall through. Otherwise treat it as a default
1316 character. */
1317 if (strchr (tc_comment_chars, ch) == NULL
1318 && (! scrub_m68k_mri
1319 || (ch != '!' && ch != '*')))
1320 goto de_fault;
1321 if (scrub_m68k_mri
1322 && (ch == '!' || ch == '*' || ch == '#')
1323 && state != 1
1324 && state != 10)
1325 goto de_fault;
1326 /* Fall through. */
1327 case LEX_IS_COMMENT_START:
1328#if defined TC_ARM && defined OBJ_ELF
1329 /* On the ARM, `@' is the comment character.
1330 Unfortunately this is also a special character in ELF .symver
30a2b4ef
KH
1331 directives (and .type, though we deal with those another way).
1332 So we check if this line is such a directive, and treat
1333 the character as default if so. This is a hack. */
252b5132
RH
1334 if ((symver_state != NULL) && (*symver_state == 0))
1335 goto de_fault;
4c400d5e 1336#endif
2a676888 1337
750e4bf7
JB
1338 /* Care is needed not to damage occurrences of \<comment-char>
1339 by stripping the <comment-char> onwards. Yuck. */
ab1fadc6 1340 if ((to > tostart ? to[-1] : last_char) == '\\')
750e4bf7 1341 /* Do not treat the <comment-char> as a start-of-comment. */
2a676888 1342 goto de_fault;
2a676888 1343
4c400d5e
AM
1344#ifdef WARN_COMMENTS
1345 if (!found_comment)
3b4dbbbf 1346 found_comment_file = as_where (&found_comment);
252b5132
RH
1347#endif
1348 do
1349 {
1350 ch = GET ();
1351 }
1352 while (ch != EOF && !IS_NEWLINE (ch));
1353 if (ch == EOF)
1354 as_warn (_("end of file in comment; newline inserted"));
1355 state = 0;
1356 PUT ('\n');
1357 break;
1358
c54b5932
DD
1359#ifdef H_TICK_HEX
1360 case LEX_IS_H:
1361 /* Look for strings like H'[0-9A-Fa-f] and if found, replace
1362 the H' with 0x to make them gas-style hex characters. */
1363 if (enable_h_tick_hex)
1364 {
1365 char quot;
1366
1367 quot = GET ();
1368 if (quot == '\'')
1369 {
1370 UNGET ('x');
1371 ch = '0';
1372 }
1373 else
1374 UNGET (quot);
1375 }
c54b5932 1376#endif
fcddde94 1377 /* Fall through. */
c54b5932 1378
252b5132
RH
1379 case LEX_IS_SYMBOL_COMPONENT:
1380 if (state == 10)
1381 {
1382 /* This is a symbol character following another symbol
1383 character, with whitespace in between. We skipped
1384 the whitespace earlier, so output it now. */
1385 UNGET (ch);
1386 state = 3;
1387 PUT (' ');
1388 break;
1389 }
1390
3c9b82ba
NC
1391#ifdef TC_Z80
1392 /* "af'" is a symbol containing '\''. */
34bca508 1393 if (state == 3 && (ch == 'a' || ch == 'A'))
3c9b82ba
NC
1394 {
1395 state = 16;
1396 PUT (ch);
1397 ch = GET ();
34bca508 1398 if (ch == 'f' || ch == 'F')
3c9b82ba
NC
1399 {
1400 state = 17;
1401 PUT (ch);
1402 break;
1403 }
1404 else
1405 {
1406 state = 9;
536695d0 1407 if (ch == EOF || !IS_SYMBOL_COMPONENT (ch))
3c9b82ba 1408 {
0146fc9d
NC
1409 if (ch != EOF)
1410 UNGET (ch);
3c9b82ba
NC
1411 break;
1412 }
1413 }
1414 }
1415#endif
252b5132
RH
1416 if (state == 3)
1417 state = 9;
1418
1419 /* This is a common case. Quickly copy CH and all the
b1ac4c66 1420 following symbol component or normal characters. */
252b5132
RH
1421 if (to + 1 < toend
1422 && mri_state == NULL
1423#if defined TC_ARM && defined OBJ_ELF
1424 && symver_state == NULL
1425#endif
1426 )
1427 {
1428 char *s;
39a45edc 1429 ptrdiff_t len;
252b5132
RH
1430
1431 for (s = from; s < fromend; s++)
1432 {
1433 int type;
1434
30a2b4ef 1435 ch2 = *(unsigned char *) s;
252b5132
RH
1436 type = lex[ch2];
1437 if (type != 0
1438 && type != LEX_IS_SYMBOL_COMPONENT)
1439 break;
1440 }
204cd129 1441
252b5132 1442 if (s > from)
204cd129
NC
1443 /* Handle the last character normally, for
1444 simplicity. */
1445 --s;
1446
252b5132 1447 len = s - from;
204cd129 1448
252b5132
RH
1449 if (len > (toend - to) - 1)
1450 len = (toend - to) - 1;
204cd129 1451
252b5132
RH
1452 if (len > 0)
1453 {
1454 PUT (ch);
518051dc
BE
1455 memcpy (to, from, len);
1456 to += len;
1457 from += len;
37b75c0c
AM
1458 if (to >= toend)
1459 goto tofull;
252b5132
RH
1460 ch = GET ();
1461 }
1462 }
1463
1464 /* Fall through. */
1465 default:
1466 de_fault:
1467 /* Some relatively `normal' character. */
1468 if (state == 0)
1469 {
9a124774 1470 state = 11; /* Now seeing label definition. */
252b5132
RH
1471 }
1472 else if (state == 1)
1473 {
9a124774 1474 state = 2; /* Ditto. */
252b5132
RH
1475 }
1476 else if (state == 9)
1477 {
2cdb18a7 1478 if (!IS_SYMBOL_COMPONENT (ch))
252b5132
RH
1479 state = 3;
1480 }
1481 else if (state == 10)
1482 {
c5c834aa
AH
1483 if (ch == '\\')
1484 {
1485 /* Special handling for backslash: a backslash may
1486 be the beginning of a formal parameter (of a
1487 macro) following another symbol character, with
1488 whitespace in between. If that is the case, we
1489 output a space before the parameter. Strictly
1490 speaking, correct handling depends upon what the
1491 macro parameter expands into; if the parameter
1492 expands into something which does not start with
1493 an operand character, then we don't want to keep
1494 the space. We don't have enough information to
1495 make the right choice, so here we are making the
1496 choice which is more likely to be correct. */
1740b7b1
NS
1497 if (to + 1 >= toend)
1498 {
1499 /* If we're near the end of the buffer, save the
1500 character for the next time round. Otherwise
1501 we'll lose our state. */
1502 UNGET (ch);
1503 goto tofull;
1504 }
1505 *to++ = ' ';
c5c834aa
AH
1506 }
1507
252b5132
RH
1508 state = 3;
1509 }
1510 PUT (ch);
1511 break;
1512 }
1513 }
1514
1515 /*NOTREACHED*/
1516
1517 fromeof:
1518 /* We have reached the end of the input. */
ab1fadc6
AM
1519 if (to > tostart)
1520 last_char = to[-1];
252b5132
RH
1521 return to - tostart;
1522
1523 tofull:
1524 /* The output buffer is full. Save any input we have not yet
1525 processed. */
1526 if (fromend > from)
1527 {
2b47531b 1528 saved_input = from;
252b5132
RH
1529 saved_input_len = fromend - from;
1530 }
1531 else
2b47531b
ILT
1532 saved_input = NULL;
1533
ab1fadc6
AM
1534 if (to > tostart)
1535 last_char = to[-1];
252b5132
RH
1536 return to - tostart;
1537}