]> git.ipfire.org Git - thirdparty/binutils-gdb.git/blob - gas/app.c
Add multibyte character warning option to the assembler.
[thirdparty/binutils-gdb.git] / gas / app.c
1 /* This is the Assembler Pre-Processor
2 Copyright (C) 1987-2021 Free Software Foundation, Inc.
3
4 This file is part of GAS, the GNU Assembler.
5
6 GAS is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10
11 GAS is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
14 License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GAS; see the file COPYING. If not, write to the Free
18 Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA
19 02110-1301, USA. */
20
21 /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90. */
22 /* App, the assembler pre-processor. This pre-processor strips out
23 excess spaces, turns single-quoted characters into a decimal
24 constant, and turns the # in # <number> <filename> <garbage> into a
25 .linefile. This needs better error-handling. */
26
27 #include "as.h"
28
29 #if (__STDC__ != 1)
30 #ifndef const
31 #define const /* empty */
32 #endif
33 #endif
34
35 #ifdef H_TICK_HEX
36 int enable_h_tick_hex = 0;
37 #endif
38
39 #ifdef TC_M68K
40 /* Whether we are scrubbing in m68k MRI mode. This is different from
41 flag_m68k_mri, because the two flags will be affected by the .mri
42 pseudo-op at different times. */
43 static int scrub_m68k_mri;
44
45 /* The pseudo-op which switches in and out of MRI mode. See the
46 comment in do_scrub_chars. */
47 static const char mri_pseudo[] = ".mri 0";
48 #else
49 #define scrub_m68k_mri 0
50 #endif
51
52 #if defined TC_ARM && defined OBJ_ELF
53 /* The pseudo-op for which we need to special-case `@' characters.
54 See the comment in do_scrub_chars. */
55 static const char symver_pseudo[] = ".symver";
56 static const char * symver_state;
57 #endif
58
59 static char last_char;
60
61 static char lex[256];
62 static const char symbol_chars[] =
63 "$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
64
65 #define LEX_IS_SYMBOL_COMPONENT 1
66 #define LEX_IS_WHITESPACE 2
67 #define LEX_IS_LINE_SEPARATOR 3
68 #define LEX_IS_COMMENT_START 4
69 #define LEX_IS_LINE_COMMENT_START 5
70 #define LEX_IS_TWOCHAR_COMMENT_1ST 6
71 #define LEX_IS_STRINGQUOTE 8
72 #define LEX_IS_COLON 9
73 #define LEX_IS_NEWLINE 10
74 #define LEX_IS_ONECHAR_QUOTE 11
75 #ifdef TC_V850
76 #define LEX_IS_DOUBLEDASH_1ST 12
77 #endif
78 #ifdef TC_M32R
79 #define DOUBLEBAR_PARALLEL
80 #endif
81 #ifdef DOUBLEBAR_PARALLEL
82 #define LEX_IS_DOUBLEBAR_1ST 13
83 #endif
84 #define LEX_IS_PARALLEL_SEPARATOR 14
85 #ifdef H_TICK_HEX
86 #define LEX_IS_H 15
87 #endif
88 #define IS_SYMBOL_COMPONENT(c) (lex[c] == LEX_IS_SYMBOL_COMPONENT)
89 #define IS_WHITESPACE(c) (lex[c] == LEX_IS_WHITESPACE)
90 #define IS_LINE_SEPARATOR(c) (lex[c] == LEX_IS_LINE_SEPARATOR)
91 #define IS_PARALLEL_SEPARATOR(c) (lex[c] == LEX_IS_PARALLEL_SEPARATOR)
92 #define IS_COMMENT(c) (lex[c] == LEX_IS_COMMENT_START)
93 #define IS_LINE_COMMENT(c) (lex[c] == LEX_IS_LINE_COMMENT_START)
94 #define IS_NEWLINE(c) (lex[c] == LEX_IS_NEWLINE)
95
96 static int process_escape (int);
97
98 /* FIXME-soon: The entire lexer/parser thingy should be
99 built statically at compile time rather than dynamically
100 each and every time the assembler is run. xoxorich. */
101
102 void
103 do_scrub_begin (int m68k_mri ATTRIBUTE_UNUSED)
104 {
105 const char *p;
106 int c;
107
108 lex[' '] = LEX_IS_WHITESPACE;
109 lex['\t'] = LEX_IS_WHITESPACE;
110 lex['\r'] = LEX_IS_WHITESPACE;
111 lex['\n'] = LEX_IS_NEWLINE;
112 lex[':'] = LEX_IS_COLON;
113
114 #ifdef TC_M68K
115 scrub_m68k_mri = m68k_mri;
116
117 if (! m68k_mri)
118 #endif
119 {
120 lex['"'] = LEX_IS_STRINGQUOTE;
121
122 #if ! defined (TC_HPPA)
123 lex['\''] = LEX_IS_ONECHAR_QUOTE;
124 #endif
125
126 #ifdef SINGLE_QUOTE_STRINGS
127 lex['\''] = LEX_IS_STRINGQUOTE;
128 #endif
129 }
130
131 /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop
132 in state 5 of do_scrub_chars must be changed. */
133
134 /* Note that these override the previous defaults, e.g. if ';' is a
135 comment char, then it isn't a line separator. */
136 for (p = symbol_chars; *p; ++p)
137 lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
138
139 for (c = 128; c < 256; ++c)
140 lex[c] = LEX_IS_SYMBOL_COMPONENT;
141
142 #ifdef tc_symbol_chars
143 /* This macro permits the processor to specify all characters which
144 may appears in an operand. This will prevent the scrubber from
145 discarding meaningful whitespace in certain cases. The i386
146 backend uses this to support prefixes, which can confuse the
147 scrubber as to whether it is parsing operands or opcodes. */
148 for (p = tc_symbol_chars; *p; ++p)
149 lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
150 #endif
151
152 /* The m68k backend wants to be able to change comment_chars. */
153 #ifndef tc_comment_chars
154 #define tc_comment_chars comment_chars
155 #endif
156 for (p = tc_comment_chars; *p; p++)
157 lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
158
159 for (p = line_comment_chars; *p; p++)
160 lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
161
162 #ifndef tc_line_separator_chars
163 #define tc_line_separator_chars line_separator_chars
164 #endif
165 for (p = tc_line_separator_chars; *p; p++)
166 lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
167
168 #ifdef tc_parallel_separator_chars
169 /* This macro permits the processor to specify all characters which
170 separate parallel insns on the same line. */
171 for (p = tc_parallel_separator_chars; *p; p++)
172 lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR;
173 #endif
174
175 /* Only allow slash-star comments if slash is not in use.
176 FIXME: This isn't right. We should always permit them. */
177 if (lex['/'] == 0)
178 lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
179
180 #ifdef TC_M68K
181 if (m68k_mri)
182 {
183 lex['\''] = LEX_IS_STRINGQUOTE;
184 lex[';'] = LEX_IS_COMMENT_START;
185 lex['*'] = LEX_IS_LINE_COMMENT_START;
186 /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
187 then it can't be used in an expression. */
188 lex['!'] = LEX_IS_LINE_COMMENT_START;
189 }
190 #endif
191
192 #ifdef TC_V850
193 lex['-'] = LEX_IS_DOUBLEDASH_1ST;
194 #endif
195 #ifdef DOUBLEBAR_PARALLEL
196 lex['|'] = LEX_IS_DOUBLEBAR_1ST;
197 #endif
198 #ifdef TC_D30V
199 /* Must do this is we want VLIW instruction with "->" or "<-". */
200 lex['-'] = LEX_IS_SYMBOL_COMPONENT;
201 #endif
202
203 #ifdef H_TICK_HEX
204 if (enable_h_tick_hex)
205 {
206 lex['h'] = LEX_IS_H;
207 lex['H'] = LEX_IS_H;
208 }
209 #endif
210 }
211
212 /* Saved state of the scrubber. */
213 static int state;
214 static int old_state;
215 static const char *out_string;
216 static char out_buf[20];
217 static int add_newlines;
218 static char *saved_input;
219 static size_t saved_input_len;
220 static char input_buffer[32 * 1024];
221 static const char *mri_state;
222 static char mri_last_ch;
223
224 /* Data structure for saving the state of app across #include's. Note that
225 app is called asynchronously to the parsing of the .include's, so our
226 state at the time .include is interpreted is completely unrelated.
227 That's why we have to save it all. */
228
229 struct app_save
230 {
231 int state;
232 int old_state;
233 const char * out_string;
234 char out_buf[sizeof (out_buf)];
235 int add_newlines;
236 char * saved_input;
237 size_t saved_input_len;
238 #ifdef TC_M68K
239 int scrub_m68k_mri;
240 #endif
241 const char * mri_state;
242 char mri_last_ch;
243 #if defined TC_ARM && defined OBJ_ELF
244 const char * symver_state;
245 #endif
246 char last_char;
247 };
248
249 char *
250 app_push (void)
251 {
252 struct app_save *saved;
253
254 saved = XNEW (struct app_save);
255 saved->state = state;
256 saved->old_state = old_state;
257 saved->out_string = out_string;
258 memcpy (saved->out_buf, out_buf, sizeof (out_buf));
259 saved->add_newlines = add_newlines;
260 if (saved_input == NULL)
261 saved->saved_input = NULL;
262 else
263 {
264 saved->saved_input = XNEWVEC (char, saved_input_len);
265 memcpy (saved->saved_input, saved_input, saved_input_len);
266 saved->saved_input_len = saved_input_len;
267 }
268 #ifdef TC_M68K
269 saved->scrub_m68k_mri = scrub_m68k_mri;
270 #endif
271 saved->mri_state = mri_state;
272 saved->mri_last_ch = mri_last_ch;
273 #if defined TC_ARM && defined OBJ_ELF
274 saved->symver_state = symver_state;
275 #endif
276 saved->last_char = last_char;
277
278 /* do_scrub_begin() is not useful, just wastes time. */
279
280 state = 0;
281 saved_input = NULL;
282 add_newlines = 0;
283
284 return (char *) saved;
285 }
286
287 void
288 app_pop (char *arg)
289 {
290 struct app_save *saved = (struct app_save *) arg;
291
292 /* There is no do_scrub_end (). */
293 state = saved->state;
294 old_state = saved->old_state;
295 out_string = saved->out_string;
296 memcpy (out_buf, saved->out_buf, sizeof (out_buf));
297 add_newlines = saved->add_newlines;
298 if (saved->saved_input == NULL)
299 saved_input = NULL;
300 else
301 {
302 gas_assert (saved->saved_input_len <= sizeof (input_buffer));
303 memcpy (input_buffer, saved->saved_input, saved->saved_input_len);
304 saved_input = input_buffer;
305 saved_input_len = saved->saved_input_len;
306 free (saved->saved_input);
307 }
308 #ifdef TC_M68K
309 scrub_m68k_mri = saved->scrub_m68k_mri;
310 #endif
311 mri_state = saved->mri_state;
312 mri_last_ch = saved->mri_last_ch;
313 #if defined TC_ARM && defined OBJ_ELF
314 symver_state = saved->symver_state;
315 #endif
316 last_char = saved->last_char;
317
318 free (arg);
319 }
320
321 /* @@ This assumes that \n &c are the same on host and target. This is not
322 necessarily true. */
323
324 static int
325 process_escape (int ch)
326 {
327 switch (ch)
328 {
329 case 'b':
330 return '\b';
331 case 'f':
332 return '\f';
333 case 'n':
334 return '\n';
335 case 'r':
336 return '\r';
337 case 't':
338 return '\t';
339 case '\'':
340 return '\'';
341 case '"':
342 return '\"';
343 default:
344 return ch;
345 }
346 }
347
348 #define MULTIBYTE_WARN_COUNT_LIMIT 10
349 static unsigned int multibyte_warn_count = 0;
350
351 bool
352 scan_for_multibyte_characters (const unsigned char * start,
353 const unsigned char * end,
354 bool warn)
355 {
356 if (end <= start)
357 return false;
358
359 if (warn && multibyte_warn_count > MULTIBYTE_WARN_COUNT_LIMIT)
360 return false;
361
362 bool found = false;
363
364 while (start < end)
365 {
366 unsigned char c;
367
368 if ((c = * start++) <= 0x7f)
369 continue;
370
371 if (!warn)
372 return true;
373
374 found = true;
375
376 const char * filename;
377 unsigned int lineno;
378
379 filename = as_where (& lineno);
380 if (filename == NULL)
381 as_warn (_("multibyte character (%#x) encountered in input"), c);
382 else if (lineno == 0)
383 as_warn (_("multibyte character (%#x) encountered in %s"), c, filename);
384 else
385 as_warn (_("multibyte character (%#x) encountered in %s at or near line %u"), c, filename, lineno);
386
387 if (++ multibyte_warn_count == MULTIBYTE_WARN_COUNT_LIMIT)
388 {
389 as_warn (_("further multibyte character warnings suppressed"));
390 break;
391 }
392 }
393
394 return found;
395 }
396
397 /* This function is called to process input characters. The GET
398 parameter is used to retrieve more input characters. GET should
399 set its parameter to point to a buffer, and return the length of
400 the buffer; it should return 0 at end of file. The scrubbed output
401 characters are put into the buffer starting at TOSTART; the TOSTART
402 buffer is TOLEN bytes in length. The function returns the number
403 of scrubbed characters put into TOSTART. This will be TOLEN unless
404 end of file was seen. This function is arranged as a state
405 machine, and saves its state so that it may return at any point.
406 This is the way the old code used to work. */
407
408 size_t
409 do_scrub_chars (size_t (*get) (char *, size_t), char *tostart, size_t tolen)
410 {
411 char *to = tostart;
412 char *toend = tostart + tolen;
413 char *from;
414 char *fromend;
415 size_t fromlen;
416 int ch, ch2 = 0;
417 /* Character that started the string we're working on. */
418 static char quotechar;
419
420 /*State 0: beginning of normal line
421 1: After first whitespace on line (flush more white)
422 2: After first non-white (opcode) on line (keep 1white)
423 3: after second white on line (into operands) (flush white)
424 4: after putting out a .linefile, put out digits
425 5: parsing a string, then go to old-state
426 6: putting out \ escape in a "d string.
427 7: no longer used
428 8: no longer used
429 9: After seeing symbol char in state 3 (keep 1white after symchar)
430 10: After seeing whitespace in state 9 (keep white before symchar)
431 11: After seeing a symbol character in state 0 (eg a label definition)
432 -1: output string in out_string and go to the state in old_state
433 -2: flush text until a '*' '/' is seen, then go to state old_state
434 #ifdef TC_V850
435 12: After seeing a dash, looking for a second dash as a start
436 of comment.
437 #endif
438 #ifdef DOUBLEBAR_PARALLEL
439 13: After seeing a vertical bar, looking for a second
440 vertical bar as a parallel expression separator.
441 #endif
442 #ifdef TC_PREDICATE_START_CHAR
443 14: After seeing a predicate start character at state 0, looking
444 for a predicate end character as predicate.
445 15: After seeing a predicate start character at state 1, looking
446 for a predicate end character as predicate.
447 #endif
448 #ifdef TC_Z80
449 16: After seeing an 'a' or an 'A' at the start of a symbol
450 17: After seeing an 'f' or an 'F' in state 16
451 #endif
452 */
453
454 /* I added states 9 and 10 because the MIPS ECOFF assembler uses
455 constructs like ``.loc 1 20''. This was turning into ``.loc
456 120''. States 9 and 10 ensure that a space is never dropped in
457 between characters which could appear in an identifier. Ian
458 Taylor, ian@cygnus.com.
459
460 I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
461 correctly on the PA (and any other target where colons are optional).
462 Jeff Law, law@cs.utah.edu.
463
464 I added state 13 so that something like "cmp r1, r2 || trap #1" does not
465 get squashed into "cmp r1,r2||trap#1", with the all important space
466 between the 'trap' and the '#1' being eliminated. nickc@cygnus.com */
467
468 /* This macro gets the next input character. */
469
470 #define GET() \
471 (from < fromend \
472 ? * (unsigned char *) (from++) \
473 : (saved_input = NULL, \
474 fromlen = (*get) (input_buffer, sizeof input_buffer), \
475 from = input_buffer, \
476 fromend = from + fromlen, \
477 (fromlen == 0 \
478 ? EOF \
479 : * (unsigned char *) (from++))))
480
481 /* This macro pushes a character back on the input stream. */
482
483 #define UNGET(uch) (*--from = (uch))
484
485 /* This macro puts a character into the output buffer. If this
486 character fills the output buffer, this macro jumps to the label
487 TOFULL. We use this rather ugly approach because we need to
488 handle two different termination conditions: EOF on the input
489 stream, and a full output buffer. It would be simpler if we
490 always read in the entire input stream before processing it, but
491 I don't want to make such a significant change to the assembler's
492 memory usage. */
493
494 #define PUT(pch) \
495 do \
496 { \
497 *to++ = (pch); \
498 if (to >= toend) \
499 goto tofull; \
500 } \
501 while (0)
502
503 if (saved_input != NULL)
504 {
505 from = saved_input;
506 fromend = from + saved_input_len;
507 }
508 else
509 {
510 fromlen = (*get) (input_buffer, sizeof input_buffer);
511 if (fromlen == 0)
512 return 0;
513 from = input_buffer;
514 fromend = from + fromlen;
515
516 if (multibyte_handling == multibyte_warn)
517 (void) scan_for_multibyte_characters ((const unsigned char *) from,
518 (const unsigned char* ) fromend,
519 true /* Generate warnings. */);
520 }
521
522 while (1)
523 {
524 /* The cases in this switch end with continue, in order to
525 branch back to the top of this while loop and generate the
526 next output character in the appropriate state. */
527 switch (state)
528 {
529 case -1:
530 ch = *out_string++;
531 if (*out_string == '\0')
532 {
533 state = old_state;
534 old_state = 3;
535 }
536 PUT (ch);
537 continue;
538
539 case -2:
540 for (;;)
541 {
542 do
543 {
544 ch = GET ();
545
546 if (ch == EOF)
547 {
548 as_warn (_("end of file in comment"));
549 goto fromeof;
550 }
551
552 if (ch == '\n')
553 PUT ('\n');
554 }
555 while (ch != '*');
556
557 while ((ch = GET ()) == '*')
558 ;
559
560 if (ch == EOF)
561 {
562 as_warn (_("end of file in comment"));
563 goto fromeof;
564 }
565
566 if (ch == '/')
567 break;
568
569 UNGET (ch);
570 }
571
572 state = old_state;
573 UNGET (' ');
574 continue;
575
576 case 4:
577 ch = GET ();
578 if (ch == EOF)
579 goto fromeof;
580 else if (ch >= '0' && ch <= '9')
581 PUT (ch);
582 else
583 {
584 while (ch != EOF && IS_WHITESPACE (ch))
585 ch = GET ();
586 if (ch == '"')
587 {
588 quotechar = ch;
589 state = 5;
590 old_state = 3;
591 PUT (ch);
592 }
593 else
594 {
595 while (ch != EOF && ch != '\n')
596 ch = GET ();
597 state = 0;
598 PUT (ch);
599 }
600 }
601 continue;
602
603 case 5:
604 /* We are going to copy everything up to a quote character,
605 with special handling for a backslash. We try to
606 optimize the copying in the simple case without using the
607 GET and PUT macros. */
608 {
609 char *s;
610 ptrdiff_t len;
611
612 for (s = from; s < fromend; s++)
613 {
614 ch = *s;
615 if (ch == '\\'
616 || ch == quotechar
617 || ch == '\n')
618 break;
619 }
620 len = s - from;
621 if (len > toend - to)
622 len = toend - to;
623 if (len > 0)
624 {
625 memcpy (to, from, len);
626 to += len;
627 from += len;
628 if (to >= toend)
629 goto tofull;
630 }
631 }
632
633 ch = GET ();
634 if (ch == EOF)
635 {
636 /* This buffer is here specifically so
637 that the UNGET below will work. */
638 static char one_char_buf[1];
639
640 as_warn (_("end of file in string; '%c' inserted"), quotechar);
641 state = old_state;
642 from = fromend = one_char_buf + 1;
643 fromlen = 1;
644 UNGET ('\n');
645 PUT (quotechar);
646 }
647 else if (ch == quotechar)
648 {
649 state = old_state;
650 PUT (ch);
651 }
652 else if (TC_STRING_ESCAPES && ch == '\\')
653 {
654 state = 6;
655 PUT (ch);
656 }
657 else if (scrub_m68k_mri && ch == '\n')
658 {
659 /* Just quietly terminate the string. This permits lines like
660 bne label loop if we haven't reach end yet. */
661 state = old_state;
662 UNGET (ch);
663 PUT ('\'');
664 }
665 else
666 {
667 PUT (ch);
668 }
669 continue;
670
671 case 6:
672 state = 5;
673 ch = GET ();
674 switch (ch)
675 {
676 /* Handle strings broken across lines, by turning '\n' into
677 '\\' and 'n'. */
678 case '\n':
679 UNGET ('n');
680 add_newlines++;
681 PUT ('\\');
682 continue;
683
684 case EOF:
685 as_warn (_("end of file in string; '%c' inserted"), quotechar);
686 PUT (quotechar);
687 continue;
688
689 case '"':
690 case '\\':
691 case 'b':
692 case 'f':
693 case 'n':
694 case 'r':
695 case 't':
696 case 'v':
697 case 'x':
698 case 'X':
699 case '0':
700 case '1':
701 case '2':
702 case '3':
703 case '4':
704 case '5':
705 case '6':
706 case '7':
707 break;
708
709 default:
710 #ifdef ONLY_STANDARD_ESCAPES
711 as_warn (_("unknown escape '\\%c' in string; ignored"), ch);
712 #endif
713 break;
714 }
715 PUT (ch);
716 continue;
717
718 #ifdef DOUBLEBAR_PARALLEL
719 case 13:
720 ch = GET ();
721 if (ch != '|')
722 abort ();
723
724 /* Reset back to state 1 and pretend that we are parsing a
725 line from just after the first white space. */
726 state = 1;
727 PUT ('|');
728 #ifdef TC_TIC6X
729 /* "||^" is used for SPMASKed instructions. */
730 ch = GET ();
731 if (ch == EOF)
732 goto fromeof;
733 else if (ch == '^')
734 PUT ('^');
735 else
736 UNGET (ch);
737 #endif
738 continue;
739 #endif
740 #ifdef TC_Z80
741 case 16:
742 /* We have seen an 'a' at the start of a symbol, look for an 'f'. */
743 ch = GET ();
744 if (ch == 'f' || ch == 'F')
745 {
746 state = 17;
747 PUT (ch);
748 }
749 else
750 {
751 state = 9;
752 break;
753 }
754 /* Fall through. */
755 case 17:
756 /* We have seen "af" at the start of a symbol,
757 a ' here is a part of that symbol. */
758 ch = GET ();
759 state = 9;
760 if (ch == '\'')
761 /* Change to avoid warning about unclosed string. */
762 PUT ('`');
763 else if (ch != EOF)
764 UNGET (ch);
765 break;
766 #endif
767 }
768
769 /* OK, we are somewhere in states 0 through 4 or 9 through 11. */
770
771 /* flushchar: */
772 ch = GET ();
773
774 #ifdef TC_PREDICATE_START_CHAR
775 if (ch == TC_PREDICATE_START_CHAR && (state == 0 || state == 1))
776 {
777 state += 14;
778 PUT (ch);
779 continue;
780 }
781 else if (state == 14 || state == 15)
782 {
783 if (ch == TC_PREDICATE_END_CHAR)
784 {
785 state -= 14;
786 PUT (ch);
787 ch = GET ();
788 }
789 else
790 {
791 PUT (ch);
792 continue;
793 }
794 }
795 #endif
796
797 recycle:
798
799 #if defined TC_ARM && defined OBJ_ELF
800 /* We need to watch out for .symver directives. See the comment later
801 in this function. */
802 if (symver_state == NULL)
803 {
804 if ((state == 0 || state == 1) && ch == symver_pseudo[0])
805 symver_state = symver_pseudo + 1;
806 }
807 else
808 {
809 /* We advance to the next state if we find the right
810 character. */
811 if (ch != '\0' && (*symver_state == ch))
812 ++symver_state;
813 else if (*symver_state != '\0')
814 /* We did not get the expected character, or we didn't
815 get a valid terminating character after seeing the
816 entire pseudo-op, so we must go back to the beginning. */
817 symver_state = NULL;
818 else
819 {
820 /* We've read the entire pseudo-op. If this is the end
821 of the line, go back to the beginning. */
822 if (IS_NEWLINE (ch))
823 symver_state = NULL;
824 }
825 }
826 #endif /* TC_ARM && OBJ_ELF */
827
828 #ifdef TC_M68K
829 /* We want to have pseudo-ops which control whether we are in
830 MRI mode or not. Unfortunately, since m68k MRI mode affects
831 the scrubber, that means that we need a special purpose
832 recognizer here. */
833 if (mri_state == NULL)
834 {
835 if ((state == 0 || state == 1)
836 && ch == mri_pseudo[0])
837 mri_state = mri_pseudo + 1;
838 }
839 else
840 {
841 /* We advance to the next state if we find the right
842 character, or if we need a space character and we get any
843 whitespace character, or if we need a '0' and we get a
844 '1' (this is so that we only need one state to handle
845 ``.mri 0'' and ``.mri 1''). */
846 if (ch != '\0'
847 && (*mri_state == ch
848 || (*mri_state == ' '
849 && lex[ch] == LEX_IS_WHITESPACE)
850 || (*mri_state == '0'
851 && ch == '1')))
852 {
853 mri_last_ch = ch;
854 ++mri_state;
855 }
856 else if (*mri_state != '\0'
857 || (lex[ch] != LEX_IS_WHITESPACE
858 && lex[ch] != LEX_IS_NEWLINE))
859 {
860 /* We did not get the expected character, or we didn't
861 get a valid terminating character after seeing the
862 entire pseudo-op, so we must go back to the
863 beginning. */
864 mri_state = NULL;
865 }
866 else
867 {
868 /* We've read the entire pseudo-op. mips_last_ch is
869 either '0' or '1' indicating whether to enter or
870 leave MRI mode. */
871 do_scrub_begin (mri_last_ch == '1');
872 mri_state = NULL;
873
874 /* We continue handling the character as usual. The
875 main gas reader must also handle the .mri pseudo-op
876 to control expression parsing and the like. */
877 }
878 }
879 #endif
880
881 if (ch == EOF)
882 {
883 if (state != 0)
884 {
885 as_warn (_("end of file not at end of a line; newline inserted"));
886 state = 0;
887 PUT ('\n');
888 }
889 goto fromeof;
890 }
891
892 switch (lex[ch])
893 {
894 case LEX_IS_WHITESPACE:
895 do
896 {
897 ch = GET ();
898 }
899 while (ch != EOF && IS_WHITESPACE (ch));
900 if (ch == EOF)
901 goto fromeof;
902
903 if (state == 0)
904 {
905 /* Preserve a single whitespace character at the
906 beginning of a line. */
907 state = 1;
908 UNGET (ch);
909 PUT (' ');
910 break;
911 }
912
913 #ifdef KEEP_WHITE_AROUND_COLON
914 if (lex[ch] == LEX_IS_COLON)
915 {
916 /* Only keep this white if there's no white *after* the
917 colon. */
918 ch2 = GET ();
919 if (ch2 != EOF)
920 UNGET (ch2);
921 if (!IS_WHITESPACE (ch2))
922 {
923 state = 9;
924 UNGET (ch);
925 PUT (' ');
926 break;
927 }
928 }
929 #endif
930 if (IS_COMMENT (ch)
931 || IS_LINE_SEPARATOR (ch)
932 || IS_PARALLEL_SEPARATOR (ch))
933 {
934 if (scrub_m68k_mri)
935 {
936 /* In MRI mode, we keep these spaces. */
937 UNGET (ch);
938 PUT (' ');
939 break;
940 }
941 goto recycle;
942 }
943
944 /* If we're in state 2 or 11, we've seen a non-white
945 character followed by whitespace. If the next character
946 is ':', this is whitespace after a label name which we
947 normally must ignore. In MRI mode, though, spaces are
948 not permitted between the label and the colon. */
949 if ((state == 2 || state == 11)
950 && lex[ch] == LEX_IS_COLON
951 && ! scrub_m68k_mri)
952 {
953 state = 1;
954 PUT (ch);
955 break;
956 }
957
958 switch (state)
959 {
960 case 1:
961 /* We can arrive here if we leave a leading whitespace
962 character at the beginning of a line. */
963 goto recycle;
964 case 2:
965 state = 3;
966 if (to + 1 < toend)
967 {
968 /* Optimize common case by skipping UNGET/GET. */
969 PUT (' '); /* Sp after opco */
970 goto recycle;
971 }
972 UNGET (ch);
973 PUT (' ');
974 break;
975 case 3:
976 #ifndef TC_KEEP_OPERAND_SPACES
977 /* For TI C6X, we keep these spaces as they may separate
978 functional unit specifiers from operands. */
979 if (scrub_m68k_mri)
980 #endif
981 {
982 /* In MRI mode, we keep these spaces. */
983 UNGET (ch);
984 PUT (' ');
985 break;
986 }
987 goto recycle; /* Sp in operands */
988 case 9:
989 case 10:
990 #ifndef TC_KEEP_OPERAND_SPACES
991 if (scrub_m68k_mri)
992 #endif
993 {
994 /* In MRI mode, we keep these spaces. */
995 state = 3;
996 UNGET (ch);
997 PUT (' ');
998 break;
999 }
1000 state = 10; /* Sp after symbol char */
1001 goto recycle;
1002 case 11:
1003 if (LABELS_WITHOUT_COLONS || flag_m68k_mri)
1004 state = 1;
1005 else
1006 {
1007 /* We know that ch is not ':', since we tested that
1008 case above. Therefore this is not a label, so it
1009 must be the opcode, and we've just seen the
1010 whitespace after it. */
1011 state = 3;
1012 }
1013 UNGET (ch);
1014 PUT (' '); /* Sp after label definition. */
1015 break;
1016 default:
1017 BAD_CASE (state);
1018 }
1019 break;
1020
1021 case LEX_IS_TWOCHAR_COMMENT_1ST:
1022 ch2 = GET ();
1023 if (ch2 == '*')
1024 {
1025 for (;;)
1026 {
1027 do
1028 {
1029 ch2 = GET ();
1030 if (ch2 != EOF && IS_NEWLINE (ch2))
1031 add_newlines++;
1032 }
1033 while (ch2 != EOF && ch2 != '*');
1034
1035 while (ch2 == '*')
1036 ch2 = GET ();
1037
1038 if (ch2 == EOF || ch2 == '/')
1039 break;
1040
1041 /* This UNGET will ensure that we count newlines
1042 correctly. */
1043 UNGET (ch2);
1044 }
1045
1046 if (ch2 == EOF)
1047 as_warn (_("end of file in multiline comment"));
1048
1049 ch = ' ';
1050 goto recycle;
1051 }
1052 #ifdef DOUBLESLASH_LINE_COMMENTS
1053 else if (ch2 == '/')
1054 {
1055 do
1056 {
1057 ch = GET ();
1058 }
1059 while (ch != EOF && !IS_NEWLINE (ch));
1060 if (ch == EOF)
1061 as_warn ("end of file in comment; newline inserted");
1062 state = 0;
1063 PUT ('\n');
1064 break;
1065 }
1066 #endif
1067 else
1068 {
1069 if (ch2 != EOF)
1070 UNGET (ch2);
1071 if (state == 9 || state == 10)
1072 state = 3;
1073 PUT (ch);
1074 }
1075 break;
1076
1077 case LEX_IS_STRINGQUOTE:
1078 quotechar = ch;
1079 if (state == 10)
1080 {
1081 /* Preserve the whitespace in foo "bar". */
1082 UNGET (ch);
1083 state = 3;
1084 PUT (' ');
1085
1086 /* PUT didn't jump out. We could just break, but we
1087 know what will happen, so optimize a bit. */
1088 ch = GET ();
1089 old_state = 3;
1090 }
1091 else if (state == 9)
1092 old_state = 3;
1093 else
1094 old_state = state;
1095 state = 5;
1096 PUT (ch);
1097 break;
1098
1099 case LEX_IS_ONECHAR_QUOTE:
1100 #ifdef H_TICK_HEX
1101 if (state == 9 && enable_h_tick_hex)
1102 {
1103 char c;
1104
1105 c = GET ();
1106 as_warn ("'%c found after symbol", c);
1107 UNGET (c);
1108 }
1109 #endif
1110 if (state == 10)
1111 {
1112 /* Preserve the whitespace in foo 'b'. */
1113 UNGET (ch);
1114 state = 3;
1115 PUT (' ');
1116 break;
1117 }
1118 ch = GET ();
1119 if (ch == EOF)
1120 {
1121 as_warn (_("end of file after a one-character quote; \\0 inserted"));
1122 ch = 0;
1123 }
1124 if (ch == '\\')
1125 {
1126 ch = GET ();
1127 if (ch == EOF)
1128 {
1129 as_warn (_("end of file in escape character"));
1130 ch = '\\';
1131 }
1132 else
1133 ch = process_escape (ch);
1134 }
1135 sprintf (out_buf, "%d", (int) (unsigned char) ch);
1136
1137 /* None of these 'x constants for us. We want 'x'. */
1138 if ((ch = GET ()) != '\'')
1139 {
1140 #ifdef REQUIRE_CHAR_CLOSE_QUOTE
1141 as_warn (_("missing close quote; (assumed)"));
1142 #else
1143 if (ch != EOF)
1144 UNGET (ch);
1145 #endif
1146 }
1147 if (strlen (out_buf) == 1)
1148 {
1149 PUT (out_buf[0]);
1150 break;
1151 }
1152 if (state == 9)
1153 old_state = 3;
1154 else
1155 old_state = state;
1156 state = -1;
1157 out_string = out_buf;
1158 PUT (*out_string++);
1159 break;
1160
1161 case LEX_IS_COLON:
1162 #ifdef KEEP_WHITE_AROUND_COLON
1163 state = 9;
1164 #else
1165 if (state == 9 || state == 10)
1166 state = 3;
1167 else if (state != 3)
1168 state = 1;
1169 #endif
1170 PUT (ch);
1171 break;
1172
1173 case LEX_IS_NEWLINE:
1174 /* Roll out a bunch of newlines from inside comments, etc. */
1175 if (add_newlines)
1176 {
1177 --add_newlines;
1178 UNGET (ch);
1179 }
1180 /* Fall through. */
1181
1182 case LEX_IS_LINE_SEPARATOR:
1183 state = 0;
1184 PUT (ch);
1185 break;
1186
1187 case LEX_IS_PARALLEL_SEPARATOR:
1188 state = 1;
1189 PUT (ch);
1190 break;
1191
1192 #ifdef TC_V850
1193 case LEX_IS_DOUBLEDASH_1ST:
1194 ch2 = GET ();
1195 if (ch2 != '-')
1196 {
1197 if (ch2 != EOF)
1198 UNGET (ch2);
1199 goto de_fault;
1200 }
1201 /* Read and skip to end of line. */
1202 do
1203 {
1204 ch = GET ();
1205 }
1206 while (ch != EOF && ch != '\n');
1207
1208 if (ch == EOF)
1209 as_warn (_("end of file in comment; newline inserted"));
1210
1211 state = 0;
1212 PUT ('\n');
1213 break;
1214 #endif
1215 #ifdef DOUBLEBAR_PARALLEL
1216 case LEX_IS_DOUBLEBAR_1ST:
1217 ch2 = GET ();
1218 if (ch2 != EOF)
1219 UNGET (ch2);
1220 if (ch2 != '|')
1221 goto de_fault;
1222
1223 /* Handle '||' in two states as invoking PUT twice might
1224 result in the first one jumping out of this loop. We'd
1225 then lose track of the state and one '|' char. */
1226 state = 13;
1227 PUT ('|');
1228 break;
1229 #endif
1230 case LEX_IS_LINE_COMMENT_START:
1231 /* FIXME-someday: The two character comment stuff was badly
1232 thought out. On i386, we want '/' as line comment start
1233 AND we want C style comments. hence this hack. The
1234 whole lexical process should be reworked. xoxorich. */
1235 if (ch == '/')
1236 {
1237 ch2 = GET ();
1238 if (ch2 == '*')
1239 {
1240 old_state = 3;
1241 state = -2;
1242 break;
1243 }
1244 else if (ch2 != EOF)
1245 {
1246 UNGET (ch2);
1247 }
1248 }
1249
1250 if (state == 0 || state == 1) /* Only comment at start of line. */
1251 {
1252 int startch;
1253
1254 startch = ch;
1255
1256 do
1257 {
1258 ch = GET ();
1259 }
1260 while (ch != EOF && IS_WHITESPACE (ch));
1261
1262 if (ch == EOF)
1263 {
1264 as_warn (_("end of file in comment; newline inserted"));
1265 PUT ('\n');
1266 break;
1267 }
1268
1269 if (ch < '0' || ch > '9' || state != 0 || startch != '#')
1270 {
1271 /* Not a cpp line. */
1272 while (ch != EOF && !IS_NEWLINE (ch))
1273 ch = GET ();
1274 if (ch == EOF)
1275 {
1276 as_warn (_("end of file in comment; newline inserted"));
1277 PUT ('\n');
1278 }
1279 else /* IS_NEWLINE (ch) */
1280 {
1281 /* To process non-zero add_newlines. */
1282 UNGET (ch);
1283 }
1284 state = 0;
1285 break;
1286 }
1287 /* Looks like `# 123 "filename"' from cpp. */
1288 UNGET (ch);
1289 old_state = 4;
1290 state = -1;
1291 if (scrub_m68k_mri)
1292 out_string = "\tlinefile ";
1293 else
1294 out_string = "\t.linefile ";
1295 PUT (*out_string++);
1296 break;
1297 }
1298
1299 #ifdef TC_D10V
1300 /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true.
1301 Trap is the only short insn that has a first operand that is
1302 neither register nor label.
1303 We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 .
1304 We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is
1305 already LEX_IS_LINE_COMMENT_START. However, it is the
1306 only character in line_comment_chars for d10v, hence we
1307 can recognize it as such. */
1308 /* An alternative approach would be to reset the state to 1 when
1309 we see '||', '<'- or '->', but that seems to be overkill. */
1310 if (state == 10)
1311 PUT (' ');
1312 #endif
1313 /* We have a line comment character which is not at the
1314 start of a line. If this is also a normal comment
1315 character, fall through. Otherwise treat it as a default
1316 character. */
1317 if (strchr (tc_comment_chars, ch) == NULL
1318 && (! scrub_m68k_mri
1319 || (ch != '!' && ch != '*')))
1320 goto de_fault;
1321 if (scrub_m68k_mri
1322 && (ch == '!' || ch == '*' || ch == '#')
1323 && state != 1
1324 && state != 10)
1325 goto de_fault;
1326 /* Fall through. */
1327 case LEX_IS_COMMENT_START:
1328 #if defined TC_ARM && defined OBJ_ELF
1329 /* On the ARM, `@' is the comment character.
1330 Unfortunately this is also a special character in ELF .symver
1331 directives (and .type, though we deal with those another way).
1332 So we check if this line is such a directive, and treat
1333 the character as default if so. This is a hack. */
1334 if ((symver_state != NULL) && (*symver_state == 0))
1335 goto de_fault;
1336 #endif
1337
1338 /* Care is needed not to damage occurrences of \<comment-char>
1339 by stripping the <comment-char> onwards. Yuck. */
1340 if ((to > tostart ? to[-1] : last_char) == '\\')
1341 /* Do not treat the <comment-char> as a start-of-comment. */
1342 goto de_fault;
1343
1344 #ifdef WARN_COMMENTS
1345 if (!found_comment)
1346 found_comment_file = as_where (&found_comment);
1347 #endif
1348 do
1349 {
1350 ch = GET ();
1351 }
1352 while (ch != EOF && !IS_NEWLINE (ch));
1353 if (ch == EOF)
1354 as_warn (_("end of file in comment; newline inserted"));
1355 state = 0;
1356 PUT ('\n');
1357 break;
1358
1359 #ifdef H_TICK_HEX
1360 case LEX_IS_H:
1361 /* Look for strings like H'[0-9A-Fa-f] and if found, replace
1362 the H' with 0x to make them gas-style hex characters. */
1363 if (enable_h_tick_hex)
1364 {
1365 char quot;
1366
1367 quot = GET ();
1368 if (quot == '\'')
1369 {
1370 UNGET ('x');
1371 ch = '0';
1372 }
1373 else
1374 UNGET (quot);
1375 }
1376 #endif
1377 /* Fall through. */
1378
1379 case LEX_IS_SYMBOL_COMPONENT:
1380 if (state == 10)
1381 {
1382 /* This is a symbol character following another symbol
1383 character, with whitespace in between. We skipped
1384 the whitespace earlier, so output it now. */
1385 UNGET (ch);
1386 state = 3;
1387 PUT (' ');
1388 break;
1389 }
1390
1391 #ifdef TC_Z80
1392 /* "af'" is a symbol containing '\''. */
1393 if (state == 3 && (ch == 'a' || ch == 'A'))
1394 {
1395 state = 16;
1396 PUT (ch);
1397 ch = GET ();
1398 if (ch == 'f' || ch == 'F')
1399 {
1400 state = 17;
1401 PUT (ch);
1402 break;
1403 }
1404 else
1405 {
1406 state = 9;
1407 if (ch == EOF || !IS_SYMBOL_COMPONENT (ch))
1408 {
1409 if (ch != EOF)
1410 UNGET (ch);
1411 break;
1412 }
1413 }
1414 }
1415 #endif
1416 if (state == 3)
1417 state = 9;
1418
1419 /* This is a common case. Quickly copy CH and all the
1420 following symbol component or normal characters. */
1421 if (to + 1 < toend
1422 && mri_state == NULL
1423 #if defined TC_ARM && defined OBJ_ELF
1424 && symver_state == NULL
1425 #endif
1426 )
1427 {
1428 char *s;
1429 ptrdiff_t len;
1430
1431 for (s = from; s < fromend; s++)
1432 {
1433 int type;
1434
1435 ch2 = *(unsigned char *) s;
1436 type = lex[ch2];
1437 if (type != 0
1438 && type != LEX_IS_SYMBOL_COMPONENT)
1439 break;
1440 }
1441
1442 if (s > from)
1443 /* Handle the last character normally, for
1444 simplicity. */
1445 --s;
1446
1447 len = s - from;
1448
1449 if (len > (toend - to) - 1)
1450 len = (toend - to) - 1;
1451
1452 if (len > 0)
1453 {
1454 PUT (ch);
1455 memcpy (to, from, len);
1456 to += len;
1457 from += len;
1458 if (to >= toend)
1459 goto tofull;
1460 ch = GET ();
1461 }
1462 }
1463
1464 /* Fall through. */
1465 default:
1466 de_fault:
1467 /* Some relatively `normal' character. */
1468 if (state == 0)
1469 {
1470 state = 11; /* Now seeing label definition. */
1471 }
1472 else if (state == 1)
1473 {
1474 state = 2; /* Ditto. */
1475 }
1476 else if (state == 9)
1477 {
1478 if (!IS_SYMBOL_COMPONENT (ch))
1479 state = 3;
1480 }
1481 else if (state == 10)
1482 {
1483 if (ch == '\\')
1484 {
1485 /* Special handling for backslash: a backslash may
1486 be the beginning of a formal parameter (of a
1487 macro) following another symbol character, with
1488 whitespace in between. If that is the case, we
1489 output a space before the parameter. Strictly
1490 speaking, correct handling depends upon what the
1491 macro parameter expands into; if the parameter
1492 expands into something which does not start with
1493 an operand character, then we don't want to keep
1494 the space. We don't have enough information to
1495 make the right choice, so here we are making the
1496 choice which is more likely to be correct. */
1497 if (to + 1 >= toend)
1498 {
1499 /* If we're near the end of the buffer, save the
1500 character for the next time round. Otherwise
1501 we'll lose our state. */
1502 UNGET (ch);
1503 goto tofull;
1504 }
1505 *to++ = ' ';
1506 }
1507
1508 state = 3;
1509 }
1510 PUT (ch);
1511 break;
1512 }
1513 }
1514
1515 /*NOTREACHED*/
1516
1517 fromeof:
1518 /* We have reached the end of the input. */
1519 if (to > tostart)
1520 last_char = to[-1];
1521 return to - tostart;
1522
1523 tofull:
1524 /* The output buffer is full. Save any input we have not yet
1525 processed. */
1526 if (fromend > from)
1527 {
1528 saved_input = from;
1529 saved_input_len = fromend - from;
1530 }
1531 else
1532 saved_input = NULL;
1533
1534 if (to > tostart)
1535 last_char = to[-1];
1536 return to - tostart;
1537 }