]> git.ipfire.org Git - thirdparty/gcc.git/blame - libcpp/lex.cc
c: Handle scoped attributes in __has*attribute and scoped attribute parsing changes...
[thirdparty/gcc.git] / libcpp / lex.cc
CommitLineData
45b966db 1/* CPP Library - lexical analysis.
a945c346 2 Copyright (C) 2000-2024 Free Software Foundation, Inc.
45b966db
ZW
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
7
8This program is free software; you can redistribute it and/or modify it
9under the terms of the GNU General Public License as published by the
748086b7 10Free Software Foundation; either version 3, or (at your option) any
45b966db
ZW
11later version.
12
13This program is distributed in the hope that it will be useful,
14but WITHOUT ANY WARRANTY; without even the implied warranty of
15MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16GNU General Public License for more details.
17
18You should have received a copy of the GNU General Public License
748086b7
JJ
19along with this program; see the file COPYING3. If not see
20<http://www.gnu.org/licenses/>. */
45b966db
ZW
21
22#include "config.h"
23#include "system.h"
45b966db 24#include "cpplib.h"
4f4e53dd 25#include "internal.h"
45b966db 26
93c80368 27enum spell_type
f9a0e96c 28{
93c80368 29 SPELL_OPERATOR = 0,
93c80368 30 SPELL_IDENT,
6338b358 31 SPELL_LITERAL,
93c80368 32 SPELL_NONE
f9a0e96c
ZW
33};
34
93c80368 35struct token_spelling
f9a0e96c 36{
93c80368
NB
37 enum spell_type category;
38 const unsigned char *name;
f9a0e96c
ZW
39};
40
8206c799 41static const unsigned char *const digraph_spellings[] =
b6baa67d 42{ UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
93c80368 43
b6baa67d
KVH
44#define OP(e, s) { SPELL_OPERATOR, UC s },
45#define TK(e, s) { SPELL_ ## s, UC #e },
8206c799 46static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
93c80368
NB
47#undef OP
48#undef TK
49
50#define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51#define TOKEN_NAME(token) (token_spellings[(token)->type].name)
f2d5f0cc 52
0b8c57ed
JJ
53/* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive. */
54#define UCS_LIMIT 0x10FFFF
55
6cf87ca4
ZW
56static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
57static int skip_line_comment (cpp_reader *);
58static void skip_whitespace (cpp_reader *, cppchar_t);
6cf87ca4
ZW
59static void lex_string (cpp_reader *, cpp_token *, const uchar *);
60static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
631d0d36 61static void store_comment (cpp_reader *, cpp_token *);
6cf87ca4
ZW
62static void create_literal (cpp_reader *, cpp_token *, const uchar *,
63 unsigned int, enum cpp_ttype);
64static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
65static int name_p (cpp_reader *, const cpp_string *);
6cf87ca4
ZW
66static tokenrun *next_tokenrun (tokenrun *);
67
6cf87ca4 68static _cpp_buff *new_buff (size_t);
15dad1d9 69
9d10c9a9 70
041c3194 71/* Utility routine:
9e62c811 72
bfb9dc7f
ZW
73 Compares, the token TOKEN to the NUL-terminated string STRING.
74 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
041c3194 75int
6cf87ca4 76cpp_ideq (const cpp_token *token, const char *string)
041c3194 77{
bfb9dc7f 78 if (token->type != CPP_NAME)
041c3194 79 return 0;
bfb9dc7f 80
9a0c6187 81 return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
15dad1d9 82}
1368ee70 83
26aea073
NB
84/* Record a note TYPE at byte POS into the current cleaned logical
85 line. */
87062813 86static void
6cf87ca4 87add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
0d9f234d 88{
26aea073
NB
89 if (buffer->notes_used == buffer->notes_cap)
90 {
91 buffer->notes_cap = buffer->notes_cap * 2 + 200;
c3f829c1
GDR
92 buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
93 buffer->notes_cap);
26aea073 94 }
0d9f234d 95
26aea073
NB
96 buffer->notes[buffer->notes_used].pos = pos;
97 buffer->notes[buffer->notes_used].type = type;
98 buffer->notes_used++;
0d9f234d
NB
99}
100
246a2fcb
RH
101\f
102/* Fast path to find line special characters using optimized character
103 scanning algorithms. Anything complicated falls back to the slow
104 path below. Since this loop is very hot it's worth doing these kinds
105 of optimizations.
106
107 One of the paths through the ifdefs should provide
108
109 const uchar *search_line_fast (const uchar *s, const uchar *end);
110
111 Between S and END, search for \n, \r, \\, ?. Return a pointer to
112 the found character.
113
114 Note that the last character of the buffer is *always* a newline,
115 as forced by _cpp_convert_input. This fact can be used to avoid
116 explicitly looking for the end of the buffer. */
117
118/* Configure gives us an ifdef test. */
119#ifndef WORDS_BIGENDIAN
120#define WORDS_BIGENDIAN 0
121#endif
122
123/* We'd like the largest integer that fits into a register. There's nothing
124 in <stdint.h> that gives us that. For most hosts this is unsigned long,
125 but MS decided on an LLP64 model. Thankfully when building with GCC we
126 can get the "real" word size. */
127#ifdef __GNUC__
128typedef unsigned int word_type __attribute__((__mode__(__word__)));
129#else
130typedef unsigned long word_type;
131#endif
132
133/* The code below is only expecting sizes 4 or 8.
134 Die at compile-time if this expectation is violated. */
135typedef char check_word_type_size
136 [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
137
138/* Return X with the first N bytes forced to values that won't match one
139 of the interesting characters. Note that NUL is not interesting. */
140
141static inline word_type
142acc_char_mask_misalign (word_type val, unsigned int n)
143{
144 word_type mask = -1;
145 if (WORDS_BIGENDIAN)
146 mask >>= n * 8;
147 else
148 mask <<= n * 8;
149 return val & mask;
150}
151
152/* Return X replicated to all byte positions within WORD_TYPE. */
153
154static inline word_type
155acc_char_replicate (uchar x)
156{
157 word_type ret;
158
159 ret = (x << 24) | (x << 16) | (x << 8) | x;
160 if (sizeof(word_type) == 8)
161 ret = (ret << 16 << 16) | ret;
162 return ret;
163}
164
165/* Return non-zero if some byte of VAL is (probably) C. */
166
167static inline word_type
168acc_char_cmp (word_type val, word_type c)
169{
170#if defined(__GNUC__) && defined(__alpha__)
171 /* We can get exact results using a compare-bytes instruction.
172 Get (val == c) via (0 >= (val ^ c)). */
173 return __builtin_alpha_cmpbge (0, val ^ c);
174#else
175 word_type magic = 0x7efefefeU;
176 if (sizeof(word_type) == 8)
177 magic = (magic << 16 << 16) | 0xfefefefeU;
178 magic |= 1;
179
180 val ^= c;
181 return ((val + magic) ^ ~val) & ~magic;
182#endif
183}
184
185/* Given the result of acc_char_cmp is non-zero, return the index of
186 the found character. If this was a false positive, return -1. */
187
188static inline int
189acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
190 word_type val ATTRIBUTE_UNUSED)
191{
192#if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
193 /* The cmpbge instruction sets *bits* of the result corresponding to
194 matches in the bytes with no false positives. */
195 return __builtin_ctzl (cmp);
196#else
197 unsigned int i;
198
199 /* ??? It would be nice to force unrolling here,
200 and have all of these constants folded. */
201 for (i = 0; i < sizeof(word_type); ++i)
202 {
203 uchar c;
204 if (WORDS_BIGENDIAN)
205 c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
206 else
207 c = (val >> i * 8) & 0xff;
208
209 if (c == '\n' || c == '\r' || c == '\\' || c == '?')
210 return i;
211 }
212
213 return -1;
214#endif
215}
216
217/* A version of the fast scanner using bit fiddling techniques.
218
219 For 32-bit words, one would normally perform 16 comparisons and
220 16 branches. With this algorithm one performs 24 arithmetic
221 operations and one branch. Whether this is faster with a 32-bit
222 word size is going to be somewhat system dependent.
223
224 For 64-bit words, we eliminate twice the number of comparisons
225 and branches without increasing the number of arithmetic operations.
226 It's almost certainly going to be a win with 64-bit word size. */
227
228static const uchar * search_line_acc_char (const uchar *, const uchar *)
229 ATTRIBUTE_UNUSED;
230
231static const uchar *
232search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
233{
234 const word_type repl_nl = acc_char_replicate ('\n');
235 const word_type repl_cr = acc_char_replicate ('\r');
236 const word_type repl_bs = acc_char_replicate ('\\');
237 const word_type repl_qm = acc_char_replicate ('?');
238
239 unsigned int misalign;
240 const word_type *p;
241 word_type val, t;
242
243 /* Align the buffer. Mask out any bytes from before the beginning. */
244 p = (word_type *)((uintptr_t)s & -sizeof(word_type));
245 val = *p;
246 misalign = (uintptr_t)s & (sizeof(word_type) - 1);
247 if (misalign)
248 val = acc_char_mask_misalign (val, misalign);
249
250 /* Main loop. */
251 while (1)
252 {
253 t = acc_char_cmp (val, repl_nl);
254 t |= acc_char_cmp (val, repl_cr);
255 t |= acc_char_cmp (val, repl_bs);
256 t |= acc_char_cmp (val, repl_qm);
257
258 if (__builtin_expect (t != 0, 0))
259 {
260 int i = acc_char_index (t, val);
261 if (i >= 0)
262 return (const uchar *)p + i;
263 }
264
265 val = *++p;
266 }
267}
268
d9f069ab 269/* Disable on Solaris 2/x86 until the following problem can be properly
789d73cb
RO
270 autoconfed:
271
789d73cb
RO
272 The Solaris 10+ assembler tags objects with the instruction set
273 extensions used, so SSE4.2 executables cannot run on machines that
274 don't support that extension. */
275
1b6b13f3 276#if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
246a2fcb
RH
277
278/* Replicated character data to be shared between implementations.
279 Recall that outside of a context with vector support we can't
280 define compatible vector types, therefore these are all defined
281 in terms of raw characters. */
282static const char repl_chars[4][16] __attribute__((aligned(16))) = {
283 { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
284 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
285 { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
286 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
287 { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
288 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
289 { '?', '?', '?', '?', '?', '?', '?', '?',
290 '?', '?', '?', '?', '?', '?', '?', '?' },
291};
292
293/* A version of the fast scanner using MMX vectorized byte compare insns.
294
295 This uses the PMOVMSKB instruction which was introduced with "MMX2",
ef230b38 296 which was packaged into SSE1; it is also present in the AMD MMX
246a2fcb
RH
297 extension. Mark the function as using "sse" so that we emit a real
298 "emms" instruction, rather than the 3dNOW "femms" instruction. */
299
300static const uchar *
301#ifndef __SSE__
302__attribute__((__target__("sse")))
303#endif
304search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
305{
306 typedef char v8qi __attribute__ ((__vector_size__ (8)));
307 typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
308
309 const v8qi repl_nl = *(const v8qi *)repl_chars[0];
310 const v8qi repl_cr = *(const v8qi *)repl_chars[1];
311 const v8qi repl_bs = *(const v8qi *)repl_chars[2];
312 const v8qi repl_qm = *(const v8qi *)repl_chars[3];
313
314 unsigned int misalign, found, mask;
315 const v8qi *p;
316 v8qi data, t, c;
317
318 /* Align the source pointer. While MMX doesn't generate unaligned data
319 faults, this allows us to safely scan to the end of the buffer without
320 reading beyond the end of the last page. */
321 misalign = (uintptr_t)s & 7;
322 p = (const v8qi *)((uintptr_t)s & -8);
323 data = *p;
324
325 /* Create a mask for the bytes that are valid within the first
326 16-byte block. The Idea here is that the AND with the mask
327 within the loop is "free", since we need some AND or TEST
328 insn in order to set the flags for the branch anyway. */
329 mask = -1u << misalign;
330
331 /* Main loop processing 8 bytes at a time. */
332 goto start;
333 do
334 {
335 data = *++p;
336 mask = -1;
337
338 start:
339 t = __builtin_ia32_pcmpeqb(data, repl_nl);
340 c = __builtin_ia32_pcmpeqb(data, repl_cr);
341 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
342 c = __builtin_ia32_pcmpeqb(data, repl_bs);
343 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
344 c = __builtin_ia32_pcmpeqb(data, repl_qm);
345 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
346 found = __builtin_ia32_pmovmskb (t);
347 found &= mask;
348 }
349 while (!found);
350
351 __builtin_ia32_emms ();
352
353 /* FOUND contains 1 in bits for which we matched a relevant
354 character. Conversion to the byte index is trivial. */
355 found = __builtin_ctz(found);
356 return (const uchar *)p + found;
357}
358
359/* A version of the fast scanner using SSE2 vectorized byte compare insns. */
360
361static const uchar *
362#ifndef __SSE2__
363__attribute__((__target__("sse2")))
364#endif
365search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
366{
367 typedef char v16qi __attribute__ ((__vector_size__ (16)));
368
369 const v16qi repl_nl = *(const v16qi *)repl_chars[0];
370 const v16qi repl_cr = *(const v16qi *)repl_chars[1];
371 const v16qi repl_bs = *(const v16qi *)repl_chars[2];
372 const v16qi repl_qm = *(const v16qi *)repl_chars[3];
373
374 unsigned int misalign, found, mask;
375 const v16qi *p;
376 v16qi data, t;
377
378 /* Align the source pointer. */
379 misalign = (uintptr_t)s & 15;
380 p = (const v16qi *)((uintptr_t)s & -16);
381 data = *p;
382
383 /* Create a mask for the bytes that are valid within the first
384 16-byte block. The Idea here is that the AND with the mask
385 within the loop is "free", since we need some AND or TEST
386 insn in order to set the flags for the branch anyway. */
387 mask = -1u << misalign;
388
389 /* Main loop processing 16 bytes at a time. */
390 goto start;
391 do
392 {
393 data = *++p;
394 mask = -1;
395
396 start:
530b1d68 397 t = data == repl_nl;
398 t |= data == repl_cr;
399 t |= data == repl_bs;
400 t |= data == repl_qm;
246a2fcb
RH
401 found = __builtin_ia32_pmovmskb128 (t);
402 found &= mask;
403 }
404 while (!found);
405
406 /* FOUND contains 1 in bits for which we matched a relevant
407 character. Conversion to the byte index is trivial. */
408 found = __builtin_ctz(found);
409 return (const uchar *)p + found;
410}
411
6f173e52 412#ifdef HAVE_SSE4
246a2fcb
RH
413/* A version of the fast scanner using SSE 4.2 vectorized string insns. */
414
415static const uchar *
416#ifndef __SSE4_2__
417__attribute__((__target__("sse4.2")))
418#endif
419search_line_sse42 (const uchar *s, const uchar *end)
420{
421 typedef char v16qi __attribute__ ((__vector_size__ (16)));
422 static const v16qi search = { '\n', '\r', '?', '\\' };
423
424 uintptr_t si = (uintptr_t)s;
425 uintptr_t index;
426
427 /* Check for unaligned input. */
428 if (si & 15)
429 {
d35d1c0f
UB
430 v16qi sv;
431
246a2fcb
RH
432 if (__builtin_expect (end - s < 16, 0)
433 && __builtin_expect ((si & 0xfff) > 0xff0, 0))
434 {
435 /* There are less than 16 bytes left in the buffer, and less
436 than 16 bytes left on the page. Reading 16 bytes at this
437 point might generate a spurious page fault. Defer to the
438 SSE2 implementation, which already handles alignment. */
439 return search_line_sse2 (s, end);
440 }
441
442 /* ??? The builtin doesn't understand that the PCMPESTRI read from
443 memory need not be aligned. */
d35d1c0f
UB
444 sv = __builtin_ia32_loaddqu ((const char *) s);
445 index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
446
246a2fcb
RH
447 if (__builtin_expect (index < 16, 0))
448 goto found;
449
450 /* Advance the pointer to an aligned address. We will re-scan a
451 few bytes, but we no longer need care for reading past the
452 end of a page, since we're guaranteed a match. */
49445904 453 s = (const uchar *)((si + 15) & -16);
246a2fcb
RH
454 }
455
dc6bcf52
UB
456 /* Main loop, processing 16 bytes at a time. */
457#ifdef __GCC_ASM_FLAG_OUTPUTS__
458 while (1)
459 {
460 char f;
461
462 /* By using inline assembly instead of the builtin,
463 we can use the result, as well as the flags set. */
464 __asm ("%vpcmpestri\t$0, %2, %3"
465 : "=c"(index), "=@ccc"(f)
466 : "m"(*s), "x"(search), "a"(4), "d"(16));
467 if (f)
468 break;
469
470 s += 16;
471 }
472#else
473 s -= 16;
474 /* By doing the whole loop in inline assembly,
475 we can make proper use of the flags set. */
476 __asm ( ".balign 16\n"
246a2fcb 477 "0: add $16, %1\n"
dc6bcf52 478 " %vpcmpestri\t$0, (%1), %2\n"
246a2fcb
RH
479 " jnc 0b"
480 : "=&c"(index), "+r"(s)
481 : "x"(search), "a"(4), "d"(16));
dc6bcf52 482#endif
246a2fcb
RH
483
484 found:
485 return s + index;
486}
487
6f173e52
RH
488#else
489/* Work around out-dated assemblers without sse4 support. */
490#define search_line_sse42 search_line_sse2
491#endif
492
246a2fcb
RH
493/* Check the CPU capabilities. */
494
495#include "../gcc/config/i386/cpuid.h"
496
497typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
498static search_line_fast_type search_line_fast;
499
b0c084b7
JJ
500#define HAVE_init_vectorized_lexer 1
501static inline void
246a2fcb
RH
502init_vectorized_lexer (void)
503{
504 unsigned dummy, ecx = 0, edx = 0;
505 search_line_fast_type impl = search_line_acc_char;
506 int minimum = 0;
507
508#if defined(__SSE4_2__)
509 minimum = 3;
510#elif defined(__SSE2__)
511 minimum = 2;
ef230b38 512#elif defined(__SSE__)
246a2fcb
RH
513 minimum = 1;
514#endif
515
516 if (minimum == 3)
517 impl = search_line_sse42;
518 else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
519 {
520 if (minimum == 3 || (ecx & bit_SSE4_2))
521 impl = search_line_sse42;
522 else if (minimum == 2 || (edx & bit_SSE2))
523 impl = search_line_sse2;
524 else if (minimum == 1 || (edx & bit_SSE))
525 impl = search_line_mmx;
526 }
527 else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
528 {
5e70c0b5
UB
529 if (minimum == 1
530 || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
246a2fcb
RH
531 impl = search_line_mmx;
532 }
533
534 search_line_fast = impl;
535}
536
d00b1b02 537#elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
246a2fcb 538
0ccaaab0
BS
539/* A vection of the fast scanner using AltiVec vectorized byte compares
540 and VSX unaligned loads (when VSX is available). This is otherwise
d00b1b02 541 the same as the AltiVec version. */
0ccaaab0 542
44d95244 543ATTRIBUTE_NO_SANITIZE_UNDEFINED
0ccaaab0
BS
544static const uchar *
545search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
546{
547 typedef __attribute__((altivec(vector))) unsigned char vc;
548
549 const vc repl_nl = {
550 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
551 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
552 };
553 const vc repl_cr = {
554 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
555 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
556 };
557 const vc repl_bs = {
558 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
559 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
560 };
561 const vc repl_qm = {
562 '?', '?', '?', '?', '?', '?', '?', '?',
563 '?', '?', '?', '?', '?', '?', '?', '?',
564 };
565 const vc zero = { 0 };
566
567 vc data, t;
568
569 /* Main loop processing 16 bytes at a time. */
570 do
571 {
572 vc m_nl, m_cr, m_bs, m_qm;
573
a3a821c9 574 data = __builtin_vec_vsx_ld (0, s);
0ccaaab0
BS
575 s += 16;
576
577 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
578 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
579 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
580 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
581 t = (m_nl | m_cr) | (m_bs | m_qm);
582
583 /* T now contains 0xff in bytes for which we matched one of the relevant
584 characters. We want to exit the loop if any byte in T is non-zero.
585 Below is the expansion of vec_any_ne(t, zero). */
586 }
587 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
588
589 /* Restore s to to point to the 16 bytes we just processed. */
590 s -= 16;
591
592 {
593#define N (sizeof(vc) / sizeof(long))
594
595 union {
596 vc v;
597 /* Statically assert that N is 2 or 4. */
598 unsigned long l[(N == 2 || N == 4) ? N : -1];
599 } u;
600 unsigned long l, i = 0;
601
602 u.v = t;
603
604 /* Find the first word of T that is non-zero. */
605 switch (N)
606 {
607 case 4:
608 l = u.l[i++];
609 if (l != 0)
610 break;
611 s += sizeof(unsigned long);
612 l = u.l[i++];
613 if (l != 0)
614 break;
615 s += sizeof(unsigned long);
191816a3 616 /* FALLTHRU */
0ccaaab0
BS
617 case 2:
618 l = u.l[i++];
619 if (l != 0)
620 break;
621 s += sizeof(unsigned long);
622 l = u.l[i];
623 }
624
625 /* L now contains 0xff in bytes for which we matched one of the
626 relevant characters. We can find the byte index by finding
627 its bit index and dividing by 8. */
628#ifdef __BIG_ENDIAN__
629 l = __builtin_clzl(l) >> 3;
630#else
631 l = __builtin_ctzl(l) >> 3;
632#endif
633 return s + l;
634
635#undef N
636 }
637}
638
639#elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
640
641/* A vection of the fast scanner using AltiVec vectorized byte compares.
642 This cannot be used for little endian because vec_lvsl/lvsr are
643 deprecated for little endian and the code won't work properly. */
246a2fcb
RH
644/* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
645 so we can't compile this function without -maltivec on the command line
646 (or implied by some other switch). */
647
648static const uchar *
649search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
650{
651 typedef __attribute__((altivec(vector))) unsigned char vc;
652
653 const vc repl_nl = {
654 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
655 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
656 };
657 const vc repl_cr = {
658 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
659 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
660 };
661 const vc repl_bs = {
662 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
663 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
664 };
665 const vc repl_qm = {
666 '?', '?', '?', '?', '?', '?', '?', '?',
667 '?', '?', '?', '?', '?', '?', '?', '?',
668 };
669 const vc ones = {
670 -1, -1, -1, -1, -1, -1, -1, -1,
671 -1, -1, -1, -1, -1, -1, -1, -1,
672 };
673 const vc zero = { 0 };
674
675 vc data, mask, t;
676
677 /* Altivec loads automatically mask addresses with -16. This lets us
678 issue the first load as early as possible. */
679 data = __builtin_vec_ld(0, (const vc *)s);
680
681 /* Discard bytes before the beginning of the buffer. Do this by
682 beginning with all ones and shifting in zeros according to the
683 mis-alignment. The LVSR instruction pulls the exact shift we
684 want from the address. */
685 mask = __builtin_vec_lvsr(0, s);
686 mask = __builtin_vec_perm(zero, ones, mask);
687 data &= mask;
688
689 /* While altivec loads mask addresses, we still need to align S so
690 that the offset we compute at the end is correct. */
691 s = (const uchar *)((uintptr_t)s & -16);
692
693 /* Main loop processing 16 bytes at a time. */
694 goto start;
695 do
696 {
697 vc m_nl, m_cr, m_bs, m_qm;
698
699 s += 16;
700 data = __builtin_vec_ld(0, (const vc *)s);
701
702 start:
703 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
704 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
705 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
706 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
707 t = (m_nl | m_cr) | (m_bs | m_qm);
708
709 /* T now contains 0xff in bytes for which we matched one of the relevant
710 characters. We want to exit the loop if any byte in T is non-zero.
711 Below is the expansion of vec_any_ne(t, zero). */
712 }
713 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
714
715 {
716#define N (sizeof(vc) / sizeof(long))
717
246a2fcb
RH
718 union {
719 vc v;
53a103d3
DS
720 /* Statically assert that N is 2 or 4. */
721 unsigned long l[(N == 2 || N == 4) ? N : -1];
246a2fcb
RH
722 } u;
723 unsigned long l, i = 0;
724
725 u.v = t;
726
727 /* Find the first word of T that is non-zero. */
728 switch (N)
729 {
730 case 4:
731 l = u.l[i++];
732 if (l != 0)
733 break;
734 s += sizeof(unsigned long);
735 l = u.l[i++];
736 if (l != 0)
737 break;
738 s += sizeof(unsigned long);
67ef83c6 739 /* FALLTHROUGH */
246a2fcb
RH
740 case 2:
741 l = u.l[i++];
742 if (l != 0)
743 break;
744 s += sizeof(unsigned long);
745 l = u.l[i];
746 }
747
748 /* L now contains 0xff in bytes for which we matched one of the
749 relevant characters. We can find the byte index by finding
750 its bit index and dividing by 8. */
751 l = __builtin_clzl(l) >> 3;
752 return s + l;
753
754#undef N
755 }
756}
757
a6ac871c
RE
758#elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
759#include "arm_neon.h"
760
761/* This doesn't have to be the exact page size, but no system may use
762 a size smaller than this. ARMv8 requires a minimum page size of
763 4k. The impact of being conservative here is a small number of
764 cases will take the slightly slower entry path into the main
765 loop. */
766
767#define AARCH64_MIN_PAGE_SIZE 4096
768
769static const uchar *
770search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
771{
772 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
773 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
774 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
775 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
776 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
777
35c4515b 778#ifdef __ARM_BIG_ENDIAN
a6ac871c
RE
779 const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
780#else
781 const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
782#endif
783
784 unsigned int found;
785 const uint8_t *p;
786 uint8x16_t data;
787 uint8x16_t t;
788 uint16x8_t m;
789 uint8x16_t u, v, w;
790
791 /* Align the source pointer. */
792 p = (const uint8_t *)((uintptr_t)s & -16);
793
794 /* Assuming random string start positions, with a 4k page size we'll take
795 the slow path about 0.37% of the time. */
796 if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
797 - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
798 < 16, 0))
799 {
800 /* Slow path: the string starts near a possible page boundary. */
801 uint32_t misalign, mask;
802
803 misalign = (uintptr_t)s & 15;
804 mask = (-1u << misalign) & 0xffff;
805 data = vld1q_u8 (p);
806 t = vceqq_u8 (data, repl_nl);
807 u = vceqq_u8 (data, repl_cr);
808 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
809 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
810 t = vorrq_u8 (v, w);
811 t = vandq_u8 (t, xmask);
812 m = vpaddlq_u8 (t);
813 m = vshlq_u16 (m, shift);
814 found = vaddvq_u16 (m);
815 found &= mask;
816 if (found)
817 return (const uchar*)p + __builtin_ctz (found);
818 }
819 else
820 {
821 data = vld1q_u8 ((const uint8_t *) s);
822 t = vceqq_u8 (data, repl_nl);
823 u = vceqq_u8 (data, repl_cr);
824 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
825 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
826 t = vorrq_u8 (v, w);
8c00ae24 827 if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
a6ac871c
RE
828 goto done;
829 }
830
831 do
832 {
833 p += 16;
834 data = vld1q_u8 (p);
835 t = vceqq_u8 (data, repl_nl);
836 u = vceqq_u8 (data, repl_cr);
837 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
838 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
839 t = vorrq_u8 (v, w);
840 } while (!vpaddd_u64 ((uint64x2_t)t));
841
842done:
843 /* Now that we've found the terminating substring, work out precisely where
844 we need to stop. */
845 t = vandq_u8 (t, xmask);
846 m = vpaddlq_u8 (t);
847 m = vshlq_u16 (m, shift);
848 found = vaddvq_u16 (m);
849 return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
850 + __builtin_ctz (found));
851}
852
95d0610c 853#elif defined (__ARM_NEON)
e75b54a2
RE
854#include "arm_neon.h"
855
856static const uchar *
857search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
858{
859 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
860 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
861 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
862 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
863 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
864
865 unsigned int misalign, found, mask;
866 const uint8_t *p;
867 uint8x16_t data;
868
869 /* Align the source pointer. */
870 misalign = (uintptr_t)s & 15;
871 p = (const uint8_t *)((uintptr_t)s & -16);
872 data = vld1q_u8 (p);
873
874 /* Create a mask for the bytes that are valid within the first
875 16-byte block. The Idea here is that the AND with the mask
876 within the loop is "free", since we need some AND or TEST
877 insn in order to set the flags for the branch anyway. */
878 mask = (-1u << misalign) & 0xffff;
879
880 /* Main loop, processing 16 bytes at a time. */
881 goto start;
882
883 do
884 {
885 uint8x8_t l;
886 uint16x4_t m;
887 uint32x2_t n;
888 uint8x16_t t, u, v, w;
889
890 p += 16;
891 data = vld1q_u8 (p);
892 mask = 0xffff;
893
894 start:
895 t = vceqq_u8 (data, repl_nl);
896 u = vceqq_u8 (data, repl_cr);
897 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
898 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
899 t = vandq_u8 (vorrq_u8 (v, w), xmask);
900 l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
901 m = vpaddl_u8 (l);
902 n = vpaddl_u16 (m);
903
904 found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
905 vshr_n_u64 ((uint64x1_t) n, 24)), 0);
906 found &= mask;
907 }
908 while (!found);
909
910 /* FOUND contains 1 in bits for which we matched a relevant
911 character. Conversion to the byte index is trivial. */
912 found = __builtin_ctz (found);
913 return (const uchar *)p + found;
914}
915
246a2fcb
RH
916#else
917
5764ee3c 918/* We only have one accelerated alternative. Use a direct call so that
246a2fcb
RH
919 we encourage inlining. */
920
921#define search_line_fast search_line_acc_char
922
923#endif
924
b0c084b7
JJ
925/* Initialize the lexer if needed. */
926
927void
928_cpp_init_lexer (void)
929{
930#ifdef HAVE_init_vectorized_lexer
931 init_vectorized_lexer ();
932#endif
933}
934
26aea073
NB
935/* Returns with a logical line that contains no escaped newlines or
936 trigraphs. This is a time-critical inner loop. */
937void
6cf87ca4 938_cpp_clean_line (cpp_reader *pfile)
45b966db 939{
26aea073
NB
940 cpp_buffer *buffer;
941 const uchar *s;
942 uchar c, *d, *p;
87062813 943
26aea073
NB
944 buffer = pfile->buffer;
945 buffer->cur_note = buffer->notes_used = 0;
946 buffer->cur = buffer->line_base = buffer->next_line;
947 buffer->need_line = false;
246a2fcb 948 s = buffer->next_line;
87062813 949
26aea073 950 if (!buffer->from_stage3)
45b966db 951 {
7af45bd4
ILT
952 const uchar *pbackslash = NULL;
953
246a2fcb 954 /* Fast path. This is the common case of an un-escaped line with
d08dcf87
ZW
955 no trigraphs. The primary win here is by not writing any
956 data back to memory until we have to. */
246a2fcb 957 while (1)
d08dcf87 958 {
246a2fcb
RH
959 /* Perform an optimized search for \n, \r, \\, ?. */
960 s = search_line_fast (s, buffer->rlimit);
d08dcf87 961
246a2fcb
RH
962 c = *s;
963 if (c == '\\')
964 {
965 /* Record the location of the backslash and continue. */
966 pbackslash = s++;
d08dcf87 967 }
246a2fcb 968 else if (__builtin_expect (c == '?', 0))
d08dcf87 969 {
246a2fcb
RH
970 if (__builtin_expect (s[1] == '?', false)
971 && _cpp_trigraph_map[s[2]])
d08dcf87 972 {
246a2fcb
RH
973 /* Have a trigraph. We may or may not have to convert
974 it. Add a line note regardless, for -Wtrigraphs. */
975 add_line_note (buffer, s, s[2]);
976 if (CPP_OPTION (pfile, trigraphs))
977 {
978 /* We do, and that means we have to switch to the
979 slow path. */
980 d = (uchar *) s;
981 *d = _cpp_trigraph_map[s[2]];
982 s += 2;
983 goto slow_path;
984 }
d08dcf87 985 }
246a2fcb
RH
986 /* Not a trigraph. Continue on fast-path. */
987 s++;
d08dcf87 988 }
246a2fcb
RH
989 else
990 break;
d08dcf87
ZW
991 }
992
246a2fcb
RH
993 /* This must be \r or \n. We're either done, or we'll be forced
994 to write back to the buffer and continue on the slow path. */
995 d = (uchar *) s;
996
997 if (__builtin_expect (s == buffer->rlimit, false))
998 goto done;
999
1000 /* DOS line ending? */
1001 if (__builtin_expect (c == '\r', false) && s[1] == '\n')
1002 {
1003 s++;
1004 if (s == buffer->rlimit)
1005 goto done;
1006 }
1007
1008 if (__builtin_expect (pbackslash == NULL, true))
1009 goto done;
1010
1011 /* Check for escaped newline. */
1012 p = d;
1013 while (is_nvspace (p[-1]))
1014 p--;
1015 if (p - 1 != pbackslash)
1016 goto done;
1017
1018 /* Have an escaped newline; process it and proceed to
1019 the slow path. */
1020 add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1021 d = p - 2;
1022 buffer->next_line = p - 1;
26aea073 1023
246a2fcb
RH
1024 slow_path:
1025 while (1)
4a5b68a2 1026 {
26aea073
NB
1027 c = *++s;
1028 *++d = c;
1029
1030 if (c == '\n' || c == '\r')
1031 {
246a2fcb 1032 /* Handle DOS line endings. */
26aea073
NB
1033 if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1034 s++;
1035 if (s == buffer->rlimit)
1036 break;
1037
1038 /* Escaped? */
1039 p = d;
1040 while (p != buffer->next_line && is_nvspace (p[-1]))
1041 p--;
1042 if (p == buffer->next_line || p[-1] != '\\')
1043 break;
1044
41c32c98 1045 add_line_note (buffer, p - 1, p != d ? ' ': '\\');
26aea073
NB
1046 d = p - 2;
1047 buffer->next_line = p - 1;
1048 }
1049 else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1050 {
1051 /* Add a note regardless, for the benefit of -Wtrigraphs. */
41c32c98 1052 add_line_note (buffer, d, s[2]);
26aea073
NB
1053 if (CPP_OPTION (pfile, trigraphs))
1054 {
1055 *d = _cpp_trigraph_map[s[2]];
1056 s += 2;
1057 }
1058 }
4a5b68a2 1059 }
45b966db 1060 }
26aea073
NB
1061 else
1062 {
246a2fcb 1063 while (*s != '\n' && *s != '\r')
26aea073 1064 s++;
26aea073
NB
1065 d = (uchar *) s;
1066
1067 /* Handle DOS line endings. */
082a7b23 1068 if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
26aea073
NB
1069 s++;
1070 }
0d9f234d 1071
d08dcf87 1072 done:
26aea073 1073 *d = '\n';
41c32c98
NB
1074 /* A sentinel note that should never be processed. */
1075 add_line_note (buffer, d + 1, '\n');
26aea073 1076 buffer->next_line = s + 1;
45b966db
ZW
1077}
1078
3ad2167b
LH
1079template <bool lexing_raw_string>
1080static bool get_fresh_line_impl (cpp_reader *pfile);
1081
a8eb6044
NB
1082/* Return true if the trigraph indicated by NOTE should be warned
1083 about in a comment. */
1084static bool
6cf87ca4 1085warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
a8eb6044
NB
1086{
1087 const uchar *p;
1088
1089 /* Within comments we don't warn about trigraphs, unless the
1090 trigraph forms an escaped newline, as that may change
6356f892 1091 behavior. */
a8eb6044
NB
1092 if (note->type != '/')
1093 return false;
1094
1095 /* If -trigraphs, then this was an escaped newline iff the next note
1096 is coincident. */
1097 if (CPP_OPTION (pfile, trigraphs))
1098 return note[1].pos == note->pos;
1099
1100 /* Otherwise, see if this forms an escaped newline. */
1101 p = note->pos + 3;
1102 while (is_nvspace (*p))
1103 p++;
1104
1105 /* There might have been escaped newlines between the trigraph and the
1106 newline we found. Hence the position test. */
1107 return (*p == '\n' && p < note[1].pos);
1108}
1109
26aea073
NB
1110/* Process the notes created by add_line_note as far as the current
1111 location. */
1112void
6cf87ca4 1113_cpp_process_line_notes (cpp_reader *pfile, int in_comment)
45b966db 1114{
29401c30
NB
1115 cpp_buffer *buffer = pfile->buffer;
1116
26aea073 1117 for (;;)
041c3194 1118 {
26aea073
NB
1119 _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1120 unsigned int col;
a5c3cccd 1121
26aea073
NB
1122 if (note->pos > buffer->cur)
1123 break;
a5c3cccd 1124
26aea073
NB
1125 buffer->cur_note++;
1126 col = CPP_BUF_COLUMN (buffer, note->pos + 1);
4d6baafa 1127
41c32c98 1128 if (note->type == '\\' || note->type == ' ')
26aea073 1129 {
41c32c98 1130 if (note->type == ' ' && !in_comment)
500bee0a 1131 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
26aea073 1132 "backslash and newline separated by space");
41c32c98 1133
26aea073 1134 if (buffer->next_line > buffer->rlimit)
87062813 1135 {
500bee0a 1136 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
26aea073
NB
1137 "backslash-newline at end of file");
1138 /* Prevent "no newline at end of file" warning. */
1139 buffer->next_line = buffer->rlimit;
87062813 1140 }
26aea073
NB
1141
1142 buffer->line_base = note->pos;
12f9df4e 1143 CPP_INCREMENT_LINE (pfile, 0);
0d9f234d 1144 }
41c32c98
NB
1145 else if (_cpp_trigraph_map[note->type])
1146 {
a8eb6044
NB
1147 if (CPP_OPTION (pfile, warn_trigraphs)
1148 && (!in_comment || warn_in_comment (pfile, note)))
41c32c98
NB
1149 {
1150 if (CPP_OPTION (pfile, trigraphs))
87cf0651
SB
1151 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1152 pfile->line_table->highest_line, col,
1153 "trigraph ??%c converted to %c",
1154 note->type,
1155 (int) _cpp_trigraph_map[note->type]);
41c32c98 1156 else
905bd7b5 1157 {
87cf0651
SB
1158 cpp_warning_with_line
1159 (pfile, CPP_W_TRIGRAPHS,
1160 pfile->line_table->highest_line, col,
905bd7b5
GK
1161 "trigraph ??%c ignored, use -trigraphs to enable",
1162 note->type);
1163 }
41c32c98
NB
1164 }
1165 }
00a81b8b
JM
1166 else if (note->type == 0)
1167 /* Already processed in lex_raw_string. */;
41c32c98
NB
1168 else
1169 abort ();
041c3194 1170 }
45b966db
ZW
1171}
1172
51c50026
MP
1173namespace bidi {
1174 enum class kind {
1175 NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
1176 };
1177
1178 /* All the UTF-8 encodings of bidi characters start with E2. */
1179 constexpr uchar utf8_start = 0xe2;
1180
bef32d4a
DM
1181 struct context
1182 {
1183 context () {}
1184 context (location_t loc, kind k, bool pdf, bool ucn)
1185 : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
1186 {
1187 }
1188
1189 kind get_pop_kind () const
1190 {
1191 return m_pdf ? kind::PDF : kind::PDI;
1192 }
1193 bool ucn_p () const
1194 {
1195 return m_ucn;
1196 }
1197
1198 location_t m_loc;
1199 kind m_kind;
1200 unsigned m_pdf : 1;
1201 unsigned m_ucn : 1;
1202 };
1203
51c50026
MP
1204 /* A vector holding currently open bidi contexts. We use a char for
1205 each context, its LSB is 1 if it represents a PDF context, 0 if it
1206 represents a PDI context. The next bit is 1 if this context was open
1207 by a bidi character written as a UCN, and 0 when it was UTF-8. */
bef32d4a 1208 semi_embedded_vec <context, 16> vec;
51c50026
MP
1209
1210 /* Close the whole comment/identifier/string literal/character constant
1211 context. */
1212 void on_close ()
1213 {
1214 vec.truncate (0);
1215 }
1216
1217 /* Pop the last element in the vector. */
1218 void pop ()
1219 {
1220 unsigned int len = vec.count ();
1221 gcc_checking_assert (len > 0);
1222 vec.truncate (len - 1);
1223 }
1224
bef32d4a
DM
1225 /* Return the pop kind of the context of the Ith element. */
1226 kind pop_kind_at (unsigned int i)
51c50026 1227 {
bef32d4a 1228 return vec[i].get_pop_kind ();
51c50026
MP
1229 }
1230
bef32d4a 1231 /* Return the pop kind of the context that is currently opened. */
51c50026
MP
1232 kind current_ctx ()
1233 {
1234 unsigned int len = vec.count ();
1235 if (len == 0)
1236 return kind::NONE;
bef32d4a 1237 return vec[len - 1].get_pop_kind ();
51c50026
MP
1238 }
1239
1240 /* Return true if the current context comes from a UCN origin, that is,
1241 the bidi char which started this bidi context was written as a UCN. */
1242 bool current_ctx_ucn_p ()
1243 {
1244 unsigned int len = vec.count ();
1245 gcc_checking_assert (len > 0);
bef32d4a 1246 return vec[len - 1].m_ucn;
51c50026
MP
1247 }
1248
bef32d4a
DM
1249 location_t current_ctx_loc ()
1250 {
1251 unsigned int len = vec.count ();
1252 gcc_checking_assert (len > 0);
1253 return vec[len - 1].m_loc;
1254 }
1255
1256 /* We've read a bidi char, update the current vector as necessary.
1257 LOC is only valid when K is not kind::NONE. */
1258 void on_char (kind k, bool ucn_p, location_t loc)
51c50026
MP
1259 {
1260 switch (k)
1261 {
1262 case kind::LRE:
1263 case kind::RLE:
1264 case kind::LRO:
1265 case kind::RLO:
bef32d4a 1266 vec.push (context (loc, k, true, ucn_p));
51c50026
MP
1267 break;
1268 case kind::LRI:
1269 case kind::RLI:
1270 case kind::FSI:
bef32d4a 1271 vec.push (context (loc, k, false, ucn_p));
51c50026
MP
1272 break;
1273 /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
1274 whose scope has not yet been terminated. */
1275 case kind::PDF:
1276 if (current_ctx () == kind::PDF)
1277 pop ();
1278 break;
1279 /* PDI terminates the scope of the last LRI, RLI, or FSI whose
1280 scope has not yet been terminated, as well as the scopes of
1281 any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
1282 yet been terminated. */
1283 case kind::PDI:
1284 for (int i = vec.count () - 1; i >= 0; --i)
bef32d4a 1285 if (pop_kind_at (i) == kind::PDI)
51c50026
MP
1286 {
1287 vec.truncate (i);
1288 break;
1289 }
1290 break;
1291 case kind::LTR:
1292 case kind::RTL:
1293 /* These aren't popped by a PDF/PDI. */
1294 break;
630686f9 1295 ATTR_LIKELY case kind::NONE:
51c50026
MP
1296 break;
1297 default:
1298 abort ();
1299 }
1300 }
1301
1302 /* Return a descriptive string for K. */
1303 const char *to_str (kind k)
1304 {
1305 switch (k)
1306 {
1307 case kind::LRE:
1308 return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
1309 case kind::RLE:
1310 return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
1311 case kind::LRO:
1312 return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
1313 case kind::RLO:
1314 return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
1315 case kind::LRI:
1316 return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
1317 case kind::RLI:
1318 return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
1319 case kind::FSI:
1320 return "U+2068 (FIRST STRONG ISOLATE)";
1321 case kind::PDF:
1322 return "U+202C (POP DIRECTIONAL FORMATTING)";
1323 case kind::PDI:
1324 return "U+2069 (POP DIRECTIONAL ISOLATE)";
1325 case kind::LTR:
1326 return "U+200E (LEFT-TO-RIGHT MARK)";
1327 case kind::RTL:
1328 return "U+200F (RIGHT-TO-LEFT MARK)";
1329 default:
1330 abort ();
1331 }
1332 }
1333}
1334
bef32d4a
DM
1335/* Get location_t for the range of bytes [START, START + NUM_BYTES)
1336 within the current line in FILE, with the caret at START. */
1337
1338static location_t
1339get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
1340 const unsigned char *const start,
1341 size_t num_bytes)
1342{
1343 gcc_checking_assert (num_bytes > 0);
1344
1345 /* CPP_BUF_COLUMN and linemap_position_for_column both refer
1346 to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
1347 whereas linemap_position_for_column is 1-based. */
1348
1349 /* Get 0-based offsets within the line. */
1350 size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
1351 size_t end_offset = start_offset + num_bytes - 1;
1352
1353 /* Now convert to location_t, where "columns" are 1-based byte offsets. */
1354 location_t start_loc = linemap_position_for_column (pfile->line_table,
1355 start_offset + 1);
1356 location_t end_loc = linemap_position_for_column (pfile->line_table,
1357 end_offset + 1);
1358
1359 if (start_loc == end_loc)
1360 return start_loc;
1361
1362 source_range src_range;
1363 src_range.m_start = start_loc;
1364 src_range.m_finish = end_loc;
1f68a3e8
DM
1365 location_t combined_loc
1366 = pfile->line_table->get_or_create_combined_loc (start_loc,
1367 src_range,
1368 nullptr,
1369 0);
bef32d4a
DM
1370 return combined_loc;
1371}
1372
51c50026
MP
1373/* Parse a sequence of 3 bytes starting with P and return its bidi code. */
1374
1375static bidi::kind
bef32d4a 1376get_bidi_utf8_1 (const unsigned char *const p)
51c50026
MP
1377{
1378 gcc_checking_assert (p[0] == bidi::utf8_start);
1379
1380 if (p[1] == 0x80)
1381 switch (p[2])
1382 {
1383 case 0xaa:
1384 return bidi::kind::LRE;
1385 case 0xab:
1386 return bidi::kind::RLE;
1387 case 0xac:
1388 return bidi::kind::PDF;
1389 case 0xad:
1390 return bidi::kind::LRO;
1391 case 0xae:
1392 return bidi::kind::RLO;
1393 case 0x8e:
1394 return bidi::kind::LTR;
1395 case 0x8f:
1396 return bidi::kind::RTL;
1397 default:
1398 break;
1399 }
1400 else if (p[1] == 0x81)
1401 switch (p[2])
1402 {
1403 case 0xa6:
1404 return bidi::kind::LRI;
1405 case 0xa7:
1406 return bidi::kind::RLI;
1407 case 0xa8:
1408 return bidi::kind::FSI;
1409 case 0xa9:
1410 return bidi::kind::PDI;
1411 default:
1412 break;
1413 }
1414
1415 return bidi::kind::NONE;
1416}
1417
bef32d4a
DM
1418/* Parse a sequence of 3 bytes starting with P and return its bidi code.
1419 If the kind is not NONE, write the location to *OUT.*/
1420
1421static bidi::kind
1422get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
1423{
1424 bidi::kind result = get_bidi_utf8_1 (p);
1425 if (result != bidi::kind::NONE)
1426 {
1427 /* We have a sequence of 3 bytes starting at P. */
1428 *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
1429 }
1430 return result;
1431}
1432
51c50026
MP
1433/* Parse a UCN where P points just past \u or \U and return its bidi code. */
1434
1435static bidi::kind
e9dd050e 1436get_bidi_ucn_1 (const unsigned char *p, bool is_U, const unsigned char **end)
51c50026
MP
1437{
1438 /* 6.4.3 Universal Character Names
1439 \u hex-quad
1440 \U hex-quad hex-quad
e9dd050e 1441 \u { simple-hexadecimal-digit-sequence }
51c50026
MP
1442 where \unnnn means \U0000nnnn. */
1443
e9dd050e 1444 *end = p + 4;
51c50026
MP
1445 if (is_U)
1446 {
1447 if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
1448 return bidi::kind::NONE;
1449 /* Skip 4B so we can treat \u and \U the same below. */
1450 p += 4;
e9dd050e
JJ
1451 *end += 4;
1452 }
1453 else if (p[0] == '{')
1454 {
1455 p++;
1456 while (*p == '0')
1457 p++;
1458 if (p[0] != '2'
1459 || p[1] != '0'
1460 || !ISXDIGIT (p[2])
1461 || !ISXDIGIT (p[3])
1462 || p[4] != '}')
1463 return bidi::kind::NONE;
1464 *end = p + 5;
51c50026
MP
1465 }
1466
1467 /* All code points we are looking for start with 20xx. */
1468 if (p[0] != '2' || p[1] != '0')
1469 return bidi::kind::NONE;
1470 else if (p[2] == '2')
1471 switch (p[3])
1472 {
1473 case 'a':
1474 case 'A':
1475 return bidi::kind::LRE;
1476 case 'b':
1477 case 'B':
1478 return bidi::kind::RLE;
1479 case 'c':
1480 case 'C':
1481 return bidi::kind::PDF;
1482 case 'd':
1483 case 'D':
1484 return bidi::kind::LRO;
1485 case 'e':
1486 case 'E':
1487 return bidi::kind::RLO;
1488 default:
1489 break;
1490 }
1491 else if (p[2] == '6')
1492 switch (p[3])
1493 {
1494 case '6':
1495 return bidi::kind::LRI;
1496 case '7':
1497 return bidi::kind::RLI;
1498 case '8':
1499 return bidi::kind::FSI;
1500 case '9':
1501 return bidi::kind::PDI;
1502 default:
1503 break;
1504 }
1505 else if (p[2] == '0')
1506 switch (p[3])
1507 {
1508 case 'e':
1509 case 'E':
1510 return bidi::kind::LTR;
1511 case 'f':
1512 case 'F':
1513 return bidi::kind::RTL;
1514 default:
1515 break;
1516 }
1517
1518 return bidi::kind::NONE;
1519}
1520
bef32d4a 1521/* Parse a UCN where P points just past \u or \U and return its bidi code.
eb4879ab 1522 If the kind is not NONE, write the location to *OUT. */
bef32d4a
DM
1523
1524static bidi::kind
e9dd050e 1525get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
bef32d4a
DM
1526 location_t *out)
1527{
e9dd050e
JJ
1528 const unsigned char *end;
1529 bidi::kind result = get_bidi_ucn_1 (p, is_U, &end);
bef32d4a
DM
1530 if (result != bidi::kind::NONE)
1531 {
1532 const unsigned char *start = p - 2;
e9dd050e 1533 size_t num_bytes = end - start;
bef32d4a
DM
1534 *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
1535 }
1536 return result;
1537}
1538
eb4879ab
JJ
1539/* Parse a named universal character escape where P points just past \N and
1540 return its bidi code. If the kind is not NONE, write the location to
1541 *OUT. */
1542
1543static bidi::kind
1544get_bidi_named (cpp_reader *pfile, const unsigned char *p, location_t *out)
1545{
1546 bidi::kind result = bidi::kind::NONE;
1547 if (*p != '{')
1548 return bidi::kind::NONE;
1549 if (strncmp ((const char *) (p + 1), "LEFT-TO-RIGHT ", 14) == 0)
1550 {
1551 if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1552 result = bidi::kind::LTR;
1553 else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1554 result = bidi::kind::LRE;
1555 else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1556 result = bidi::kind::LRO;
1557 else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1558 result = bidi::kind::LRI;
1559 }
1560 else if (strncmp ((const char *) (p + 1), "RIGHT-TO-LEFT ", 14) == 0)
1561 {
1562 if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0)
1563 result = bidi::kind::RTL;
1564 else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0)
1565 result = bidi::kind::RLE;
1566 else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0)
1567 result = bidi::kind::RLO;
1568 else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0)
1569 result = bidi::kind::RLI;
1570 }
1571 else if (strncmp ((const char *) (p + 1), "POP DIRECTIONAL ", 16) == 0)
1572 {
1573 if (strncmp ((const char *) (p + 16), "FORMATTING}", 11) == 0)
1574 result = bidi::kind::PDF;
1575 else if (strncmp ((const char *) (p + 16), "ISOLATE}", 8) == 0)
1576 result = bidi::kind::PDI;
1577 }
1578 else if (strncmp ((const char *) (p + 1), "FIRST STRONG ISOLATE}", 21) == 0)
1579 result = bidi::kind::FSI;
1580 if (result != bidi::kind::NONE)
1581 *out = get_location_for_byte_range_in_cur_line (pfile, p - 2,
1582 (strchr ((const char *)
1583 (p + 1), '}')
1584 - (const char *) p)
1585 + 3);
1586 return result;
1587}
1588
bef32d4a
DM
1589/* Subclass of rich_location for reporting on unpaired UTF-8
1590 bidirectional control character(s).
1591 Escape the source lines on output, and show all unclosed
1592 bidi context, labelling everything. */
1593
1594class unpaired_bidi_rich_location : public rich_location
1595{
1596 public:
1597 class custom_range_label : public range_label
1598 {
1599 public:
ff171cb1 1600 label_text get_text (unsigned range_idx) const final override
bef32d4a
DM
1601 {
1602 /* range 0 is the primary location; each subsequent range i + 1
1603 is for bidi::vec[i]. */
1604 if (range_idx > 0)
1605 {
1606 const bidi::context &ctxt (bidi::vec[range_idx - 1]);
1607 return label_text::borrow (bidi::to_str (ctxt.m_kind));
1608 }
1609 else
1610 return label_text::borrow (_("end of bidirectional context"));
1611 }
1612 };
1613
1614 unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
1615 : rich_location (pfile->line_table, loc, &m_custom_label)
1616 {
1617 set_escape_on_output (true);
1618 for (unsigned i = 0; i < bidi::vec.count (); i++)
1619 add_range (bidi::vec[i].m_loc,
1620 SHOW_RANGE_WITHOUT_CARET,
1621 &m_custom_label);
1622 }
1623
1624 private:
1625 custom_range_label m_custom_label;
1626};
1627
51c50026
MP
1628/* We're closing a bidi context, that is, we've encountered a newline,
1629 are closing a C-style comment, or are at the end of a string literal,
1630 character constant, or identifier. Warn if this context was not
1631 properly terminated by a PDI or PDF. P points to the last character
1632 in this context. */
1633
1634static void
1635maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
1636{
ae36f839
MP
1637 const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1638 if (bidi::vec.count () > 0
1639 && (warn_bidi & bidirectional_unpaired
1640 && (!bidi::current_ctx_ucn_p ()
1641 || (warn_bidi & bidirectional_ucn))))
51c50026
MP
1642 {
1643 const location_t loc
1644 = linemap_position_for_column (pfile->line_table,
1645 CPP_BUF_COLUMN (pfile->buffer, p));
bef32d4a
DM
1646 unpaired_bidi_rich_location rich_loc (pfile, loc);
1647 /* cpp_callbacks doesn't yet have a way to handle singular vs plural
1648 forms of a diagnostic, so fake it for now. */
1649 if (bidi::vec.count () > 1)
1650 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1651 "unpaired UTF-8 bidirectional control characters "
1652 "detected");
1653 else
1654 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1655 "unpaired UTF-8 bidirectional control character "
1656 "detected");
51c50026
MP
1657 }
1658 /* We're done with this context. */
1659 bidi::on_close ();
1660}
1661
1662/* We're at the beginning or in the middle of an identifier/comment/string
1663 literal/character constant. Warn if we've encountered a bidi character.
bef32d4a
DM
1664 KIND says which bidi control character it was; UCN_P is true iff this bidi
1665 control character was written as a UCN. LOC is the location of the
1666 character, but is only valid if KIND != bidi::kind::NONE. */
51c50026
MP
1667
1668static void
bef32d4a
DM
1669maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
1670 bool ucn_p, location_t loc)
51c50026
MP
1671{
1672 if (__builtin_expect (kind == bidi::kind::NONE, 1))
1673 return;
1674
1675 const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1676
ae36f839 1677 if (warn_bidi & (bidirectional_unpaired|bidirectional_any))
51c50026 1678 {
1a7f2c07
DM
1679 rich_location rich_loc (pfile->line_table, loc);
1680 rich_loc.set_escape_on_output (true);
1681
51c50026
MP
1682 /* It seems excessive to warn about a PDI/PDF that is closing
1683 an opened context because we've already warned about the
1684 opening character. Except warn when we have a UCN x UTF-8
ae36f839 1685 mismatch, if UCN checking is enabled. */
51c50026
MP
1686 if (kind == bidi::current_ctx ())
1687 {
ae36f839 1688 if (warn_bidi == (bidirectional_unpaired|bidirectional_ucn)
51c50026 1689 && bidi::current_ctx_ucn_p () != ucn_p)
bef32d4a
DM
1690 {
1691 rich_loc.add_range (bidi::current_ctx_loc ());
1692 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1693 "UTF-8 vs UCN mismatch when closing "
1694 "a context by \"%s\"", bidi::to_str (kind));
1695 }
51c50026 1696 }
ae36f839
MP
1697 else if (warn_bidi & bidirectional_any
1698 && (!ucn_p || (warn_bidi & bidirectional_ucn)))
51c50026
MP
1699 {
1700 if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
1a7f2c07
DM
1701 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1702 "\"%s\" is closing an unopened context",
1703 bidi::to_str (kind));
51c50026 1704 else
1a7f2c07
DM
1705 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1706 "found problematic Unicode character \"%s\"",
1707 bidi::to_str (kind));
51c50026
MP
1708 }
1709 }
1710 /* We're done with this context. */
bef32d4a 1711 bidi::on_char (kind, ucn_p, loc);
51c50026
MP
1712}
1713
0b8c57ed
JJ
1714static const cppchar_t utf8_continuation = 0x80;
1715static const cppchar_t utf8_signifier = 0xC0;
1716
1717/* Emit -Winvalid-utf8 warning on invalid UTF-8 character starting
1718 at PFILE->buffer->cur. Return a pointer after the diagnosed
1719 invalid character. */
1720
1721static const uchar *
1722_cpp_warn_invalid_utf8 (cpp_reader *pfile)
1723{
1724 cpp_buffer *buffer = pfile->buffer;
1725 const uchar *cur = buffer->cur;
1726 bool pedantic = (CPP_PEDANTIC (pfile)
1727 && CPP_OPTION (pfile, cpp_warn_invalid_utf8) == 2);
1728
1729 if (cur[0] < utf8_signifier
1730 || cur[1] < utf8_continuation || cur[1] >= utf8_signifier)
1731 {
1732 if (pedantic)
1733 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1734 pfile->line_table->highest_line,
1735 CPP_BUF_COL (buffer),
1736 "invalid UTF-8 character <%x>",
1737 cur[0]);
1738 else
1739 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1740 pfile->line_table->highest_line,
1741 CPP_BUF_COL (buffer),
1742 "invalid UTF-8 character <%x>",
1743 cur[0]);
1744 return cur + 1;
1745 }
1746 else if (cur[2] < utf8_continuation || cur[2] >= utf8_signifier)
1747 {
1748 if (pedantic)
1749 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1750 pfile->line_table->highest_line,
1751 CPP_BUF_COL (buffer),
1752 "invalid UTF-8 character <%x><%x>",
1753 cur[0], cur[1]);
1754 else
1755 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1756 pfile->line_table->highest_line,
1757 CPP_BUF_COL (buffer),
1758 "invalid UTF-8 character <%x><%x>",
1759 cur[0], cur[1]);
1760 return cur + 2;
1761 }
1762 else if (cur[3] < utf8_continuation || cur[3] >= utf8_signifier)
1763 {
1764 if (pedantic)
1765 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1766 pfile->line_table->highest_line,
1767 CPP_BUF_COL (buffer),
1768 "invalid UTF-8 character <%x><%x><%x>",
1769 cur[0], cur[1], cur[2]);
1770 else
1771 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1772 pfile->line_table->highest_line,
1773 CPP_BUF_COL (buffer),
1774 "invalid UTF-8 character <%x><%x><%x>",
1775 cur[0], cur[1], cur[2]);
1776 return cur + 3;
1777 }
1778 else
1779 {
1780 if (pedantic)
1781 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1782 pfile->line_table->highest_line,
1783 CPP_BUF_COL (buffer),
1784 "invalid UTF-8 character <%x><%x><%x><%x>",
1785 cur[0], cur[1], cur[2], cur[3]);
1786 else
1787 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1788 pfile->line_table->highest_line,
1789 CPP_BUF_COL (buffer),
1790 "invalid UTF-8 character <%x><%x><%x><%x>",
1791 cur[0], cur[1], cur[2], cur[3]);
1792 return cur + 4;
1793 }
1794}
1795
1796/* Helper function of *skip_*_comment and lex*_string. For C,
1797 character at CUR[-1] with MSB set handle -Wbidi-chars* and
1798 -Winvalid-utf8 diagnostics and return pointer to first character
1799 that should be processed next. */
1800
1801static inline const uchar *
1802_cpp_handle_multibyte_utf8 (cpp_reader *pfile, uchar c,
1803 const uchar *cur, bool warn_bidi_p,
1804 bool warn_invalid_utf8_p)
1805{
1806 /* If this is a beginning of a UTF-8 encoding, it might be
1807 a bidirectional control character. */
1808 if (c == bidi::utf8_start && warn_bidi_p)
1809 {
1810 location_t loc;
1811 bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
1812 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1813 }
1814 if (!warn_invalid_utf8_p)
1815 return cur;
1816 if (c >= utf8_signifier)
1817 {
1818 cppchar_t s;
1819 const uchar *pstr = cur - 1;
1820 if (_cpp_valid_utf8 (pfile, &pstr, pfile->buffer->rlimit, 0, NULL, &s)
1821 && s <= UCS_LIMIT)
1822 return pstr;
1823 }
1824 pfile->buffer->cur = cur - 1;
1825 return _cpp_warn_invalid_utf8 (pfile);
1826}
1827
0d9f234d
NB
1828/* Skip a C-style block comment. We find the end of the comment by
1829 seeing if an asterisk is before every '/' we encounter. Returns
6f572ac2
NB
1830 nonzero if comment terminated by EOF, zero otherwise.
1831
1832 Buffer->cur points to the initial asterisk of the comment. */
26aea073 1833bool
6cf87ca4 1834_cpp_skip_block_comment (cpp_reader *pfile)
45b966db 1835{
041c3194 1836 cpp_buffer *buffer = pfile->buffer;
d08dcf87
ZW
1837 const uchar *cur = buffer->cur;
1838 uchar c;
51c50026 1839 const bool warn_bidi_p = pfile->warn_bidi_p ();
0b8c57ed
JJ
1840 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1841 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
0d9f234d 1842
d08dcf87
ZW
1843 cur++;
1844 if (*cur == '/')
1845 cur++;
0d9f234d 1846
26aea073
NB
1847 for (;;)
1848 {
0d9f234d
NB
1849 /* People like decorating comments with '*', so check for '/'
1850 instead for efficiency. */
d08dcf87
ZW
1851 c = *cur++;
1852
041c3194 1853 if (c == '/')
45b966db 1854 {
d08dcf87 1855 if (cur[-2] == '*')
51c50026
MP
1856 {
1857 if (warn_bidi_p)
1858 maybe_warn_bidi_on_close (pfile, cur);
1859 break;
1860 }
041c3194 1861
0d9f234d 1862 /* Warn about potential nested comments, but not if the '/'
a1f300c0 1863 comes immediately before the true comment delimiter.
041c3194 1864 Don't bother to get it right across escaped newlines. */
0d9f234d 1865 if (CPP_OPTION (pfile, warn_comments)
d08dcf87
ZW
1866 && cur[0] == '*' && cur[1] != '/')
1867 {
1868 buffer->cur = cur;
87cf0651
SB
1869 cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1870 pfile->line_table->highest_line,
1871 CPP_BUF_COL (buffer),
1872 "\"/*\" within comment");
d08dcf87 1873 }
45b966db 1874 }
26aea073
NB
1875 else if (c == '\n')
1876 {
12f9df4e 1877 unsigned int cols;
d08dcf87 1878 buffer->cur = cur - 1;
51c50026
MP
1879 if (warn_bidi_p)
1880 maybe_warn_bidi_on_close (pfile, cur);
26aea073
NB
1881 _cpp_process_line_notes (pfile, true);
1882 if (buffer->next_line >= buffer->rlimit)
1883 return true;
1884 _cpp_clean_line (pfile);
12f9df4e
PB
1885
1886 cols = buffer->next_line - buffer->line_base;
1887 CPP_INCREMENT_LINE (pfile, cols);
1888
d08dcf87 1889 cur = buffer->cur;
26aea073 1890 }
0b8c57ed
JJ
1891 else if (__builtin_expect (c >= utf8_continuation, 0)
1892 && warn_bidi_or_invalid_utf8_p)
1893 cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
1894 warn_invalid_utf8_p);
45b966db 1895 }
041c3194 1896
d08dcf87 1897 buffer->cur = cur;
a8eb6044 1898 _cpp_process_line_notes (pfile, true);
26aea073 1899 return false;
45b966db
ZW
1900}
1901
480709cc 1902/* Skip a C++ line comment, leaving buffer->cur pointing to the
da7d8304 1903 terminating newline. Handles escaped newlines. Returns nonzero
480709cc 1904 if a multiline comment. */
041c3194 1905static int
6cf87ca4 1906skip_line_comment (cpp_reader *pfile)
45b966db 1907{
cbcff6df 1908 cpp_buffer *buffer = pfile->buffer;
620e594b 1909 location_t orig_line = pfile->line_table->highest_line;
51c50026 1910 const bool warn_bidi_p = pfile->warn_bidi_p ();
0b8c57ed
JJ
1911 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1912 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
041c3194 1913
0b8c57ed 1914 if (!warn_bidi_or_invalid_utf8_p)
51c50026
MP
1915 while (*buffer->cur != '\n')
1916 buffer->cur++;
0b8c57ed 1917 else if (!warn_invalid_utf8_p)
51c50026
MP
1918 {
1919 while (*buffer->cur != '\n'
1920 && *buffer->cur != bidi::utf8_start)
1921 buffer->cur++;
1922 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1923 {
1924 while (*buffer->cur != '\n')
1925 {
1926 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1927 {
bef32d4a
DM
1928 location_t loc;
1929 bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1930 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
51c50026
MP
1931 }
1932 buffer->cur++;
1933 }
1934 maybe_warn_bidi_on_close (pfile, buffer->cur);
1935 }
1936 }
0b8c57ed
JJ
1937 else
1938 {
1939 while (*buffer->cur != '\n')
1940 {
1941 if (*buffer->cur < utf8_continuation)
1942 {
1943 buffer->cur++;
1944 continue;
1945 }
1946 buffer->cur
1947 = _cpp_handle_multibyte_utf8 (pfile, *buffer->cur, buffer->cur + 1,
1948 warn_bidi_p, warn_invalid_utf8_p);
1949 }
1950 if (warn_bidi_p)
1951 maybe_warn_bidi_on_close (pfile, buffer->cur);
1952 }
480709cc 1953
26aea073 1954 _cpp_process_line_notes (pfile, true);
500bee0a 1955 return orig_line != pfile->line_table->highest_line;
041c3194 1956}
45b966db 1957
26aea073 1958/* Skips whitespace, saving the next non-whitespace character. */
52fadca8 1959static void
6cf87ca4 1960skip_whitespace (cpp_reader *pfile, cppchar_t c)
041c3194
ZW
1961{
1962 cpp_buffer *buffer = pfile->buffer;
f7d151fb 1963 bool saw_NUL = false;
45b966db 1964
0d9f234d 1965 do
041c3194 1966 {
91fcd158 1967 /* Horizontal space always OK. */
26aea073 1968 if (c == ' ' || c == '\t')
0d9f234d 1969 ;
0d9f234d 1970 /* Just \f \v or \0 left. */
91fcd158 1971 else if (c == '\0')
f7d151fb 1972 saw_NUL = true;
93c80368 1973 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
500bee0a 1974 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
ebef4e8c
NB
1975 CPP_BUF_COL (buffer),
1976 "%s in preprocessing directive",
1977 c == '\f' ? "form feed" : "vertical tab");
0d9f234d 1978
0d9f234d 1979 c = *buffer->cur++;
45b966db 1980 }
ec5c56db 1981 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
0d9f234d
NB
1982 while (is_nvspace (c));
1983
f7d151fb 1984 if (saw_NUL)
bd5e882c
DM
1985 {
1986 encoding_rich_location rich_loc (pfile);
1987 cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
1988 "null character(s) ignored");
1989 }
f7d151fb 1990
480709cc 1991 buffer->cur--;
041c3194 1992}
45b966db 1993
93c80368
NB
1994/* See if the characters of a number token are valid in a name (no
1995 '.', '+' or '-'). */
1996static int
6cf87ca4 1997name_p (cpp_reader *pfile, const cpp_string *string)
93c80368
NB
1998{
1999 unsigned int i;
2000
2001 for (i = 0; i < string->len; i++)
2002 if (!is_idchar (string->text[i]))
2003 return 0;
2004
df383483 2005 return 1;
93c80368
NB
2006}
2007
50668cf6
GK
2008/* After parsing an identifier or other sequence, produce a warning about
2009 sequences not in NFC/NFKC. */
2010static void
2011warn_about_normalization (cpp_reader *pfile,
2012 const cpp_token *token,
36d20fa4
JM
2013 const struct normalize_state *s,
2014 bool identifier)
50668cf6
GK
2015{
2016 if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
2017 && !pfile->state.skipping)
2018 {
bd5e882c
DM
2019 location_t loc = token->src_loc;
2020
2021 /* If possible, create a location range for the token. */
2022 if (loc >= RESERVED_LOCATION_COUNT
2023 && token->type != CPP_EOF
2024 /* There must be no line notes to process. */
2025 && (!(pfile->buffer->cur
2026 >= pfile->buffer->notes[pfile->buffer->cur_note].pos
2027 && !pfile->overlaid_buffer)))
2028 {
2029 source_range tok_range;
2030 tok_range.m_start = loc;
2031 tok_range.m_finish
2032 = linemap_position_for_column (pfile->line_table,
2033 CPP_BUF_COLUMN (pfile->buffer,
2034 pfile->buffer->cur));
1f68a3e8
DM
2035 loc = pfile->line_table->get_or_create_combined_loc (loc, tok_range,
2036 nullptr, 0);
bd5e882c
DM
2037 }
2038
2039 encoding_rich_location rich_loc (pfile, loc);
2040
50668cf6
GK
2041 /* Make sure that the token is printed using UCNs, even
2042 if we'd otherwise happily print UTF-8. */
c3f829c1 2043 unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
50668cf6
GK
2044 size_t sz;
2045
2046 sz = cpp_spell_token (pfile, token, buf, false) - buf;
2047 if (NORMALIZE_STATE_RESULT (s) == normalized_C)
bd5e882c
DM
2048 cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2049 "`%.*s' is not in NFKC", (int) sz, buf);
36d20fa4 2050 else if (identifier && CPP_OPTION (pfile, xid_identifiers))
bd5e882c 2051 cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
c4d6dcac 2052 "`%.*s' is not in NFC", (int) sz, buf);
50668cf6 2053 else
bd5e882c
DM
2054 cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
2055 "`%.*s' is not in NFC", (int) sz, buf);
55e7f907 2056 free (buf);
50668cf6
GK
2057 }
2058}
2059
1d3e4f4e
LH
2060/* Returns TRUE if the byte sequence starting at buffer->cur is a valid
2061 extended character in an identifier. If FIRST is TRUE, then the character
2062 must be valid at the beginning of an identifier as well. If the return
2063 value is TRUE, then pfile->buffer->cur has been moved to point to the next
2064 byte after the extended character. */
51c50026 2065
bced6edf 2066static bool
50668cf6
GK
2067forms_identifier_p (cpp_reader *pfile, int first,
2068 struct normalize_state *state)
bced6edf 2069{
1613e52b 2070 cpp_buffer *buffer = pfile->buffer;
51c50026 2071 const bool warn_bidi_p = pfile->warn_bidi_p ();
1613e52b
NB
2072
2073 if (*buffer->cur == '$')
2074 {
2075 if (!CPP_OPTION (pfile, dollars_in_ident))
2076 return false;
2077
2078 buffer->cur++;
78b8811a 2079 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1613e52b 2080 {
78b8811a 2081 CPP_OPTION (pfile, warn_dollars) = 0;
0527bc4e 2082 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1613e52b
NB
2083 }
2084
2085 return true;
2086 }
bced6edf 2087
7d112d66
LH
2088 /* Is this a syntactically valid UCN or a valid UTF-8 char? */
2089 if (CPP_OPTION (pfile, extended_identifiers))
bced6edf 2090 {
fbb22910 2091 cppchar_t s;
7d112d66
LH
2092 if (*buffer->cur >= utf8_signifier)
2093 {
51c50026
MP
2094 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
2095 && warn_bidi_p)
2096 {
bef32d4a
DM
2097 location_t loc;
2098 bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
2099 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
51c50026 2100 }
7d112d66
LH
2101 if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2102 state, &s))
2103 return true;
2104 }
2105 else if (*buffer->cur == '\\'
eb4879ab
JJ
2106 && (buffer->cur[1] == 'u'
2107 || buffer->cur[1] == 'U'
2108 || buffer->cur[1] == 'N'))
7d112d66
LH
2109 {
2110 buffer->cur += 2;
51c50026
MP
2111 if (warn_bidi_p)
2112 {
bef32d4a 2113 location_t loc;
eb4879ab
JJ
2114 bidi::kind kind;
2115 if (buffer->cur[-1] == 'N')
2116 kind = get_bidi_named (pfile, buffer->cur, &loc);
2117 else
2118 kind = get_bidi_ucn (pfile, buffer->cur,
2119 buffer->cur[-1] == 'U', &loc);
bef32d4a 2120 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
51c50026 2121 }
7d112d66
LH
2122 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2123 state, &s, NULL, NULL))
2124 return true;
2125 buffer->cur -= 2;
2126 }
bced6edf 2127 }
bced6edf 2128
1613e52b 2129 return false;
bced6edf
NB
2130}
2131
fb771b9d
TT
2132/* Helper function to issue error about improper __VA_OPT__ use. */
2133static void
2134maybe_va_opt_error (cpp_reader *pfile)
2135{
2136 if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
2137 {
2138 /* __VA_OPT__ should not be accepted at all, but allow it in
2139 system headers. */
bf425849 2140 if (!_cpp_in_system_header (pfile))
ce53cf7b
JM
2141 {
2142 if (CPP_OPTION (pfile, cplusplus))
2143 cpp_error (pfile, CPP_DL_PEDWARN,
2144 "__VA_OPT__ is not available until C++20");
2145 else
2146 cpp_error (pfile, CPP_DL_PEDWARN,
094a609c 2147 "__VA_OPT__ is not available until C23");
ce53cf7b 2148 }
fb771b9d
TT
2149 }
2150 else if (!pfile->state.va_args_ok)
2151 {
2152 /* __VA_OPT__ should only appear in the replacement list of a
2153 variadic macro. */
2154 cpp_error (pfile, CPP_DL_PEDWARN,
2155 "__VA_OPT__ can only appear in the expansion"
b04445d4 2156 " of a C++20 variadic macro");
fb771b9d
TT
2157 }
2158}
2159
1d3e4f4e
LH
2160/* Helper function to perform diagnostics that are needed (rarely)
2161 when an identifier is lexed. */
2162static void
2163identifier_diagnostics_on_lex (cpp_reader *pfile, cpp_hashnode *node)
2164{
2165 if (__builtin_expect (!(node->flags & NODE_DIAGNOSTIC)
2166 || pfile->state.skipping, 1))
2167 return;
2168
2169 /* It is allowed to poison the same identifier twice. */
2170 if ((node->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
cb05acdc
LH
2171 {
2172 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2173 NODE_NAME (node));
2174 const auto data = (cpp_hashnode_extra *)
2175 ht_lookup (pfile->extra_hash_table, node->ident, HT_NO_INSERT);
2176 if (data && data->poisoned_loc)
2177 cpp_error_at (pfile, CPP_DL_NOTE, data->poisoned_loc, "poisoned here");
2178 }
1d3e4f4e
LH
2179
2180 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2181 replacement list of a variadic macro. */
2182 if (node == pfile->spec_nodes.n__VA_ARGS__
2183 && !pfile->state.va_args_ok)
2184 {
2185 if (CPP_OPTION (pfile, cplusplus))
2186 cpp_error (pfile, CPP_DL_PEDWARN,
2187 "__VA_ARGS__ can only appear in the expansion"
2188 " of a C++11 variadic macro");
2189 else
2190 cpp_error (pfile, CPP_DL_PEDWARN,
2191 "__VA_ARGS__ can only appear in the expansion"
2192 " of a C99 variadic macro");
2193 }
2194
2195 /* __VA_OPT__ should only appear in the replacement list of a
2196 variadic macro. */
2197 if (node == pfile->spec_nodes.n__VA_OPT__)
2198 maybe_va_opt_error (pfile);
2199
2200 /* For -Wc++-compat, warn about use of C++ named operators. */
2201 if (node->flags & NODE_WARN_OPERATOR)
2202 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2203 "identifier \"%s\" is a special operator name in C++",
2204 NODE_NAME (node));
2205}
2206
17e7cb85
KT
2207/* Helper function to get the cpp_hashnode of the identifier BASE. */
2208static cpp_hashnode *
2209lex_identifier_intern (cpp_reader *pfile, const uchar *base)
2210{
2211 cpp_hashnode *result;
2212 const uchar *cur;
2213 unsigned int len;
2214 unsigned int hash = HT_HASHSTEP (0, *base);
2215
2216 cur = base + 1;
2217 while (ISIDNUM (*cur))
2218 {
2219 hash = HT_HASHSTEP (hash, *cur);
2220 cur++;
2221 }
2222 len = cur - base;
2223 hash = HT_HASHFINISH (hash, len);
2224 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2225 base, len, hash, HT_ALLOC));
1d3e4f4e 2226 identifier_diagnostics_on_lex (pfile, result);
17e7cb85
KT
2227 return result;
2228}
2229
2230/* Get the cpp_hashnode of an identifier specified by NAME in
2231 the current cpp_reader object. If none is found, NULL is returned. */
2232cpp_hashnode *
2233_cpp_lex_identifier (cpp_reader *pfile, const char *name)
2234{
2235 cpp_hashnode *result;
2236 result = lex_identifier_intern (pfile, (uchar *) name);
2237 return result;
2238}
2239
1d3e4f4e
LH
2240/* Lex an identifier starting at BASE. BUFFER->CUR is expected to point
2241 one past the first character at BASE, which may be a (possibly multi-byte)
2242 character if STARTS_UCN is true. */
0d9f234d 2243static cpp_hashnode *
50668cf6 2244lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
be5ffc59 2245 struct normalize_state *nst, cpp_hashnode **spelling)
45b966db 2246{
93c80368 2247 cpp_hashnode *result;
47e20491 2248 const uchar *cur;
c6e83800
ZW
2249 unsigned int len;
2250 unsigned int hash = HT_HASHSTEP (0, *base);
51c50026 2251 const bool warn_bidi_p = pfile->warn_bidi_p ();
2c3fcba6 2252
c6e83800 2253 cur = pfile->buffer->cur;
47e20491 2254 if (! starts_ucn)
d3f4ff8b
JM
2255 {
2256 while (ISIDNUM (*cur))
2257 {
2258 hash = HT_HASHSTEP (hash, *cur);
2259 cur++;
2260 }
2261 NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
2262 }
47e20491 2263 pfile->buffer->cur = cur;
50668cf6 2264 if (starts_ucn || forms_identifier_p (pfile, false, nst))
10cf9bde 2265 {
7d112d66
LH
2266 /* Slower version for identifiers containing UCNs
2267 or extended chars (including $). */
47e20491
GK
2268 do {
2269 while (ISIDNUM (*pfile->buffer->cur))
50668cf6 2270 {
d3f4ff8b 2271 NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
50668cf6 2272 pfile->buffer->cur++;
50668cf6
GK
2273 }
2274 } while (forms_identifier_p (pfile, false, nst));
51c50026
MP
2275 if (warn_bidi_p)
2276 maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
47e20491
GK
2277 result = _cpp_interpret_identifier (pfile, base,
2278 pfile->buffer->cur - base);
be5ffc59 2279 *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
2c3fcba6 2280 }
47e20491
GK
2281 else
2282 {
2283 len = cur - base;
2284 hash = HT_HASHFINISH (hash, len);
bced6edf 2285
2bf41bf0
TT
2286 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2287 base, len, hash, HT_ALLOC));
be5ffc59 2288 *spelling = result;
47e20491 2289 }
2c3fcba6 2290
1d3e4f4e
LH
2291 return result;
2292}
3d8b2a98 2293
1d3e4f4e
LH
2294/* Struct to hold the return value of the scan_cur_identifier () helper
2295 function below. */
2c3fcba6 2296
1d3e4f4e
LH
2297struct scan_id_result
2298{
2299 cpp_hashnode *node;
2300 normalize_state nst;
2301
2302 scan_id_result ()
2303 : node (nullptr)
2304 {
2305 nst = INITIAL_NORMALIZE_STATE;
2306 }
2307
2308 explicit operator bool () const { return node; }
2309};
2310
2311/* Helper function to scan an entire identifier beginning at
2312 pfile->buffer->cur, and possibly containing extended characters (UCNs
2313 and/or UTF-8). Returns the cpp_hashnode for the identifier on success, or
2314 else nullptr, as well as a normalize_state so that normalization warnings
2315 may be issued once the token lexing is complete. */
2316
2317static scan_id_result
2318scan_cur_identifier (cpp_reader *pfile)
2319{
2320 const auto buffer = pfile->buffer;
2321 const auto begin = buffer->cur;
2322 scan_id_result result;
2323 if (ISIDST (*buffer->cur))
2324 {
2325 ++buffer->cur;
2326 cpp_hashnode *ignore;
2327 result.node = lex_identifier (pfile, begin, false, &result.nst, &ignore);
2328 }
2329 else if (forms_identifier_p (pfile, true, &result.nst))
2330 {
2331 /* buffer->cur has been moved already by the call
2332 to forms_identifier_p. */
2333 cpp_hashnode *ignore;
2334 result.node = lex_identifier (pfile, begin, true, &result.nst, &ignore);
2335 }
2c3fcba6
ZW
2336 return result;
2337}
2338
bced6edf 2339/* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
45b966db 2340static void
50668cf6
GK
2341lex_number (cpp_reader *pfile, cpp_string *number,
2342 struct normalize_state *nst)
45b966db 2343{
562a5c27 2344 const uchar *cur;
bced6edf
NB
2345 const uchar *base;
2346 uchar *dest;
45b966db 2347
bced6edf
NB
2348 base = pfile->buffer->cur - 1;
2349 do
041c3194 2350 {
8f51cf38 2351 const uchar *adj_digit_sep = NULL;
bced6edf 2352 cur = pfile->buffer->cur;
0d9f234d 2353
bced6edf 2354 /* N.B. ISIDNUM does not include $. */
8f51cf38
JM
2355 while (ISIDNUM (*cur)
2356 || (*cur == '.' && !DIGIT_SEP (cur[-1]))
2357 || DIGIT_SEP (*cur)
2358 || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
50668cf6 2359 {
d3f4ff8b 2360 NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
8f51cf38
JM
2361 /* Adjacent digit separators do not form part of the pp-number syntax.
2362 However, they can safely be diagnosed here as an error, since '' is
2363 not a valid preprocessing token. */
2364 if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
2365 adj_digit_sep = cur;
50668cf6 2366 cur++;
50668cf6 2367 }
a5858a3d
ESR
2368 /* A number can't end with a digit separator. */
2369 while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
2370 --cur;
8f51cf38
JM
2371 if (adj_digit_sep && adj_digit_sep < cur)
2372 cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
45b966db 2373
10cf9bde 2374 pfile->buffer->cur = cur;
45b966db 2375 }
50668cf6 2376 while (forms_identifier_p (pfile, false, nst));
93c80368 2377
bced6edf
NB
2378 number->len = cur - base;
2379 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
2380 memcpy (dest, base, number->len);
2381 dest[number->len] = '\0';
2382 number->text = dest;
93c80368
NB
2383}
2384
6338b358
NB
2385/* Create a token of type TYPE with a literal spelling. */
2386static void
6cf87ca4
ZW
2387create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
2388 unsigned int len, enum cpp_ttype type)
6338b358 2389{
6338b358
NB
2390 token->type = type;
2391 token->val.str.len = len;
13f93cf5
NS
2392 token->val.str.text = cpp_alloc_token_string (pfile, base, len);
2393}
2394
1d3e4f4e
LH
2395/* Like create_literal(), but construct it from two separate strings
2396 which are concatenated. LEN2 may be 0 if no second string is
2397 required. */
2398static void
2399create_literal2 (cpp_reader *pfile, cpp_token *token, const uchar *base1,
2400 unsigned int len1, const uchar *base2, unsigned int len2,
2401 enum cpp_ttype type)
2402{
2403 token->type = type;
2404 token->val.str.len = len1 + len2;
2405 uchar *const dest = _cpp_unaligned_alloc (pfile, len1 + len2 + 1);
2406 memcpy (dest, base1, len1);
2407 if (len2)
2408 memcpy (dest+len1, base2, len2);
2409 dest[len1 + len2] = 0;
2410 token->val.str.text = dest;
2411}
2412
13f93cf5
NS
2413const uchar *
2414cpp_alloc_token_string (cpp_reader *pfile,
2415 const unsigned char *ptr, unsigned len)
2416{
2417 uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
2418
2419 dest[len] = 0;
2420 memcpy (dest, ptr, len);
2421 return dest;
6338b358
NB
2422}
2423
ed63c387
NS
2424/* A pair of raw buffer pointers. The currently open one is [1], the
2425 first one is [0]. Used for string literal lexing. */
2426struct lit_accum {
2427 _cpp_buff *first;
2428 _cpp_buff *last;
2429 const uchar *rpos;
2430 size_t accum;
2431
2432 lit_accum ()
2433 : first (NULL), last (NULL), rpos (0), accum (0)
2434 {
2435 }
2436
2437 void append (cpp_reader *, const uchar *, size_t);
2438
2439 void read_begin (cpp_reader *);
2440 bool reading_p () const
2441 {
2442 return rpos != NULL;
2443 }
2444 char read_char ()
2445 {
2446 char c = *rpos++;
2447 if (rpos == BUFF_FRONT (last))
2448 rpos = NULL;
2449 return c;
2450 }
1d3e4f4e
LH
2451
2452 void create_literal2 (cpp_reader *pfile, cpp_token *token,
2453 const uchar *base1, unsigned int len1,
2454 const uchar *base2, unsigned int len2,
2455 enum cpp_ttype type);
ed63c387
NS
2456};
2457
00a81b8b
JM
2458/* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
2459 sequence from *FIRST_BUFF_P to LAST_BUFF_P. */
2460
ed63c387
NS
2461void
2462lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
00a81b8b 2463{
ed63c387
NS
2464 if (!last)
2465 /* Starting. */
2466 first = last = _cpp_get_buff (pfile, len);
2467 else if (len > BUFF_ROOM (last))
00a81b8b 2468 {
ed63c387
NS
2469 /* There is insufficient room in the buffer. Copy what we can,
2470 and then either extend or create a new one. */
2471 size_t room = BUFF_ROOM (last);
2472 memcpy (BUFF_FRONT (last), base, room);
2473 BUFF_FRONT (last) += room;
00a81b8b
JM
2474 base += room;
2475 len -= room;
ed63c387 2476 accum += room;
00a81b8b 2477
ed63c387
NS
2478 gcc_checking_assert (!rpos);
2479
2480 last = _cpp_append_extend_buff (pfile, last, len);
2481 }
00a81b8b 2482
ed63c387
NS
2483 memcpy (BUFF_FRONT (last), base, len);
2484 BUFF_FRONT (last) += len;
2485 accum += len;
00a81b8b
JM
2486}
2487
ed63c387
NS
2488void
2489lit_accum::read_begin (cpp_reader *pfile)
2490{
2491 /* We never accumulate more than 4 chars to read. */
2492 if (BUFF_ROOM (last) < 4)
2493
2494 last = _cpp_append_extend_buff (pfile, last, 4);
2495 rpos = BUFF_FRONT (last);
2496}
c865f923 2497
1d3e4f4e
LH
2498/* Helper function to check if a string format macro, say from inttypes.h, is
2499 placed touching a string literal, in which case it could be parsed as a C++11
2500 user-defined string literal thus breaking the program. Return TRUE if the
2501 UDL should be ignored for now and preserved for potential macro
2502 expansion. */
c865f923
ESR
2503
2504static bool
1d3e4f4e
LH
2505maybe_ignore_udl_macro_suffix (cpp_reader *pfile, location_t src_loc,
2506 const uchar *suffix_begin, cpp_hashnode *node)
b44f8ad8
JW
2507{
2508 /* User-defined literals outside of namespace std must start with a single
2509 underscore, so assume anything of that form really is a UDL suffix.
2510 We don't need to worry about UDLs defined inside namespace std because
2511 their names are reserved, so cannot be used as macro names in valid
2512 programs. */
1d3e4f4e
LH
2513 if ((suffix_begin[0] == '_' && suffix_begin[1] != '_')
2514 || !cpp_macro_p (node))
b44f8ad8 2515 return false;
1d3e4f4e
LH
2516
2517 /* Maybe raise a warning here; caller should arrange not to consume
2518 the tokens. */
2519 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2520 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX, src_loc, 0,
2521 "invalid suffix on literal; C++11 requires a space "
2522 "between literal and string macro");
2523 return true;
2524}
2525
2526/* Like create_literal2(), but also prepend all the accumulated data from
2527 the lit_accum struct. */
2528void
2529lit_accum::create_literal2 (cpp_reader *pfile, cpp_token *token,
2530 const uchar *base1, unsigned int len1,
2531 const uchar *base2, unsigned int len2,
2532 enum cpp_ttype type)
2533{
2534 const unsigned int tot_len = accum + len1 + len2;
2535 uchar *dest = _cpp_unaligned_alloc (pfile, tot_len + 1);
2536 token->type = type;
2537 token->val.str.len = tot_len;
2538 token->val.str.text = dest;
2539 for (_cpp_buff *buf = first; buf; buf = buf->next)
2540 {
2541 size_t len = BUFF_FRONT (buf) - buf->base;
2542 memcpy (dest, buf->base, len);
2543 dest += len;
2544 }
2545 memcpy (dest, base1, len1);
2546 dest += len1;
2547 if (len2)
2548 memcpy (dest, base2, len2);
2549 dest += len2;
2550 *dest = '\0';
b44f8ad8 2551}
c865f923 2552
ed63c387
NS
2553/* Lexes a raw string. The stored string contains the spelling,
2554 including double quotes, delimiter string, '(' and ')', any leading
2555 'L', 'u', 'U' or 'u8' and 'R' modifier. The created token contains
2556 the type of the literal, or CPP_OTHER if it was not properly
2557 terminated.
2558
2559 BASE is the start of the token. Updates pfile->buffer->cur to just
2560 after the lexed string.
2c6e3f55
JJ
2561
2562 The spelling is NUL-terminated, but it is not guaranteed that this
2563 is the first NUL since embedded NULs are preserved. */
2564
2565static void
ed63c387 2566lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2c6e3f55 2567{
ed63c387 2568 const uchar *pos = base;
51c50026 2569 const bool warn_bidi_p = pfile->warn_bidi_p ();
0b8c57ed
JJ
2570 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2571 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
ed63c387
NS
2572
2573 /* 'tis a pity this information isn't passed down from the lexer's
2574 initial categorization of the token. */
2575 enum cpp_ttype type = CPP_STRING;
2576
2577 if (*pos == 'L')
2578 {
2579 type = CPP_WSTRING;
2580 pos++;
2581 }
2582 else if (*pos == 'U')
2583 {
2584 type = CPP_STRING32;
2585 pos++;
2586 }
2587 else if (*pos == 'u')
2588 {
2589 if (pos[1] == '8')
2590 {
2591 type = CPP_UTF8STRING;
2592 pos++;
2593 }
2594 else
2595 type = CPP_STRING16;
2596 pos++;
2597 }
2598
2599 gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
2600 pos += 2;
2601
00a81b8b 2602 _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
2c6e3f55 2603
ed63c387
NS
2604 /* Skip notes before the ". */
2605 while (note->pos < pos)
2606 ++note;
2607
2608 lit_accum accum;
2609
2610 uchar prefix[17];
2611 unsigned prefix_len = 0;
2612 enum Phase
2613 {
2614 PHASE_PREFIX = -2,
2615 PHASE_NONE = -1,
2616 PHASE_SUFFIX = 0
2617 } phase = PHASE_PREFIX;
2618
8cf88735
JJ
2619 for (;;)
2620 {
ed63c387 2621 gcc_checking_assert (note->pos >= pos);
00a81b8b 2622
ed63c387
NS
2623 /* Undo any escaped newlines and trigraphs. */
2624 if (!accum.reading_p () && note->pos == pos)
2625 switch (note->type)
2626 {
2627 case '\\':
2628 case ' ':
2629 /* Restore backslash followed by newline. */
2630 accum.append (pfile, base, pos - base);
2631 base = pos;
2632 accum.read_begin (pfile);
2633 accum.append (pfile, UC"\\", 1);
2634
2635 after_backslash:
2636 if (note->type == ' ')
2637 /* GNU backslash whitespace newline extension. FIXME
2638 could be any sequence of non-vertical space. When we
2639 can properly restore any such sequence, we should
2640 mark this note as handled so _cpp_process_line_notes
2641 doesn't warn. */
2642 accum.append (pfile, UC" ", 1);
2643
2644 accum.append (pfile, UC"\n", 1);
2645 note++;
2646 break;
00a81b8b 2647
ed63c387
NS
2648 case '\n':
2649 /* This can happen for ??/<NEWLINE> when trigraphs are not
2650 being interpretted. */
2651 gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
2652 note->type = 0;
2653 note++;
2654 break;
00a81b8b 2655
ed63c387
NS
2656 default:
2657 gcc_checking_assert (_cpp_trigraph_map[note->type]);
2658
2659 /* Don't warn about this trigraph in
2660 _cpp_process_line_notes, since trigraphs show up as
2661 trigraphs in raw strings. */
2662 uchar type = note->type;
2663 note->type = 0;
2664
2665 if (CPP_OPTION (pfile, trigraphs))
2666 {
2667 accum.append (pfile, base, pos - base);
2668 base = pos;
2669 accum.read_begin (pfile);
2670 accum.append (pfile, UC"??", 2);
2671 accum.append (pfile, &type, 1);
2672
2673 /* ??/ followed by newline gets two line notes, one for
2674 the trigraph and one for the backslash/newline. */
2675 if (type == '/' && note[1].pos == pos)
2676 {
2677 note++;
2678 gcc_assert (note->type == '\\' || note->type == ' ');
2679 goto after_backslash;
2680 }
2681 /* Skip the replacement character. */
2682 base = ++pos;
2683 }
2684
2685 note++;
2686 break;
2687 }
2688
2689 /* Now get a char to process. Either from an expanded note, or
2690 from the line buffer. */
2691 bool read_note = accum.reading_p ();
2692 char c = read_note ? accum.read_char () : *pos++;
2c6e3f55 2693
ed63c387 2694 if (phase == PHASE_PREFIX)
2c6e3f55 2695 {
ed63c387 2696 if (c == '(')
8cf88735 2697 {
ed63c387
NS
2698 /* Done. */
2699 phase = PHASE_NONE;
2700 prefix[prefix_len++] = '"';
2701 }
2702 else if (prefix_len < 16
2703 /* Prefix chars are any of the basic character set,
2704 [lex.charset] except for '
2705 ()\\\t\v\f\n'. Optimized for a contiguous
2706 alphabet. */
2707 /* Unlike a switch, this collapses down to one or
2708 two shift and bitmask operations on an ASCII
2709 system, with an outlier or two. */
2710 && (('Z' - 'A' == 25
2711 ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
2712 : ISIDST (c))
2713 || (c >= '0' && c <= '9')
2714 || c == '_' || c == '{' || c == '}'
2715 || c == '[' || c == ']' || c == '#'
2716 || c == '<' || c == '>' || c == '%'
2717 || c == ':' || c == ';' || c == '.' || c == '?'
2718 || c == '*' || c == '+' || c == '-' || c == '/'
2719 || c == '^' || c == '&' || c == '|' || c == '~'
2720 || c == '!' || c == '=' || c == ','
2721 || c == '"' || c == '\''))
2722 prefix[prefix_len++] = c;
2723 else
2724 {
2725 /* Something is wrong. */
2726 int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
2727 if (prefix_len == 16)
2728 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2729 col, "raw string delimiter longer "
2730 "than 16 characters");
2731 else if (c == '\n')
2732 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2733 col, "invalid new-line in raw "
2734 "string delimiter");
2735 else
2736 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2737 col, "invalid character '%c' in "
2738 "raw string delimiter", c);
2739 type = CPP_OTHER;
2740 phase = PHASE_NONE;
2741 /* Continue until we get a close quote, that's probably
2742 the best failure mode. */
2743 prefix_len = 0;
8cf88735 2744 }
ae49af94
JJ
2745 if (c != '\n')
2746 continue;
8cf88735 2747 }
ed63c387
NS
2748
2749 if (phase != PHASE_NONE)
8cf88735 2750 {
ed63c387
NS
2751 if (prefix[phase] != c)
2752 phase = PHASE_NONE;
2753 else if (unsigned (phase + 1) == prefix_len)
8cf88735 2754 break;
ed63c387
NS
2755 else
2756 {
2757 phase = Phase (phase + 1);
2758 continue;
2759 }
2c6e3f55 2760 }
ed63c387
NS
2761
2762 if (!prefix_len && c == '"')
2763 /* Failure mode lexing. */
2764 goto out;
2765 else if (prefix_len && c == ')')
2766 phase = PHASE_SUFFIX;
2767 else if (!read_note && c == '\n')
2c6e3f55 2768 {
ed63c387
NS
2769 pos--;
2770 pfile->buffer->cur = pos;
3ad2167b
LH
2771 if ((pfile->state.in_directive || pfile->state.parsing_args)
2772 && pfile->buffer->next_line >= pfile->buffer->rlimit)
2c6e3f55 2773 {
2c6e3f55
JJ
2774 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
2775 "unterminated raw string");
ed63c387
NS
2776 type = CPP_OTHER;
2777 goto out;
2c6e3f55
JJ
2778 }
2779
ed63c387 2780 accum.append (pfile, base, pos - base + 1);
2a0225e4
NS
2781 _cpp_process_line_notes (pfile, false);
2782
2783 if (pfile->buffer->next_line < pfile->buffer->rlimit)
2c6e3f55
JJ
2784 CPP_INCREMENT_LINE (pfile, 0);
2785 pfile->buffer->need_line = true;
2786
3ad2167b 2787 if (!get_fresh_line_impl<true> (pfile))
2c6e3f55 2788 {
ed63c387 2789 /* We ran out of file and failed to get a line. */
620e594b 2790 location_t src_loc = token->src_loc;
2c6e3f55
JJ
2791 token->type = CPP_EOF;
2792 /* Tell the compiler the line number of the EOF token. */
2793 token->src_loc = pfile->line_table->highest_line;
2794 token->flags = BOL;
ed63c387
NS
2795 if (accum.first)
2796 _cpp_release_buff (pfile, accum.first);
2c6e3f55
JJ
2797 cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
2798 "unterminated raw string");
3ad2167b
LH
2799
2800 /* Now pop the buffer that get_fresh_line_impl() did not. Popping
2801 is not safe if processing a directive, however this cannot
2802 happen as we already checked above that a line would be
2803 available, and get_fresh_line_impl() can't fail in this
2804 case. */
2805 gcc_assert (!pfile->state.in_directive);
2a0225e4 2806 _cpp_pop_buffer (pfile);
3ad2167b 2807
2c6e3f55
JJ
2808 return;
2809 }
2810
ed63c387 2811 pos = base = pfile->buffer->cur;
00a81b8b 2812 note = &pfile->buffer->notes[pfile->buffer->cur_note];
2c6e3f55 2813 }
0b8c57ed
JJ
2814 else if (__builtin_expect ((unsigned char) c >= utf8_continuation, 0)
2815 && warn_bidi_or_invalid_utf8_p)
2816 pos = _cpp_handle_multibyte_utf8 (pfile, c, pos, warn_bidi_p,
2817 warn_invalid_utf8_p);
2c6e3f55
JJ
2818 }
2819
51c50026
MP
2820 if (warn_bidi_p)
2821 maybe_warn_bidi_on_close (pfile, pos);
2822
3ce4f9e4
ESR
2823 if (CPP_OPTION (pfile, user_literals))
2824 {
1d3e4f4e
LH
2825 const uchar *const suffix_begin = pos;
2826 pfile->buffer->cur = pos;
7f5f5f98 2827
1d3e4f4e
LH
2828 if (const auto sr = scan_cur_identifier (pfile))
2829 {
2830 if (maybe_ignore_udl_macro_suffix (pfile, token->src_loc,
2831 suffix_begin, sr.node))
2832 pfile->buffer->cur = suffix_begin;
2833 else
2834 {
2835 type = cpp_userdef_string_add_type (type);
2836 accum.create_literal2 (pfile, token, base, suffix_begin - base,
2837 NODE_NAME (sr.node), NODE_LEN (sr.node),
2838 type);
2839 if (accum.first)
2840 _cpp_release_buff (pfile, accum.first);
2841 warn_about_normalization (pfile, token, &sr.nst, true);
2842 return;
2843 }
3ce4f9e4 2844 }
3ce4f9e4
ESR
2845 }
2846
ed63c387
NS
2847 out:
2848 pfile->buffer->cur = pos;
2849 if (!accum.accum)
2850 create_literal (pfile, token, base, pos - base, type);
2c6e3f55
JJ
2851 else
2852 {
1d3e4f4e 2853 accum.create_literal2 (pfile, token, base, pos - base, nullptr, 0, type);
ed63c387 2854 _cpp_release_buff (pfile, accum.first);
2c6e3f55
JJ
2855 }
2856}
2857
bced6edf 2858/* Lexes a string, character constant, or angle-bracketed header file
6338b358 2859 name. The stored string contains the spelling, including opening
2c6e3f55
JJ
2860 quote and any leading 'L', 'u', 'U' or 'u8' and optional
2861 'R' modifier. It returns the type of the literal, or CPP_OTHER
2862 if it was not properly terminated, or CPP_LESS for an unterminated
2863 header name which must be relexed as normal tokens.
6338b358
NB
2864
2865 The spelling is NUL-terminated, but it is not guaranteed that this
2866 is the first NUL since embedded NULs are preserved. */
041c3194 2867static void
6cf87ca4 2868lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
45b966db 2869{
6338b358
NB
2870 bool saw_NUL = false;
2871 const uchar *cur;
bced6edf 2872 cppchar_t terminator;
6338b358
NB
2873 enum cpp_ttype type;
2874
2875 cur = base;
2876 terminator = *cur++;
2c6e3f55 2877 if (terminator == 'L' || terminator == 'U')
6338b358 2878 terminator = *cur++;
2c6e3f55
JJ
2879 else if (terminator == 'u')
2880 {
2881 terminator = *cur++;
2882 if (terminator == '8')
2883 terminator = *cur++;
2884 }
2885 if (terminator == 'R')
2886 {
ed63c387 2887 lex_raw_string (pfile, token, base);
2c6e3f55
JJ
2888 return;
2889 }
2890 if (terminator == '"')
b6baa67d
KVH
2891 type = (*base == 'L' ? CPP_WSTRING :
2892 *base == 'U' ? CPP_STRING32 :
2c6e3f55
JJ
2893 *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2894 : CPP_STRING);
6338b358 2895 else if (terminator == '\'')
b6baa67d
KVH
2896 type = (*base == 'L' ? CPP_WCHAR :
2897 *base == 'U' ? CPP_CHAR32 :
fe95b036
ESR
2898 *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2899 : CPP_CHAR);
6338b358
NB
2900 else
2901 terminator = '>', type = CPP_HEADER_NAME;
93c80368 2902
51c50026 2903 const bool warn_bidi_p = pfile->warn_bidi_p ();
0b8c57ed
JJ
2904 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2905 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
0d9f234d 2906 for (;;)
45b966db 2907 {
6338b358 2908 cppchar_t c = *cur++;
7868b4a2 2909
6f572ac2 2910 /* In #include-style directives, terminators are not escapable. */
6338b358 2911 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
51c50026 2912 {
eb4879ab 2913 if ((cur[0] == 'u' || cur[0] == 'U' || cur[0] == 'N') && warn_bidi_p)
51c50026 2914 {
bef32d4a 2915 location_t loc;
eb4879ab
JJ
2916 bidi::kind kind;
2917 if (cur[0] == 'N')
2918 kind = get_bidi_named (pfile, cur + 1, &loc);
2919 else
2920 kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U', &loc);
bef32d4a 2921 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
51c50026
MP
2922 }
2923 cur++;
2924 }
6338b358 2925 else if (c == terminator)
51c50026
MP
2926 {
2927 if (warn_bidi_p)
2928 maybe_warn_bidi_on_close (pfile, cur - 1);
2929 break;
2930 }
6338b358 2931 else if (c == '\n')
0d9f234d 2932 {
6338b358 2933 cur--;
4bb09c26
JM
2934 /* Unmatched quotes always yield undefined behavior, but
2935 greedy lexing means that what appears to be an unterminated
2936 header name may actually be a legitimate sequence of tokens. */
2937 if (terminator == '>')
2938 {
2939 token->type = CPP_LESS;
2940 return;
2941 }
6338b358
NB
2942 type = CPP_OTHER;
2943 break;
45b966db 2944 }
6338b358
NB
2945 else if (c == '\0')
2946 saw_NUL = true;
0b8c57ed
JJ
2947 else if (__builtin_expect (c >= utf8_continuation, 0)
2948 && warn_bidi_or_invalid_utf8_p)
2949 cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
2950 warn_invalid_utf8_p);
45b966db
ZW
2951 }
2952
6338b358 2953 if (saw_NUL && !pfile->state.skipping)
0527bc4e
JDA
2954 cpp_error (pfile, CPP_DL_WARNING,
2955 "null character(s) preserved in literal");
45b966db 2956
c663e301
JM
2957 if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2958 cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2959 (int) terminator);
2960
1d3e4f4e
LH
2961 pfile->buffer->cur = cur;
2962 const uchar *const suffix_begin = cur;
2963
3ce4f9e4
ESR
2964 if (CPP_OPTION (pfile, user_literals))
2965 {
1d3e4f4e 2966 if (const auto sr = scan_cur_identifier (pfile))
3ce4f9e4 2967 {
1d3e4f4e
LH
2968 if (maybe_ignore_udl_macro_suffix (pfile, token->src_loc,
2969 suffix_begin, sr.node))
2970 pfile->buffer->cur = suffix_begin;
2971 else
2972 {
2973 /* Grab user defined literal suffix. */
2974 type = cpp_userdef_char_add_type (type);
2975 type = cpp_userdef_string_add_type (type);
2976 create_literal2 (pfile, token, base, suffix_begin - base,
2977 NODE_NAME (sr.node), NODE_LEN (sr.node), type);
2978 warn_about_normalization (pfile, token, &sr.nst, true);
2979 return;
2980 }
3ce4f9e4 2981 }
3ce4f9e4 2982 }
fe191308 2983 else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
fe191308 2984 && !pfile->state.skipping)
1d3e4f4e
LH
2985 {
2986 const auto sr = scan_cur_identifier (pfile);
2987 /* Maybe raise a warning, but do not consume the tokens. */
2988 pfile->buffer->cur = suffix_begin;
2989 if (sr && cpp_macro_p (sr.node))
2990 cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2991 token->src_loc, 0, "C++11 requires a space "
2992 "between string literal and macro");
2993 }
3ce4f9e4 2994
6338b358 2995 create_literal (pfile, token, base, cur - base, type);
0d9f234d 2996}
041c3194 2997
631d0d36
MG
2998/* Return the comment table. The client may not make any assumption
2999 about the ordering of the table. */
3000cpp_comment_table *
3001cpp_get_comments (cpp_reader *pfile)
3002{
3003 return &pfile->comments;
3004}
3005
3006/* Append a comment to the end of the comment table. */
3007static void
3008store_comment (cpp_reader *pfile, cpp_token *token)
3009{
3010 int len;
3011
3012 if (pfile->comments.allocated == 0)
3013 {
3014 pfile->comments.allocated = 256;
3015 pfile->comments.entries = (cpp_comment *) xmalloc
3016 (pfile->comments.allocated * sizeof (cpp_comment));
3017 }
3018
3019 if (pfile->comments.count == pfile->comments.allocated)
3020 {
3021 pfile->comments.allocated *= 2;
3022 pfile->comments.entries = (cpp_comment *) xrealloc
3023 (pfile->comments.entries,
3024 pfile->comments.allocated * sizeof (cpp_comment));
3025 }
3026
3027 len = token->val.str.len;
3028
3029 /* Copy comment. Note, token may not be NULL terminated. */
3030 pfile->comments.entries[pfile->comments.count].comment =
3031 (char *) xmalloc (sizeof (char) * (len + 1));
3032 memcpy (pfile->comments.entries[pfile->comments.count].comment,
3033 token->val.str.text, len);
3034 pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
3035
3036 /* Set source location. */
3037 pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
3038
3039 /* Increment the count of entries in the comment table. */
3040 pfile->comments.count++;
3041}
3042
93c80368 3043/* The stored comment includes the comment start and any terminator. */
9e62c811 3044static void
6cf87ca4
ZW
3045save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
3046 cppchar_t type)
9e62c811 3047{
041c3194 3048 unsigned char *buffer;
651a20b5 3049 unsigned int len, clen, i;
df383483 3050
1c6d33ef 3051 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
480709cc 3052
3542203b
NB
3053 /* C++ comments probably (not definitely) have moved past a new
3054 line, which we don't want to save in the comment. */
480709cc 3055 if (is_vspace (pfile->buffer->cur[-1]))
3542203b 3056 len--;
477cdac7 3057
651a20b5
KT
3058 /* If we are currently in a directive or in argument parsing, then
3059 we need to store all C++ comments as C comments internally, and
3060 so we need to allocate a little extra space in that case.
477cdac7
JT
3061
3062 Note that the only time we encounter a directive here is
3063 when we are saving comments in a "#define". */
651a20b5
KT
3064 clen = ((pfile->state.in_directive || pfile->state.parsing_args)
3065 && type == '/') ? len + 2 : len;
477cdac7
JT
3066
3067 buffer = _cpp_unaligned_alloc (pfile, clen);
df383483 3068
041c3194 3069 token->type = CPP_COMMENT;
477cdac7 3070 token->val.str.len = clen;
0d9f234d 3071 token->val.str.text = buffer;
45b966db 3072
1c6d33ef
NB
3073 buffer[0] = '/';
3074 memcpy (buffer + 1, from, len - 1);
477cdac7 3075
1eeeb6a4 3076 /* Finish conversion to a C comment, if necessary. */
651a20b5 3077 if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
477cdac7
JT
3078 {
3079 buffer[1] = '*';
3080 buffer[clen - 2] = '*';
3081 buffer[clen - 1] = '/';
651a20b5
KT
3082 /* As there can be in a C++ comments illegal sequences for C comments
3083 we need to filter them out. */
3084 for (i = 2; i < (clen - 2); i++)
3085 if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
3086 buffer[i] = '|';
477cdac7 3087 }
631d0d36
MG
3088
3089 /* Finally store this comment for use by clients of libcpp. */
3090 store_comment (pfile, token);
0d9f234d 3091}
45b966db 3092
81fea426
MP
3093/* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
3094 comment. */
3095
3096static bool
3097fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
3098{
3099 const unsigned char *from = comment_start + 1;
70f6d5e1
JJ
3100
3101 switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
3102 {
3103 /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
3104 don't recognize any comments. The latter only checks attributes,
3105 the former doesn't warn. */
3106 case 0:
3107 default:
3108 return false;
3109 /* -Wimplicit-fallthrough=1 considers any comment, no matter what
3110 content it has. */
3111 case 1:
3112 return true;
3113 case 2:
3114 /* -Wimplicit-fallthrough=2 looks for (case insensitive)
3115 .*falls?[ \t-]*thr(u|ough).* regex. */
3116 for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
3117 from++)
3118 {
3119 /* Is there anything like strpbrk with upper boundary, or
3120 memchr looking for 2 characters rather than just one? */
3121 if (from[0] != 'f' && from[0] != 'F')
3122 continue;
3123 if (from[1] != 'a' && from[1] != 'A')
3124 continue;
3125 if (from[2] != 'l' && from[2] != 'L')
3126 continue;
3127 if (from[3] != 'l' && from[3] != 'L')
3128 continue;
3129 from += sizeof "fall" - 1;
3130 if (from[0] == 's' || from[0] == 'S')
3131 from++;
3132 while (*from == ' ' || *from == '\t' || *from == '-')
3133 from++;
3134 if (from[0] != 't' && from[0] != 'T')
3135 continue;
3136 if (from[1] != 'h' && from[1] != 'H')
3137 continue;
3138 if (from[2] != 'r' && from[2] != 'R')
3139 continue;
3140 if (from[3] == 'u' || from[3] == 'U')
3141 return true;
3142 if (from[3] != 'o' && from[3] != 'O')
3143 continue;
3144 if (from[4] != 'u' && from[4] != 'U')
3145 continue;
3146 if (from[5] != 'g' && from[5] != 'G')
3147 continue;
3148 if (from[6] != 'h' && from[6] != 'H')
3149 continue;
3150 return true;
3151 }
3152 return false;
3153 case 3:
3154 case 4:
3155 break;
3156 }
3157
81fea426
MP
3158 /* Whole comment contents:
3159 -fallthrough
3160 @fallthrough@
3161 */
3162 if (*from == '-' || *from == '@')
3163 {
3164 size_t len = sizeof "fallthrough" - 1;
3165 if ((size_t) (pfile->buffer->cur - from - 1) < len)
3166 return false;
3167 if (memcmp (from + 1, "fallthrough", len))
3168 return false;
3169 if (*from == '@')
3170 {
3171 if (from[len + 1] != '@')
3172 return false;
3173 len++;
3174 }
3175 from += 1 + len;
3176 }
3177 /* Whole comment contents (regex):
70f6d5e1 3178 lint -fallthrough[ \t]*
81b02905
JJ
3179 */
3180 else if (*from == 'l')
3181 {
3182 size_t len = sizeof "int -fallthrough" - 1;
3183 if ((size_t) (pfile->buffer->cur - from - 1) < len)
3184 return false;
3185 if (memcmp (from + 1, "int -fallthrough", len))
70f6d5e1 3186 return false;
81b02905 3187 from += 1 + len;
70f6d5e1
JJ
3188 while (*from == ' ' || *from == '\t')
3189 from++;
3190 }
3191 /* Whole comment contents (regex):
3192 [ \t]*FALLTHR(U|OUGH)[ \t]*
3193 */
3194 else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
3195 {
3196 while (*from == ' ' || *from == '\t')
3197 from++;
3198 if ((size_t) (pfile->buffer->cur - from) < sizeof "FALLTHRU" - 1)
3199 return false;
3200 if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
3201 return false;
3202 from += sizeof "FALLTHR" - 1;
3203 if (*from == 'U')
3204 from++;
3205 else if ((size_t) (pfile->buffer->cur - from) < sizeof "OUGH" - 1)
3206 return false;
3207 else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
3208 return false;
3209 else
3210 from += sizeof "OUGH" - 1;
3211 while (*from == ' ' || *from == '\t')
3212 from++;
81b02905
JJ
3213 }
3214 /* Whole comment contents (regex):
ee19ef45
JJ
3215 [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
3216 [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
3217 [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
81fea426
MP
3218 */
3219 else
3220 {
81b02905 3221 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
81fea426
MP
3222 from++;
3223 unsigned char f = *from;
81b02905
JJ
3224 bool all_upper = false;
3225 if (f == 'E' || f == 'e')
70f6d5e1
JJ
3226 {
3227 if ((size_t) (pfile->buffer->cur - from)
81b02905
JJ
3228 < sizeof "else fallthru" - 1)
3229 return false;
ee19ef45 3230 if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
81b02905 3231 all_upper = true;
ee19ef45
JJ
3232 else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
3233 return false;
3234 from += sizeof "else" - 1;
3235 if (*from == ',')
3236 from++;
70f6d5e1 3237 if (*from != ' ')
ee19ef45
JJ
3238 return false;
3239 from++;
3240 if (all_upper && *from == 'f')
81b02905 3241 return false;
81b02905
JJ
3242 if (f == 'e' && *from == 'F')
3243 return false;
3244 f = *from;
70f6d5e1 3245 }
81b02905 3246 else if (f == 'I' || f == 'i')
70f6d5e1
JJ
3247 {
3248 if ((size_t) (pfile->buffer->cur - from)
81b02905
JJ
3249 < sizeof "intentional fallthru" - 1)
3250 return false;
3251 if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
3252 sizeof "NTENTIONAL" - 1) == 0)
3253 all_upper = true;
3254 else if (memcmp (from + 1, "ntentional",
3255 sizeof "ntentional" - 1))
3256 return false;
3257 from += sizeof "intentional" - 1;
3258 if (*from == ' ')
3259 {
3260 from++;
3261 if (all_upper && *from == 'f')
3262 return false;
3263 }
3264 else if (all_upper)
3265 {
3266 if (memcmp (from, "LY F", sizeof "LY F" - 1))
3267 return false;
3268 from += sizeof "LY " - 1;
3269 }
3270 else
3271 {
3272 if (memcmp (from, "ly ", sizeof "ly " - 1))
3273 return false;
3274 from += sizeof "ly " - 1;
3275 }
3276 if (f == 'i' && *from == 'F')
3277 return false;
3278 f = *from;
70f6d5e1 3279 }
81fea426
MP
3280 if (f != 'F' && f != 'f')
3281 return false;
7bad794a 3282 if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
81fea426 3283 return false;
81fea426
MP
3284 if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
3285 all_upper = true;
81b02905
JJ
3286 else if (all_upper)
3287 return false;
81fea426
MP
3288 else if (memcmp (from + 1, "all", sizeof "all" - 1))
3289 return false;
7bad794a
JJ
3290 from += sizeof "fall" - 1;
3291 if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
3292 from += 2;
3293 else if (*from == ' ' || *from == '-')
3294 from++;
3295 else if (*from != (all_upper ? 'T' : 't'))
81fea426 3296 return false;
81fea426
MP
3297 if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
3298 return false;
7bad794a 3299 if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
81fea426
MP
3300 return false;
3301 if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
3302 {
7bad794a 3303 if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
81fea426
MP
3304 return false;
3305 if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
3306 sizeof "hrough" - 1))
3307 return false;
3308 from += sizeof "through" - 1;
3309 }
3310 else
3311 from += sizeof "thru" - 1;
81b02905 3312 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
81fea426 3313 from++;
81b02905
JJ
3314 if (*from == '-')
3315 {
3316 from++;
3317 if (*comment_start == '*')
3318 {
3319 do
3320 {
3321 while (*from && *from != '*'
3322 && *from != '\n' && *from != '\r')
3323 from++;
3324 if (*from != '*' || from[1] == '/')
3325 break;
3326 from++;
3327 }
3328 while (1);
3329 }
3330 else
3331 while (*from && *from != '\n' && *from != '\r')
3332 from++;
3333 }
81fea426
MP
3334 }
3335 /* C block comment. */
3336 if (*comment_start == '*')
3337 {
3338 if (*from != '*' || from[1] != '/')
3339 return false;
3340 }
3341 /* C++ line comment. */
3342 else if (*from != '\n')
3343 return false;
3344
3345 return true;
3346}
3347
5fddcffc
NB
3348/* Allocate COUNT tokens for RUN. */
3349void
6cf87ca4 3350_cpp_init_tokenrun (tokenrun *run, unsigned int count)
5fddcffc 3351{
72bb2c39 3352 run->base = XNEWVEC (cpp_token, count);
5fddcffc
NB
3353 run->limit = run->base + count;
3354 run->next = NULL;
3355}
3356
3357/* Returns the next tokenrun, or creates one if there is none. */
3358static tokenrun *
6cf87ca4 3359next_tokenrun (tokenrun *run)
5fddcffc
NB
3360{
3361 if (run->next == NULL)
3362 {
72bb2c39 3363 run->next = XNEW (tokenrun);
bdcbe496 3364 run->next->prev = run;
5fddcffc
NB
3365 _cpp_init_tokenrun (run->next, 250);
3366 }
3367
3368 return run->next;
3369}
3370
ad2305ad 3371/* Return the number of not yet processed token in a given
92582b75
TT
3372 context. */
3373int
ad2305ad 3374_cpp_remaining_tokens_num_in_context (cpp_context *context)
92582b75 3375{
92582b75 3376 if (context->tokens_kind == TOKENS_KIND_DIRECT)
cbbcf655 3377 return (LAST (context).token - FIRST (context).token);
92582b75
TT
3378 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3379 || context->tokens_kind == TOKENS_KIND_EXTENDED)
cbbcf655 3380 return (LAST (context).ptoken - FIRST (context).ptoken);
92582b75
TT
3381 else
3382 abort ();
3383}
3384
ad2305ad
DS
3385/* Returns the token present at index INDEX in a given context. If
3386 INDEX is zero, the next token to be processed is returned. */
92582b75 3387static const cpp_token*
ad2305ad 3388_cpp_token_from_context_at (cpp_context *context, int index)
92582b75 3389{
92582b75
TT
3390 if (context->tokens_kind == TOKENS_KIND_DIRECT)
3391 return &(FIRST (context).token[index]);
3392 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3393 || context->tokens_kind == TOKENS_KIND_EXTENDED)
3394 return FIRST (context).ptoken[index];
3395 else
3396 abort ();
3397}
3398
5950c3c9
BE
3399/* Look ahead in the input stream. */
3400const cpp_token *
3401cpp_peek_token (cpp_reader *pfile, int index)
3402{
3403 cpp_context *context = pfile->context;
3404 const cpp_token *peektok;
3405 int count;
3406
3407 /* First, scan through any pending cpp_context objects. */
3408 while (context->prev)
3409 {
ad2305ad 3410 ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
5950c3c9
BE
3411
3412 if (index < (int) sz)
ad2305ad 3413 return _cpp_token_from_context_at (context, index);
5950c3c9
BE
3414 index -= (int) sz;
3415 context = context->prev;
3416 }
3417
3418 /* We will have to read some new tokens after all (and do so
3419 without invalidating preceding tokens). */
3420 count = index;
3421 pfile->keep_tokens++;
3422
b8cd77f4
JJ
3423 /* For peeked tokens temporarily disable line_change reporting,
3424 until the tokens are parsed for real. */
3425 void (*line_change) (cpp_reader *, const cpp_token *, int)
3426 = pfile->cb.line_change;
3427 pfile->cb.line_change = NULL;
3428
5950c3c9
BE
3429 do
3430 {
3431 peektok = _cpp_lex_token (pfile);
3432 if (peektok->type == CPP_EOF)
e4b33ee5
JJ
3433 {
3434 index--;
3435 break;
3436 }
8bd9a00f
NS
3437 else if (peektok->type == CPP_PRAGMA)
3438 {
3439 /* Don't peek past a pragma. */
3440 if (peektok == &pfile->directive_result)
3441 /* Save the pragma in the buffer. */
3442 *pfile->cur_token++ = *peektok;
3443 index--;
3444 break;
3445 }
5950c3c9
BE
3446 }
3447 while (index--);
3448
e4b33ee5 3449 _cpp_backup_tokens_direct (pfile, count - index);
5950c3c9 3450 pfile->keep_tokens--;
b8cd77f4 3451 pfile->cb.line_change = line_change;
5950c3c9
BE
3452
3453 return peektok;
3454}
3455
4ed5bcfb
NB
3456/* Allocate a single token that is invalidated at the same time as the
3457 rest of the tokens on the line. Has its line and col set to the
3458 same as the last lexed token, so that diagnostics appear in the
3459 right place. */
3460cpp_token *
6cf87ca4 3461_cpp_temp_token (cpp_reader *pfile)
4ed5bcfb
NB
3462{
3463 cpp_token *old, *result;
5950c3c9
BE
3464 ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
3465 ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
4ed5bcfb
NB
3466
3467 old = pfile->cur_token - 1;
5950c3c9
BE
3468 /* Any pre-existing lookaheads must not be clobbered. */
3469 if (la)
3470 {
3471 if (sz <= la)
3472 {
3473 tokenrun *next = next_tokenrun (pfile->cur_run);
3474
3475 if (sz < la)
3476 memmove (next->base + 1, next->base,
3477 (la - sz) * sizeof (cpp_token));
3478
3479 next->base[0] = pfile->cur_run->limit[-1];
3480 }
3481
3482 if (sz > 1)
3483 memmove (pfile->cur_token + 1, pfile->cur_token,
3484 MIN (la, sz - 1) * sizeof (cpp_token));
3485 }
3486
3487 if (!sz && pfile->cur_token == pfile->cur_run->limit)
4ed5bcfb
NB
3488 {
3489 pfile->cur_run = next_tokenrun (pfile->cur_run);
3490 pfile->cur_token = pfile->cur_run->base;
3491 }
3492
3493 result = pfile->cur_token++;
12f9df4e 3494 result->src_loc = old->src_loc;
4ed5bcfb
NB
3495 return result;
3496}
3497
c9c3d5f2
NS
3498/* We're at the beginning of a logical line (so not in
3499 directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set. See
3500 if we should enter deferred_pragma mode to tokenize the rest of the
3501 line as a module control-line. */
3502
3503static void
3504cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
3505{
3506 unsigned backup = 0; /* Tokens we peeked. */
3507 cpp_hashnode *node = result->val.node.node;
3508 cpp_token *peek = result;
3509 cpp_token *keyword = peek;
3510 cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
3511 int header_count = 0;
3512
3513 /* Make sure the incoming state is as we expect it. This way we
3514 can restore it using constants. */
3515 gcc_checking_assert (!pfile->state.in_deferred_pragma
3516 && !pfile->state.skipping
3517 && !pfile->state.parsing_args
3518 && !pfile->state.angled_headers
3519 && (pfile->state.save_comments
3520 == !CPP_OPTION (pfile, discard_comments)));
3521
3522 /* Enter directives mode sufficiently for peeking. We don't have
3523 to actually set in_directive. */
3524 pfile->state.in_deferred_pragma = true;
3525
3526 /* These two fields are needed to process tokenization in deferred
3527 pragma mode. They are not used outside deferred pragma mode or
3528 directives mode. */
3529 pfile->state.pragma_allow_expansion = true;
3530 pfile->directive_line = result->src_loc;
3531
3532 /* Saving comments is incompatible with directives mode. */
3533 pfile->state.save_comments = 0;
3534
3535 if (node == n_modules[spec_nodes::M_EXPORT][0])
3536 {
3537 peek = _cpp_lex_direct (pfile);
3538 keyword = peek;
3539 backup++;
3540 if (keyword->type != CPP_NAME)
3541 goto not_module;
3542 node = keyword->val.node.node;
3543 if (!(node->flags & NODE_MODULE))
3544 goto not_module;
3545 }
3546
3547 if (node == n_modules[spec_nodes::M__IMPORT][0])
3548 /* __import */
3549 header_count = backup + 2 + 16;
3550 else if (node == n_modules[spec_nodes::M_IMPORT][0])
3551 /* import */
3552 header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
3553 else if (node == n_modules[spec_nodes::M_MODULE][0])
3554 ; /* module */
3555 else
3556 goto not_module;
3557
3558 /* We've seen [export] {module|import|__import}. Check the next token. */
3559 if (header_count)
3560 /* After '{,__}import' a header name may appear. */
3561 pfile->state.angled_headers = true;
3562 peek = _cpp_lex_direct (pfile);
3563 backup++;
3564
3565 /* ... import followed by identifier, ':', '<' or
3566 header-name preprocessing tokens, or module
3567 followed by cpp-identifier, ':' or ';' preprocessing
3568 tokens. C++ keywords are not yet relevant. */
3569 if (peek->type == CPP_NAME
3570 || peek->type == CPP_COLON
3571 || (header_count
3572 ? (peek->type == CPP_LESS
3573 || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
3574 || peek->type == CPP_HEADER_NAME)
3575 : peek->type == CPP_SEMICOLON))
3576 {
3577 pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
3578 if (!pfile->state.pragma_allow_expansion)
3579 pfile->state.prevent_expansion++;
3580
3581 if (!header_count && linemap_included_from
3582 (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
3583 cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
3584 "module control-line cannot be in included file");
3585
3586 /* The first one or two tokens cannot be macro names. */
3587 for (int ix = backup; ix--;)
3588 {
3589 cpp_token *tok = ix ? keyword : result;
3590 cpp_hashnode *node = tok->val.node.node;
3591
3592 /* Don't attempt to expand the token. */
3593 tok->flags |= NO_EXPAND;
3594 if (_cpp_defined_macro_p (node)
13f93cf5 3595 && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
c9c3d5f2
NS
3596 && !cpp_fun_like_macro_p (node))
3597 cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
3598 "module control-line \"%s\" cannot be"
3599 " an object-like macro",
3600 NODE_NAME (node));
3601 }
3602
3603 /* Map to underbar variants. */
3604 keyword->val.node.node = n_modules[header_count
3605 ? spec_nodes::M_IMPORT
3606 : spec_nodes::M_MODULE][1];
3607 if (backup != 1)
3608 result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
3609
3610 /* Maybe tell the tokenizer we expect a header-name down the
3611 road. */
3612 pfile->state.directive_file_token = header_count;
3613 }
3614 else
3615 {
3616 not_module:
3617 /* Drop out of directive mode. */
3618 /* We aaserted save_comments had this value upon entry. */
3619 pfile->state.save_comments
3620 = !CPP_OPTION (pfile, discard_comments);
3621 pfile->state.in_deferred_pragma = false;
3622 /* Do not let this remain on. */
3623 pfile->state.angled_headers = false;
3624 }
3625
3626 /* In either case we want to backup the peeked tokens. */
3627 if (backup)
3628 {
3629 /* If we saw EOL, we should drop it, because this isn't a module
3630 control-line after all. */
3631 bool eol = peek->type == CPP_PRAGMA_EOL;
3632 if (!eol || backup > 1)
3633 {
3634 /* Put put the peeked tokens back */
3635 _cpp_backup_tokens_direct (pfile, backup);
3636 /* But if the last one was an EOL, forget it. */
3637 if (eol)
3638 pfile->lookaheads--;
3639 }
3640 }
3641}
3642
14baae01
NB
3643/* Lex a token into RESULT (external interface). Takes care of issues
3644 like directive handling, token lookahead, multiple include
a1f300c0 3645 optimization and skipping. */
345894b4 3646const cpp_token *
6cf87ca4 3647_cpp_lex_token (cpp_reader *pfile)
5fddcffc 3648{
bdcbe496 3649 cpp_token *result;
5fddcffc 3650
bdcbe496 3651 for (;;)
5fddcffc 3652 {
bdcbe496 3653 if (pfile->cur_token == pfile->cur_run->limit)
5fddcffc 3654 {
bdcbe496
NB
3655 pfile->cur_run = next_tokenrun (pfile->cur_run);
3656 pfile->cur_token = pfile->cur_run->base;
5fddcffc 3657 }
ee380365
TT
3658 /* We assume that the current token is somewhere in the current
3659 run. */
3660 if (pfile->cur_token < pfile->cur_run->base
3661 || pfile->cur_token >= pfile->cur_run->limit)
3662 abort ();
5fddcffc 3663
bdcbe496 3664 if (pfile->lookaheads)
14baae01
NB
3665 {
3666 pfile->lookaheads--;
3667 result = pfile->cur_token++;
3668 }
bdcbe496 3669 else
14baae01 3670 result = _cpp_lex_direct (pfile);
bdcbe496
NB
3671
3672 if (result->flags & BOL)
5fddcffc 3673 {
bdcbe496
NB
3674 /* Is this a directive. If _cpp_handle_directive returns
3675 false, it is an assembler #. */
3676 if (result->type == CPP_HASH
e808ec9c
NB
3677 /* 6.10.3 p 11: Directives in a list of macro arguments
3678 gives undefined behavior. This implementation
3679 handles the directive as normal. */
bc4071dd 3680 && pfile->state.parsing_args != 1)
21b11495 3681 {
bc4071dd 3682 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
21b11495 3683 {
bc4071dd
RH
3684 if (pfile->directive_result.type == CPP_PADDING)
3685 continue;
21b11495 3686 result = &pfile->directive_result;
21b11495
ZW
3687 }
3688 }
bc4071dd
RH
3689 else if (pfile->state.in_deferred_pragma)
3690 result = &pfile->directive_result;
c9c3d5f2
NS
3691 else if (result->type == CPP_NAME
3692 && (result->val.node.node->flags & NODE_MODULE)
3693 && !pfile->state.skipping
3694 /* Unlike regular directives, we do not deal with
3695 tokenizing module directives as macro arguments.
3696 That's not permitted. */
3697 && !pfile->state.parsing_args)
3698 {
3699 /* P1857. Before macro expansion, At start of logical
3700 line ... */
3701 /* We don't have to consider lookaheads at this point. */
3702 gcc_checking_assert (!pfile->lookaheads);
3703
3704 cpp_maybe_module_directive (pfile, result);
3705 }
21b11495 3706
97293897 3707 if (pfile->cb.line_change && !pfile->state.skipping)
6cf87ca4 3708 pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
5fddcffc 3709 }
5fddcffc 3710
bdcbe496 3711 /* We don't skip tokens in directives. */
bc4071dd 3712 if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
bdcbe496 3713 break;
5fddcffc 3714
bdcbe496 3715 /* Outside a directive, invalidate controlling macros. At file
14baae01 3716 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
6356f892 3717 get here and MI optimization works. */
5fddcffc 3718 pfile->mi_valid = false;
bdcbe496
NB
3719
3720 if (!pfile->state.skipping || result->type == CPP_EOF)
3721 break;
5fddcffc
NB
3722 }
3723
345894b4 3724 return result;
5fddcffc
NB
3725}
3726
26aea073 3727/* Returns true if a fresh line has been loaded. */
3ad2167b
LH
3728template <bool lexing_raw_string>
3729static bool
3730get_fresh_line_impl (cpp_reader *pfile)
004cb263 3731{
3ad2167b
LH
3732 /* We can't get a new line until we leave the current directive, unless we
3733 are lexing a raw string, in which case it will be OK as long as we don't
3734 pop the current buffer. */
3735 if (!lexing_raw_string && pfile->state.in_directive)
26aea073 3736 return false;
df383483 3737
26aea073 3738 for (;;)
1a76916c 3739 {
26aea073 3740 cpp_buffer *buffer = pfile->buffer;
1a76916c 3741
26aea073
NB
3742 if (!buffer->need_line)
3743 return true;
3744
3745 if (buffer->next_line < buffer->rlimit)
004cb263 3746 {
26aea073
NB
3747 _cpp_clean_line (pfile);
3748 return true;
3749 }
004cb263 3750
3ad2167b
LH
3751 /* We can't change buffers until we leave the current directive. */
3752 if (lexing_raw_string && pfile->state.in_directive)
3753 return false;
3754
26aea073
NB
3755 /* First, get out of parsing arguments state. */
3756 if (pfile->state.parsing_args)
3757 return false;
3758
3759 /* End of buffer. Non-empty files should end in a newline. */
3760 if (buffer->buf != buffer->rlimit
3761 && buffer->next_line > buffer->rlimit
3762 && !buffer->from_stage3)
3763 {
ed0e74e0 3764 /* Clip to buffer size. */
26aea073 3765 buffer->next_line = buffer->rlimit;
26aea073 3766 }
22234f56 3767
2a0225e4
NS
3768 if (buffer->prev && !buffer->return_at_eof)
3769 _cpp_pop_buffer (pfile);
3770 else
3771 {
3772 /* End of translation. Do not pop the buffer yet. Increment
3773 line number so that the EOF token is on a line of its own
3774 (_cpp_lex_direct doesn't increment in that case, because
3775 it's hard for it to distinguish this special case). */
3776 CPP_INCREMENT_LINE (pfile, 0);
3777 return false;
3778 }
26aea073 3779 }
004cb263
NB
3780}
3781
3ad2167b
LH
3782bool
3783_cpp_get_fresh_line (cpp_reader *pfile)
3784{
3785 return get_fresh_line_impl<false> (pfile);
3786}
3787
3788
6f572ac2
NB
3789#define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
3790 do \
3791 { \
3792 result->type = ELSE_TYPE; \
3793 if (*buffer->cur == CHAR) \
3794 buffer->cur++, result->type = THEN_TYPE; \
3795 } \
3796 while (0)
480709cc 3797
14baae01
NB
3798/* Lex a token into pfile->cur_token, which is also incremented, to
3799 get diagnostics pointing to the correct location.
3800
3801 Does not handle issues such as token lookahead, multiple-include
f1ba665b 3802 optimization, directives, skipping etc. This function is only
14baae01
NB
3803 suitable for use by _cpp_lex_token, and in special cases like
3804 lex_expansion_token which doesn't care for any of these issues.
3805
3806 When meeting a newline, returns CPP_EOF if parsing a directive,
3807 otherwise returns to the start of the token buffer if permissible.
3808 Returns the location of the lexed token. */
3809cpp_token *
6cf87ca4 3810_cpp_lex_direct (cpp_reader *pfile)
45b966db 3811{
819bc4f6 3812 cppchar_t c = 0;
adb84b42 3813 cpp_buffer *buffer;
7bad794a
JJ
3814 const unsigned char *comment_start;
3815 bool fallthrough_comment = false;
14baae01 3816 cpp_token *result = pfile->cur_token++;
9ec7291f 3817
5fddcffc 3818 fresh_line:
26aea073 3819 result->flags = 0;
2be570f9 3820 buffer = pfile->buffer;
a506c55c 3821 if (buffer->need_line)
26aea073 3822 {
55dfce4d
JJ
3823 if (pfile->state.in_deferred_pragma)
3824 {
3825 /* This can happen in cases like:
3826 #define loop(x) whatever
3827 #pragma omp loop
3828 where when trying to expand loop we need to peek
3829 next token after loop, but aren't still in_deferred_pragma
3830 mode but are in in_directive mode, so buffer->need_line
3831 is set, a CPP_EOF is peeked. */
3832 result->type = CPP_PRAGMA_EOL;
3833 pfile->state.in_deferred_pragma = false;
3834 if (!pfile->state.pragma_allow_expansion)
3835 pfile->state.prevent_expansion--;
819bc4f6 3836 result->src_loc = pfile->line_table->highest_line;
55dfce4d
JJ
3837 return result;
3838 }
26aea073
NB
3839 if (!_cpp_get_fresh_line (pfile))
3840 {
3841 result->type = CPP_EOF;
dbcc6b15
NS
3842 /* Not a real EOF in a directive or arg parsing -- we refuse
3843 to advance to the next file now, and will once we're out
3844 of those modes. */
3845 if (!pfile->state.in_directive && !pfile->state.parsing_args)
9ff7868d
NB
3846 {
3847 /* Tell the compiler the line number of the EOF token. */
500bee0a 3848 result->src_loc = pfile->line_table->highest_line;
9ff7868d 3849 result->flags = BOL;
2a0225e4
NS
3850 /* Now pop the buffer that _cpp_get_fresh_line did not. */
3851 _cpp_pop_buffer (pfile);
9ff7868d 3852 }
819bc4f6
JJ
3853 else if (c == 0)
3854 result->src_loc = pfile->line_table->highest_line;
26aea073
NB
3855 return result;
3856 }
81fea426 3857 if (buffer != pfile->buffer)
7bad794a 3858 fallthrough_comment = false;
26aea073
NB
3859 if (!pfile->keep_tokens)
3860 {
3861 pfile->cur_run = &pfile->base_run;
3862 result = pfile->base_run.base;
3863 pfile->cur_token = result + 1;
3864 }
3865 result->flags = BOL;
3866 if (pfile->state.parsing_args == 2)
3867 result->flags |= PREV_WHITE;
3868 }
a506c55c 3869 buffer = pfile->buffer;
5fddcffc 3870 update_tokens_line:
500bee0a 3871 result->src_loc = pfile->line_table->highest_line;
041c3194 3872
5fddcffc 3873 skipped_white:
26aea073
NB
3874 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3875 && !pfile->overlaid_buffer)
3876 {
3877 _cpp_process_line_notes (pfile, false);
500bee0a 3878 result->src_loc = pfile->line_table->highest_line;
26aea073 3879 }
480709cc 3880 c = *buffer->cur++;
12f9df4e 3881
f3f6029d
NS
3882 if (pfile->forced_token_location)
3883 result->src_loc = pfile->forced_token_location;
e3dfef44
GC
3884 else
3885 result->src_loc = linemap_position_for_column (pfile->line_table,
3886 CPP_BUF_COLUMN (buffer, buffer->cur));
5fddcffc 3887
0d9f234d 3888 switch (c)
45b966db 3889 {
4d6baafa
NB
3890 case ' ': case '\t': case '\f': case '\v': case '\0':
3891 result->flags |= PREV_WHITE;
26aea073
NB
3892 skip_whitespace (pfile, c);
3893 goto skipped_white;
0d9f234d 3894
26aea073 3895 case '\n':
056f95ec
NS
3896 /* Increment the line, unless this is the last line ... */
3897 if (buffer->cur < buffer->rlimit
3898 /* ... or this is a #include, (where _cpp_stack_file needs to
3899 unwind by one line) ... */
3900 || (pfile->state.in_directive > 1
3901 /* ... except traditional-cpp increments this elsewhere. */
3902 && !CPP_OPTION (pfile, traditional)))
12f9df4e 3903 CPP_INCREMENT_LINE (pfile, 0);
26aea073 3904 buffer->need_line = true;
8bd9a00f
NS
3905 if (pfile->state.in_deferred_pragma)
3906 {
3907 /* Produce the PRAGMA_EOL on this line. File reading
3908 ensures there is always a \n at end of the buffer, thus
3909 in a deferred pragma we always see CPP_PRAGMA_EOL before
3910 any CPP_EOF. */
3911 result->type = CPP_PRAGMA_EOL;
3912 result->flags &= ~PREV_WHITE;
3913 pfile->state.in_deferred_pragma = false;
3914 if (!pfile->state.pragma_allow_expansion)
3915 pfile->state.prevent_expansion--;
3916 return result;
3917 }
26aea073 3918 goto fresh_line;
46d07497 3919
0d9f234d
NB
3920 case '0': case '1': case '2': case '3': case '4':
3921 case '5': case '6': case '7': case '8': case '9':
50668cf6
GK
3922 {
3923 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3924 result->type = CPP_NUMBER;
3925 lex_number (pfile, &result->val.str, &nst);
36d20fa4 3926 warn_about_normalization (pfile, result, &nst, false);
50668cf6
GK
3927 break;
3928 }
46d07497 3929
0abc6a6a 3930 case 'L':
b6baa67d
KVH
3931 case 'u':
3932 case 'U':
2c6e3f55
JJ
3933 case 'R':
3934 /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3935 wide strings or raw strings. */
a48e3dd1
JM
3936 if (c == 'L' || CPP_OPTION (pfile, rliterals)
3937 || (c != 'R' && CPP_OPTION (pfile, uliterals)))
bced6edf 3938 {
2c6e3f55
JJ
3939 if ((*buffer->cur == '\'' && c != 'R')
3940 || *buffer->cur == '"'
3941 || (*buffer->cur == 'R'
3942 && c != 'R'
3943 && buffer->cur[1] == '"'
a48e3dd1 3944 && CPP_OPTION (pfile, rliterals))
2c6e3f55
JJ
3945 || (*buffer->cur == '8'
3946 && c == 'u'
fe95b036
ESR
3947 && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3948 && CPP_OPTION (pfile, utf8_char_literals)))
a48e3dd1
JM
3949 || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3950 && CPP_OPTION (pfile, rliterals)))))
b6baa67d
KVH
3951 {
3952 lex_string (pfile, result, buffer->cur - 1);
3953 break;
3954 }
bced6edf 3955 }
df383483 3956 /* Fall through. */
0abc6a6a 3957
0d9f234d
NB
3958 case '_':
3959 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3960 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3961 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
b6baa67d 3962 case 's': case 't': case 'v': case 'w': case 'x':
0d9f234d
NB
3963 case 'y': case 'z':
3964 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
0abc6a6a 3965 case 'G': case 'H': case 'I': case 'J': case 'K':
2c6e3f55 3966 case 'M': case 'N': case 'O': case 'P': case 'Q':
b6baa67d 3967 case 'S': case 'T': case 'V': case 'W': case 'X':
0d9f234d
NB
3968 case 'Y': case 'Z':
3969 result->type = CPP_NAME;
50668cf6
GK
3970 {
3971 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1d3e4f4e
LH
3972 const auto node = lex_identifier (pfile, buffer->cur - 1, false, &nst,
3973 &result->val.node.spelling);
3974 result->val.node.node = node;
3975 identifier_diagnostics_on_lex (pfile, node);
36d20fa4 3976 warn_about_normalization (pfile, result, &nst, true);
50668cf6 3977 }
0d9f234d 3978
0d9f234d 3979 /* Convert named operators to their proper types. */
9a0c6187 3980 if (result->val.node.node->flags & NODE_OPERATOR)
0d9f234d
NB
3981 {
3982 result->flags |= NAMED_OP;
9a0c6187 3983 result->type = (enum cpp_ttype) result->val.node.node->directive_index;
0d9f234d 3984 }
81fea426
MP
3985
3986 /* Signal FALLTHROUGH comment followed by another token. */
7bad794a 3987 if (fallthrough_comment)
81fea426 3988 result->flags |= PREV_FALLTHROUGH;
0d9f234d
NB
3989 break;
3990
3991 case '\'':
3992 case '"':
6338b358 3993 lex_string (pfile, result, buffer->cur - 1);
0d9f234d 3994 break;
041c3194 3995
0d9f234d 3996 case '/':
1c6d33ef
NB
3997 /* A potential block or line comment. */
3998 comment_start = buffer->cur;
6f572ac2
NB
3999 c = *buffer->cur;
4000
1c6d33ef
NB
4001 if (c == '*')
4002 {
26aea073 4003 if (_cpp_skip_block_comment (pfile))
0527bc4e 4004 cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
0d9f234d 4005 }
909eb89c 4006 else if (c == '/' && ! CPP_OPTION (pfile, traditional))
0d9f234d 4007 {
909eb89c 4008 /* Don't warn for system headers. */
bf425849 4009 if (_cpp_in_system_header (pfile))
909eb89c 4010 ;
f3bede71 4011 /* Warn about comments if pedantically GNUC89, and not
bdb05a7b 4012 in system headers. */
909eb89c
MP
4013 else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
4014 && CPP_PEDANTIC (pfile)
4015 && ! buffer->warned_cplusplus_comments)
041c3194 4016 {
0c86a39d
JJ
4017 if (cpp_error (pfile, CPP_DL_PEDWARN,
4018 "C++ style comments are not allowed in ISO C90"))
4019 cpp_error (pfile, CPP_DL_NOTE,
4020 "(this will be reported only once per input file)");
1c6d33ef
NB
4021 buffer->warned_cplusplus_comments = 1;
4022 }
f3bede71 4023 /* Or if specifically desired via -Wc90-c99-compat. */
177cce46 4024 else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
dd3ff077 4025 && ! CPP_OPTION (pfile, cplusplus)
f3bede71
MP
4026 && ! buffer->warned_cplusplus_comments)
4027 {
0c86a39d
JJ
4028 if (cpp_error (pfile, CPP_DL_WARNING,
4029 "C++ style comments are incompatible with C90"))
4030 cpp_error (pfile, CPP_DL_NOTE,
4031 "(this will be reported only once per input file)");
f3bede71
MP
4032 buffer->warned_cplusplus_comments = 1;
4033 }
909eb89c
MP
4034 /* In C89/C94, C++ style comments are forbidden. */
4035 else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
4036 || CPP_OPTION (pfile, lang) == CLK_STDC94))
4037 {
4038 /* But don't be confused about valid code such as
4039 - // immediately followed by *,
4040 - // in a preprocessing directive,
4041 - // in an #if 0 block. */
4042 if (buffer->cur[1] == '*'
4043 || pfile->state.in_directive
4044 || pfile->state.skipping)
4045 {
4046 result->type = CPP_DIV;
4047 break;
4048 }
4049 else if (! buffer->warned_cplusplus_comments)
4050 {
0c86a39d
JJ
4051 if (cpp_error (pfile, CPP_DL_ERROR,
4052 "C++ style comments are not allowed in "
4053 "ISO C90"))
4054 cpp_error (pfile, CPP_DL_NOTE,
4055 "(this will be reported only once per input "
4056 "file)");
909eb89c
MP
4057 buffer->warned_cplusplus_comments = 1;
4058 }
4059 }
01ef6563 4060 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
87cf0651 4061 cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
1c6d33ef 4062 }
480709cc
NB
4063 else if (c == '=')
4064 {
6f572ac2 4065 buffer->cur++;
480709cc
NB
4066 result->type = CPP_DIV_EQ;
4067 break;
4068 }
4069 else
4070 {
480709cc
NB
4071 result->type = CPP_DIV;
4072 break;
4073 }
0d9f234d 4074
7bad794a
JJ
4075 if (fallthrough_comment_p (pfile, comment_start))
4076 fallthrough_comment = true;
4077
05945a1b
DM
4078 if (pfile->cb.comment)
4079 {
4080 size_t len = pfile->buffer->cur - comment_start;
4081 pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
4082 len + 1);
4083 }
4084
1c6d33ef
NB
4085 if (!pfile->state.save_comments)
4086 {
4087 result->flags |= PREV_WHITE;
5fddcffc 4088 goto update_tokens_line;
0d9f234d 4089 }
1c6d33ef 4090
7bad794a 4091 if (fallthrough_comment)
81fea426
MP
4092 result->flags |= PREV_FALLTHROUGH;
4093
1c6d33ef 4094 /* Save the comment as a token in its own right. */
477cdac7 4095 save_comment (pfile, result, comment_start, c);
bdcbe496 4096 break;
0d9f234d
NB
4097
4098 case '<':
4099 if (pfile->state.angled_headers)
4100 {
6338b358 4101 lex_string (pfile, result, buffer->cur - 1);
4bb09c26
JM
4102 if (result->type != CPP_LESS)
4103 break;
0d9f234d 4104 }
45b966db 4105
6f572ac2
NB
4106 result->type = CPP_LESS;
4107 if (*buffer->cur == '=')
b7689b96
JM
4108 {
4109 buffer->cur++, result->type = CPP_LESS_EQ;
4110 if (*buffer->cur == '>'
4111 && CPP_OPTION (pfile, cplusplus)
b04445d4 4112 && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
b7689b96
JM
4113 buffer->cur++, result->type = CPP_SPACESHIP;
4114 }
6f572ac2 4115 else if (*buffer->cur == '<')
0d9f234d 4116 {
6f572ac2
NB
4117 buffer->cur++;
4118 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
0d9f234d 4119 }
6f572ac2 4120 else if (CPP_OPTION (pfile, digraphs))
480709cc 4121 {
6f572ac2
NB
4122 if (*buffer->cur == ':')
4123 {
1582c677
PC
4124 /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
4125 three characters are <:: and the subsequent character
4126 is neither : nor >, the < is treated as a preprocessor
4127 token by itself". */
4128 if (CPP_OPTION (pfile, cplusplus)
61949153
PC
4129 && CPP_OPTION (pfile, lang) != CLK_CXX98
4130 && CPP_OPTION (pfile, lang) != CLK_GNUCXX
1582c677
PC
4131 && buffer->cur[1] == ':'
4132 && buffer->cur[2] != ':' && buffer->cur[2] != '>')
4133 break;
4134
6f572ac2
NB
4135 buffer->cur++;
4136 result->flags |= DIGRAPH;
4137 result->type = CPP_OPEN_SQUARE;
4138 }
4139 else if (*buffer->cur == '%')
4140 {
4141 buffer->cur++;
4142 result->flags |= DIGRAPH;
4143 result->type = CPP_OPEN_BRACE;
4144 }
480709cc 4145 }
0d9f234d
NB
4146 break;
4147
4148 case '>':
6f572ac2
NB
4149 result->type = CPP_GREATER;
4150 if (*buffer->cur == '=')
4151 buffer->cur++, result->type = CPP_GREATER_EQ;
4152 else if (*buffer->cur == '>')
0d9f234d 4153 {
6f572ac2
NB
4154 buffer->cur++;
4155 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
4156 }
0d9f234d
NB
4157 break;
4158
cbcff6df 4159 case '%':
6f572ac2
NB
4160 result->type = CPP_MOD;
4161 if (*buffer->cur == '=')
4162 buffer->cur++, result->type = CPP_MOD_EQ;
4163 else if (CPP_OPTION (pfile, digraphs))
480709cc 4164 {
6f572ac2 4165 if (*buffer->cur == ':')
480709cc 4166 {
6f572ac2
NB
4167 buffer->cur++;
4168 result->flags |= DIGRAPH;
4169 result->type = CPP_HASH;
4170 if (*buffer->cur == '%' && buffer->cur[1] == ':')
9a0c6187 4171 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
6f572ac2
NB
4172 }
4173 else if (*buffer->cur == '>')
4174 {
4175 buffer->cur++;
4176 result->flags |= DIGRAPH;
4177 result->type = CPP_CLOSE_BRACE;
480709cc 4178 }
480709cc 4179 }
0d9f234d
NB
4180 break;
4181
cbcff6df 4182 case '.':
480709cc 4183 result->type = CPP_DOT;
6f572ac2 4184 if (ISDIGIT (*buffer->cur))
480709cc 4185 {
50668cf6 4186 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
480709cc 4187 result->type = CPP_NUMBER;
50668cf6 4188 lex_number (pfile, &result->val.str, &nst);
36d20fa4 4189 warn_about_normalization (pfile, result, &nst, false);
480709cc 4190 }
6f572ac2
NB
4191 else if (*buffer->cur == '.' && buffer->cur[1] == '.')
4192 buffer->cur += 2, result->type = CPP_ELLIPSIS;
4193 else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4194 buffer->cur++, result->type = CPP_DOT_STAR;
0d9f234d 4195 break;
45b966db 4196
0d9f234d 4197 case '+':
6f572ac2
NB
4198 result->type = CPP_PLUS;
4199 if (*buffer->cur == '+')
4200 buffer->cur++, result->type = CPP_PLUS_PLUS;
4201 else if (*buffer->cur == '=')
4202 buffer->cur++, result->type = CPP_PLUS_EQ;
0d9f234d 4203 break;
04e3ec78 4204
0d9f234d 4205 case '-':
6f572ac2
NB
4206 result->type = CPP_MINUS;
4207 if (*buffer->cur == '>')
0d9f234d 4208 {
6f572ac2 4209 buffer->cur++;
480709cc 4210 result->type = CPP_DEREF;
6f572ac2
NB
4211 if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4212 buffer->cur++, result->type = CPP_DEREF_STAR;
480709cc 4213 }
6f572ac2
NB
4214 else if (*buffer->cur == '-')
4215 buffer->cur++, result->type = CPP_MINUS_MINUS;
4216 else if (*buffer->cur == '=')
4217 buffer->cur++, result->type = CPP_MINUS_EQ;
0d9f234d 4218 break;
45b966db 4219
0d9f234d 4220 case '&':
6f572ac2
NB
4221 result->type = CPP_AND;
4222 if (*buffer->cur == '&')
4223 buffer->cur++, result->type = CPP_AND_AND;
4224 else if (*buffer->cur == '=')
4225 buffer->cur++, result->type = CPP_AND_EQ;
0d9f234d 4226 break;
df383483 4227
0d9f234d 4228 case '|':
6f572ac2
NB
4229 result->type = CPP_OR;
4230 if (*buffer->cur == '|')
4231 buffer->cur++, result->type = CPP_OR_OR;
4232 else if (*buffer->cur == '=')
4233 buffer->cur++, result->type = CPP_OR_EQ;
0d9f234d 4234 break;
45b966db 4235
0d9f234d 4236 case ':':
6f572ac2 4237 result->type = CPP_COLON;
37127ed9
JJ
4238 if (*buffer->cur == ':')
4239 {
4240 if (CPP_OPTION (pfile, scope))
4241 buffer->cur++, result->type = CPP_SCOPE;
4242 else
4243 result->flags |= COLON_SCOPE;
4244 }
6f572ac2 4245 else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
0d9f234d 4246 {
6f572ac2 4247 buffer->cur++;
0d9f234d 4248 result->flags |= DIGRAPH;
480709cc
NB
4249 result->type = CPP_CLOSE_SQUARE;
4250 }
0d9f234d 4251 break;
45b966db 4252
480709cc
NB
4253 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
4254 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
4255 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
4256 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
9a0c6187 4257 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
480709cc 4258
26aea073 4259 case '?': result->type = CPP_QUERY; break;
0d9f234d
NB
4260 case '~': result->type = CPP_COMPL; break;
4261 case ',': result->type = CPP_COMMA; break;
4262 case '(': result->type = CPP_OPEN_PAREN; break;
4263 case ')': result->type = CPP_CLOSE_PAREN; break;
4264 case '[': result->type = CPP_OPEN_SQUARE; break;
4265 case ']': result->type = CPP_CLOSE_SQUARE; break;
4266 case '{': result->type = CPP_OPEN_BRACE; break;
4267 case '}': result->type = CPP_CLOSE_BRACE; break;
4268 case ';': result->type = CPP_SEMICOLON; break;
4269
40f03658 4270 /* @ is a punctuator in Objective-C. */
cc937581 4271 case '@': result->type = CPP_ATSIGN; break;
0d9f234d 4272
7d112d66 4273 default:
1613e52b
NB
4274 {
4275 const uchar *base = --buffer->cur;
0b8c57ed 4276 static int no_warn_cnt;
0abc6a6a 4277
7d112d66
LH
4278 /* Check for an extended identifier ($ or UCN or UTF-8). */
4279 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
50668cf6 4280 if (forms_identifier_p (pfile, true, &nst))
1613e52b
NB
4281 {
4282 result->type = CPP_NAME;
1d3e4f4e
LH
4283 const auto node = lex_identifier (pfile, base, true, &nst,
4284 &result->val.node.spelling);
4285 result->val.node.node = node;
4286 identifier_diagnostics_on_lex (pfile, node);
36d20fa4 4287 warn_about_normalization (pfile, result, &nst, true);
1613e52b
NB
4288 break;
4289 }
7d112d66
LH
4290
4291 /* Otherwise this will form a CPP_OTHER token. Parse valid UTF-8 as a
4292 single token. */
1613e52b 4293 buffer->cur++;
7d112d66
LH
4294 if (c >= utf8_signifier)
4295 {
4296 const uchar *pstr = base;
4297 cppchar_t s;
4298 if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
0b8c57ed
JJ
4299 {
4300 if (s > UCS_LIMIT && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4301 {
4302 buffer->cur = base;
4303 _cpp_warn_invalid_utf8 (pfile);
4304 }
4305 buffer->cur = pstr;
4306 }
4307 else if (CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4308 {
4309 buffer->cur = base;
4310 const uchar *end = _cpp_warn_invalid_utf8 (pfile);
4311 buffer->cur = base + 1;
4312 no_warn_cnt = end - buffer->cur;
4313 }
4314 }
4315 else if (c >= utf8_continuation
4316 && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4317 {
4318 if (no_warn_cnt)
4319 --no_warn_cnt;
4320 else
4321 {
4322 buffer->cur = base;
4323 _cpp_warn_invalid_utf8 (pfile);
4324 buffer->cur = base + 1;
4325 }
7d112d66
LH
4326 }
4327 create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
4328 break;
1067694a 4329 }
1613e52b 4330
0d9f234d 4331 }
bdcbe496 4332
a3998c2f
DM
4333 /* Potentially convert the location of the token to a range. */
4334 if (result->src_loc >= RESERVED_LOCATION_COUNT
4335 && result->type != CPP_EOF)
4336 {
4337 /* Ensure that any line notes are processed, so that we have the
4338 correct physical line/column for the end-point of the token even
4339 when a logical line is split via one or more backslashes. */
4340 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
4341 && !pfile->overlaid_buffer)
4342 _cpp_process_line_notes (pfile, false);
4343
4344 source_range tok_range;
4345 tok_range.m_start = result->src_loc;
4346 tok_range.m_finish
4347 = linemap_position_for_column (pfile->line_table,
4348 CPP_BUF_COLUMN (buffer, buffer->cur));
4349
1f68a3e8
DM
4350 result->src_loc
4351 = pfile->line_table->get_or_create_combined_loc (result->src_loc,
4352 tok_range, nullptr, 0);
a3998c2f 4353 }
ebedc9a3 4354
bdcbe496 4355 return result;
0d9f234d
NB
4356}
4357
59325650
NB
4358/* An upper bound on the number of bytes needed to spell TOKEN.
4359 Does not include preceding whitespace. */
93c80368 4360unsigned int
6cf87ca4 4361cpp_token_len (const cpp_token *token)
0d9f234d 4362{
93c80368 4363 unsigned int len;
6d2c2047 4364
93c80368 4365 switch (TOKEN_SPELL (token))
041c3194 4366 {
cc955282 4367 default: len = 6; break;
6338b358 4368 case SPELL_LITERAL: len = token->val.str.len; break;
9a0c6187 4369 case SPELL_IDENT: len = NODE_LEN (token->val.node.node) * 10; break;
041c3194 4370 }
59325650
NB
4371
4372 return len;
6d2c2047
ZW
4373}
4374
47e20491
GK
4375/* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
4376 Return the number of bytes read out of NAME. (There are always
4377 10 bytes written to BUFFER.) */
4378
4379static size_t
4380utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
4381{
4382 int j;
4383 int ucn_len = 0;
4384 int ucn_len_c;
4385 unsigned t;
4386 unsigned long utf32;
4387
4388 /* Compute the length of the UTF-8 sequence. */
4389 for (t = *name; t & 0x80; t <<= 1)
4390 ucn_len++;
4391
4392 utf32 = *name & (0x7F >> ucn_len);
4393 for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
4394 {
4395 utf32 = (utf32 << 6) | (*++name & 0x3F);
4396
4397 /* Ill-formed UTF-8. */
4398 if ((*name & ~0x3F) != 0x80)
4399 abort ();
4400 }
4401
4402 *buffer++ = '\\';
4403 *buffer++ = 'U';
4404 for (j = 7; j >= 0; j--)
4405 *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
4406 return ucn_len;
4407}
4408
cfc93532
MLI
4409/* Given a token TYPE corresponding to a digraph, return a pointer to
4410 the spelling of the digraph. */
4411static const unsigned char *
4412cpp_digraph2name (enum cpp_ttype type)
4413{
4414 return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
4415}
47e20491 4416
be5ffc59 4417/* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
1d3e4f4e 4418 The buffer must already contain enough space to hold the
be5ffc59
JM
4419 token's spelling. Returns a pointer to the character after the
4420 last character written. */
4421unsigned char *
4422_cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
4423{
4424 size_t i;
4425 const unsigned char *name = NODE_NAME (ident);
4426
4427 for (i = 0; i < NODE_LEN (ident); i++)
4428 if (name[i] & ~0x7F)
4429 {
4430 i += utf8_to_ucn (buffer, name + i) - 1;
4431 buffer += 10;
4432 }
4433 else
4434 *buffer++ = name[i];
4435
4436 return buffer;
4437}
4438
041c3194 4439/* Write the spelling of a token TOKEN to BUFFER. The buffer must
1d3e4f4e 4440 already contain enough space to hold the token's spelling.
6cf87ca4 4441 Returns a pointer to the character after the last character written.
47e20491 4442 FORSTRING is true if this is to be the spelling after translation
be5ffc59
JM
4443 phase 1 (with the original spelling of extended identifiers), false
4444 if extended identifiers should always be written using UCNs (there is
4445 no option for always writing them in the internal UTF-8 form).
6cf87ca4 4446 FIXME: Would be nice if we didn't need the PFILE argument. */
93c80368 4447unsigned char *
6cf87ca4 4448cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
47e20491 4449 unsigned char *buffer, bool forstring)
041c3194 4450{
96be6998 4451 switch (TOKEN_SPELL (token))
041c3194
ZW
4452 {
4453 case SPELL_OPERATOR:
4454 {
4455 const unsigned char *spelling;
4456 unsigned char c;
d6d5f795 4457
041c3194 4458 if (token->flags & DIGRAPH)
cfc93532 4459 spelling = cpp_digraph2name (token->type);
92936ecf
ZW
4460 else if (token->flags & NAMED_OP)
4461 goto spell_ident;
041c3194 4462 else
96be6998 4463 spelling = TOKEN_NAME (token);
df383483 4464
041c3194
ZW
4465 while ((c = *spelling++) != '\0')
4466 *buffer++ = c;
4467 }
4468 break;
d6d5f795 4469
47ad4138 4470 spell_ident:
041c3194 4471 case SPELL_IDENT:
47e20491
GK
4472 if (forstring)
4473 {
be5ffc59
JM
4474 memcpy (buffer, NODE_NAME (token->val.node.spelling),
4475 NODE_LEN (token->val.node.spelling));
4476 buffer += NODE_LEN (token->val.node.spelling);
47e20491
GK
4477 }
4478 else
be5ffc59 4479 buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
041c3194 4480 break;
d6d5f795 4481
6338b358 4482 case SPELL_LITERAL:
47ad4138
ZW
4483 memcpy (buffer, token->val.str.text, token->val.str.len);
4484 buffer += token->val.str.len;
4485 break;
4486
041c3194 4487 case SPELL_NONE:
0527bc4e
JDA
4488 cpp_error (pfile, CPP_DL_ICE,
4489 "unspellable token %s", TOKEN_NAME (token));
041c3194
ZW
4490 break;
4491 }
d6d5f795 4492
041c3194
ZW
4493 return buffer;
4494}
d6d5f795 4495
5d8ebbd8
NB
4496/* Returns TOKEN spelt as a null-terminated string. The string is
4497 freed when the reader is destroyed. Useful for diagnostics. */
93c80368 4498unsigned char *
6cf87ca4 4499cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
59325650
NB
4500{
4501 unsigned int len = cpp_token_len (token) + 1;
ece54d54 4502 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
c5a04734 4503
47e20491 4504 end = cpp_spell_token (pfile, token, start, false);
93c80368 4505 end[0] = '\0';
c5a04734 4506
93c80368
NB
4507 return start;
4508}
c5a04734 4509
cfc93532
MLI
4510/* Returns a pointer to a string which spells the token defined by
4511 TYPE and FLAGS. Used by C front ends, which really should move to
4512 using cpp_token_as_text. */
93c80368 4513const char *
cfc93532 4514cpp_type2name (enum cpp_ttype type, unsigned char flags)
93c80368 4515{
cfc93532
MLI
4516 if (flags & DIGRAPH)
4517 return (const char *) cpp_digraph2name (type);
4518 else if (flags & NAMED_OP)
4519 return cpp_named_operator2name (type);
4520
93c80368
NB
4521 return (const char *) token_spellings[type].name;
4522}
c5a04734 4523
4ed5bcfb
NB
4524/* Writes the spelling of token to FP, without any preceding space.
4525 Separated from cpp_spell_token for efficiency - to avoid stdio
4526 double-buffering. */
93c80368 4527void
6cf87ca4 4528cpp_output_token (const cpp_token *token, FILE *fp)
93c80368 4529{
93c80368 4530 switch (TOKEN_SPELL (token))
c5a04734 4531 {
93c80368
NB
4532 case SPELL_OPERATOR:
4533 {
4534 const unsigned char *spelling;
3b681e9d 4535 int c;
c5a04734 4536
93c80368 4537 if (token->flags & DIGRAPH)
cfc93532 4538 spelling = cpp_digraph2name (token->type);
93c80368
NB
4539 else if (token->flags & NAMED_OP)
4540 goto spell_ident;
4541 else
4542 spelling = TOKEN_NAME (token);
041c3194 4543
3b681e9d
ZW
4544 c = *spelling;
4545 do
4546 putc (c, fp);
4547 while ((c = *++spelling) != '\0');
93c80368
NB
4548 }
4549 break;
041c3194 4550
93c80368
NB
4551 spell_ident:
4552 case SPELL_IDENT:
47e20491
GK
4553 {
4554 size_t i;
9a0c6187 4555 const unsigned char * name = NODE_NAME (token->val.node.node);
47e20491 4556
9a0c6187 4557 for (i = 0; i < NODE_LEN (token->val.node.node); i++)
47e20491
GK
4558 if (name[i] & ~0x7F)
4559 {
4560 unsigned char buffer[10];
4561 i += utf8_to_ucn (buffer, name + i) - 1;
4562 fwrite (buffer, 1, 10, fp);
4563 }
4564 else
9a0c6187 4565 fputc (NODE_NAME (token->val.node.node)[i], fp);
47e20491
GK
4566 }
4567 break;
041c3194 4568
6338b358 4569 case SPELL_LITERAL:
c9c3d5f2
NS
4570 if (token->type == CPP_HEADER_NAME)
4571 fputc ('"', fp);
47ad4138 4572 fwrite (token->val.str.text, 1, token->val.str.len, fp);
c9c3d5f2
NS
4573 if (token->type == CPP_HEADER_NAME)
4574 fputc ('"', fp);
47ad4138
ZW
4575 break;
4576
93c80368
NB
4577 case SPELL_NONE:
4578 /* An error, most probably. */
4579 break;
041c3194 4580 }
c5a04734
ZW
4581}
4582
93c80368
NB
4583/* Compare two tokens. */
4584int
6cf87ca4 4585_cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
c5a04734 4586{
93c80368
NB
4587 if (a->type == b->type && a->flags == b->flags)
4588 switch (TOKEN_SPELL (a))
4589 {
4590 default: /* Keep compiler happy. */
4591 case SPELL_OPERATOR:
9a0c6187 4592 /* token_no is used to track where multiple consecutive ##
aa508502 4593 tokens were originally located. */
9a0c6187 4594 return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
93c80368 4595 case SPELL_NONE:
9a0c6187 4596 return (a->type != CPP_MACRO_ARG
be5ffc59
JM
4597 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
4598 && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
93c80368 4599 case SPELL_IDENT:
be5ffc59
JM
4600 return (a->val.node.node == b->val.node.node
4601 && a->val.node.spelling == b->val.node.spelling);
6338b358 4602 case SPELL_LITERAL:
93c80368
NB
4603 return (a->val.str.len == b->val.str.len
4604 && !memcmp (a->val.str.text, b->val.str.text,
4605 a->val.str.len));
4606 }
c5a04734 4607
041c3194
ZW
4608 return 0;
4609}
4610
93c80368
NB
4611/* Returns nonzero if a space should be inserted to avoid an
4612 accidental token paste for output. For simplicity, it is
4613 conservative, and occasionally advises a space where one is not
4614 needed, e.g. "." and ".2". */
93c80368 4615int
6cf87ca4
ZW
4616cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
4617 const cpp_token *token2)
c5a04734 4618{
93c80368
NB
4619 enum cpp_ttype a = token1->type, b = token2->type;
4620 cppchar_t c;
c5a04734 4621
93c80368
NB
4622 if (token1->flags & NAMED_OP)
4623 a = CPP_NAME;
4624 if (token2->flags & NAMED_OP)
4625 b = CPP_NAME;
c5a04734 4626
93c80368
NB
4627 c = EOF;
4628 if (token2->flags & DIGRAPH)
37b8524c 4629 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
93c80368
NB
4630 else if (token_spellings[b].category == SPELL_OPERATOR)
4631 c = token_spellings[b].name[0];
c5a04734 4632
93c80368 4633 /* Quickly get everything that can paste with an '='. */
37b8524c 4634 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
93c80368 4635 return 1;
c5a04734 4636
93c80368 4637 switch (a)
c5a04734 4638 {
b52dbbf8
SE
4639 case CPP_GREATER: return c == '>';
4640 case CPP_LESS: return c == '<' || c == '%' || c == ':';
93c80368
NB
4641 case CPP_PLUS: return c == '+';
4642 case CPP_MINUS: return c == '-' || c == '>';
4643 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
4644 case CPP_MOD: return c == ':' || c == '>';
4645 case CPP_AND: return c == '&';
4646 case CPP_OR: return c == '|';
4647 case CPP_COLON: return c == ':' || c == '>';
4648 case CPP_DEREF: return c == '*';
26ec42ee 4649 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
93c80368 4650 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
170c850e 4651 case CPP_PRAGMA:
93c80368
NB
4652 case CPP_NAME: return ((b == CPP_NUMBER
4653 && name_p (pfile, &token2->val.str))
4654 || b == CPP_NAME
4655 || b == CPP_CHAR || b == CPP_STRING); /* L */
4656 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
3e3fdf3d 4657 || b == CPP_CHAR
93c80368 4658 || c == '.' || c == '+' || c == '-');
1613e52b 4659 /* UCNs */
1067694a
NB
4660 case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
4661 && b == CPP_NAME)
1613e52b 4662 || (CPP_OPTION (pfile, objc)
1067694a 4663 && token1->val.str.text[0] == '@'
1613e52b 4664 && (b == CPP_NAME || b == CPP_STRING)));
b7689b96 4665 case CPP_LESS_EQ: return c == '>';
87e356ba
JJ
4666 case CPP_STRING:
4667 case CPP_WSTRING:
4668 case CPP_UTF8STRING:
4669 case CPP_STRING16:
4670 case CPP_STRING32: return (CPP_OPTION (pfile, user_literals)
4671 && (b == CPP_NAME
4672 || (TOKEN_SPELL (token2) == SPELL_LITERAL
4673 && ISIDST (token2->val.str.text[0]))));
4674
93c80368 4675 default: break;
c5a04734 4676 }
c5a04734 4677
417f3e3a 4678 return 0;
c5a04734
ZW
4679}
4680
93c80368 4681/* Output all the remaining tokens on the current line, and a newline
4ed5bcfb
NB
4682 character, to FP. Leading whitespace is removed. If there are
4683 macros, special token padding is not performed. */
c5a04734 4684void
6cf87ca4 4685cpp_output_line (cpp_reader *pfile, FILE *fp)
c5a04734 4686{
4ed5bcfb 4687 const cpp_token *token;
96be6998 4688
4ed5bcfb
NB
4689 token = cpp_get_token (pfile);
4690 while (token->type != CPP_EOF)
96be6998 4691 {
4ed5bcfb
NB
4692 cpp_output_token (token, fp);
4693 token = cpp_get_token (pfile);
4694 if (token->flags & PREV_WHITE)
4695 putc (' ', fp);
96be6998
ZW
4696 }
4697
93c80368 4698 putc ('\n', fp);
041c3194 4699}
c5a04734 4700
5d6342eb
TT
4701/* Return a string representation of all the remaining tokens on the
4702 current line. The result is allocated using xmalloc and must be
4703 freed by the caller. */
4704unsigned char *
4705cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
4706{
4707 const cpp_token *token;
4708 unsigned int out = dir_name ? ustrlen (dir_name) : 0;
4709 unsigned int alloced = 120 + out;
4710 unsigned char *result = (unsigned char *) xmalloc (alloced);
4711
4712 /* If DIR_NAME is empty, there are no initial contents. */
4713 if (dir_name)
4714 {
4715 sprintf ((char *) result, "#%s ", dir_name);
4716 out += 2;
4717 }
4718
4719 token = cpp_get_token (pfile);
4720 while (token->type != CPP_EOF)
4721 {
4722 unsigned char *last;
4723 /* Include room for a possible space and the terminating nul. */
4724 unsigned int len = cpp_token_len (token) + 2;
4725
4726 if (out + len > alloced)
4727 {
4728 alloced *= 2;
4729 if (out + len > alloced)
4730 alloced = out + len;
4731 result = (unsigned char *) xrealloc (result, alloced);
4732 }
4733
4734 last = cpp_spell_token (pfile, token, &result[out], 0);
4735 out = last - result;
4736
4737 token = cpp_get_token (pfile);
4738 if (token->flags & PREV_WHITE)
4739 result[out++] = ' ';
4740 }
4741
4742 result[out] = '\0';
4743 return result;
4744}
4745
1e013d2e
NB
4746/* Memory buffers. Changing these three constants can have a dramatic
4747 effect on performance. The values here are reasonable defaults,
4748 but might be tuned. If you adjust them, be sure to test across a
4749 range of uses of cpplib, including heavy nested function-like macro
4750 expansion. Also check the change in peak memory usage (NJAMD is a
4751 good tool for this). */
4752#define MIN_BUFF_SIZE 8000
87062813 4753#define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
1e013d2e
NB
4754#define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
4755 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
417f3e3a 4756
87062813
NB
4757#if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
4758 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
4759#endif
4760
c9e7a609
NB
4761/* Create a new allocation buffer. Place the control block at the end
4762 of the buffer, so that buffer overflows will cause immediate chaos. */
b8af0ca5 4763static _cpp_buff *
6cf87ca4 4764new_buff (size_t len)
b8af0ca5
NB
4765{
4766 _cpp_buff *result;
ece54d54 4767 unsigned char *base;
b8af0ca5 4768
1e013d2e
NB
4769 if (len < MIN_BUFF_SIZE)
4770 len = MIN_BUFF_SIZE;
c70f6ed3 4771 len = CPP_ALIGN (len);
b8af0ca5 4772
ab78426a 4773#ifdef ENABLE_VALGRIND_WORKAROUNDS
1a80db97
JJ
4774 /* Valgrind warns about uses of interior pointers, so put _cpp_buff
4775 struct first. */
4776 size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
4777 base = XNEWVEC (unsigned char, len + slen);
4778 result = (_cpp_buff *) base;
4779 base += slen;
4780#else
c3f829c1 4781 base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
b8af0ca5 4782 result = (_cpp_buff *) (base + len);
1a80db97 4783#endif
b8af0ca5
NB
4784 result->base = base;
4785 result->cur = base;
4786 result->limit = base + len;
4787 result->next = NULL;
4788 return result;
4789}
4790
4791/* Place a chain of unwanted allocation buffers on the free list. */
4792void
6cf87ca4 4793_cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
b8af0ca5
NB
4794{
4795 _cpp_buff *end = buff;
4796
4797 while (end->next)
4798 end = end->next;
4799 end->next = pfile->free_buffs;
4800 pfile->free_buffs = buff;
4801}
4802
4803/* Return a free buffer of size at least MIN_SIZE. */
4804_cpp_buff *
6cf87ca4 4805_cpp_get_buff (cpp_reader *pfile, size_t min_size)
b8af0ca5
NB
4806{
4807 _cpp_buff *result, **p;
4808
4809 for (p = &pfile->free_buffs;; p = &(*p)->next)
4810 {
6142088c 4811 size_t size;
1e013d2e
NB
4812
4813 if (*p == NULL)
b8af0ca5 4814 return new_buff (min_size);
1e013d2e
NB
4815 result = *p;
4816 size = result->limit - result->base;
4817 /* Return a buffer that's big enough, but don't waste one that's
4818 way too big. */
34f5271d 4819 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
b8af0ca5
NB
4820 break;
4821 }
4822
4823 *p = result->next;
4824 result->next = NULL;
4825 result->cur = result->base;
4826 return result;
4827}
4828
4fe9b91c 4829/* Creates a new buffer with enough space to hold the uncommitted
8c3b2693
NB
4830 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
4831 the excess bytes to the new buffer. Chains the new buffer after
4832 BUFF, and returns the new buffer. */
b8af0ca5 4833_cpp_buff *
6cf87ca4 4834_cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
b8af0ca5 4835{
6142088c 4836 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
8c3b2693 4837 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
b8af0ca5 4838
8c3b2693
NB
4839 buff->next = new_buff;
4840 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
4841 return new_buff;
4842}
4843
4fe9b91c 4844/* Creates a new buffer with enough space to hold the uncommitted
8c3b2693
NB
4845 remaining bytes of the buffer pointed to by BUFF, and at least
4846 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
4847 Chains the new buffer before the buffer pointed to by BUFF, and
4848 updates the pointer to point to the new buffer. */
4849void
6cf87ca4 4850_cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
8c3b2693
NB
4851{
4852 _cpp_buff *new_buff, *old_buff = *pbuff;
4853 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
4854
4855 new_buff = _cpp_get_buff (pfile, size);
4856 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
4857 new_buff->next = old_buff;
4858 *pbuff = new_buff;
b8af0ca5
NB
4859}
4860
4861/* Free a chain of buffers starting at BUFF. */
4862void
5671bf27 4863_cpp_free_buff (_cpp_buff *buff)
b8af0ca5
NB
4864{
4865 _cpp_buff *next;
4866
4867 for (; buff; buff = next)
4868 {
4869 next = buff->next;
ab78426a 4870#ifdef ENABLE_VALGRIND_WORKAROUNDS
1a80db97
JJ
4871 free (buff);
4872#else
b8af0ca5 4873 free (buff->base);
1a80db97 4874#endif
b8af0ca5
NB
4875 }
4876}
417f3e3a 4877
ece54d54
NB
4878/* Allocate permanent, unaligned storage of length LEN. */
4879unsigned char *
6cf87ca4 4880_cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
ece54d54
NB
4881{
4882 _cpp_buff *buff = pfile->u_buff;
4883 unsigned char *result = buff->cur;
4884
4885 if (len > (size_t) (buff->limit - result))
4886 {
4887 buff = _cpp_get_buff (pfile, len);
4888 buff->next = pfile->u_buff;
4889 pfile->u_buff = buff;
4890 result = buff->cur;
4891 }
4892
4893 buff->cur = result + len;
4894 return result;
4895}
4896
87062813
NB
4897/* Allocate permanent, unaligned storage of length LEN from a_buff.
4898 That buffer is used for growing allocations when saving macro
4899 replacement lists in a #define, and when parsing an answer to an
4900 assertion in #assert, #unassert or #if (and therefore possibly
4901 whilst expanding macros). It therefore must not be used by any
4902 code that they might call: specifically the lexer and the guts of
4903 the macro expander.
4904
4905 All existing other uses clearly fit this restriction: storing
4906 registered pragmas during initialization. */
93c80368 4907unsigned char *
6cf87ca4 4908_cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3fef5b2b 4909{
8c3b2693
NB
4910 _cpp_buff *buff = pfile->a_buff;
4911 unsigned char *result = buff->cur;
3fef5b2b 4912
8c3b2693 4913 if (len > (size_t) (buff->limit - result))
3fef5b2b 4914 {
8c3b2693
NB
4915 buff = _cpp_get_buff (pfile, len);
4916 buff->next = pfile->a_buff;
4917 pfile->a_buff = buff;
4918 result = buff->cur;
3fef5b2b 4919 }
041c3194 4920
8c3b2693 4921 buff->cur = result + len;
93c80368 4922 return result;
041c3194 4923}
d8044160 4924
10f04917
NS
4925/* Commit or allocate storage from a buffer. */
4926
4927void *
4928_cpp_commit_buff (cpp_reader *pfile, size_t size)
4929{
4930 void *ptr = BUFF_FRONT (pfile->a_buff);
4931
4932 if (pfile->hash_table->alloc_subobject)
4933 {
4934 void *copy = pfile->hash_table->alloc_subobject (size);
4935 memcpy (copy, ptr, size);
4936 ptr = copy;
4937 }
4938 else
4939 BUFF_FRONT (pfile->a_buff) += size;
4940
4941 return ptr;
4942}
4943
d8044160
GK
4944/* Say which field of TOK is in use. */
4945
4946enum cpp_token_fld_kind
c26302d5 4947cpp_token_val_index (const cpp_token *tok)
d8044160
GK
4948{
4949 switch (TOKEN_SPELL (tok))
4950 {
4951 case SPELL_IDENT:
4952 return CPP_TOKEN_FLD_NODE;
4953 case SPELL_LITERAL:
4954 return CPP_TOKEN_FLD_STR;
aa508502 4955 case SPELL_OPERATOR:
3f23e487
AP
4956 /* Operands which were originally spelled as ident keep around
4957 the node for the exact spelling. */
4958 if (tok->flags & NAMED_OP)
4959 return CPP_TOKEN_FLD_NODE;
4960 else if (tok->type == CPP_PASTE)
9a0c6187 4961 return CPP_TOKEN_FLD_TOKEN_NO;
aa508502
JM
4962 else
4963 return CPP_TOKEN_FLD_NONE;
d8044160
GK
4964 case SPELL_NONE:
4965 if (tok->type == CPP_MACRO_ARG)
4966 return CPP_TOKEN_FLD_ARG_NO;
4967 else if (tok->type == CPP_PADDING)
4968 return CPP_TOKEN_FLD_SOURCE;
21b11495 4969 else if (tok->type == CPP_PRAGMA)
bc4071dd 4970 return CPP_TOKEN_FLD_PRAGMA;
191816a3 4971 /* fall through */
d8044160
GK
4972 default:
4973 return CPP_TOKEN_FLD_NONE;
4974 }
4975}
e3dfef44 4976
f3f6029d 4977/* All tokens lexed in R after calling this function will be forced to
620e594b 4978 have their location_t to be P, until
e3dfef44
GC
4979 cpp_stop_forcing_token_locations is called for R. */
4980
4981void
620e594b 4982cpp_force_token_locations (cpp_reader *r, location_t loc)
e3dfef44 4983{
f3f6029d 4984 r->forced_token_location = loc;
e3dfef44
GC
4985}
4986
4987/* Go back to assigning locations naturally for lexed tokens. */
4988
4989void
4990cpp_stop_forcing_token_locations (cpp_reader *r)
4991{
f3f6029d 4992 r->forced_token_location = 0;
e3dfef44 4993}
b224c376
NS
4994
4995/* We're looking at \, if it's escaping EOL, look past it. If at
4996 LIMIT, don't advance. */
4997
4998static const unsigned char *
4999do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
5000{
5001 const unsigned char *probe = peek;
5002
5003 if (__builtin_expect (peek[1] == '\n', true))
5004 {
5005 eol:
5006 probe += 2;
5007 if (__builtin_expect (probe < limit, true))
5008 {
5009 peek = probe;
5010 if (*peek == '\\')
5011 /* The user might be perverse. */
5012 return do_peek_backslash (peek, limit);
5013 }
5014 }
5015 else if (__builtin_expect (peek[1] == '\r', false))
5016 {
5017 if (probe[2] == '\n')
5018 probe++;
5019 goto eol;
5020 }
5021
5022 return peek;
5023}
5024
5025static const unsigned char *
5026do_peek_next (const unsigned char *peek, const unsigned char *limit)
5027{
5028 if (__builtin_expect (*peek == '\\', false))
5029 peek = do_peek_backslash (peek, limit);
5030 return peek;
5031}
5032
5033static const unsigned char *
5034do_peek_prev (const unsigned char *peek, const unsigned char *bound)
5035{
5036 if (peek == bound)
5037 return NULL;
5038
5039 unsigned char c = *--peek;
5040 if (__builtin_expect (c == '\n', false)
5041 || __builtin_expect (c == 'r', false))
5042 {
5043 if (peek == bound)
5044 return peek;
5045 int ix = -1;
5046 if (c == '\n' && peek[ix] == '\r')
5047 {
5048 if (peek + ix == bound)
5049 return peek;
5050 ix--;
5051 }
5052
5053 if (peek[ix] == '\\')
5054 return do_peek_prev (peek + ix, bound);
5055
5056 return peek;
5057 }
5058 else
5059 return peek;
5060}
5061
c9c3d5f2
NS
5062/* If PEEK[-1] is identifier MATCH, scan past it and trailing white
5063 space. Otherwise return NULL. */
5064
5065static const unsigned char *
5066do_peek_ident (const char *match, const unsigned char *peek,
5067 const unsigned char *limit)
5068{
5069 for (; *++match; peek++)
5070 if (*peek != *match)
5071 {
5072 peek = do_peek_next (peek, limit);
5073 if (*peek != *match)
5074 return NULL;
5075 }
5076
5077 /* Must now not be looking at an identifier char. */
5078 peek = do_peek_next (peek, limit);
5079 if (ISIDNUM (*peek))
5080 return NULL;
5081
5082 /* Skip control-line whitespace. */
5083 ws:
5084 while (*peek == ' ' || *peek == '\t')
5085 peek++;
5086 if (__builtin_expect (*peek == '\\', false))
5087 {
5088 peek = do_peek_backslash (peek, limit);
5089 if (*peek != '\\')
5090 goto ws;
5091 }
5092
5093 return peek;
5094}
5095
5096/* Are we looking at a module control line starting as PEEK - 1? */
5097
5098static bool
5099do_peek_module (cpp_reader *pfile, unsigned char c,
5100 const unsigned char *peek, const unsigned char *limit)
5101{
5102 bool import = false;
5103
5104 if (__builtin_expect (c == 'e', false))
5105 {
5106 if (!((peek[0] == 'x' || peek[0] == '\\')
5107 && (peek = do_peek_ident ("export", peek, limit))))
5108 return false;
5109
5110 /* export, peek for import or module. No need to peek __import
5111 here. */
5112 if (peek[0] == 'i')
5113 {
5114 if (!((peek[1] == 'm' || peek[1] == '\\')
5115 && (peek = do_peek_ident ("import", peek + 1, limit))))
5116 return false;
5117 import = true;
5118 }
5119 else if (peek[0] == 'm')
5120 {
5121 if (!((peek[1] == 'o' || peek[1] == '\\')
5122 && (peek = do_peek_ident ("module", peek + 1, limit))))
5123 return false;
5124 }
5125 else
5126 return false;
5127 }
5128 else if (__builtin_expect (c == 'i', false))
5129 {
5130 if (!((peek[0] == 'm' || peek[0] == '\\')
5131 && (peek = do_peek_ident ("import", peek, limit))))
5132 return false;
5133 import = true;
5134 }
5135 else if (__builtin_expect (c == '_', false))
5136 {
5137 /* Needed for translated includes. */
5138 if (!((peek[0] == '_' || peek[0] == '\\')
5139 && (peek = do_peek_ident ("__import", peek, limit))))
5140 return false;
5141 import = true;
5142 }
5143 else if (__builtin_expect (c == 'm', false))
5144 {
5145 if (!((peek[0] == 'o' || peek[0] == '\\')
5146 && (peek = do_peek_ident ("module", peek, limit))))
5147 return false;
5148 }
5149 else
5150 return false;
5151
5152 /* Peek the next character to see if it's good enough. We'll be at
5153 the first non-whitespace char, including skipping an escaped
5154 newline. */
5155 /* ... import followed by identifier, ':', '<' or header-name
5156 preprocessing tokens, or module followed by identifier, ':' or
5157 ';' preprocessing tokens. */
5158 unsigned char p = *peek++;
5159
5160 /* A character literal is ... single quotes, ... optionally preceded
5161 by u8, u, U, or L */
5162 /* A string-literal is a ... double quotes, optionally prefixed by
5163 R, u8, u8R, u, uR, U, UR, L, or LR */
5164 if (p == 'u')
5165 {
5166 peek = do_peek_next (peek, limit);
5167 if (*peek == '8')
5168 {
5169 peek++;
5170 goto peek_u8;
5171 }
5172 goto peek_u;
5173 }
5174 else if (p == 'U' || p == 'L')
5175 {
5176 peek_u8:
5177 peek = do_peek_next (peek, limit);
5178 peek_u:
5179 if (*peek == '\"' || *peek == '\'')
5180 return false;
5181
5182 if (*peek == 'R')
5183 goto peek_R;
5184 /* Identifier. Ok. */
5185 }
5186 else if (p == 'R')
5187 {
5188 peek_R:
5189 if (CPP_OPTION (pfile, rliterals))
5190 {
5191 peek = do_peek_next (peek, limit);
5192 if (*peek == '\"')
5193 return false;
5194 }
5195 /* Identifier. Ok. */
5196 }
5197 else if ('Z' - 'A' == 25
5198 ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
5199 : ISIDST (p))
5200 {
5201 /* Identifier. Ok. */
5202 }
5203 else if (p == '<')
5204 {
5205 /* Maybe angle header, ok for import. Reject
5206 '<=', '<<' digraph:'<:'. */
5207 if (!import)
5208 return false;
5209 peek = do_peek_next (peek, limit);
5210 if (*peek == '=' || *peek == '<'
5211 || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
5212 return false;
5213 }
5214 else if (p == ';')
5215 {
5216 /* SEMICOLON, ok for module. */
5217 if (import)
5218 return false;
5219 }
5220 else if (p == '"')
5221 {
5222 /* STRING, ok for import. */
5223 if (!import)
5224 return false;
5225 }
5226 else if (p == ':')
5227 {
5228 /* Maybe COLON, ok. Reject '::', digraph:':>'. */
5229 peek = do_peek_next (peek, limit);
5230 if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
5231 return false;
5232 }
5233 else
5234 /* FIXME: Detect a unicode character, excluding those not
5235 permitted as the initial character. [lex.name]/1. I presume
5236 we need to check the \[uU] spellings, and directly using
5237 Unicode in say UTF8 form? Or perhaps we do the phase-1
5238 conversion of UTF8 to universal-character-names? */
5239 return false;
5240
5241 return true;
5242}
5243
b224c376
NS
5244/* Directives-only scanning. Somewhat more relaxed than correct
5245 parsing -- some ill-formed programs will not be rejected. */
5246
5247void
5248cpp_directive_only_process (cpp_reader *pfile,
5249 void *data,
5250 void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
5251{
c9c3d5f2
NS
5252 bool module_p = CPP_OPTION (pfile, module_directives);
5253
b224c376
NS
5254 do
5255 {
5256 restart:
5257 /* Buffer initialization, but no line cleaning. */
5258 cpp_buffer *buffer = pfile->buffer;
5259 buffer->cur_note = buffer->notes_used = 0;
5260 buffer->cur = buffer->line_base = buffer->next_line;
5261 buffer->need_line = false;
ac16f432 5262 /* Files always end in a newline or carriage return. We rely on this for
b224c376 5263 character peeking safety. */
ac16f432 5264 gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
b224c376
NS
5265
5266 const unsigned char *base = buffer->cur;
5267 unsigned line_count = 0;
5268 const unsigned char *line_start = base;
5269
5270 bool bol = true;
5271 bool raw = false;
5272
5273 const unsigned char *lwm = base;
5274 for (const unsigned char *pos = base, *limit = buffer->rlimit;
5275 pos < limit;)
5276 {
5277 unsigned char c = *pos++;
5278 /* This matches the switch in _cpp_lex_direct. */
5279 switch (c)
5280 {
5281 case ' ': case '\t': case '\f': case '\v':
5282 /* Whitespace, do nothing. */
5283 break;
5284
5285 case '\r': /* MAC line ending, or Windows \r\n */
5286 if (*pos == '\n')
5287 pos++;
5288 /* FALLTHROUGH */
5289
5290 case '\n':
5291 bol = true;
5292
5293 next_line:
5294 CPP_INCREMENT_LINE (pfile, 0);
5295 line_count++;
5296 line_start = pos;
5297 break;
5298
5299 case '\\':
5300 /* <backslash><newline> is removed, and doesn't undo any
5301 preceeding escape or whatnot. */
5302 if (*pos == '\n')
5303 {
5304 pos++;
5305 goto next_line;
5306 }
5307 else if (*pos == '\r')
5308 {
5309 if (pos[1] == '\n')
5310 pos++;
5311 pos++;
5312 goto next_line;
5313 }
5314 goto dflt;
5315
5316 case '#':
5317 if (bol)
5318 {
5319 /* Line directive. */
5320 if (pos - 1 > base && !pfile->state.skipping)
5321 cb (pfile, CPP_DO_print, data,
5322 line_count, base, pos - 1 - base);
5323
5324 /* Prep things for directive handling. */
5325 buffer->next_line = pos;
5326 buffer->need_line = true;
2a0225e4
NS
5327 bool ok = _cpp_get_fresh_line (pfile);
5328 gcc_checking_assert (ok);
b224c376
NS
5329
5330 /* Ensure proper column numbering for generated
5331 error messages. */
5332 buffer->line_base -= pos - line_start;
5333
5334 _cpp_handle_directive (pfile, line_start + 1 != pos);
5335
5336 /* Sanitize the line settings. Duplicate #include's can
5337 mess things up. */
5338 // FIXME: Necessary?
5339 pfile->line_table->highest_location
5340 = pfile->line_table->highest_line;
5341
5342 if (!pfile->state.skipping
5343 && pfile->buffer->next_line < pfile->buffer->rlimit)
5344 cb (pfile, CPP_DO_location, data,
5345 pfile->line_table->highest_line);
5346
5347 goto restart;
5348 }
5349 goto dflt;
5350
5351 case '/':
5352 {
5353 const unsigned char *peek = do_peek_next (pos, limit);
5354 if (!(*peek == '/' || *peek == '*'))
5355 goto dflt;
5356
5357 /* Line or block comment */
5358 bool is_block = *peek == '*';
5359 bool star = false;
5360 bool esc = false;
5361 location_t sloc
5362 = linemap_position_for_column (pfile->line_table,
5363 pos - line_start);
5364
5365 while (pos < limit)
5366 {
5367 char c = *pos++;
5368 switch (c)
5369 {
5370 case '\\':
5371 esc = true;
5372 break;
5373
5374 case '\r':
5375 if (*pos == '\n')
5376 pos++;
5377 /* FALLTHROUGH */
5378
5379 case '\n':
5380 {
5381 CPP_INCREMENT_LINE (pfile, 0);
5382 line_count++;
5383 line_start = pos;
5384 if (!esc && !is_block)
5385 {
5386 bol = true;
5387 goto done_comment;
5388 }
5389 }
5390 if (!esc)
5391 star = false;
5392 esc = false;
5393 break;
5394
5395 case '*':
049f0efe 5396 if (pos > peek)
b224c376
NS
5397 star = is_block;
5398 esc = false;
5399 break;
5400
5401 case '/':
5402 if (star)
5403 goto done_comment;
5404 /* FALLTHROUGH */
5405
5406 default:
5407 star = false;
5408 esc = false;
5409 break;
5410 }
5411 }
d15a2d26
JJ
5412 if (pos < limit || is_block)
5413 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5414 "unterminated comment");
b224c376
NS
5415 done_comment:
5416 lwm = pos;
5417 break;
5418 }
5419
5420 case '\'':
5421 if (!CPP_OPTION (pfile, digit_separators))
5422 goto delimited_string;
5423
5424 /* Possibly a number punctuator. */
5425 if (!ISIDNUM (*do_peek_next (pos, limit)))
5426 goto delimited_string;
5427
5428 goto quote_peek;
5429
5430 case '\"':
5431 if (!CPP_OPTION (pfile, rliterals))
5432 goto delimited_string;
5433
5434 quote_peek:
5435 {
5436 /* For ' see if it's a number punctuator
5437 \.?<digit>(<digit>|<identifier-nondigit>
5438 |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
5439 /* For " see if it's a raw string
5440 {U,L,u,u8}R. This includes CPP_NUMBER detection,
5441 because that could be 0e+R. */
5442 const unsigned char *peek = pos - 1;
5443 bool quote_first = c == '"';
5444 bool quote_eight = false;
5445 bool maybe_number_start = false;
5446 bool want_number = false;
5447
5448 while ((peek = do_peek_prev (peek, lwm)))
5449 {
5450 unsigned char p = *peek;
5451 if (quote_first)
5452 {
5453 if (!raw)
5454 {
5455 if (p != 'R')
5456 break;
5457 raw = true;
5458 continue;
5459 }
5460
5461 quote_first = false;
5462 if (p == 'L' || p == 'U' || p == 'u')
5463 ;
5464 else if (p == '8')
5465 quote_eight = true;
5466 else
5467 goto second_raw;
5468 }
5469 else if (quote_eight)
5470 {
5471 if (p != 'u')
5472 {
5473 raw = false;
5474 break;
5475 }
5476 quote_eight = false;
5477 }
5478 else if (c == '"')
5479 {
5480 second_raw:;
5481 if (!want_number && ISIDNUM (p))
5482 {
5483 raw = false;
5484 break;
5485 }
5486 }
5487
5488 if (ISDIGIT (p))
5489 maybe_number_start = true;
5490 else if (p == '.')
5491 want_number = true;
5492 else if (ISIDNUM (p))
5493 maybe_number_start = false;
5494 else if (p == '+' || p == '-')
5495 {
5496 if (const unsigned char *peek_prev
5497 = do_peek_prev (peek, lwm))
5498 {
5499 p = *peek_prev;
5500 if (p == 'e' || p == 'E'
5501 || p == 'p' || p == 'P')
5502 {
5503 want_number = true;
5504 maybe_number_start = false;
5505 }
5506 else
5507 break;
5508 }
5509 else
5510 break;
5511 }
5512 else if (p == '\'' || p == '\"')
5513 {
5514 /* If this is lwm, this must be the end of a
5515 previous string. So this is a trailing
5516 literal type, (a) if those are allowed,
5517 and (b) maybe_start is false. Otherwise
5518 this must be a CPP_NUMBER because we've
5519 met another ', and we'd have checked that
5520 in its own right. */
5521 if (peek == lwm && CPP_OPTION (pfile, uliterals))
5522 {
5523 if (!maybe_number_start && !want_number)
5524 /* Must be a literal type. */
5525 raw = false;
5526 }
5527 else if (p == '\''
5528 && CPP_OPTION (pfile, digit_separators))
5529 maybe_number_start = true;
5530 break;
5531 }
5532 else if (c == '\'')
5533 break;
5534 else if (!quote_first && !quote_eight)
5535 break;
5536 }
5537
5538 if (maybe_number_start)
5539 {
5540 if (c == '\'')
5541 /* A CPP NUMBER. */
5542 goto dflt;
5543 raw = false;
5544 }
5545
5546 goto delimited_string;
5547 }
5548
5549 delimited_string:
5550 {
5551 /* (Possibly raw) string or char literal. */
5552 unsigned char end = c;
5553 int delim_len = -1;
5554 const unsigned char *delim = NULL;
5555 location_t sloc = linemap_position_for_column (pfile->line_table,
5556 pos - line_start);
5557 int esc = 0;
5558
5559 if (raw)
5560 {
5561 /* There can be no line breaks in the delimiter. */
5562 delim = pos;
5563 for (delim_len = 0; (c = *pos++) != '('; delim_len++)
5564 {
5565 if (delim_len == 16)
5566 {
5567 cpp_error_with_line (pfile, CPP_DL_ERROR,
5568 sloc, 0,
5569 "raw string delimiter"
5570 " longer than %d"
5571 " characters",
5572 delim_len);
5573 raw = false;
5574 pos = delim;
5575 break;
5576 }
5577 if (strchr (") \\\t\v\f\n", c))
5578 {
5579 cpp_error_with_line (pfile, CPP_DL_ERROR,
5580 sloc, 0,
5581 "invalid character '%c'"
5582 " in raw string"
5583 " delimiter", c);
5584 raw = false;
5585 pos = delim;
5586 break;
5587 }
5588 if (pos >= limit)
5589 goto bad_string;
5590 }
5591 }
5592
5593 while (pos < limit)
5594 {
5595 char c = *pos++;
5596 switch (c)
5597 {
5598 case '\\':
5599 if (!raw)
5600 esc++;
5601 break;
5602
5603 case '\r':
5604 if (*pos == '\n')
5605 pos++;
5606 /* FALLTHROUGH */
5607
5608 case '\n':
5609 {
5610 CPP_INCREMENT_LINE (pfile, 0);
5611 line_count++;
5612 line_start = pos;
5613 }
5614 if (esc)
5615 esc--;
5616 break;
5617
5618 case ')':
5619 if (raw
5620 && pos + delim_len + 1 < limit
5621 && pos[delim_len] == end
5622 && !memcmp (delim, pos, delim_len))
5623 {
5624 pos += delim_len + 1;
5625 raw = false;
5626 goto done_string;
5627 }
5628 break;
5629
5630 default:
5631 if (!raw && !(esc & 1) && c == end)
5632 goto done_string;
5633 esc = 0;
5634 break;
5635 }
5636 }
5637 bad_string:
5638 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5639 "unterminated literal");
5640
5641 done_string:
5642 raw = false;
5643 lwm = pos - 1;
5644 }
5645 goto dflt;
5646
c9c3d5f2
NS
5647 case '_':
5648 case 'e':
5649 case 'i':
5650 case 'm':
5651 if (bol && module_p && !pfile->state.skipping
5652 && do_peek_module (pfile, c, pos, limit))
5653 {
5654 /* We've seen the start of a module control line.
5655 Start up the tokenizer. */
5656 pos--; /* Backup over the first character. */
5657
5658 /* Backup over whitespace to start of line. */
5659 while (pos > line_start
5660 && (pos[-1] == ' ' || pos[-1] == '\t'))
5661 pos--;
5662
5663 if (pos > base)
5664 cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
5665
5666 /* Prep things for directive handling. */
5667 buffer->next_line = pos;
5668 buffer->need_line = true;
5669
5670 /* Now get tokens until the PRAGMA_EOL. */
5671 do
5672 {
5673 location_t spelling;
5674 const cpp_token *tok
5675 = cpp_get_token_with_location (pfile, &spelling);
5676
5677 gcc_assert (pfile->state.in_deferred_pragma
5678 || tok->type == CPP_PRAGMA_EOL);
5679 cb (pfile, CPP_DO_token, data, tok, spelling);
5680 }
5681 while (pfile->state.in_deferred_pragma);
5682
5683 if (pfile->buffer->next_line < pfile->buffer->rlimit)
5684 cb (pfile, CPP_DO_location, data,
5685 pfile->line_table->highest_line);
5686
5687 pfile->mi_valid = false;
5688 goto restart;
5689 }
5690 goto dflt;
5691
b224c376
NS
5692 default:
5693 dflt:
5694 bol = false;
5695 pfile->mi_valid = false;
5696 break;
5697 }
5698 }
5699
5700 if (buffer->rlimit > base && !pfile->state.skipping)
c6b664e2
JJ
5701 {
5702 const unsigned char *limit = buffer->rlimit;
5703 /* If the file was not newline terminated, add rlimit, which is
5704 guaranteed to point to a newline, to the end of our range. */
5705 if (limit[-1] != '\n')
5706 {
5707 limit++;
5708 CPP_INCREMENT_LINE (pfile, 0);
5709 line_count++;
5710 }
5711 cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
5712 }
b224c376
NS
5713
5714 _cpp_pop_buffer (pfile);
5715 }
5716 while (pfile->buffer);
5717}