]> git.ipfire.org Git - thirdparty/gcc.git/blame - libcpp/lex.cc
Handle failure to determine pointer provenance conservatively [PR104069].
[thirdparty/gcc.git] / libcpp / lex.cc
CommitLineData
45b966db 1/* CPP Library - lexical analysis.
7adcbafe 2 Copyright (C) 2000-2022 Free Software Foundation, Inc.
45b966db
ZW
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
7
8This program is free software; you can redistribute it and/or modify it
9under the terms of the GNU General Public License as published by the
748086b7 10Free Software Foundation; either version 3, or (at your option) any
45b966db
ZW
11later version.
12
13This program is distributed in the hope that it will be useful,
14but WITHOUT ANY WARRANTY; without even the implied warranty of
15MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16GNU General Public License for more details.
17
18You should have received a copy of the GNU General Public License
748086b7
JJ
19along with this program; see the file COPYING3. If not see
20<http://www.gnu.org/licenses/>. */
45b966db
ZW
21
22#include "config.h"
23#include "system.h"
45b966db 24#include "cpplib.h"
4f4e53dd 25#include "internal.h"
45b966db 26
93c80368 27enum spell_type
f9a0e96c 28{
93c80368 29 SPELL_OPERATOR = 0,
93c80368 30 SPELL_IDENT,
6338b358 31 SPELL_LITERAL,
93c80368 32 SPELL_NONE
f9a0e96c
ZW
33};
34
93c80368 35struct token_spelling
f9a0e96c 36{
93c80368
NB
37 enum spell_type category;
38 const unsigned char *name;
f9a0e96c
ZW
39};
40
8206c799 41static const unsigned char *const digraph_spellings[] =
b6baa67d 42{ UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
93c80368 43
b6baa67d
KVH
44#define OP(e, s) { SPELL_OPERATOR, UC s },
45#define TK(e, s) { SPELL_ ## s, UC #e },
8206c799 46static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
93c80368
NB
47#undef OP
48#undef TK
49
50#define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51#define TOKEN_NAME(token) (token_spellings[(token)->type].name)
f2d5f0cc 52
6cf87ca4
ZW
53static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
54static int skip_line_comment (cpp_reader *);
55static void skip_whitespace (cpp_reader *, cppchar_t);
6cf87ca4
ZW
56static void lex_string (cpp_reader *, cpp_token *, const uchar *);
57static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
631d0d36 58static void store_comment (cpp_reader *, cpp_token *);
6cf87ca4
ZW
59static void create_literal (cpp_reader *, cpp_token *, const uchar *,
60 unsigned int, enum cpp_ttype);
61static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
62static int name_p (cpp_reader *, const cpp_string *);
6cf87ca4
ZW
63static tokenrun *next_tokenrun (tokenrun *);
64
6cf87ca4 65static _cpp_buff *new_buff (size_t);
15dad1d9 66
9d10c9a9 67
041c3194 68/* Utility routine:
9e62c811 69
bfb9dc7f
ZW
70 Compares, the token TOKEN to the NUL-terminated string STRING.
71 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
041c3194 72int
6cf87ca4 73cpp_ideq (const cpp_token *token, const char *string)
041c3194 74{
bfb9dc7f 75 if (token->type != CPP_NAME)
041c3194 76 return 0;
bfb9dc7f 77
9a0c6187 78 return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
15dad1d9 79}
1368ee70 80
26aea073
NB
81/* Record a note TYPE at byte POS into the current cleaned logical
82 line. */
87062813 83static void
6cf87ca4 84add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
0d9f234d 85{
26aea073
NB
86 if (buffer->notes_used == buffer->notes_cap)
87 {
88 buffer->notes_cap = buffer->notes_cap * 2 + 200;
c3f829c1
GDR
89 buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
90 buffer->notes_cap);
26aea073 91 }
0d9f234d 92
26aea073
NB
93 buffer->notes[buffer->notes_used].pos = pos;
94 buffer->notes[buffer->notes_used].type = type;
95 buffer->notes_used++;
0d9f234d
NB
96}
97
246a2fcb
RH
98\f
99/* Fast path to find line special characters using optimized character
100 scanning algorithms. Anything complicated falls back to the slow
101 path below. Since this loop is very hot it's worth doing these kinds
102 of optimizations.
103
104 One of the paths through the ifdefs should provide
105
106 const uchar *search_line_fast (const uchar *s, const uchar *end);
107
108 Between S and END, search for \n, \r, \\, ?. Return a pointer to
109 the found character.
110
111 Note that the last character of the buffer is *always* a newline,
112 as forced by _cpp_convert_input. This fact can be used to avoid
113 explicitly looking for the end of the buffer. */
114
115/* Configure gives us an ifdef test. */
116#ifndef WORDS_BIGENDIAN
117#define WORDS_BIGENDIAN 0
118#endif
119
120/* We'd like the largest integer that fits into a register. There's nothing
121 in <stdint.h> that gives us that. For most hosts this is unsigned long,
122 but MS decided on an LLP64 model. Thankfully when building with GCC we
123 can get the "real" word size. */
124#ifdef __GNUC__
125typedef unsigned int word_type __attribute__((__mode__(__word__)));
126#else
127typedef unsigned long word_type;
128#endif
129
130/* The code below is only expecting sizes 4 or 8.
131 Die at compile-time if this expectation is violated. */
132typedef char check_word_type_size
133 [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
134
135/* Return X with the first N bytes forced to values that won't match one
136 of the interesting characters. Note that NUL is not interesting. */
137
138static inline word_type
139acc_char_mask_misalign (word_type val, unsigned int n)
140{
141 word_type mask = -1;
142 if (WORDS_BIGENDIAN)
143 mask >>= n * 8;
144 else
145 mask <<= n * 8;
146 return val & mask;
147}
148
149/* Return X replicated to all byte positions within WORD_TYPE. */
150
151static inline word_type
152acc_char_replicate (uchar x)
153{
154 word_type ret;
155
156 ret = (x << 24) | (x << 16) | (x << 8) | x;
157 if (sizeof(word_type) == 8)
158 ret = (ret << 16 << 16) | ret;
159 return ret;
160}
161
162/* Return non-zero if some byte of VAL is (probably) C. */
163
164static inline word_type
165acc_char_cmp (word_type val, word_type c)
166{
167#if defined(__GNUC__) && defined(__alpha__)
168 /* We can get exact results using a compare-bytes instruction.
169 Get (val == c) via (0 >= (val ^ c)). */
170 return __builtin_alpha_cmpbge (0, val ^ c);
171#else
172 word_type magic = 0x7efefefeU;
173 if (sizeof(word_type) == 8)
174 magic = (magic << 16 << 16) | 0xfefefefeU;
175 magic |= 1;
176
177 val ^= c;
178 return ((val + magic) ^ ~val) & ~magic;
179#endif
180}
181
182/* Given the result of acc_char_cmp is non-zero, return the index of
183 the found character. If this was a false positive, return -1. */
184
185static inline int
186acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
187 word_type val ATTRIBUTE_UNUSED)
188{
189#if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
190 /* The cmpbge instruction sets *bits* of the result corresponding to
191 matches in the bytes with no false positives. */
192 return __builtin_ctzl (cmp);
193#else
194 unsigned int i;
195
196 /* ??? It would be nice to force unrolling here,
197 and have all of these constants folded. */
198 for (i = 0; i < sizeof(word_type); ++i)
199 {
200 uchar c;
201 if (WORDS_BIGENDIAN)
202 c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
203 else
204 c = (val >> i * 8) & 0xff;
205
206 if (c == '\n' || c == '\r' || c == '\\' || c == '?')
207 return i;
208 }
209
210 return -1;
211#endif
212}
213
214/* A version of the fast scanner using bit fiddling techniques.
215
216 For 32-bit words, one would normally perform 16 comparisons and
217 16 branches. With this algorithm one performs 24 arithmetic
218 operations and one branch. Whether this is faster with a 32-bit
219 word size is going to be somewhat system dependent.
220
221 For 64-bit words, we eliminate twice the number of comparisons
222 and branches without increasing the number of arithmetic operations.
223 It's almost certainly going to be a win with 64-bit word size. */
224
225static const uchar * search_line_acc_char (const uchar *, const uchar *)
226 ATTRIBUTE_UNUSED;
227
228static const uchar *
229search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
230{
231 const word_type repl_nl = acc_char_replicate ('\n');
232 const word_type repl_cr = acc_char_replicate ('\r');
233 const word_type repl_bs = acc_char_replicate ('\\');
234 const word_type repl_qm = acc_char_replicate ('?');
235
236 unsigned int misalign;
237 const word_type *p;
238 word_type val, t;
239
240 /* Align the buffer. Mask out any bytes from before the beginning. */
241 p = (word_type *)((uintptr_t)s & -sizeof(word_type));
242 val = *p;
243 misalign = (uintptr_t)s & (sizeof(word_type) - 1);
244 if (misalign)
245 val = acc_char_mask_misalign (val, misalign);
246
247 /* Main loop. */
248 while (1)
249 {
250 t = acc_char_cmp (val, repl_nl);
251 t |= acc_char_cmp (val, repl_cr);
252 t |= acc_char_cmp (val, repl_bs);
253 t |= acc_char_cmp (val, repl_qm);
254
255 if (__builtin_expect (t != 0, 0))
256 {
257 int i = acc_char_index (t, val);
258 if (i >= 0)
259 return (const uchar *)p + i;
260 }
261
262 val = *++p;
263 }
264}
265
d9f069ab 266/* Disable on Solaris 2/x86 until the following problem can be properly
789d73cb
RO
267 autoconfed:
268
789d73cb
RO
269 The Solaris 10+ assembler tags objects with the instruction set
270 extensions used, so SSE4.2 executables cannot run on machines that
271 don't support that extension. */
272
1b6b13f3 273#if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
246a2fcb
RH
274
275/* Replicated character data to be shared between implementations.
276 Recall that outside of a context with vector support we can't
277 define compatible vector types, therefore these are all defined
278 in terms of raw characters. */
279static const char repl_chars[4][16] __attribute__((aligned(16))) = {
280 { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
281 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
282 { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
283 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
284 { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
285 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
286 { '?', '?', '?', '?', '?', '?', '?', '?',
287 '?', '?', '?', '?', '?', '?', '?', '?' },
288};
289
290/* A version of the fast scanner using MMX vectorized byte compare insns.
291
292 This uses the PMOVMSKB instruction which was introduced with "MMX2",
ef230b38 293 which was packaged into SSE1; it is also present in the AMD MMX
246a2fcb
RH
294 extension. Mark the function as using "sse" so that we emit a real
295 "emms" instruction, rather than the 3dNOW "femms" instruction. */
296
297static const uchar *
298#ifndef __SSE__
299__attribute__((__target__("sse")))
300#endif
301search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
302{
303 typedef char v8qi __attribute__ ((__vector_size__ (8)));
304 typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
305
306 const v8qi repl_nl = *(const v8qi *)repl_chars[0];
307 const v8qi repl_cr = *(const v8qi *)repl_chars[1];
308 const v8qi repl_bs = *(const v8qi *)repl_chars[2];
309 const v8qi repl_qm = *(const v8qi *)repl_chars[3];
310
311 unsigned int misalign, found, mask;
312 const v8qi *p;
313 v8qi data, t, c;
314
315 /* Align the source pointer. While MMX doesn't generate unaligned data
316 faults, this allows us to safely scan to the end of the buffer without
317 reading beyond the end of the last page. */
318 misalign = (uintptr_t)s & 7;
319 p = (const v8qi *)((uintptr_t)s & -8);
320 data = *p;
321
322 /* Create a mask for the bytes that are valid within the first
323 16-byte block. The Idea here is that the AND with the mask
324 within the loop is "free", since we need some AND or TEST
325 insn in order to set the flags for the branch anyway. */
326 mask = -1u << misalign;
327
328 /* Main loop processing 8 bytes at a time. */
329 goto start;
330 do
331 {
332 data = *++p;
333 mask = -1;
334
335 start:
336 t = __builtin_ia32_pcmpeqb(data, repl_nl);
337 c = __builtin_ia32_pcmpeqb(data, repl_cr);
338 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
339 c = __builtin_ia32_pcmpeqb(data, repl_bs);
340 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
341 c = __builtin_ia32_pcmpeqb(data, repl_qm);
342 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
343 found = __builtin_ia32_pmovmskb (t);
344 found &= mask;
345 }
346 while (!found);
347
348 __builtin_ia32_emms ();
349
350 /* FOUND contains 1 in bits for which we matched a relevant
351 character. Conversion to the byte index is trivial. */
352 found = __builtin_ctz(found);
353 return (const uchar *)p + found;
354}
355
356/* A version of the fast scanner using SSE2 vectorized byte compare insns. */
357
358static const uchar *
359#ifndef __SSE2__
360__attribute__((__target__("sse2")))
361#endif
362search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
363{
364 typedef char v16qi __attribute__ ((__vector_size__ (16)));
365
366 const v16qi repl_nl = *(const v16qi *)repl_chars[0];
367 const v16qi repl_cr = *(const v16qi *)repl_chars[1];
368 const v16qi repl_bs = *(const v16qi *)repl_chars[2];
369 const v16qi repl_qm = *(const v16qi *)repl_chars[3];
370
371 unsigned int misalign, found, mask;
372 const v16qi *p;
373 v16qi data, t;
374
375 /* Align the source pointer. */
376 misalign = (uintptr_t)s & 15;
377 p = (const v16qi *)((uintptr_t)s & -16);
378 data = *p;
379
380 /* Create a mask for the bytes that are valid within the first
381 16-byte block. The Idea here is that the AND with the mask
382 within the loop is "free", since we need some AND or TEST
383 insn in order to set the flags for the branch anyway. */
384 mask = -1u << misalign;
385
386 /* Main loop processing 16 bytes at a time. */
387 goto start;
388 do
389 {
390 data = *++p;
391 mask = -1;
392
393 start:
530b1d68 394 t = data == repl_nl;
395 t |= data == repl_cr;
396 t |= data == repl_bs;
397 t |= data == repl_qm;
246a2fcb
RH
398 found = __builtin_ia32_pmovmskb128 (t);
399 found &= mask;
400 }
401 while (!found);
402
403 /* FOUND contains 1 in bits for which we matched a relevant
404 character. Conversion to the byte index is trivial. */
405 found = __builtin_ctz(found);
406 return (const uchar *)p + found;
407}
408
6f173e52 409#ifdef HAVE_SSE4
246a2fcb
RH
410/* A version of the fast scanner using SSE 4.2 vectorized string insns. */
411
412static const uchar *
413#ifndef __SSE4_2__
414__attribute__((__target__("sse4.2")))
415#endif
416search_line_sse42 (const uchar *s, const uchar *end)
417{
418 typedef char v16qi __attribute__ ((__vector_size__ (16)));
419 static const v16qi search = { '\n', '\r', '?', '\\' };
420
421 uintptr_t si = (uintptr_t)s;
422 uintptr_t index;
423
424 /* Check for unaligned input. */
425 if (si & 15)
426 {
d35d1c0f
UB
427 v16qi sv;
428
246a2fcb
RH
429 if (__builtin_expect (end - s < 16, 0)
430 && __builtin_expect ((si & 0xfff) > 0xff0, 0))
431 {
432 /* There are less than 16 bytes left in the buffer, and less
433 than 16 bytes left on the page. Reading 16 bytes at this
434 point might generate a spurious page fault. Defer to the
435 SSE2 implementation, which already handles alignment. */
436 return search_line_sse2 (s, end);
437 }
438
439 /* ??? The builtin doesn't understand that the PCMPESTRI read from
440 memory need not be aligned. */
d35d1c0f
UB
441 sv = __builtin_ia32_loaddqu ((const char *) s);
442 index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
443
246a2fcb
RH
444 if (__builtin_expect (index < 16, 0))
445 goto found;
446
447 /* Advance the pointer to an aligned address. We will re-scan a
448 few bytes, but we no longer need care for reading past the
449 end of a page, since we're guaranteed a match. */
49445904 450 s = (const uchar *)((si + 15) & -16);
246a2fcb
RH
451 }
452
dc6bcf52
UB
453 /* Main loop, processing 16 bytes at a time. */
454#ifdef __GCC_ASM_FLAG_OUTPUTS__
455 while (1)
456 {
457 char f;
458
459 /* By using inline assembly instead of the builtin,
460 we can use the result, as well as the flags set. */
461 __asm ("%vpcmpestri\t$0, %2, %3"
462 : "=c"(index), "=@ccc"(f)
463 : "m"(*s), "x"(search), "a"(4), "d"(16));
464 if (f)
465 break;
466
467 s += 16;
468 }
469#else
470 s -= 16;
471 /* By doing the whole loop in inline assembly,
472 we can make proper use of the flags set. */
473 __asm ( ".balign 16\n"
246a2fcb 474 "0: add $16, %1\n"
dc6bcf52 475 " %vpcmpestri\t$0, (%1), %2\n"
246a2fcb
RH
476 " jnc 0b"
477 : "=&c"(index), "+r"(s)
478 : "x"(search), "a"(4), "d"(16));
dc6bcf52 479#endif
246a2fcb
RH
480
481 found:
482 return s + index;
483}
484
6f173e52
RH
485#else
486/* Work around out-dated assemblers without sse4 support. */
487#define search_line_sse42 search_line_sse2
488#endif
489
246a2fcb
RH
490/* Check the CPU capabilities. */
491
492#include "../gcc/config/i386/cpuid.h"
493
494typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
495static search_line_fast_type search_line_fast;
496
b0c084b7
JJ
497#define HAVE_init_vectorized_lexer 1
498static inline void
246a2fcb
RH
499init_vectorized_lexer (void)
500{
501 unsigned dummy, ecx = 0, edx = 0;
502 search_line_fast_type impl = search_line_acc_char;
503 int minimum = 0;
504
505#if defined(__SSE4_2__)
506 minimum = 3;
507#elif defined(__SSE2__)
508 minimum = 2;
ef230b38 509#elif defined(__SSE__)
246a2fcb
RH
510 minimum = 1;
511#endif
512
513 if (minimum == 3)
514 impl = search_line_sse42;
515 else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
516 {
517 if (minimum == 3 || (ecx & bit_SSE4_2))
518 impl = search_line_sse42;
519 else if (minimum == 2 || (edx & bit_SSE2))
520 impl = search_line_sse2;
521 else if (minimum == 1 || (edx & bit_SSE))
522 impl = search_line_mmx;
523 }
524 else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
525 {
5e70c0b5
UB
526 if (minimum == 1
527 || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
246a2fcb
RH
528 impl = search_line_mmx;
529 }
530
531 search_line_fast = impl;
532}
533
d00b1b02 534#elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
246a2fcb 535
0ccaaab0
BS
536/* A vection of the fast scanner using AltiVec vectorized byte compares
537 and VSX unaligned loads (when VSX is available). This is otherwise
d00b1b02 538 the same as the AltiVec version. */
0ccaaab0 539
44d95244 540ATTRIBUTE_NO_SANITIZE_UNDEFINED
0ccaaab0
BS
541static const uchar *
542search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
543{
544 typedef __attribute__((altivec(vector))) unsigned char vc;
545
546 const vc repl_nl = {
547 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
548 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
549 };
550 const vc repl_cr = {
551 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
552 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
553 };
554 const vc repl_bs = {
555 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
556 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
557 };
558 const vc repl_qm = {
559 '?', '?', '?', '?', '?', '?', '?', '?',
560 '?', '?', '?', '?', '?', '?', '?', '?',
561 };
562 const vc zero = { 0 };
563
564 vc data, t;
565
566 /* Main loop processing 16 bytes at a time. */
567 do
568 {
569 vc m_nl, m_cr, m_bs, m_qm;
570
a3a821c9 571 data = __builtin_vec_vsx_ld (0, s);
0ccaaab0
BS
572 s += 16;
573
574 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
575 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
576 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
577 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
578 t = (m_nl | m_cr) | (m_bs | m_qm);
579
580 /* T now contains 0xff in bytes for which we matched one of the relevant
581 characters. We want to exit the loop if any byte in T is non-zero.
582 Below is the expansion of vec_any_ne(t, zero). */
583 }
584 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
585
586 /* Restore s to to point to the 16 bytes we just processed. */
587 s -= 16;
588
589 {
590#define N (sizeof(vc) / sizeof(long))
591
592 union {
593 vc v;
594 /* Statically assert that N is 2 or 4. */
595 unsigned long l[(N == 2 || N == 4) ? N : -1];
596 } u;
597 unsigned long l, i = 0;
598
599 u.v = t;
600
601 /* Find the first word of T that is non-zero. */
602 switch (N)
603 {
604 case 4:
605 l = u.l[i++];
606 if (l != 0)
607 break;
608 s += sizeof(unsigned long);
609 l = u.l[i++];
610 if (l != 0)
611 break;
612 s += sizeof(unsigned long);
191816a3 613 /* FALLTHRU */
0ccaaab0
BS
614 case 2:
615 l = u.l[i++];
616 if (l != 0)
617 break;
618 s += sizeof(unsigned long);
619 l = u.l[i];
620 }
621
622 /* L now contains 0xff in bytes for which we matched one of the
623 relevant characters. We can find the byte index by finding
624 its bit index and dividing by 8. */
625#ifdef __BIG_ENDIAN__
626 l = __builtin_clzl(l) >> 3;
627#else
628 l = __builtin_ctzl(l) >> 3;
629#endif
630 return s + l;
631
632#undef N
633 }
634}
635
636#elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
637
638/* A vection of the fast scanner using AltiVec vectorized byte compares.
639 This cannot be used for little endian because vec_lvsl/lvsr are
640 deprecated for little endian and the code won't work properly. */
246a2fcb
RH
641/* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
642 so we can't compile this function without -maltivec on the command line
643 (or implied by some other switch). */
644
645static const uchar *
646search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
647{
648 typedef __attribute__((altivec(vector))) unsigned char vc;
649
650 const vc repl_nl = {
651 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
652 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
653 };
654 const vc repl_cr = {
655 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
656 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
657 };
658 const vc repl_bs = {
659 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
660 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
661 };
662 const vc repl_qm = {
663 '?', '?', '?', '?', '?', '?', '?', '?',
664 '?', '?', '?', '?', '?', '?', '?', '?',
665 };
666 const vc ones = {
667 -1, -1, -1, -1, -1, -1, -1, -1,
668 -1, -1, -1, -1, -1, -1, -1, -1,
669 };
670 const vc zero = { 0 };
671
672 vc data, mask, t;
673
674 /* Altivec loads automatically mask addresses with -16. This lets us
675 issue the first load as early as possible. */
676 data = __builtin_vec_ld(0, (const vc *)s);
677
678 /* Discard bytes before the beginning of the buffer. Do this by
679 beginning with all ones and shifting in zeros according to the
680 mis-alignment. The LVSR instruction pulls the exact shift we
681 want from the address. */
682 mask = __builtin_vec_lvsr(0, s);
683 mask = __builtin_vec_perm(zero, ones, mask);
684 data &= mask;
685
686 /* While altivec loads mask addresses, we still need to align S so
687 that the offset we compute at the end is correct. */
688 s = (const uchar *)((uintptr_t)s & -16);
689
690 /* Main loop processing 16 bytes at a time. */
691 goto start;
692 do
693 {
694 vc m_nl, m_cr, m_bs, m_qm;
695
696 s += 16;
697 data = __builtin_vec_ld(0, (const vc *)s);
698
699 start:
700 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
701 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
702 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
703 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
704 t = (m_nl | m_cr) | (m_bs | m_qm);
705
706 /* T now contains 0xff in bytes for which we matched one of the relevant
707 characters. We want to exit the loop if any byte in T is non-zero.
708 Below is the expansion of vec_any_ne(t, zero). */
709 }
710 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
711
712 {
713#define N (sizeof(vc) / sizeof(long))
714
246a2fcb
RH
715 union {
716 vc v;
53a103d3
DS
717 /* Statically assert that N is 2 or 4. */
718 unsigned long l[(N == 2 || N == 4) ? N : -1];
246a2fcb
RH
719 } u;
720 unsigned long l, i = 0;
721
722 u.v = t;
723
724 /* Find the first word of T that is non-zero. */
725 switch (N)
726 {
727 case 4:
728 l = u.l[i++];
729 if (l != 0)
730 break;
731 s += sizeof(unsigned long);
732 l = u.l[i++];
733 if (l != 0)
734 break;
735 s += sizeof(unsigned long);
67ef83c6 736 /* FALLTHROUGH */
246a2fcb
RH
737 case 2:
738 l = u.l[i++];
739 if (l != 0)
740 break;
741 s += sizeof(unsigned long);
742 l = u.l[i];
743 }
744
745 /* L now contains 0xff in bytes for which we matched one of the
746 relevant characters. We can find the byte index by finding
747 its bit index and dividing by 8. */
748 l = __builtin_clzl(l) >> 3;
749 return s + l;
750
751#undef N
752 }
753}
754
a6ac871c
RE
755#elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
756#include "arm_neon.h"
757
758/* This doesn't have to be the exact page size, but no system may use
759 a size smaller than this. ARMv8 requires a minimum page size of
760 4k. The impact of being conservative here is a small number of
761 cases will take the slightly slower entry path into the main
762 loop. */
763
764#define AARCH64_MIN_PAGE_SIZE 4096
765
766static const uchar *
767search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
768{
769 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
770 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
771 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
772 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
773 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
774
35c4515b 775#ifdef __ARM_BIG_ENDIAN
a6ac871c
RE
776 const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
777#else
778 const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
779#endif
780
781 unsigned int found;
782 const uint8_t *p;
783 uint8x16_t data;
784 uint8x16_t t;
785 uint16x8_t m;
786 uint8x16_t u, v, w;
787
788 /* Align the source pointer. */
789 p = (const uint8_t *)((uintptr_t)s & -16);
790
791 /* Assuming random string start positions, with a 4k page size we'll take
792 the slow path about 0.37% of the time. */
793 if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
794 - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
795 < 16, 0))
796 {
797 /* Slow path: the string starts near a possible page boundary. */
798 uint32_t misalign, mask;
799
800 misalign = (uintptr_t)s & 15;
801 mask = (-1u << misalign) & 0xffff;
802 data = vld1q_u8 (p);
803 t = vceqq_u8 (data, repl_nl);
804 u = vceqq_u8 (data, repl_cr);
805 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
806 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
807 t = vorrq_u8 (v, w);
808 t = vandq_u8 (t, xmask);
809 m = vpaddlq_u8 (t);
810 m = vshlq_u16 (m, shift);
811 found = vaddvq_u16 (m);
812 found &= mask;
813 if (found)
814 return (const uchar*)p + __builtin_ctz (found);
815 }
816 else
817 {
818 data = vld1q_u8 ((const uint8_t *) s);
819 t = vceqq_u8 (data, repl_nl);
820 u = vceqq_u8 (data, repl_cr);
821 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
822 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
823 t = vorrq_u8 (v, w);
8c00ae24 824 if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
a6ac871c
RE
825 goto done;
826 }
827
828 do
829 {
830 p += 16;
831 data = vld1q_u8 (p);
832 t = vceqq_u8 (data, repl_nl);
833 u = vceqq_u8 (data, repl_cr);
834 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
835 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
836 t = vorrq_u8 (v, w);
837 } while (!vpaddd_u64 ((uint64x2_t)t));
838
839done:
840 /* Now that we've found the terminating substring, work out precisely where
841 we need to stop. */
842 t = vandq_u8 (t, xmask);
843 m = vpaddlq_u8 (t);
844 m = vshlq_u16 (m, shift);
845 found = vaddvq_u16 (m);
846 return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
847 + __builtin_ctz (found));
848}
849
95d0610c 850#elif defined (__ARM_NEON)
e75b54a2
RE
851#include "arm_neon.h"
852
853static const uchar *
854search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
855{
856 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
857 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
858 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
859 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
860 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
861
862 unsigned int misalign, found, mask;
863 const uint8_t *p;
864 uint8x16_t data;
865
866 /* Align the source pointer. */
867 misalign = (uintptr_t)s & 15;
868 p = (const uint8_t *)((uintptr_t)s & -16);
869 data = vld1q_u8 (p);
870
871 /* Create a mask for the bytes that are valid within the first
872 16-byte block. The Idea here is that the AND with the mask
873 within the loop is "free", since we need some AND or TEST
874 insn in order to set the flags for the branch anyway. */
875 mask = (-1u << misalign) & 0xffff;
876
877 /* Main loop, processing 16 bytes at a time. */
878 goto start;
879
880 do
881 {
882 uint8x8_t l;
883 uint16x4_t m;
884 uint32x2_t n;
885 uint8x16_t t, u, v, w;
886
887 p += 16;
888 data = vld1q_u8 (p);
889 mask = 0xffff;
890
891 start:
892 t = vceqq_u8 (data, repl_nl);
893 u = vceqq_u8 (data, repl_cr);
894 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
895 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
896 t = vandq_u8 (vorrq_u8 (v, w), xmask);
897 l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
898 m = vpaddl_u8 (l);
899 n = vpaddl_u16 (m);
900
901 found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
902 vshr_n_u64 ((uint64x1_t) n, 24)), 0);
903 found &= mask;
904 }
905 while (!found);
906
907 /* FOUND contains 1 in bits for which we matched a relevant
908 character. Conversion to the byte index is trivial. */
909 found = __builtin_ctz (found);
910 return (const uchar *)p + found;
911}
912
246a2fcb
RH
913#else
914
5764ee3c 915/* We only have one accelerated alternative. Use a direct call so that
246a2fcb
RH
916 we encourage inlining. */
917
918#define search_line_fast search_line_acc_char
919
920#endif
921
b0c084b7
JJ
922/* Initialize the lexer if needed. */
923
924void
925_cpp_init_lexer (void)
926{
927#ifdef HAVE_init_vectorized_lexer
928 init_vectorized_lexer ();
929#endif
930}
931
26aea073
NB
932/* Returns with a logical line that contains no escaped newlines or
933 trigraphs. This is a time-critical inner loop. */
934void
6cf87ca4 935_cpp_clean_line (cpp_reader *pfile)
45b966db 936{
26aea073
NB
937 cpp_buffer *buffer;
938 const uchar *s;
939 uchar c, *d, *p;
87062813 940
26aea073
NB
941 buffer = pfile->buffer;
942 buffer->cur_note = buffer->notes_used = 0;
943 buffer->cur = buffer->line_base = buffer->next_line;
944 buffer->need_line = false;
246a2fcb 945 s = buffer->next_line;
87062813 946
26aea073 947 if (!buffer->from_stage3)
45b966db 948 {
7af45bd4
ILT
949 const uchar *pbackslash = NULL;
950
246a2fcb 951 /* Fast path. This is the common case of an un-escaped line with
d08dcf87
ZW
952 no trigraphs. The primary win here is by not writing any
953 data back to memory until we have to. */
246a2fcb 954 while (1)
d08dcf87 955 {
246a2fcb
RH
956 /* Perform an optimized search for \n, \r, \\, ?. */
957 s = search_line_fast (s, buffer->rlimit);
d08dcf87 958
246a2fcb
RH
959 c = *s;
960 if (c == '\\')
961 {
962 /* Record the location of the backslash and continue. */
963 pbackslash = s++;
d08dcf87 964 }
246a2fcb 965 else if (__builtin_expect (c == '?', 0))
d08dcf87 966 {
246a2fcb
RH
967 if (__builtin_expect (s[1] == '?', false)
968 && _cpp_trigraph_map[s[2]])
d08dcf87 969 {
246a2fcb
RH
970 /* Have a trigraph. We may or may not have to convert
971 it. Add a line note regardless, for -Wtrigraphs. */
972 add_line_note (buffer, s, s[2]);
973 if (CPP_OPTION (pfile, trigraphs))
974 {
975 /* We do, and that means we have to switch to the
976 slow path. */
977 d = (uchar *) s;
978 *d = _cpp_trigraph_map[s[2]];
979 s += 2;
980 goto slow_path;
981 }
d08dcf87 982 }
246a2fcb
RH
983 /* Not a trigraph. Continue on fast-path. */
984 s++;
d08dcf87 985 }
246a2fcb
RH
986 else
987 break;
d08dcf87
ZW
988 }
989
246a2fcb
RH
990 /* This must be \r or \n. We're either done, or we'll be forced
991 to write back to the buffer and continue on the slow path. */
992 d = (uchar *) s;
993
994 if (__builtin_expect (s == buffer->rlimit, false))
995 goto done;
996
997 /* DOS line ending? */
998 if (__builtin_expect (c == '\r', false) && s[1] == '\n')
999 {
1000 s++;
1001 if (s == buffer->rlimit)
1002 goto done;
1003 }
1004
1005 if (__builtin_expect (pbackslash == NULL, true))
1006 goto done;
1007
1008 /* Check for escaped newline. */
1009 p = d;
1010 while (is_nvspace (p[-1]))
1011 p--;
1012 if (p - 1 != pbackslash)
1013 goto done;
1014
1015 /* Have an escaped newline; process it and proceed to
1016 the slow path. */
1017 add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1018 d = p - 2;
1019 buffer->next_line = p - 1;
26aea073 1020
246a2fcb
RH
1021 slow_path:
1022 while (1)
4a5b68a2 1023 {
26aea073
NB
1024 c = *++s;
1025 *++d = c;
1026
1027 if (c == '\n' || c == '\r')
1028 {
246a2fcb 1029 /* Handle DOS line endings. */
26aea073
NB
1030 if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1031 s++;
1032 if (s == buffer->rlimit)
1033 break;
1034
1035 /* Escaped? */
1036 p = d;
1037 while (p != buffer->next_line && is_nvspace (p[-1]))
1038 p--;
1039 if (p == buffer->next_line || p[-1] != '\\')
1040 break;
1041
41c32c98 1042 add_line_note (buffer, p - 1, p != d ? ' ': '\\');
26aea073
NB
1043 d = p - 2;
1044 buffer->next_line = p - 1;
1045 }
1046 else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1047 {
1048 /* Add a note regardless, for the benefit of -Wtrigraphs. */
41c32c98 1049 add_line_note (buffer, d, s[2]);
26aea073
NB
1050 if (CPP_OPTION (pfile, trigraphs))
1051 {
1052 *d = _cpp_trigraph_map[s[2]];
1053 s += 2;
1054 }
1055 }
4a5b68a2 1056 }
45b966db 1057 }
26aea073
NB
1058 else
1059 {
246a2fcb 1060 while (*s != '\n' && *s != '\r')
26aea073 1061 s++;
26aea073
NB
1062 d = (uchar *) s;
1063
1064 /* Handle DOS line endings. */
082a7b23 1065 if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
26aea073
NB
1066 s++;
1067 }
0d9f234d 1068
d08dcf87 1069 done:
26aea073 1070 *d = '\n';
41c32c98
NB
1071 /* A sentinel note that should never be processed. */
1072 add_line_note (buffer, d + 1, '\n');
26aea073 1073 buffer->next_line = s + 1;
45b966db
ZW
1074}
1075
a8eb6044
NB
1076/* Return true if the trigraph indicated by NOTE should be warned
1077 about in a comment. */
1078static bool
6cf87ca4 1079warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
a8eb6044
NB
1080{
1081 const uchar *p;
1082
1083 /* Within comments we don't warn about trigraphs, unless the
1084 trigraph forms an escaped newline, as that may change
6356f892 1085 behavior. */
a8eb6044
NB
1086 if (note->type != '/')
1087 return false;
1088
1089 /* If -trigraphs, then this was an escaped newline iff the next note
1090 is coincident. */
1091 if (CPP_OPTION (pfile, trigraphs))
1092 return note[1].pos == note->pos;
1093
1094 /* Otherwise, see if this forms an escaped newline. */
1095 p = note->pos + 3;
1096 while (is_nvspace (*p))
1097 p++;
1098
1099 /* There might have been escaped newlines between the trigraph and the
1100 newline we found. Hence the position test. */
1101 return (*p == '\n' && p < note[1].pos);
1102}
1103
26aea073
NB
1104/* Process the notes created by add_line_note as far as the current
1105 location. */
1106void
6cf87ca4 1107_cpp_process_line_notes (cpp_reader *pfile, int in_comment)
45b966db 1108{
29401c30
NB
1109 cpp_buffer *buffer = pfile->buffer;
1110
26aea073 1111 for (;;)
041c3194 1112 {
26aea073
NB
1113 _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1114 unsigned int col;
a5c3cccd 1115
26aea073
NB
1116 if (note->pos > buffer->cur)
1117 break;
a5c3cccd 1118
26aea073
NB
1119 buffer->cur_note++;
1120 col = CPP_BUF_COLUMN (buffer, note->pos + 1);
4d6baafa 1121
41c32c98 1122 if (note->type == '\\' || note->type == ' ')
26aea073 1123 {
41c32c98 1124 if (note->type == ' ' && !in_comment)
500bee0a 1125 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
26aea073 1126 "backslash and newline separated by space");
41c32c98 1127
26aea073 1128 if (buffer->next_line > buffer->rlimit)
87062813 1129 {
500bee0a 1130 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
26aea073
NB
1131 "backslash-newline at end of file");
1132 /* Prevent "no newline at end of file" warning. */
1133 buffer->next_line = buffer->rlimit;
87062813 1134 }
26aea073
NB
1135
1136 buffer->line_base = note->pos;
12f9df4e 1137 CPP_INCREMENT_LINE (pfile, 0);
0d9f234d 1138 }
41c32c98
NB
1139 else if (_cpp_trigraph_map[note->type])
1140 {
a8eb6044
NB
1141 if (CPP_OPTION (pfile, warn_trigraphs)
1142 && (!in_comment || warn_in_comment (pfile, note)))
41c32c98
NB
1143 {
1144 if (CPP_OPTION (pfile, trigraphs))
87cf0651
SB
1145 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1146 pfile->line_table->highest_line, col,
1147 "trigraph ??%c converted to %c",
1148 note->type,
1149 (int) _cpp_trigraph_map[note->type]);
41c32c98 1150 else
905bd7b5 1151 {
87cf0651
SB
1152 cpp_warning_with_line
1153 (pfile, CPP_W_TRIGRAPHS,
1154 pfile->line_table->highest_line, col,
905bd7b5
GK
1155 "trigraph ??%c ignored, use -trigraphs to enable",
1156 note->type);
1157 }
41c32c98
NB
1158 }
1159 }
00a81b8b
JM
1160 else if (note->type == 0)
1161 /* Already processed in lex_raw_string. */;
41c32c98
NB
1162 else
1163 abort ();
041c3194 1164 }
45b966db
ZW
1165}
1166
51c50026
MP
1167namespace bidi {
1168 enum class kind {
1169 NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
1170 };
1171
1172 /* All the UTF-8 encodings of bidi characters start with E2. */
1173 constexpr uchar utf8_start = 0xe2;
1174
bef32d4a
DM
1175 struct context
1176 {
1177 context () {}
1178 context (location_t loc, kind k, bool pdf, bool ucn)
1179 : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
1180 {
1181 }
1182
1183 kind get_pop_kind () const
1184 {
1185 return m_pdf ? kind::PDF : kind::PDI;
1186 }
1187 bool ucn_p () const
1188 {
1189 return m_ucn;
1190 }
1191
1192 location_t m_loc;
1193 kind m_kind;
1194 unsigned m_pdf : 1;
1195 unsigned m_ucn : 1;
1196 };
1197
51c50026
MP
1198 /* A vector holding currently open bidi contexts. We use a char for
1199 each context, its LSB is 1 if it represents a PDF context, 0 if it
1200 represents a PDI context. The next bit is 1 if this context was open
1201 by a bidi character written as a UCN, and 0 when it was UTF-8. */
bef32d4a 1202 semi_embedded_vec <context, 16> vec;
51c50026
MP
1203
1204 /* Close the whole comment/identifier/string literal/character constant
1205 context. */
1206 void on_close ()
1207 {
1208 vec.truncate (0);
1209 }
1210
1211 /* Pop the last element in the vector. */
1212 void pop ()
1213 {
1214 unsigned int len = vec.count ();
1215 gcc_checking_assert (len > 0);
1216 vec.truncate (len - 1);
1217 }
1218
bef32d4a
DM
1219 /* Return the pop kind of the context of the Ith element. */
1220 kind pop_kind_at (unsigned int i)
51c50026 1221 {
bef32d4a 1222 return vec[i].get_pop_kind ();
51c50026
MP
1223 }
1224
bef32d4a 1225 /* Return the pop kind of the context that is currently opened. */
51c50026
MP
1226 kind current_ctx ()
1227 {
1228 unsigned int len = vec.count ();
1229 if (len == 0)
1230 return kind::NONE;
bef32d4a 1231 return vec[len - 1].get_pop_kind ();
51c50026
MP
1232 }
1233
1234 /* Return true if the current context comes from a UCN origin, that is,
1235 the bidi char which started this bidi context was written as a UCN. */
1236 bool current_ctx_ucn_p ()
1237 {
1238 unsigned int len = vec.count ();
1239 gcc_checking_assert (len > 0);
bef32d4a 1240 return vec[len - 1].m_ucn;
51c50026
MP
1241 }
1242
bef32d4a
DM
1243 location_t current_ctx_loc ()
1244 {
1245 unsigned int len = vec.count ();
1246 gcc_checking_assert (len > 0);
1247 return vec[len - 1].m_loc;
1248 }
1249
1250 /* We've read a bidi char, update the current vector as necessary.
1251 LOC is only valid when K is not kind::NONE. */
1252 void on_char (kind k, bool ucn_p, location_t loc)
51c50026
MP
1253 {
1254 switch (k)
1255 {
1256 case kind::LRE:
1257 case kind::RLE:
1258 case kind::LRO:
1259 case kind::RLO:
bef32d4a 1260 vec.push (context (loc, k, true, ucn_p));
51c50026
MP
1261 break;
1262 case kind::LRI:
1263 case kind::RLI:
1264 case kind::FSI:
bef32d4a 1265 vec.push (context (loc, k, false, ucn_p));
51c50026
MP
1266 break;
1267 /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
1268 whose scope has not yet been terminated. */
1269 case kind::PDF:
1270 if (current_ctx () == kind::PDF)
1271 pop ();
1272 break;
1273 /* PDI terminates the scope of the last LRI, RLI, or FSI whose
1274 scope has not yet been terminated, as well as the scopes of
1275 any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
1276 yet been terminated. */
1277 case kind::PDI:
1278 for (int i = vec.count () - 1; i >= 0; --i)
bef32d4a 1279 if (pop_kind_at (i) == kind::PDI)
51c50026
MP
1280 {
1281 vec.truncate (i);
1282 break;
1283 }
1284 break;
1285 case kind::LTR:
1286 case kind::RTL:
1287 /* These aren't popped by a PDF/PDI. */
1288 break;
630686f9 1289 ATTR_LIKELY case kind::NONE:
51c50026
MP
1290 break;
1291 default:
1292 abort ();
1293 }
1294 }
1295
1296 /* Return a descriptive string for K. */
1297 const char *to_str (kind k)
1298 {
1299 switch (k)
1300 {
1301 case kind::LRE:
1302 return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
1303 case kind::RLE:
1304 return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
1305 case kind::LRO:
1306 return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
1307 case kind::RLO:
1308 return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
1309 case kind::LRI:
1310 return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
1311 case kind::RLI:
1312 return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
1313 case kind::FSI:
1314 return "U+2068 (FIRST STRONG ISOLATE)";
1315 case kind::PDF:
1316 return "U+202C (POP DIRECTIONAL FORMATTING)";
1317 case kind::PDI:
1318 return "U+2069 (POP DIRECTIONAL ISOLATE)";
1319 case kind::LTR:
1320 return "U+200E (LEFT-TO-RIGHT MARK)";
1321 case kind::RTL:
1322 return "U+200F (RIGHT-TO-LEFT MARK)";
1323 default:
1324 abort ();
1325 }
1326 }
1327}
1328
bef32d4a
DM
1329/* Get location_t for the range of bytes [START, START + NUM_BYTES)
1330 within the current line in FILE, with the caret at START. */
1331
1332static location_t
1333get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
1334 const unsigned char *const start,
1335 size_t num_bytes)
1336{
1337 gcc_checking_assert (num_bytes > 0);
1338
1339 /* CPP_BUF_COLUMN and linemap_position_for_column both refer
1340 to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
1341 whereas linemap_position_for_column is 1-based. */
1342
1343 /* Get 0-based offsets within the line. */
1344 size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
1345 size_t end_offset = start_offset + num_bytes - 1;
1346
1347 /* Now convert to location_t, where "columns" are 1-based byte offsets. */
1348 location_t start_loc = linemap_position_for_column (pfile->line_table,
1349 start_offset + 1);
1350 location_t end_loc = linemap_position_for_column (pfile->line_table,
1351 end_offset + 1);
1352
1353 if (start_loc == end_loc)
1354 return start_loc;
1355
1356 source_range src_range;
1357 src_range.m_start = start_loc;
1358 src_range.m_finish = end_loc;
1359 location_t combined_loc = COMBINE_LOCATION_DATA (pfile->line_table,
1360 start_loc,
1361 src_range,
1362 NULL);
1363 return combined_loc;
1364}
1365
51c50026
MP
1366/* Parse a sequence of 3 bytes starting with P and return its bidi code. */
1367
1368static bidi::kind
bef32d4a 1369get_bidi_utf8_1 (const unsigned char *const p)
51c50026
MP
1370{
1371 gcc_checking_assert (p[0] == bidi::utf8_start);
1372
1373 if (p[1] == 0x80)
1374 switch (p[2])
1375 {
1376 case 0xaa:
1377 return bidi::kind::LRE;
1378 case 0xab:
1379 return bidi::kind::RLE;
1380 case 0xac:
1381 return bidi::kind::PDF;
1382 case 0xad:
1383 return bidi::kind::LRO;
1384 case 0xae:
1385 return bidi::kind::RLO;
1386 case 0x8e:
1387 return bidi::kind::LTR;
1388 case 0x8f:
1389 return bidi::kind::RTL;
1390 default:
1391 break;
1392 }
1393 else if (p[1] == 0x81)
1394 switch (p[2])
1395 {
1396 case 0xa6:
1397 return bidi::kind::LRI;
1398 case 0xa7:
1399 return bidi::kind::RLI;
1400 case 0xa8:
1401 return bidi::kind::FSI;
1402 case 0xa9:
1403 return bidi::kind::PDI;
1404 default:
1405 break;
1406 }
1407
1408 return bidi::kind::NONE;
1409}
1410
bef32d4a
DM
1411/* Parse a sequence of 3 bytes starting with P and return its bidi code.
1412 If the kind is not NONE, write the location to *OUT.*/
1413
1414static bidi::kind
1415get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
1416{
1417 bidi::kind result = get_bidi_utf8_1 (p);
1418 if (result != bidi::kind::NONE)
1419 {
1420 /* We have a sequence of 3 bytes starting at P. */
1421 *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
1422 }
1423 return result;
1424}
1425
51c50026
MP
1426/* Parse a UCN where P points just past \u or \U and return its bidi code. */
1427
1428static bidi::kind
bef32d4a 1429get_bidi_ucn_1 (const unsigned char *p, bool is_U)
51c50026
MP
1430{
1431 /* 6.4.3 Universal Character Names
1432 \u hex-quad
1433 \U hex-quad hex-quad
1434 where \unnnn means \U0000nnnn. */
1435
1436 if (is_U)
1437 {
1438 if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
1439 return bidi::kind::NONE;
1440 /* Skip 4B so we can treat \u and \U the same below. */
1441 p += 4;
1442 }
1443
1444 /* All code points we are looking for start with 20xx. */
1445 if (p[0] != '2' || p[1] != '0')
1446 return bidi::kind::NONE;
1447 else if (p[2] == '2')
1448 switch (p[3])
1449 {
1450 case 'a':
1451 case 'A':
1452 return bidi::kind::LRE;
1453 case 'b':
1454 case 'B':
1455 return bidi::kind::RLE;
1456 case 'c':
1457 case 'C':
1458 return bidi::kind::PDF;
1459 case 'd':
1460 case 'D':
1461 return bidi::kind::LRO;
1462 case 'e':
1463 case 'E':
1464 return bidi::kind::RLO;
1465 default:
1466 break;
1467 }
1468 else if (p[2] == '6')
1469 switch (p[3])
1470 {
1471 case '6':
1472 return bidi::kind::LRI;
1473 case '7':
1474 return bidi::kind::RLI;
1475 case '8':
1476 return bidi::kind::FSI;
1477 case '9':
1478 return bidi::kind::PDI;
1479 default:
1480 break;
1481 }
1482 else if (p[2] == '0')
1483 switch (p[3])
1484 {
1485 case 'e':
1486 case 'E':
1487 return bidi::kind::LTR;
1488 case 'f':
1489 case 'F':
1490 return bidi::kind::RTL;
1491 default:
1492 break;
1493 }
1494
1495 return bidi::kind::NONE;
1496}
1497
bef32d4a
DM
1498/* Parse a UCN where P points just past \u or \U and return its bidi code.
1499 If the kind is not NONE, write the location to *OUT.*/
1500
1501static bidi::kind
1502get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
1503 location_t *out)
1504{
1505 bidi::kind result = get_bidi_ucn_1 (p, is_U);
1506 if (result != bidi::kind::NONE)
1507 {
1508 const unsigned char *start = p - 2;
1509 size_t num_bytes = 2 + (is_U ? 8 : 4);
1510 *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
1511 }
1512 return result;
1513}
1514
1515/* Subclass of rich_location for reporting on unpaired UTF-8
1516 bidirectional control character(s).
1517 Escape the source lines on output, and show all unclosed
1518 bidi context, labelling everything. */
1519
1520class unpaired_bidi_rich_location : public rich_location
1521{
1522 public:
1523 class custom_range_label : public range_label
1524 {
1525 public:
1526 label_text get_text (unsigned range_idx) const FINAL OVERRIDE
1527 {
1528 /* range 0 is the primary location; each subsequent range i + 1
1529 is for bidi::vec[i]. */
1530 if (range_idx > 0)
1531 {
1532 const bidi::context &ctxt (bidi::vec[range_idx - 1]);
1533 return label_text::borrow (bidi::to_str (ctxt.m_kind));
1534 }
1535 else
1536 return label_text::borrow (_("end of bidirectional context"));
1537 }
1538 };
1539
1540 unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
1541 : rich_location (pfile->line_table, loc, &m_custom_label)
1542 {
1543 set_escape_on_output (true);
1544 for (unsigned i = 0; i < bidi::vec.count (); i++)
1545 add_range (bidi::vec[i].m_loc,
1546 SHOW_RANGE_WITHOUT_CARET,
1547 &m_custom_label);
1548 }
1549
1550 private:
1551 custom_range_label m_custom_label;
1552};
1553
51c50026
MP
1554/* We're closing a bidi context, that is, we've encountered a newline,
1555 are closing a C-style comment, or are at the end of a string literal,
1556 character constant, or identifier. Warn if this context was not
1557 properly terminated by a PDI or PDF. P points to the last character
1558 in this context. */
1559
1560static void
1561maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
1562{
1563 if (CPP_OPTION (pfile, cpp_warn_bidirectional) == bidirectional_unpaired
1564 && bidi::vec.count () > 0)
1565 {
1566 const location_t loc
1567 = linemap_position_for_column (pfile->line_table,
1568 CPP_BUF_COLUMN (pfile->buffer, p));
bef32d4a
DM
1569 unpaired_bidi_rich_location rich_loc (pfile, loc);
1570 /* cpp_callbacks doesn't yet have a way to handle singular vs plural
1571 forms of a diagnostic, so fake it for now. */
1572 if (bidi::vec.count () > 1)
1573 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1574 "unpaired UTF-8 bidirectional control characters "
1575 "detected");
1576 else
1577 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1578 "unpaired UTF-8 bidirectional control character "
1579 "detected");
51c50026
MP
1580 }
1581 /* We're done with this context. */
1582 bidi::on_close ();
1583}
1584
1585/* We're at the beginning or in the middle of an identifier/comment/string
1586 literal/character constant. Warn if we've encountered a bidi character.
bef32d4a
DM
1587 KIND says which bidi control character it was; UCN_P is true iff this bidi
1588 control character was written as a UCN. LOC is the location of the
1589 character, but is only valid if KIND != bidi::kind::NONE. */
51c50026
MP
1590
1591static void
bef32d4a
DM
1592maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
1593 bool ucn_p, location_t loc)
51c50026
MP
1594{
1595 if (__builtin_expect (kind == bidi::kind::NONE, 1))
1596 return;
1597
1598 const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1599
1600 if (warn_bidi != bidirectional_none)
1601 {
1a7f2c07
DM
1602 rich_location rich_loc (pfile->line_table, loc);
1603 rich_loc.set_escape_on_output (true);
1604
51c50026
MP
1605 /* It seems excessive to warn about a PDI/PDF that is closing
1606 an opened context because we've already warned about the
1607 opening character. Except warn when we have a UCN x UTF-8
1608 mismatch. */
1609 if (kind == bidi::current_ctx ())
1610 {
1611 if (warn_bidi == bidirectional_unpaired
1612 && bidi::current_ctx_ucn_p () != ucn_p)
bef32d4a
DM
1613 {
1614 rich_loc.add_range (bidi::current_ctx_loc ());
1615 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1616 "UTF-8 vs UCN mismatch when closing "
1617 "a context by \"%s\"", bidi::to_str (kind));
1618 }
51c50026
MP
1619 }
1620 else if (warn_bidi == bidirectional_any)
1621 {
1622 if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
1a7f2c07
DM
1623 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1624 "\"%s\" is closing an unopened context",
1625 bidi::to_str (kind));
51c50026 1626 else
1a7f2c07
DM
1627 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
1628 "found problematic Unicode character \"%s\"",
1629 bidi::to_str (kind));
51c50026
MP
1630 }
1631 }
1632 /* We're done with this context. */
bef32d4a 1633 bidi::on_char (kind, ucn_p, loc);
51c50026
MP
1634}
1635
0d9f234d
NB
1636/* Skip a C-style block comment. We find the end of the comment by
1637 seeing if an asterisk is before every '/' we encounter. Returns
6f572ac2
NB
1638 nonzero if comment terminated by EOF, zero otherwise.
1639
1640 Buffer->cur points to the initial asterisk of the comment. */
26aea073 1641bool
6cf87ca4 1642_cpp_skip_block_comment (cpp_reader *pfile)
45b966db 1643{
041c3194 1644 cpp_buffer *buffer = pfile->buffer;
d08dcf87
ZW
1645 const uchar *cur = buffer->cur;
1646 uchar c;
51c50026 1647 const bool warn_bidi_p = pfile->warn_bidi_p ();
0d9f234d 1648
d08dcf87
ZW
1649 cur++;
1650 if (*cur == '/')
1651 cur++;
0d9f234d 1652
26aea073
NB
1653 for (;;)
1654 {
0d9f234d
NB
1655 /* People like decorating comments with '*', so check for '/'
1656 instead for efficiency. */
d08dcf87
ZW
1657 c = *cur++;
1658
041c3194 1659 if (c == '/')
45b966db 1660 {
d08dcf87 1661 if (cur[-2] == '*')
51c50026
MP
1662 {
1663 if (warn_bidi_p)
1664 maybe_warn_bidi_on_close (pfile, cur);
1665 break;
1666 }
041c3194 1667
0d9f234d 1668 /* Warn about potential nested comments, but not if the '/'
a1f300c0 1669 comes immediately before the true comment delimiter.
041c3194 1670 Don't bother to get it right across escaped newlines. */
0d9f234d 1671 if (CPP_OPTION (pfile, warn_comments)
d08dcf87
ZW
1672 && cur[0] == '*' && cur[1] != '/')
1673 {
1674 buffer->cur = cur;
87cf0651
SB
1675 cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1676 pfile->line_table->highest_line,
1677 CPP_BUF_COL (buffer),
1678 "\"/*\" within comment");
d08dcf87 1679 }
45b966db 1680 }
26aea073
NB
1681 else if (c == '\n')
1682 {
12f9df4e 1683 unsigned int cols;
d08dcf87 1684 buffer->cur = cur - 1;
51c50026
MP
1685 if (warn_bidi_p)
1686 maybe_warn_bidi_on_close (pfile, cur);
26aea073
NB
1687 _cpp_process_line_notes (pfile, true);
1688 if (buffer->next_line >= buffer->rlimit)
1689 return true;
1690 _cpp_clean_line (pfile);
12f9df4e
PB
1691
1692 cols = buffer->next_line - buffer->line_base;
1693 CPP_INCREMENT_LINE (pfile, cols);
1694
d08dcf87 1695 cur = buffer->cur;
26aea073 1696 }
51c50026
MP
1697 /* If this is a beginning of a UTF-8 encoding, it might be
1698 a bidirectional control character. */
1699 else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
1700 {
bef32d4a
DM
1701 location_t loc;
1702 bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
1703 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
51c50026 1704 }
45b966db 1705 }
041c3194 1706
d08dcf87 1707 buffer->cur = cur;
a8eb6044 1708 _cpp_process_line_notes (pfile, true);
26aea073 1709 return false;
45b966db
ZW
1710}
1711
480709cc 1712/* Skip a C++ line comment, leaving buffer->cur pointing to the
da7d8304 1713 terminating newline. Handles escaped newlines. Returns nonzero
480709cc 1714 if a multiline comment. */
041c3194 1715static int
6cf87ca4 1716skip_line_comment (cpp_reader *pfile)
45b966db 1717{
cbcff6df 1718 cpp_buffer *buffer = pfile->buffer;
620e594b 1719 location_t orig_line = pfile->line_table->highest_line;
51c50026 1720 const bool warn_bidi_p = pfile->warn_bidi_p ();
041c3194 1721
51c50026
MP
1722 if (!warn_bidi_p)
1723 while (*buffer->cur != '\n')
1724 buffer->cur++;
1725 else
1726 {
1727 while (*buffer->cur != '\n'
1728 && *buffer->cur != bidi::utf8_start)
1729 buffer->cur++;
1730 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1731 {
1732 while (*buffer->cur != '\n')
1733 {
1734 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1735 {
bef32d4a
DM
1736 location_t loc;
1737 bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1738 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
51c50026
MP
1739 }
1740 buffer->cur++;
1741 }
1742 maybe_warn_bidi_on_close (pfile, buffer->cur);
1743 }
1744 }
480709cc 1745
26aea073 1746 _cpp_process_line_notes (pfile, true);
500bee0a 1747 return orig_line != pfile->line_table->highest_line;
041c3194 1748}
45b966db 1749
26aea073 1750/* Skips whitespace, saving the next non-whitespace character. */
52fadca8 1751static void
6cf87ca4 1752skip_whitespace (cpp_reader *pfile, cppchar_t c)
041c3194
ZW
1753{
1754 cpp_buffer *buffer = pfile->buffer;
f7d151fb 1755 bool saw_NUL = false;
45b966db 1756
0d9f234d 1757 do
041c3194 1758 {
91fcd158 1759 /* Horizontal space always OK. */
26aea073 1760 if (c == ' ' || c == '\t')
0d9f234d 1761 ;
0d9f234d 1762 /* Just \f \v or \0 left. */
91fcd158 1763 else if (c == '\0')
f7d151fb 1764 saw_NUL = true;
93c80368 1765 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
500bee0a 1766 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
ebef4e8c
NB
1767 CPP_BUF_COL (buffer),
1768 "%s in preprocessing directive",
1769 c == '\f' ? "form feed" : "vertical tab");
0d9f234d 1770
0d9f234d 1771 c = *buffer->cur++;
45b966db 1772 }
ec5c56db 1773 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
0d9f234d
NB
1774 while (is_nvspace (c));
1775
f7d151fb 1776 if (saw_NUL)
bd5e882c
DM
1777 {
1778 encoding_rich_location rich_loc (pfile);
1779 cpp_error_at (pfile, CPP_DL_WARNING, &rich_loc,
1780 "null character(s) ignored");
1781 }
f7d151fb 1782
480709cc 1783 buffer->cur--;
041c3194 1784}
45b966db 1785
93c80368
NB
1786/* See if the characters of a number token are valid in a name (no
1787 '.', '+' or '-'). */
1788static int
6cf87ca4 1789name_p (cpp_reader *pfile, const cpp_string *string)
93c80368
NB
1790{
1791 unsigned int i;
1792
1793 for (i = 0; i < string->len; i++)
1794 if (!is_idchar (string->text[i]))
1795 return 0;
1796
df383483 1797 return 1;
93c80368
NB
1798}
1799
50668cf6
GK
1800/* After parsing an identifier or other sequence, produce a warning about
1801 sequences not in NFC/NFKC. */
1802static void
1803warn_about_normalization (cpp_reader *pfile,
1804 const cpp_token *token,
1805 const struct normalize_state *s)
1806{
1807 if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1808 && !pfile->state.skipping)
1809 {
bd5e882c
DM
1810 location_t loc = token->src_loc;
1811
1812 /* If possible, create a location range for the token. */
1813 if (loc >= RESERVED_LOCATION_COUNT
1814 && token->type != CPP_EOF
1815 /* There must be no line notes to process. */
1816 && (!(pfile->buffer->cur
1817 >= pfile->buffer->notes[pfile->buffer->cur_note].pos
1818 && !pfile->overlaid_buffer)))
1819 {
1820 source_range tok_range;
1821 tok_range.m_start = loc;
1822 tok_range.m_finish
1823 = linemap_position_for_column (pfile->line_table,
1824 CPP_BUF_COLUMN (pfile->buffer,
1825 pfile->buffer->cur));
1826 loc = COMBINE_LOCATION_DATA (pfile->line_table,
1827 loc, tok_range, NULL);
1828 }
1829
1830 encoding_rich_location rich_loc (pfile, loc);
1831
50668cf6
GK
1832 /* Make sure that the token is printed using UCNs, even
1833 if we'd otherwise happily print UTF-8. */
c3f829c1 1834 unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
50668cf6
GK
1835 size_t sz;
1836
1837 sz = cpp_spell_token (pfile, token, buf, false) - buf;
1838 if (NORMALIZE_STATE_RESULT (s) == normalized_C)
bd5e882c
DM
1839 cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
1840 "`%.*s' is not in NFKC", (int) sz, buf);
c264208e 1841 else if (CPP_OPTION (pfile, cplusplus))
bd5e882c 1842 cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
c4d6dcac 1843 "`%.*s' is not in NFC", (int) sz, buf);
50668cf6 1844 else
bd5e882c
DM
1845 cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
1846 "`%.*s' is not in NFC", (int) sz, buf);
55e7f907 1847 free (buf);
50668cf6
GK
1848 }
1849}
1850
7d112d66
LH
1851static const cppchar_t utf8_signifier = 0xC0;
1852
1853/* Returns TRUE if the sequence starting at buffer->cur is valid in
1613e52b 1854 an identifier. FIRST is TRUE if this starts an identifier. */
51c50026 1855
bced6edf 1856static bool
50668cf6
GK
1857forms_identifier_p (cpp_reader *pfile, int first,
1858 struct normalize_state *state)
bced6edf 1859{
1613e52b 1860 cpp_buffer *buffer = pfile->buffer;
51c50026 1861 const bool warn_bidi_p = pfile->warn_bidi_p ();
1613e52b
NB
1862
1863 if (*buffer->cur == '$')
1864 {
1865 if (!CPP_OPTION (pfile, dollars_in_ident))
1866 return false;
1867
1868 buffer->cur++;
78b8811a 1869 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1613e52b 1870 {
78b8811a 1871 CPP_OPTION (pfile, warn_dollars) = 0;
0527bc4e 1872 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1613e52b
NB
1873 }
1874
1875 return true;
1876 }
bced6edf 1877
7d112d66
LH
1878 /* Is this a syntactically valid UCN or a valid UTF-8 char? */
1879 if (CPP_OPTION (pfile, extended_identifiers))
bced6edf 1880 {
fbb22910 1881 cppchar_t s;
7d112d66
LH
1882 if (*buffer->cur >= utf8_signifier)
1883 {
51c50026
MP
1884 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
1885 && warn_bidi_p)
1886 {
bef32d4a
DM
1887 location_t loc;
1888 bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
1889 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
51c50026 1890 }
7d112d66
LH
1891 if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1892 state, &s))
1893 return true;
1894 }
1895 else if (*buffer->cur == '\\'
1896 && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1897 {
1898 buffer->cur += 2;
51c50026
MP
1899 if (warn_bidi_p)
1900 {
bef32d4a
DM
1901 location_t loc;
1902 bidi::kind kind = get_bidi_ucn (pfile,
1903 buffer->cur,
1904 buffer->cur[-1] == 'U',
1905 &loc);
1906 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
51c50026 1907 }
7d112d66
LH
1908 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1909 state, &s, NULL, NULL))
1910 return true;
1911 buffer->cur -= 2;
1912 }
bced6edf 1913 }
bced6edf 1914
1613e52b 1915 return false;
bced6edf
NB
1916}
1917
fb771b9d
TT
1918/* Helper function to issue error about improper __VA_OPT__ use. */
1919static void
1920maybe_va_opt_error (cpp_reader *pfile)
1921{
1922 if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
1923 {
1924 /* __VA_OPT__ should not be accepted at all, but allow it in
1925 system headers. */
bf425849 1926 if (!_cpp_in_system_header (pfile))
fb771b9d 1927 cpp_error (pfile, CPP_DL_PEDWARN,
b04445d4 1928 "__VA_OPT__ is not available until C++20");
fb771b9d
TT
1929 }
1930 else if (!pfile->state.va_args_ok)
1931 {
1932 /* __VA_OPT__ should only appear in the replacement list of a
1933 variadic macro. */
1934 cpp_error (pfile, CPP_DL_PEDWARN,
1935 "__VA_OPT__ can only appear in the expansion"
b04445d4 1936 " of a C++20 variadic macro");
fb771b9d
TT
1937 }
1938}
1939
17e7cb85
KT
1940/* Helper function to get the cpp_hashnode of the identifier BASE. */
1941static cpp_hashnode *
1942lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1943{
1944 cpp_hashnode *result;
1945 const uchar *cur;
1946 unsigned int len;
1947 unsigned int hash = HT_HASHSTEP (0, *base);
1948
1949 cur = base + 1;
1950 while (ISIDNUM (*cur))
1951 {
1952 hash = HT_HASHSTEP (hash, *cur);
1953 cur++;
1954 }
1955 len = cur - base;
1956 hash = HT_HASHFINISH (hash, len);
1957 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1958 base, len, hash, HT_ALLOC));
1959
1960 /* Rarely, identifiers require diagnostics when lexed. */
1961 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1962 && !pfile->state.skipping, 0))
1963 {
1964 /* It is allowed to poison the same identifier twice. */
1965 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1966 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1967 NODE_NAME (result));
1968
1969 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1970 replacement list of a variadic macro. */
1971 if (result == pfile->spec_nodes.n__VA_ARGS__
1972 && !pfile->state.va_args_ok)
3976796b
ESR
1973 {
1974 if (CPP_OPTION (pfile, cplusplus))
1975 cpp_error (pfile, CPP_DL_PEDWARN,
1976 "__VA_ARGS__ can only appear in the expansion"
1977 " of a C++11 variadic macro");
1978 else
1979 cpp_error (pfile, CPP_DL_PEDWARN,
1980 "__VA_ARGS__ can only appear in the expansion"
1981 " of a C99 variadic macro");
1982 }
17e7cb85 1983
fb771b9d
TT
1984 if (result == pfile->spec_nodes.n__VA_OPT__)
1985 maybe_va_opt_error (pfile);
1986
17e7cb85
KT
1987 /* For -Wc++-compat, warn about use of C++ named operators. */
1988 if (result->flags & NODE_WARN_OPERATOR)
87cf0651
SB
1989 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1990 "identifier \"%s\" is a special operator name in C++",
1991 NODE_NAME (result));
17e7cb85
KT
1992 }
1993
1994 return result;
1995}
1996
1997/* Get the cpp_hashnode of an identifier specified by NAME in
1998 the current cpp_reader object. If none is found, NULL is returned. */
1999cpp_hashnode *
2000_cpp_lex_identifier (cpp_reader *pfile, const char *name)
2001{
2002 cpp_hashnode *result;
2003 result = lex_identifier_intern (pfile, (uchar *) name);
2004 return result;
2005}
2006
bced6edf 2007/* Lex an identifier starting at BUFFER->CUR - 1. */
0d9f234d 2008static cpp_hashnode *
50668cf6 2009lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
be5ffc59 2010 struct normalize_state *nst, cpp_hashnode **spelling)
45b966db 2011{
93c80368 2012 cpp_hashnode *result;
47e20491 2013 const uchar *cur;
c6e83800
ZW
2014 unsigned int len;
2015 unsigned int hash = HT_HASHSTEP (0, *base);
51c50026 2016 const bool warn_bidi_p = pfile->warn_bidi_p ();
2c3fcba6 2017
c6e83800 2018 cur = pfile->buffer->cur;
47e20491 2019 if (! starts_ucn)
d3f4ff8b
JM
2020 {
2021 while (ISIDNUM (*cur))
2022 {
2023 hash = HT_HASHSTEP (hash, *cur);
2024 cur++;
2025 }
2026 NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
2027 }
47e20491 2028 pfile->buffer->cur = cur;
50668cf6 2029 if (starts_ucn || forms_identifier_p (pfile, false, nst))
10cf9bde 2030 {
7d112d66
LH
2031 /* Slower version for identifiers containing UCNs
2032 or extended chars (including $). */
47e20491
GK
2033 do {
2034 while (ISIDNUM (*pfile->buffer->cur))
50668cf6 2035 {
d3f4ff8b 2036 NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
50668cf6 2037 pfile->buffer->cur++;
50668cf6
GK
2038 }
2039 } while (forms_identifier_p (pfile, false, nst));
51c50026
MP
2040 if (warn_bidi_p)
2041 maybe_warn_bidi_on_close (pfile, pfile->buffer->cur);
47e20491
GK
2042 result = _cpp_interpret_identifier (pfile, base,
2043 pfile->buffer->cur - base);
be5ffc59 2044 *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
2c3fcba6 2045 }
47e20491
GK
2046 else
2047 {
2048 len = cur - base;
2049 hash = HT_HASHFINISH (hash, len);
bced6edf 2050
2bf41bf0
TT
2051 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2052 base, len, hash, HT_ALLOC));
be5ffc59 2053 *spelling = result;
47e20491 2054 }
2c3fcba6 2055
bced6edf 2056 /* Rarely, identifiers require diagnostics when lexed. */
2c3fcba6
ZW
2057 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
2058 && !pfile->state.skipping, 0))
2059 {
2060 /* It is allowed to poison the same identifier twice. */
2061 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
0527bc4e 2062 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
2c3fcba6
ZW
2063 NODE_NAME (result));
2064
2065 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2066 replacement list of a variadic macro. */
2067 if (result == pfile->spec_nodes.n__VA_ARGS__
2068 && !pfile->state.va_args_ok)
3976796b
ESR
2069 {
2070 if (CPP_OPTION (pfile, cplusplus))
2071 cpp_error (pfile, CPP_DL_PEDWARN,
2072 "__VA_ARGS__ can only appear in the expansion"
2073 " of a C++11 variadic macro");
2074 else
2075 cpp_error (pfile, CPP_DL_PEDWARN,
2076 "__VA_ARGS__ can only appear in the expansion"
2077 " of a C99 variadic macro");
2078 }
3d8b2a98 2079
fb771b9d
TT
2080 /* __VA_OPT__ should only appear in the replacement list of a
2081 variadic macro. */
2082 if (result == pfile->spec_nodes.n__VA_OPT__)
2083 maybe_va_opt_error (pfile);
2084
3d8b2a98
ILT
2085 /* For -Wc++-compat, warn about use of C++ named operators. */
2086 if (result->flags & NODE_WARN_OPERATOR)
87cf0651
SB
2087 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2088 "identifier \"%s\" is a special operator name in C++",
2089 NODE_NAME (result));
2c3fcba6
ZW
2090 }
2091
2092 return result;
2093}
2094
bced6edf 2095/* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
45b966db 2096static void
50668cf6
GK
2097lex_number (cpp_reader *pfile, cpp_string *number,
2098 struct normalize_state *nst)
45b966db 2099{
562a5c27 2100 const uchar *cur;
bced6edf
NB
2101 const uchar *base;
2102 uchar *dest;
45b966db 2103
bced6edf
NB
2104 base = pfile->buffer->cur - 1;
2105 do
041c3194 2106 {
8f51cf38 2107 const uchar *adj_digit_sep = NULL;
bced6edf 2108 cur = pfile->buffer->cur;
0d9f234d 2109
bced6edf 2110 /* N.B. ISIDNUM does not include $. */
8f51cf38
JM
2111 while (ISIDNUM (*cur)
2112 || (*cur == '.' && !DIGIT_SEP (cur[-1]))
2113 || DIGIT_SEP (*cur)
2114 || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
50668cf6 2115 {
d3f4ff8b 2116 NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
8f51cf38
JM
2117 /* Adjacent digit separators do not form part of the pp-number syntax.
2118 However, they can safely be diagnosed here as an error, since '' is
2119 not a valid preprocessing token. */
2120 if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
2121 adj_digit_sep = cur;
50668cf6 2122 cur++;
50668cf6 2123 }
a5858a3d
ESR
2124 /* A number can't end with a digit separator. */
2125 while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
2126 --cur;
8f51cf38
JM
2127 if (adj_digit_sep && adj_digit_sep < cur)
2128 cpp_error (pfile, CPP_DL_ERROR, "adjacent digit separators");
45b966db 2129
10cf9bde 2130 pfile->buffer->cur = cur;
45b966db 2131 }
50668cf6 2132 while (forms_identifier_p (pfile, false, nst));
93c80368 2133
bced6edf
NB
2134 number->len = cur - base;
2135 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
2136 memcpy (dest, base, number->len);
2137 dest[number->len] = '\0';
2138 number->text = dest;
93c80368
NB
2139}
2140
6338b358
NB
2141/* Create a token of type TYPE with a literal spelling. */
2142static void
6cf87ca4
ZW
2143create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
2144 unsigned int len, enum cpp_ttype type)
6338b358 2145{
6338b358
NB
2146 token->type = type;
2147 token->val.str.len = len;
13f93cf5
NS
2148 token->val.str.text = cpp_alloc_token_string (pfile, base, len);
2149}
2150
2151const uchar *
2152cpp_alloc_token_string (cpp_reader *pfile,
2153 const unsigned char *ptr, unsigned len)
2154{
2155 uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
2156
2157 dest[len] = 0;
2158 memcpy (dest, ptr, len);
2159 return dest;
6338b358
NB
2160}
2161
ed63c387
NS
2162/* A pair of raw buffer pointers. The currently open one is [1], the
2163 first one is [0]. Used for string literal lexing. */
2164struct lit_accum {
2165 _cpp_buff *first;
2166 _cpp_buff *last;
2167 const uchar *rpos;
2168 size_t accum;
2169
2170 lit_accum ()
2171 : first (NULL), last (NULL), rpos (0), accum (0)
2172 {
2173 }
2174
2175 void append (cpp_reader *, const uchar *, size_t);
2176
2177 void read_begin (cpp_reader *);
2178 bool reading_p () const
2179 {
2180 return rpos != NULL;
2181 }
2182 char read_char ()
2183 {
2184 char c = *rpos++;
2185 if (rpos == BUFF_FRONT (last))
2186 rpos = NULL;
2187 return c;
2188 }
2189};
2190
00a81b8b
JM
2191/* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
2192 sequence from *FIRST_BUFF_P to LAST_BUFF_P. */
2193
ed63c387
NS
2194void
2195lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
00a81b8b 2196{
ed63c387
NS
2197 if (!last)
2198 /* Starting. */
2199 first = last = _cpp_get_buff (pfile, len);
2200 else if (len > BUFF_ROOM (last))
00a81b8b 2201 {
ed63c387
NS
2202 /* There is insufficient room in the buffer. Copy what we can,
2203 and then either extend or create a new one. */
2204 size_t room = BUFF_ROOM (last);
2205 memcpy (BUFF_FRONT (last), base, room);
2206 BUFF_FRONT (last) += room;
00a81b8b
JM
2207 base += room;
2208 len -= room;
ed63c387 2209 accum += room;
00a81b8b 2210
ed63c387
NS
2211 gcc_checking_assert (!rpos);
2212
2213 last = _cpp_append_extend_buff (pfile, last, len);
2214 }
00a81b8b 2215
ed63c387
NS
2216 memcpy (BUFF_FRONT (last), base, len);
2217 BUFF_FRONT (last) += len;
2218 accum += len;
00a81b8b
JM
2219}
2220
ed63c387
NS
2221void
2222lit_accum::read_begin (cpp_reader *pfile)
2223{
2224 /* We never accumulate more than 4 chars to read. */
2225 if (BUFF_ROOM (last) < 4)
2226
2227 last = _cpp_append_extend_buff (pfile, last, 4);
2228 rpos = BUFF_FRONT (last);
2229}
c865f923
ESR
2230
2231/* Returns true if a macro has been defined.
2232 This might not work if compile with -save-temps,
2233 or preprocess separately from compilation. */
2234
2235static bool
2236is_macro(cpp_reader *pfile, const uchar *base)
2237{
2238 const uchar *cur = base;
2239 if (! ISIDST (*cur))
2240 return false;
2241 unsigned int hash = HT_HASHSTEP (0, *cur);
2242 ++cur;
2243 while (ISIDNUM (*cur))
2244 {
2245 hash = HT_HASHSTEP (hash, *cur);
2246 ++cur;
2247 }
2248 hash = HT_HASHFINISH (hash, cur - base);
2249
2250 cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2251 base, cur - base, hash, HT_NO_INSERT));
2252
3f6677f4 2253 return result && cpp_macro_p (result);
c865f923
ESR
2254}
2255
b44f8ad8
JW
2256/* Returns true if a literal suffix does not have the expected form
2257 and is defined as a macro. */
2258
2259static bool
2260is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
2261{
2262 /* User-defined literals outside of namespace std must start with a single
2263 underscore, so assume anything of that form really is a UDL suffix.
2264 We don't need to worry about UDLs defined inside namespace std because
2265 their names are reserved, so cannot be used as macro names in valid
2266 programs. */
2267 if (base[0] == '_' && base[1] != '_')
2268 return false;
2269 return is_macro (pfile, base);
2270}
c865f923 2271
ed63c387
NS
2272/* Lexes a raw string. The stored string contains the spelling,
2273 including double quotes, delimiter string, '(' and ')', any leading
2274 'L', 'u', 'U' or 'u8' and 'R' modifier. The created token contains
2275 the type of the literal, or CPP_OTHER if it was not properly
2276 terminated.
2277
2278 BASE is the start of the token. Updates pfile->buffer->cur to just
2279 after the lexed string.
2c6e3f55
JJ
2280
2281 The spelling is NUL-terminated, but it is not guaranteed that this
2282 is the first NUL since embedded NULs are preserved. */
2283
2284static void
ed63c387 2285lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2c6e3f55 2286{
ed63c387 2287 const uchar *pos = base;
51c50026 2288 const bool warn_bidi_p = pfile->warn_bidi_p ();
ed63c387
NS
2289
2290 /* 'tis a pity this information isn't passed down from the lexer's
2291 initial categorization of the token. */
2292 enum cpp_ttype type = CPP_STRING;
2293
2294 if (*pos == 'L')
2295 {
2296 type = CPP_WSTRING;
2297 pos++;
2298 }
2299 else if (*pos == 'U')
2300 {
2301 type = CPP_STRING32;
2302 pos++;
2303 }
2304 else if (*pos == 'u')
2305 {
2306 if (pos[1] == '8')
2307 {
2308 type = CPP_UTF8STRING;
2309 pos++;
2310 }
2311 else
2312 type = CPP_STRING16;
2313 pos++;
2314 }
2315
2316 gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
2317 pos += 2;
2318
00a81b8b 2319 _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
2c6e3f55 2320
ed63c387
NS
2321 /* Skip notes before the ". */
2322 while (note->pos < pos)
2323 ++note;
2324
2325 lit_accum accum;
2326
2327 uchar prefix[17];
2328 unsigned prefix_len = 0;
2329 enum Phase
2330 {
2331 PHASE_PREFIX = -2,
2332 PHASE_NONE = -1,
2333 PHASE_SUFFIX = 0
2334 } phase = PHASE_PREFIX;
2335
8cf88735
JJ
2336 for (;;)
2337 {
ed63c387 2338 gcc_checking_assert (note->pos >= pos);
00a81b8b 2339
ed63c387
NS
2340 /* Undo any escaped newlines and trigraphs. */
2341 if (!accum.reading_p () && note->pos == pos)
2342 switch (note->type)
2343 {
2344 case '\\':
2345 case ' ':
2346 /* Restore backslash followed by newline. */
2347 accum.append (pfile, base, pos - base);
2348 base = pos;
2349 accum.read_begin (pfile);
2350 accum.append (pfile, UC"\\", 1);
2351
2352 after_backslash:
2353 if (note->type == ' ')
2354 /* GNU backslash whitespace newline extension. FIXME
2355 could be any sequence of non-vertical space. When we
2356 can properly restore any such sequence, we should
2357 mark this note as handled so _cpp_process_line_notes
2358 doesn't warn. */
2359 accum.append (pfile, UC" ", 1);
2360
2361 accum.append (pfile, UC"\n", 1);
2362 note++;
2363 break;
00a81b8b 2364
ed63c387
NS
2365 case '\n':
2366 /* This can happen for ??/<NEWLINE> when trigraphs are not
2367 being interpretted. */
2368 gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
2369 note->type = 0;
2370 note++;
2371 break;
00a81b8b 2372
ed63c387
NS
2373 default:
2374 gcc_checking_assert (_cpp_trigraph_map[note->type]);
2375
2376 /* Don't warn about this trigraph in
2377 _cpp_process_line_notes, since trigraphs show up as
2378 trigraphs in raw strings. */
2379 uchar type = note->type;
2380 note->type = 0;
2381
2382 if (CPP_OPTION (pfile, trigraphs))
2383 {
2384 accum.append (pfile, base, pos - base);
2385 base = pos;
2386 accum.read_begin (pfile);
2387 accum.append (pfile, UC"??", 2);
2388 accum.append (pfile, &type, 1);
2389
2390 /* ??/ followed by newline gets two line notes, one for
2391 the trigraph and one for the backslash/newline. */
2392 if (type == '/' && note[1].pos == pos)
2393 {
2394 note++;
2395 gcc_assert (note->type == '\\' || note->type == ' ');
2396 goto after_backslash;
2397 }
2398 /* Skip the replacement character. */
2399 base = ++pos;
2400 }
2401
2402 note++;
2403 break;
2404 }
2405
2406 /* Now get a char to process. Either from an expanded note, or
2407 from the line buffer. */
2408 bool read_note = accum.reading_p ();
2409 char c = read_note ? accum.read_char () : *pos++;
2c6e3f55 2410
ed63c387 2411 if (phase == PHASE_PREFIX)
2c6e3f55 2412 {
ed63c387 2413 if (c == '(')
8cf88735 2414 {
ed63c387
NS
2415 /* Done. */
2416 phase = PHASE_NONE;
2417 prefix[prefix_len++] = '"';
2418 }
2419 else if (prefix_len < 16
2420 /* Prefix chars are any of the basic character set,
2421 [lex.charset] except for '
2422 ()\\\t\v\f\n'. Optimized for a contiguous
2423 alphabet. */
2424 /* Unlike a switch, this collapses down to one or
2425 two shift and bitmask operations on an ASCII
2426 system, with an outlier or two. */
2427 && (('Z' - 'A' == 25
2428 ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
2429 : ISIDST (c))
2430 || (c >= '0' && c <= '9')
2431 || c == '_' || c == '{' || c == '}'
2432 || c == '[' || c == ']' || c == '#'
2433 || c == '<' || c == '>' || c == '%'
2434 || c == ':' || c == ';' || c == '.' || c == '?'
2435 || c == '*' || c == '+' || c == '-' || c == '/'
2436 || c == '^' || c == '&' || c == '|' || c == '~'
2437 || c == '!' || c == '=' || c == ','
2438 || c == '"' || c == '\''))
2439 prefix[prefix_len++] = c;
2440 else
2441 {
2442 /* Something is wrong. */
2443 int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
2444 if (prefix_len == 16)
2445 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2446 col, "raw string delimiter longer "
2447 "than 16 characters");
2448 else if (c == '\n')
2449 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2450 col, "invalid new-line in raw "
2451 "string delimiter");
2452 else
2453 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2454 col, "invalid character '%c' in "
2455 "raw string delimiter", c);
2456 type = CPP_OTHER;
2457 phase = PHASE_NONE;
2458 /* Continue until we get a close quote, that's probably
2459 the best failure mode. */
2460 prefix_len = 0;
8cf88735 2461 }
ae49af94
JJ
2462 if (c != '\n')
2463 continue;
8cf88735 2464 }
ed63c387
NS
2465
2466 if (phase != PHASE_NONE)
8cf88735 2467 {
ed63c387
NS
2468 if (prefix[phase] != c)
2469 phase = PHASE_NONE;
2470 else if (unsigned (phase + 1) == prefix_len)
8cf88735 2471 break;
ed63c387
NS
2472 else
2473 {
2474 phase = Phase (phase + 1);
2475 continue;
2476 }
2c6e3f55 2477 }
ed63c387
NS
2478
2479 if (!prefix_len && c == '"')
2480 /* Failure mode lexing. */
2481 goto out;
2482 else if (prefix_len && c == ')')
2483 phase = PHASE_SUFFIX;
2484 else if (!read_note && c == '\n')
2c6e3f55 2485 {
ed63c387
NS
2486 pos--;
2487 pfile->buffer->cur = pos;
2c6e3f55 2488 if (pfile->state.in_directive
d5e48350
JJ
2489 || (pfile->state.parsing_args
2490 && pfile->buffer->next_line >= pfile->buffer->rlimit))
2c6e3f55 2491 {
2c6e3f55
JJ
2492 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
2493 "unterminated raw string");
ed63c387
NS
2494 type = CPP_OTHER;
2495 goto out;
2c6e3f55
JJ
2496 }
2497
ed63c387 2498 accum.append (pfile, base, pos - base + 1);
2a0225e4
NS
2499 _cpp_process_line_notes (pfile, false);
2500
2501 if (pfile->buffer->next_line < pfile->buffer->rlimit)
2c6e3f55
JJ
2502 CPP_INCREMENT_LINE (pfile, 0);
2503 pfile->buffer->need_line = true;
2504
2505 if (!_cpp_get_fresh_line (pfile))
2506 {
ed63c387 2507 /* We ran out of file and failed to get a line. */
620e594b 2508 location_t src_loc = token->src_loc;
2c6e3f55
JJ
2509 token->type = CPP_EOF;
2510 /* Tell the compiler the line number of the EOF token. */
2511 token->src_loc = pfile->line_table->highest_line;
2512 token->flags = BOL;
ed63c387
NS
2513 if (accum.first)
2514 _cpp_release_buff (pfile, accum.first);
2c6e3f55
JJ
2515 cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
2516 "unterminated raw string");
2a0225e4
NS
2517 /* Now pop the buffer that _cpp_get_fresh_line did not. */
2518 _cpp_pop_buffer (pfile);
2c6e3f55
JJ
2519 return;
2520 }
2521
ed63c387 2522 pos = base = pfile->buffer->cur;
00a81b8b 2523 note = &pfile->buffer->notes[pfile->buffer->cur_note];
2c6e3f55 2524 }
51c50026
MP
2525 else if (__builtin_expect ((unsigned char) c == bidi::utf8_start, 0)
2526 && warn_bidi_p)
bef32d4a
DM
2527 {
2528 location_t loc;
2529 bidi::kind kind = get_bidi_utf8 (pfile, pos - 1, &loc);
2530 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2531 }
2c6e3f55
JJ
2532 }
2533
51c50026
MP
2534 if (warn_bidi_p)
2535 maybe_warn_bidi_on_close (pfile, pos);
2536
3ce4f9e4
ESR
2537 if (CPP_OPTION (pfile, user_literals))
2538 {
c865f923
ESR
2539 /* If a string format macro, say from inttypes.h, is placed touching
2540 a string literal it could be parsed as a C++11 user-defined string
b44f8ad8 2541 literal thus breaking the program. */
ed63c387 2542 if (is_macro_not_literal_suffix (pfile, pos))
7f5f5f98 2543 {
112448b4 2544 /* Raise a warning, but do not consume subsequent tokens. */
7aee8646 2545 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
7f5f5f98
OW
2546 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2547 token->src_loc, 0,
2548 "invalid suffix on literal; C++11 requires "
c865f923 2549 "a space between literal and string macro");
7f5f5f98 2550 }
3ce4f9e4 2551 /* Grab user defined literal suffix. */
ed63c387 2552 else if (ISIDST (*pos))
3ce4f9e4
ESR
2553 {
2554 type = cpp_userdef_string_add_type (type);
ed63c387 2555 ++pos;
7f5f5f98 2556
ed63c387
NS
2557 while (ISIDNUM (*pos))
2558 ++pos;
3ce4f9e4 2559 }
3ce4f9e4
ESR
2560 }
2561
ed63c387
NS
2562 out:
2563 pfile->buffer->cur = pos;
2564 if (!accum.accum)
2565 create_literal (pfile, token, base, pos - base, type);
2c6e3f55
JJ
2566 else
2567 {
ed63c387
NS
2568 size_t extra_len = pos - base;
2569 uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1);
2c6e3f55
JJ
2570
2571 token->type = type;
ed63c387 2572 token->val.str.len = accum.accum + extra_len;
2c6e3f55 2573 token->val.str.text = dest;
ed63c387 2574 for (_cpp_buff *buf = accum.first; buf; buf = buf->next)
2c6e3f55 2575 {
ed63c387
NS
2576 size_t len = BUFF_FRONT (buf) - buf->base;
2577 memcpy (dest, buf->base, len);
2578 dest += len;
2c6e3f55 2579 }
ed63c387
NS
2580 _cpp_release_buff (pfile, accum.first);
2581 memcpy (dest, base, extra_len);
2582 dest[extra_len] = '\0';
2c6e3f55
JJ
2583 }
2584}
2585
bced6edf 2586/* Lexes a string, character constant, or angle-bracketed header file
6338b358 2587 name. The stored string contains the spelling, including opening
2c6e3f55
JJ
2588 quote and any leading 'L', 'u', 'U' or 'u8' and optional
2589 'R' modifier. It returns the type of the literal, or CPP_OTHER
2590 if it was not properly terminated, or CPP_LESS for an unterminated
2591 header name which must be relexed as normal tokens.
6338b358
NB
2592
2593 The spelling is NUL-terminated, but it is not guaranteed that this
2594 is the first NUL since embedded NULs are preserved. */
041c3194 2595static void
6cf87ca4 2596lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
45b966db 2597{
6338b358
NB
2598 bool saw_NUL = false;
2599 const uchar *cur;
bced6edf 2600 cppchar_t terminator;
6338b358
NB
2601 enum cpp_ttype type;
2602
2603 cur = base;
2604 terminator = *cur++;
2c6e3f55 2605 if (terminator == 'L' || terminator == 'U')
6338b358 2606 terminator = *cur++;
2c6e3f55
JJ
2607 else if (terminator == 'u')
2608 {
2609 terminator = *cur++;
2610 if (terminator == '8')
2611 terminator = *cur++;
2612 }
2613 if (terminator == 'R')
2614 {
ed63c387 2615 lex_raw_string (pfile, token, base);
2c6e3f55
JJ
2616 return;
2617 }
2618 if (terminator == '"')
b6baa67d
KVH
2619 type = (*base == 'L' ? CPP_WSTRING :
2620 *base == 'U' ? CPP_STRING32 :
2c6e3f55
JJ
2621 *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2622 : CPP_STRING);
6338b358 2623 else if (terminator == '\'')
b6baa67d
KVH
2624 type = (*base == 'L' ? CPP_WCHAR :
2625 *base == 'U' ? CPP_CHAR32 :
fe95b036
ESR
2626 *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2627 : CPP_CHAR);
6338b358
NB
2628 else
2629 terminator = '>', type = CPP_HEADER_NAME;
93c80368 2630
51c50026 2631 const bool warn_bidi_p = pfile->warn_bidi_p ();
0d9f234d 2632 for (;;)
45b966db 2633 {
6338b358 2634 cppchar_t c = *cur++;
7868b4a2 2635
6f572ac2 2636 /* In #include-style directives, terminators are not escapable. */
6338b358 2637 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
51c50026
MP
2638 {
2639 if ((cur[0] == 'u' || cur[0] == 'U') && warn_bidi_p)
2640 {
bef32d4a
DM
2641 location_t loc;
2642 bidi::kind kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U',
2643 &loc);
2644 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
51c50026
MP
2645 }
2646 cur++;
2647 }
6338b358 2648 else if (c == terminator)
51c50026
MP
2649 {
2650 if (warn_bidi_p)
2651 maybe_warn_bidi_on_close (pfile, cur - 1);
2652 break;
2653 }
6338b358 2654 else if (c == '\n')
0d9f234d 2655 {
6338b358 2656 cur--;
4bb09c26
JM
2657 /* Unmatched quotes always yield undefined behavior, but
2658 greedy lexing means that what appears to be an unterminated
2659 header name may actually be a legitimate sequence of tokens. */
2660 if (terminator == '>')
2661 {
2662 token->type = CPP_LESS;
2663 return;
2664 }
6338b358
NB
2665 type = CPP_OTHER;
2666 break;
45b966db 2667 }
6338b358
NB
2668 else if (c == '\0')
2669 saw_NUL = true;
51c50026
MP
2670 else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
2671 {
bef32d4a
DM
2672 location_t loc;
2673 bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
2674 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
51c50026 2675 }
45b966db
ZW
2676 }
2677
6338b358 2678 if (saw_NUL && !pfile->state.skipping)
0527bc4e
JDA
2679 cpp_error (pfile, CPP_DL_WARNING,
2680 "null character(s) preserved in literal");
45b966db 2681
c663e301
JM
2682 if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2683 cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2684 (int) terminator);
2685
3ce4f9e4
ESR
2686 if (CPP_OPTION (pfile, user_literals))
2687 {
c865f923
ESR
2688 /* If a string format macro, say from inttypes.h, is placed touching
2689 a string literal it could be parsed as a C++11 user-defined string
b44f8ad8
JW
2690 literal thus breaking the program. */
2691 if (is_macro_not_literal_suffix (pfile, cur))
7f5f5f98 2692 {
112448b4 2693 /* Raise a warning, but do not consume subsequent tokens. */
7aee8646 2694 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
7f5f5f98
OW
2695 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2696 token->src_loc, 0,
2697 "invalid suffix on literal; C++11 requires "
c865f923 2698 "a space between literal and string macro");
7f5f5f98 2699 }
3ce4f9e4 2700 /* Grab user defined literal suffix. */
561f7fc7 2701 else if (ISIDST (*cur))
3ce4f9e4
ESR
2702 {
2703 type = cpp_userdef_char_add_type (type);
2704 type = cpp_userdef_string_add_type (type);
2705 ++cur;
7f5f5f98
OW
2706
2707 while (ISIDNUM (*cur))
2708 ++cur;
3ce4f9e4 2709 }
3ce4f9e4 2710 }
fe191308
JM
2711 else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2712 && is_macro (pfile, cur)
2713 && !pfile->state.skipping)
2714 cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2715 token->src_loc, 0, "C++11 requires a space "
2716 "between string literal and macro");
3ce4f9e4 2717
6338b358
NB
2718 pfile->buffer->cur = cur;
2719 create_literal (pfile, token, base, cur - base, type);
0d9f234d 2720}
041c3194 2721
631d0d36
MG
2722/* Return the comment table. The client may not make any assumption
2723 about the ordering of the table. */
2724cpp_comment_table *
2725cpp_get_comments (cpp_reader *pfile)
2726{
2727 return &pfile->comments;
2728}
2729
2730/* Append a comment to the end of the comment table. */
2731static void
2732store_comment (cpp_reader *pfile, cpp_token *token)
2733{
2734 int len;
2735
2736 if (pfile->comments.allocated == 0)
2737 {
2738 pfile->comments.allocated = 256;
2739 pfile->comments.entries = (cpp_comment *) xmalloc
2740 (pfile->comments.allocated * sizeof (cpp_comment));
2741 }
2742
2743 if (pfile->comments.count == pfile->comments.allocated)
2744 {
2745 pfile->comments.allocated *= 2;
2746 pfile->comments.entries = (cpp_comment *) xrealloc
2747 (pfile->comments.entries,
2748 pfile->comments.allocated * sizeof (cpp_comment));
2749 }
2750
2751 len = token->val.str.len;
2752
2753 /* Copy comment. Note, token may not be NULL terminated. */
2754 pfile->comments.entries[pfile->comments.count].comment =
2755 (char *) xmalloc (sizeof (char) * (len + 1));
2756 memcpy (pfile->comments.entries[pfile->comments.count].comment,
2757 token->val.str.text, len);
2758 pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2759
2760 /* Set source location. */
2761 pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2762
2763 /* Increment the count of entries in the comment table. */
2764 pfile->comments.count++;
2765}
2766
93c80368 2767/* The stored comment includes the comment start and any terminator. */
9e62c811 2768static void
6cf87ca4
ZW
2769save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2770 cppchar_t type)
9e62c811 2771{
041c3194 2772 unsigned char *buffer;
651a20b5 2773 unsigned int len, clen, i;
df383483 2774
1c6d33ef 2775 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
480709cc 2776
3542203b
NB
2777 /* C++ comments probably (not definitely) have moved past a new
2778 line, which we don't want to save in the comment. */
480709cc 2779 if (is_vspace (pfile->buffer->cur[-1]))
3542203b 2780 len--;
477cdac7 2781
651a20b5
KT
2782 /* If we are currently in a directive or in argument parsing, then
2783 we need to store all C++ comments as C comments internally, and
2784 so we need to allocate a little extra space in that case.
477cdac7
JT
2785
2786 Note that the only time we encounter a directive here is
2787 when we are saving comments in a "#define". */
651a20b5
KT
2788 clen = ((pfile->state.in_directive || pfile->state.parsing_args)
2789 && type == '/') ? len + 2 : len;
477cdac7
JT
2790
2791 buffer = _cpp_unaligned_alloc (pfile, clen);
df383483 2792
041c3194 2793 token->type = CPP_COMMENT;
477cdac7 2794 token->val.str.len = clen;
0d9f234d 2795 token->val.str.text = buffer;
45b966db 2796
1c6d33ef
NB
2797 buffer[0] = '/';
2798 memcpy (buffer + 1, from, len - 1);
477cdac7 2799
1eeeb6a4 2800 /* Finish conversion to a C comment, if necessary. */
651a20b5 2801 if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
477cdac7
JT
2802 {
2803 buffer[1] = '*';
2804 buffer[clen - 2] = '*';
2805 buffer[clen - 1] = '/';
651a20b5
KT
2806 /* As there can be in a C++ comments illegal sequences for C comments
2807 we need to filter them out. */
2808 for (i = 2; i < (clen - 2); i++)
2809 if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
2810 buffer[i] = '|';
477cdac7 2811 }
631d0d36
MG
2812
2813 /* Finally store this comment for use by clients of libcpp. */
2814 store_comment (pfile, token);
0d9f234d 2815}
45b966db 2816
81fea426
MP
2817/* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
2818 comment. */
2819
2820static bool
2821fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
2822{
2823 const unsigned char *from = comment_start + 1;
70f6d5e1
JJ
2824
2825 switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
2826 {
2827 /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
2828 don't recognize any comments. The latter only checks attributes,
2829 the former doesn't warn. */
2830 case 0:
2831 default:
2832 return false;
2833 /* -Wimplicit-fallthrough=1 considers any comment, no matter what
2834 content it has. */
2835 case 1:
2836 return true;
2837 case 2:
2838 /* -Wimplicit-fallthrough=2 looks for (case insensitive)
2839 .*falls?[ \t-]*thr(u|ough).* regex. */
2840 for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
2841 from++)
2842 {
2843 /* Is there anything like strpbrk with upper boundary, or
2844 memchr looking for 2 characters rather than just one? */
2845 if (from[0] != 'f' && from[0] != 'F')
2846 continue;
2847 if (from[1] != 'a' && from[1] != 'A')
2848 continue;
2849 if (from[2] != 'l' && from[2] != 'L')
2850 continue;
2851 if (from[3] != 'l' && from[3] != 'L')
2852 continue;
2853 from += sizeof "fall" - 1;
2854 if (from[0] == 's' || from[0] == 'S')
2855 from++;
2856 while (*from == ' ' || *from == '\t' || *from == '-')
2857 from++;
2858 if (from[0] != 't' && from[0] != 'T')
2859 continue;
2860 if (from[1] != 'h' && from[1] != 'H')
2861 continue;
2862 if (from[2] != 'r' && from[2] != 'R')
2863 continue;
2864 if (from[3] == 'u' || from[3] == 'U')
2865 return true;
2866 if (from[3] != 'o' && from[3] != 'O')
2867 continue;
2868 if (from[4] != 'u' && from[4] != 'U')
2869 continue;
2870 if (from[5] != 'g' && from[5] != 'G')
2871 continue;
2872 if (from[6] != 'h' && from[6] != 'H')
2873 continue;
2874 return true;
2875 }
2876 return false;
2877 case 3:
2878 case 4:
2879 break;
2880 }
2881
81fea426
MP
2882 /* Whole comment contents:
2883 -fallthrough
2884 @fallthrough@
2885 */
2886 if (*from == '-' || *from == '@')
2887 {
2888 size_t len = sizeof "fallthrough" - 1;
2889 if ((size_t) (pfile->buffer->cur - from - 1) < len)
2890 return false;
2891 if (memcmp (from + 1, "fallthrough", len))
2892 return false;
2893 if (*from == '@')
2894 {
2895 if (from[len + 1] != '@')
2896 return false;
2897 len++;
2898 }
2899 from += 1 + len;
2900 }
2901 /* Whole comment contents (regex):
70f6d5e1 2902 lint -fallthrough[ \t]*
81b02905
JJ
2903 */
2904 else if (*from == 'l')
2905 {
2906 size_t len = sizeof "int -fallthrough" - 1;
2907 if ((size_t) (pfile->buffer->cur - from - 1) < len)
2908 return false;
2909 if (memcmp (from + 1, "int -fallthrough", len))
70f6d5e1 2910 return false;
81b02905 2911 from += 1 + len;
70f6d5e1
JJ
2912 while (*from == ' ' || *from == '\t')
2913 from++;
2914 }
2915 /* Whole comment contents (regex):
2916 [ \t]*FALLTHR(U|OUGH)[ \t]*
2917 */
2918 else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
2919 {
2920 while (*from == ' ' || *from == '\t')
2921 from++;
2922 if ((size_t) (pfile->buffer->cur - from) < sizeof "FALLTHRU" - 1)
2923 return false;
2924 if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
2925 return false;
2926 from += sizeof "FALLTHR" - 1;
2927 if (*from == 'U')
2928 from++;
2929 else if ((size_t) (pfile->buffer->cur - from) < sizeof "OUGH" - 1)
2930 return false;
2931 else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
2932 return false;
2933 else
2934 from += sizeof "OUGH" - 1;
2935 while (*from == ' ' || *from == '\t')
2936 from++;
81b02905
JJ
2937 }
2938 /* Whole comment contents (regex):
ee19ef45
JJ
2939 [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
2940 [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
2941 [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
81fea426
MP
2942 */
2943 else
2944 {
81b02905 2945 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
81fea426
MP
2946 from++;
2947 unsigned char f = *from;
81b02905
JJ
2948 bool all_upper = false;
2949 if (f == 'E' || f == 'e')
70f6d5e1
JJ
2950 {
2951 if ((size_t) (pfile->buffer->cur - from)
81b02905
JJ
2952 < sizeof "else fallthru" - 1)
2953 return false;
ee19ef45 2954 if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
81b02905 2955 all_upper = true;
ee19ef45
JJ
2956 else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
2957 return false;
2958 from += sizeof "else" - 1;
2959 if (*from == ',')
2960 from++;
70f6d5e1 2961 if (*from != ' ')
ee19ef45
JJ
2962 return false;
2963 from++;
2964 if (all_upper && *from == 'f')
81b02905 2965 return false;
81b02905
JJ
2966 if (f == 'e' && *from == 'F')
2967 return false;
2968 f = *from;
70f6d5e1 2969 }
81b02905 2970 else if (f == 'I' || f == 'i')
70f6d5e1
JJ
2971 {
2972 if ((size_t) (pfile->buffer->cur - from)
81b02905
JJ
2973 < sizeof "intentional fallthru" - 1)
2974 return false;
2975 if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
2976 sizeof "NTENTIONAL" - 1) == 0)
2977 all_upper = true;
2978 else if (memcmp (from + 1, "ntentional",
2979 sizeof "ntentional" - 1))
2980 return false;
2981 from += sizeof "intentional" - 1;
2982 if (*from == ' ')
2983 {
2984 from++;
2985 if (all_upper && *from == 'f')
2986 return false;
2987 }
2988 else if (all_upper)
2989 {
2990 if (memcmp (from, "LY F", sizeof "LY F" - 1))
2991 return false;
2992 from += sizeof "LY " - 1;
2993 }
2994 else
2995 {
2996 if (memcmp (from, "ly ", sizeof "ly " - 1))
2997 return false;
2998 from += sizeof "ly " - 1;
2999 }
3000 if (f == 'i' && *from == 'F')
3001 return false;
3002 f = *from;
70f6d5e1 3003 }
81fea426
MP
3004 if (f != 'F' && f != 'f')
3005 return false;
7bad794a 3006 if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
81fea426 3007 return false;
81fea426
MP
3008 if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
3009 all_upper = true;
81b02905
JJ
3010 else if (all_upper)
3011 return false;
81fea426
MP
3012 else if (memcmp (from + 1, "all", sizeof "all" - 1))
3013 return false;
7bad794a
JJ
3014 from += sizeof "fall" - 1;
3015 if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
3016 from += 2;
3017 else if (*from == ' ' || *from == '-')
3018 from++;
3019 else if (*from != (all_upper ? 'T' : 't'))
81fea426 3020 return false;
81fea426
MP
3021 if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
3022 return false;
7bad794a 3023 if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
81fea426
MP
3024 return false;
3025 if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
3026 {
7bad794a 3027 if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
81fea426
MP
3028 return false;
3029 if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
3030 sizeof "hrough" - 1))
3031 return false;
3032 from += sizeof "through" - 1;
3033 }
3034 else
3035 from += sizeof "thru" - 1;
81b02905 3036 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
81fea426 3037 from++;
81b02905
JJ
3038 if (*from == '-')
3039 {
3040 from++;
3041 if (*comment_start == '*')
3042 {
3043 do
3044 {
3045 while (*from && *from != '*'
3046 && *from != '\n' && *from != '\r')
3047 from++;
3048 if (*from != '*' || from[1] == '/')
3049 break;
3050 from++;
3051 }
3052 while (1);
3053 }
3054 else
3055 while (*from && *from != '\n' && *from != '\r')
3056 from++;
3057 }
81fea426
MP
3058 }
3059 /* C block comment. */
3060 if (*comment_start == '*')
3061 {
3062 if (*from != '*' || from[1] != '/')
3063 return false;
3064 }
3065 /* C++ line comment. */
3066 else if (*from != '\n')
3067 return false;
3068
3069 return true;
3070}
3071
5fddcffc
NB
3072/* Allocate COUNT tokens for RUN. */
3073void
6cf87ca4 3074_cpp_init_tokenrun (tokenrun *run, unsigned int count)
5fddcffc 3075{
72bb2c39 3076 run->base = XNEWVEC (cpp_token, count);
5fddcffc
NB
3077 run->limit = run->base + count;
3078 run->next = NULL;
3079}
3080
3081/* Returns the next tokenrun, or creates one if there is none. */
3082static tokenrun *
6cf87ca4 3083next_tokenrun (tokenrun *run)
5fddcffc
NB
3084{
3085 if (run->next == NULL)
3086 {
72bb2c39 3087 run->next = XNEW (tokenrun);
bdcbe496 3088 run->next->prev = run;
5fddcffc
NB
3089 _cpp_init_tokenrun (run->next, 250);
3090 }
3091
3092 return run->next;
3093}
3094
ad2305ad 3095/* Return the number of not yet processed token in a given
92582b75
TT
3096 context. */
3097int
ad2305ad 3098_cpp_remaining_tokens_num_in_context (cpp_context *context)
92582b75 3099{
92582b75 3100 if (context->tokens_kind == TOKENS_KIND_DIRECT)
cbbcf655 3101 return (LAST (context).token - FIRST (context).token);
92582b75
TT
3102 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3103 || context->tokens_kind == TOKENS_KIND_EXTENDED)
cbbcf655 3104 return (LAST (context).ptoken - FIRST (context).ptoken);
92582b75
TT
3105 else
3106 abort ();
3107}
3108
ad2305ad
DS
3109/* Returns the token present at index INDEX in a given context. If
3110 INDEX is zero, the next token to be processed is returned. */
92582b75 3111static const cpp_token*
ad2305ad 3112_cpp_token_from_context_at (cpp_context *context, int index)
92582b75 3113{
92582b75
TT
3114 if (context->tokens_kind == TOKENS_KIND_DIRECT)
3115 return &(FIRST (context).token[index]);
3116 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3117 || context->tokens_kind == TOKENS_KIND_EXTENDED)
3118 return FIRST (context).ptoken[index];
3119 else
3120 abort ();
3121}
3122
5950c3c9
BE
3123/* Look ahead in the input stream. */
3124const cpp_token *
3125cpp_peek_token (cpp_reader *pfile, int index)
3126{
3127 cpp_context *context = pfile->context;
3128 const cpp_token *peektok;
3129 int count;
3130
3131 /* First, scan through any pending cpp_context objects. */
3132 while (context->prev)
3133 {
ad2305ad 3134 ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
5950c3c9
BE
3135
3136 if (index < (int) sz)
ad2305ad 3137 return _cpp_token_from_context_at (context, index);
5950c3c9
BE
3138 index -= (int) sz;
3139 context = context->prev;
3140 }
3141
3142 /* We will have to read some new tokens after all (and do so
3143 without invalidating preceding tokens). */
3144 count = index;
3145 pfile->keep_tokens++;
3146
b8cd77f4
JJ
3147 /* For peeked tokens temporarily disable line_change reporting,
3148 until the tokens are parsed for real. */
3149 void (*line_change) (cpp_reader *, const cpp_token *, int)
3150 = pfile->cb.line_change;
3151 pfile->cb.line_change = NULL;
3152
5950c3c9
BE
3153 do
3154 {
3155 peektok = _cpp_lex_token (pfile);
3156 if (peektok->type == CPP_EOF)
e4b33ee5
JJ
3157 {
3158 index--;
3159 break;
3160 }
8bd9a00f
NS
3161 else if (peektok->type == CPP_PRAGMA)
3162 {
3163 /* Don't peek past a pragma. */
3164 if (peektok == &pfile->directive_result)
3165 /* Save the pragma in the buffer. */
3166 *pfile->cur_token++ = *peektok;
3167 index--;
3168 break;
3169 }
5950c3c9
BE
3170 }
3171 while (index--);
3172
e4b33ee5 3173 _cpp_backup_tokens_direct (pfile, count - index);
5950c3c9 3174 pfile->keep_tokens--;
b8cd77f4 3175 pfile->cb.line_change = line_change;
5950c3c9
BE
3176
3177 return peektok;
3178}
3179
4ed5bcfb
NB
3180/* Allocate a single token that is invalidated at the same time as the
3181 rest of the tokens on the line. Has its line and col set to the
3182 same as the last lexed token, so that diagnostics appear in the
3183 right place. */
3184cpp_token *
6cf87ca4 3185_cpp_temp_token (cpp_reader *pfile)
4ed5bcfb
NB
3186{
3187 cpp_token *old, *result;
5950c3c9
BE
3188 ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
3189 ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
4ed5bcfb
NB
3190
3191 old = pfile->cur_token - 1;
5950c3c9
BE
3192 /* Any pre-existing lookaheads must not be clobbered. */
3193 if (la)
3194 {
3195 if (sz <= la)
3196 {
3197 tokenrun *next = next_tokenrun (pfile->cur_run);
3198
3199 if (sz < la)
3200 memmove (next->base + 1, next->base,
3201 (la - sz) * sizeof (cpp_token));
3202
3203 next->base[0] = pfile->cur_run->limit[-1];
3204 }
3205
3206 if (sz > 1)
3207 memmove (pfile->cur_token + 1, pfile->cur_token,
3208 MIN (la, sz - 1) * sizeof (cpp_token));
3209 }
3210
3211 if (!sz && pfile->cur_token == pfile->cur_run->limit)
4ed5bcfb
NB
3212 {
3213 pfile->cur_run = next_tokenrun (pfile->cur_run);
3214 pfile->cur_token = pfile->cur_run->base;
3215 }
3216
3217 result = pfile->cur_token++;
12f9df4e 3218 result->src_loc = old->src_loc;
4ed5bcfb
NB
3219 return result;
3220}
3221
c9c3d5f2
NS
3222/* We're at the beginning of a logical line (so not in
3223 directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set. See
3224 if we should enter deferred_pragma mode to tokenize the rest of the
3225 line as a module control-line. */
3226
3227static void
3228cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
3229{
3230 unsigned backup = 0; /* Tokens we peeked. */
3231 cpp_hashnode *node = result->val.node.node;
3232 cpp_token *peek = result;
3233 cpp_token *keyword = peek;
3234 cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
3235 int header_count = 0;
3236
3237 /* Make sure the incoming state is as we expect it. This way we
3238 can restore it using constants. */
3239 gcc_checking_assert (!pfile->state.in_deferred_pragma
3240 && !pfile->state.skipping
3241 && !pfile->state.parsing_args
3242 && !pfile->state.angled_headers
3243 && (pfile->state.save_comments
3244 == !CPP_OPTION (pfile, discard_comments)));
3245
3246 /* Enter directives mode sufficiently for peeking. We don't have
3247 to actually set in_directive. */
3248 pfile->state.in_deferred_pragma = true;
3249
3250 /* These two fields are needed to process tokenization in deferred
3251 pragma mode. They are not used outside deferred pragma mode or
3252 directives mode. */
3253 pfile->state.pragma_allow_expansion = true;
3254 pfile->directive_line = result->src_loc;
3255
3256 /* Saving comments is incompatible with directives mode. */
3257 pfile->state.save_comments = 0;
3258
3259 if (node == n_modules[spec_nodes::M_EXPORT][0])
3260 {
3261 peek = _cpp_lex_direct (pfile);
3262 keyword = peek;
3263 backup++;
3264 if (keyword->type != CPP_NAME)
3265 goto not_module;
3266 node = keyword->val.node.node;
3267 if (!(node->flags & NODE_MODULE))
3268 goto not_module;
3269 }
3270
3271 if (node == n_modules[spec_nodes::M__IMPORT][0])
3272 /* __import */
3273 header_count = backup + 2 + 16;
3274 else if (node == n_modules[spec_nodes::M_IMPORT][0])
3275 /* import */
3276 header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
3277 else if (node == n_modules[spec_nodes::M_MODULE][0])
3278 ; /* module */
3279 else
3280 goto not_module;
3281
3282 /* We've seen [export] {module|import|__import}. Check the next token. */
3283 if (header_count)
3284 /* After '{,__}import' a header name may appear. */
3285 pfile->state.angled_headers = true;
3286 peek = _cpp_lex_direct (pfile);
3287 backup++;
3288
3289 /* ... import followed by identifier, ':', '<' or
3290 header-name preprocessing tokens, or module
3291 followed by cpp-identifier, ':' or ';' preprocessing
3292 tokens. C++ keywords are not yet relevant. */
3293 if (peek->type == CPP_NAME
3294 || peek->type == CPP_COLON
3295 || (header_count
3296 ? (peek->type == CPP_LESS
3297 || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
3298 || peek->type == CPP_HEADER_NAME)
3299 : peek->type == CPP_SEMICOLON))
3300 {
3301 pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
3302 if (!pfile->state.pragma_allow_expansion)
3303 pfile->state.prevent_expansion++;
3304
3305 if (!header_count && linemap_included_from
3306 (LINEMAPS_LAST_ORDINARY_MAP (pfile->line_table)))
3307 cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
3308 "module control-line cannot be in included file");
3309
3310 /* The first one or two tokens cannot be macro names. */
3311 for (int ix = backup; ix--;)
3312 {
3313 cpp_token *tok = ix ? keyword : result;
3314 cpp_hashnode *node = tok->val.node.node;
3315
3316 /* Don't attempt to expand the token. */
3317 tok->flags |= NO_EXPAND;
3318 if (_cpp_defined_macro_p (node)
13f93cf5 3319 && _cpp_maybe_notify_macro_use (pfile, node, tok->src_loc)
c9c3d5f2
NS
3320 && !cpp_fun_like_macro_p (node))
3321 cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
3322 "module control-line \"%s\" cannot be"
3323 " an object-like macro",
3324 NODE_NAME (node));
3325 }
3326
3327 /* Map to underbar variants. */
3328 keyword->val.node.node = n_modules[header_count
3329 ? spec_nodes::M_IMPORT
3330 : spec_nodes::M_MODULE][1];
3331 if (backup != 1)
3332 result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
3333
3334 /* Maybe tell the tokenizer we expect a header-name down the
3335 road. */
3336 pfile->state.directive_file_token = header_count;
3337 }
3338 else
3339 {
3340 not_module:
3341 /* Drop out of directive mode. */
3342 /* We aaserted save_comments had this value upon entry. */
3343 pfile->state.save_comments
3344 = !CPP_OPTION (pfile, discard_comments);
3345 pfile->state.in_deferred_pragma = false;
3346 /* Do not let this remain on. */
3347 pfile->state.angled_headers = false;
3348 }
3349
3350 /* In either case we want to backup the peeked tokens. */
3351 if (backup)
3352 {
3353 /* If we saw EOL, we should drop it, because this isn't a module
3354 control-line after all. */
3355 bool eol = peek->type == CPP_PRAGMA_EOL;
3356 if (!eol || backup > 1)
3357 {
3358 /* Put put the peeked tokens back */
3359 _cpp_backup_tokens_direct (pfile, backup);
3360 /* But if the last one was an EOL, forget it. */
3361 if (eol)
3362 pfile->lookaheads--;
3363 }
3364 }
3365}
3366
14baae01
NB
3367/* Lex a token into RESULT (external interface). Takes care of issues
3368 like directive handling, token lookahead, multiple include
a1f300c0 3369 optimization and skipping. */
345894b4 3370const cpp_token *
6cf87ca4 3371_cpp_lex_token (cpp_reader *pfile)
5fddcffc 3372{
bdcbe496 3373 cpp_token *result;
5fddcffc 3374
bdcbe496 3375 for (;;)
5fddcffc 3376 {
bdcbe496 3377 if (pfile->cur_token == pfile->cur_run->limit)
5fddcffc 3378 {
bdcbe496
NB
3379 pfile->cur_run = next_tokenrun (pfile->cur_run);
3380 pfile->cur_token = pfile->cur_run->base;
5fddcffc 3381 }
ee380365
TT
3382 /* We assume that the current token is somewhere in the current
3383 run. */
3384 if (pfile->cur_token < pfile->cur_run->base
3385 || pfile->cur_token >= pfile->cur_run->limit)
3386 abort ();
5fddcffc 3387
bdcbe496 3388 if (pfile->lookaheads)
14baae01
NB
3389 {
3390 pfile->lookaheads--;
3391 result = pfile->cur_token++;
3392 }
bdcbe496 3393 else
14baae01 3394 result = _cpp_lex_direct (pfile);
bdcbe496
NB
3395
3396 if (result->flags & BOL)
5fddcffc 3397 {
bdcbe496
NB
3398 /* Is this a directive. If _cpp_handle_directive returns
3399 false, it is an assembler #. */
3400 if (result->type == CPP_HASH
e808ec9c
NB
3401 /* 6.10.3 p 11: Directives in a list of macro arguments
3402 gives undefined behavior. This implementation
3403 handles the directive as normal. */
bc4071dd 3404 && pfile->state.parsing_args != 1)
21b11495 3405 {
bc4071dd 3406 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
21b11495 3407 {
bc4071dd
RH
3408 if (pfile->directive_result.type == CPP_PADDING)
3409 continue;
21b11495 3410 result = &pfile->directive_result;
21b11495
ZW
3411 }
3412 }
bc4071dd
RH
3413 else if (pfile->state.in_deferred_pragma)
3414 result = &pfile->directive_result;
c9c3d5f2
NS
3415 else if (result->type == CPP_NAME
3416 && (result->val.node.node->flags & NODE_MODULE)
3417 && !pfile->state.skipping
3418 /* Unlike regular directives, we do not deal with
3419 tokenizing module directives as macro arguments.
3420 That's not permitted. */
3421 && !pfile->state.parsing_args)
3422 {
3423 /* P1857. Before macro expansion, At start of logical
3424 line ... */
3425 /* We don't have to consider lookaheads at this point. */
3426 gcc_checking_assert (!pfile->lookaheads);
3427
3428 cpp_maybe_module_directive (pfile, result);
3429 }
21b11495 3430
97293897 3431 if (pfile->cb.line_change && !pfile->state.skipping)
6cf87ca4 3432 pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
5fddcffc 3433 }
5fddcffc 3434
bdcbe496 3435 /* We don't skip tokens in directives. */
bc4071dd 3436 if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
bdcbe496 3437 break;
5fddcffc 3438
bdcbe496 3439 /* Outside a directive, invalidate controlling macros. At file
14baae01 3440 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
6356f892 3441 get here and MI optimization works. */
5fddcffc 3442 pfile->mi_valid = false;
bdcbe496
NB
3443
3444 if (!pfile->state.skipping || result->type == CPP_EOF)
3445 break;
5fddcffc
NB
3446 }
3447
345894b4 3448 return result;
5fddcffc
NB
3449}
3450
26aea073
NB
3451/* Returns true if a fresh line has been loaded. */
3452bool
6cf87ca4 3453_cpp_get_fresh_line (cpp_reader *pfile)
004cb263 3454{
26aea073
NB
3455 /* We can't get a new line until we leave the current directive. */
3456 if (pfile->state.in_directive)
3457 return false;
df383483 3458
26aea073 3459 for (;;)
1a76916c 3460 {
26aea073 3461 cpp_buffer *buffer = pfile->buffer;
1a76916c 3462
26aea073
NB
3463 if (!buffer->need_line)
3464 return true;
3465
3466 if (buffer->next_line < buffer->rlimit)
004cb263 3467 {
26aea073
NB
3468 _cpp_clean_line (pfile);
3469 return true;
3470 }
004cb263 3471
26aea073
NB
3472 /* First, get out of parsing arguments state. */
3473 if (pfile->state.parsing_args)
3474 return false;
3475
3476 /* End of buffer. Non-empty files should end in a newline. */
3477 if (buffer->buf != buffer->rlimit
3478 && buffer->next_line > buffer->rlimit
3479 && !buffer->from_stage3)
3480 {
ed0e74e0 3481 /* Clip to buffer size. */
26aea073 3482 buffer->next_line = buffer->rlimit;
26aea073 3483 }
22234f56 3484
2a0225e4
NS
3485 if (buffer->prev && !buffer->return_at_eof)
3486 _cpp_pop_buffer (pfile);
3487 else
3488 {
3489 /* End of translation. Do not pop the buffer yet. Increment
3490 line number so that the EOF token is on a line of its own
3491 (_cpp_lex_direct doesn't increment in that case, because
3492 it's hard for it to distinguish this special case). */
3493 CPP_INCREMENT_LINE (pfile, 0);
3494 return false;
3495 }
26aea073 3496 }
004cb263
NB
3497}
3498
6f572ac2
NB
3499#define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
3500 do \
3501 { \
3502 result->type = ELSE_TYPE; \
3503 if (*buffer->cur == CHAR) \
3504 buffer->cur++, result->type = THEN_TYPE; \
3505 } \
3506 while (0)
480709cc 3507
14baae01
NB
3508/* Lex a token into pfile->cur_token, which is also incremented, to
3509 get diagnostics pointing to the correct location.
3510
3511 Does not handle issues such as token lookahead, multiple-include
f1ba665b 3512 optimization, directives, skipping etc. This function is only
14baae01
NB
3513 suitable for use by _cpp_lex_token, and in special cases like
3514 lex_expansion_token which doesn't care for any of these issues.
3515
3516 When meeting a newline, returns CPP_EOF if parsing a directive,
3517 otherwise returns to the start of the token buffer if permissible.
3518 Returns the location of the lexed token. */
3519cpp_token *
6cf87ca4 3520_cpp_lex_direct (cpp_reader *pfile)
45b966db 3521{
0d9f234d 3522 cppchar_t c;
adb84b42 3523 cpp_buffer *buffer;
7bad794a
JJ
3524 const unsigned char *comment_start;
3525 bool fallthrough_comment = false;
14baae01 3526 cpp_token *result = pfile->cur_token++;
9ec7291f 3527
5fddcffc 3528 fresh_line:
26aea073 3529 result->flags = 0;
2be570f9 3530 buffer = pfile->buffer;
a506c55c 3531 if (buffer->need_line)
26aea073 3532 {
55dfce4d
JJ
3533 if (pfile->state.in_deferred_pragma)
3534 {
3535 /* This can happen in cases like:
3536 #define loop(x) whatever
3537 #pragma omp loop
3538 where when trying to expand loop we need to peek
3539 next token after loop, but aren't still in_deferred_pragma
3540 mode but are in in_directive mode, so buffer->need_line
3541 is set, a CPP_EOF is peeked. */
3542 result->type = CPP_PRAGMA_EOL;
3543 pfile->state.in_deferred_pragma = false;
3544 if (!pfile->state.pragma_allow_expansion)
3545 pfile->state.prevent_expansion--;
3546 return result;
3547 }
26aea073
NB
3548 if (!_cpp_get_fresh_line (pfile))
3549 {
3550 result->type = CPP_EOF;
dbcc6b15
NS
3551 /* Not a real EOF in a directive or arg parsing -- we refuse
3552 to advance to the next file now, and will once we're out
3553 of those modes. */
3554 if (!pfile->state.in_directive && !pfile->state.parsing_args)
9ff7868d
NB
3555 {
3556 /* Tell the compiler the line number of the EOF token. */
500bee0a 3557 result->src_loc = pfile->line_table->highest_line;
9ff7868d 3558 result->flags = BOL;
2a0225e4
NS
3559 /* Now pop the buffer that _cpp_get_fresh_line did not. */
3560 _cpp_pop_buffer (pfile);
9ff7868d 3561 }
26aea073
NB
3562 return result;
3563 }
81fea426 3564 if (buffer != pfile->buffer)
7bad794a 3565 fallthrough_comment = false;
26aea073
NB
3566 if (!pfile->keep_tokens)
3567 {
3568 pfile->cur_run = &pfile->base_run;
3569 result = pfile->base_run.base;
3570 pfile->cur_token = result + 1;
3571 }
3572 result->flags = BOL;
3573 if (pfile->state.parsing_args == 2)
3574 result->flags |= PREV_WHITE;
3575 }
a506c55c 3576 buffer = pfile->buffer;
5fddcffc 3577 update_tokens_line:
500bee0a 3578 result->src_loc = pfile->line_table->highest_line;
041c3194 3579
5fddcffc 3580 skipped_white:
26aea073
NB
3581 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3582 && !pfile->overlaid_buffer)
3583 {
3584 _cpp_process_line_notes (pfile, false);
500bee0a 3585 result->src_loc = pfile->line_table->highest_line;
26aea073 3586 }
480709cc 3587 c = *buffer->cur++;
12f9df4e 3588
f3f6029d
NS
3589 if (pfile->forced_token_location)
3590 result->src_loc = pfile->forced_token_location;
e3dfef44
GC
3591 else
3592 result->src_loc = linemap_position_for_column (pfile->line_table,
3593 CPP_BUF_COLUMN (buffer, buffer->cur));
5fddcffc 3594
0d9f234d 3595 switch (c)
45b966db 3596 {
4d6baafa
NB
3597 case ' ': case '\t': case '\f': case '\v': case '\0':
3598 result->flags |= PREV_WHITE;
26aea073
NB
3599 skip_whitespace (pfile, c);
3600 goto skipped_white;
0d9f234d 3601
26aea073 3602 case '\n':
056f95ec
NS
3603 /* Increment the line, unless this is the last line ... */
3604 if (buffer->cur < buffer->rlimit
3605 /* ... or this is a #include, (where _cpp_stack_file needs to
3606 unwind by one line) ... */
3607 || (pfile->state.in_directive > 1
3608 /* ... except traditional-cpp increments this elsewhere. */
3609 && !CPP_OPTION (pfile, traditional)))
12f9df4e 3610 CPP_INCREMENT_LINE (pfile, 0);
26aea073 3611 buffer->need_line = true;
8bd9a00f
NS
3612 if (pfile->state.in_deferred_pragma)
3613 {
3614 /* Produce the PRAGMA_EOL on this line. File reading
3615 ensures there is always a \n at end of the buffer, thus
3616 in a deferred pragma we always see CPP_PRAGMA_EOL before
3617 any CPP_EOF. */
3618 result->type = CPP_PRAGMA_EOL;
3619 result->flags &= ~PREV_WHITE;
3620 pfile->state.in_deferred_pragma = false;
3621 if (!pfile->state.pragma_allow_expansion)
3622 pfile->state.prevent_expansion--;
3623 return result;
3624 }
26aea073 3625 goto fresh_line;
46d07497 3626
0d9f234d
NB
3627 case '0': case '1': case '2': case '3': case '4':
3628 case '5': case '6': case '7': case '8': case '9':
50668cf6
GK
3629 {
3630 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3631 result->type = CPP_NUMBER;
3632 lex_number (pfile, &result->val.str, &nst);
3633 warn_about_normalization (pfile, result, &nst);
3634 break;
3635 }
46d07497 3636
0abc6a6a 3637 case 'L':
b6baa67d
KVH
3638 case 'u':
3639 case 'U':
2c6e3f55
JJ
3640 case 'R':
3641 /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
3642 wide strings or raw strings. */
a48e3dd1
JM
3643 if (c == 'L' || CPP_OPTION (pfile, rliterals)
3644 || (c != 'R' && CPP_OPTION (pfile, uliterals)))
bced6edf 3645 {
2c6e3f55
JJ
3646 if ((*buffer->cur == '\'' && c != 'R')
3647 || *buffer->cur == '"'
3648 || (*buffer->cur == 'R'
3649 && c != 'R'
3650 && buffer->cur[1] == '"'
a48e3dd1 3651 && CPP_OPTION (pfile, rliterals))
2c6e3f55
JJ
3652 || (*buffer->cur == '8'
3653 && c == 'u'
fe95b036
ESR
3654 && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
3655 && CPP_OPTION (pfile, utf8_char_literals)))
a48e3dd1
JM
3656 || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
3657 && CPP_OPTION (pfile, rliterals)))))
b6baa67d
KVH
3658 {
3659 lex_string (pfile, result, buffer->cur - 1);
3660 break;
3661 }
bced6edf 3662 }
df383483 3663 /* Fall through. */
0abc6a6a 3664
0d9f234d
NB
3665 case '_':
3666 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
3667 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
3668 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
b6baa67d 3669 case 's': case 't': case 'v': case 'w': case 'x':
0d9f234d
NB
3670 case 'y': case 'z':
3671 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
0abc6a6a 3672 case 'G': case 'H': case 'I': case 'J': case 'K':
2c6e3f55 3673 case 'M': case 'N': case 'O': case 'P': case 'Q':
b6baa67d 3674 case 'S': case 'T': case 'V': case 'W': case 'X':
0d9f234d
NB
3675 case 'Y': case 'Z':
3676 result->type = CPP_NAME;
50668cf6
GK
3677 {
3678 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
9a0c6187 3679 result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
be5ffc59
JM
3680 &nst,
3681 &result->val.node.spelling);
50668cf6
GK
3682 warn_about_normalization (pfile, result, &nst);
3683 }
0d9f234d 3684
0d9f234d 3685 /* Convert named operators to their proper types. */
9a0c6187 3686 if (result->val.node.node->flags & NODE_OPERATOR)
0d9f234d
NB
3687 {
3688 result->flags |= NAMED_OP;
9a0c6187 3689 result->type = (enum cpp_ttype) result->val.node.node->directive_index;
0d9f234d 3690 }
81fea426
MP
3691
3692 /* Signal FALLTHROUGH comment followed by another token. */
7bad794a 3693 if (fallthrough_comment)
81fea426 3694 result->flags |= PREV_FALLTHROUGH;
0d9f234d
NB
3695 break;
3696
3697 case '\'':
3698 case '"':
6338b358 3699 lex_string (pfile, result, buffer->cur - 1);
0d9f234d 3700 break;
041c3194 3701
0d9f234d 3702 case '/':
1c6d33ef
NB
3703 /* A potential block or line comment. */
3704 comment_start = buffer->cur;
6f572ac2
NB
3705 c = *buffer->cur;
3706
1c6d33ef
NB
3707 if (c == '*')
3708 {
26aea073 3709 if (_cpp_skip_block_comment (pfile))
0527bc4e 3710 cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
0d9f234d 3711 }
909eb89c 3712 else if (c == '/' && ! CPP_OPTION (pfile, traditional))
0d9f234d 3713 {
909eb89c 3714 /* Don't warn for system headers. */
bf425849 3715 if (_cpp_in_system_header (pfile))
909eb89c 3716 ;
f3bede71 3717 /* Warn about comments if pedantically GNUC89, and not
bdb05a7b 3718 in system headers. */
909eb89c
MP
3719 else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
3720 && CPP_PEDANTIC (pfile)
3721 && ! buffer->warned_cplusplus_comments)
041c3194 3722 {
0c86a39d
JJ
3723 if (cpp_error (pfile, CPP_DL_PEDWARN,
3724 "C++ style comments are not allowed in ISO C90"))
3725 cpp_error (pfile, CPP_DL_NOTE,
3726 "(this will be reported only once per input file)");
1c6d33ef
NB
3727 buffer->warned_cplusplus_comments = 1;
3728 }
f3bede71 3729 /* Or if specifically desired via -Wc90-c99-compat. */
177cce46 3730 else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
dd3ff077 3731 && ! CPP_OPTION (pfile, cplusplus)
f3bede71
MP
3732 && ! buffer->warned_cplusplus_comments)
3733 {
0c86a39d
JJ
3734 if (cpp_error (pfile, CPP_DL_WARNING,
3735 "C++ style comments are incompatible with C90"))
3736 cpp_error (pfile, CPP_DL_NOTE,
3737 "(this will be reported only once per input file)");
f3bede71
MP
3738 buffer->warned_cplusplus_comments = 1;
3739 }
909eb89c
MP
3740 /* In C89/C94, C++ style comments are forbidden. */
3741 else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
3742 || CPP_OPTION (pfile, lang) == CLK_STDC94))
3743 {
3744 /* But don't be confused about valid code such as
3745 - // immediately followed by *,
3746 - // in a preprocessing directive,
3747 - // in an #if 0 block. */
3748 if (buffer->cur[1] == '*'
3749 || pfile->state.in_directive
3750 || pfile->state.skipping)
3751 {
3752 result->type = CPP_DIV;
3753 break;
3754 }
3755 else if (! buffer->warned_cplusplus_comments)
3756 {
0c86a39d
JJ
3757 if (cpp_error (pfile, CPP_DL_ERROR,
3758 "C++ style comments are not allowed in "
3759 "ISO C90"))
3760 cpp_error (pfile, CPP_DL_NOTE,
3761 "(this will be reported only once per input "
3762 "file)");
909eb89c
MP
3763 buffer->warned_cplusplus_comments = 1;
3764 }
3765 }
01ef6563 3766 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
87cf0651 3767 cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
1c6d33ef 3768 }
480709cc
NB
3769 else if (c == '=')
3770 {
6f572ac2 3771 buffer->cur++;
480709cc
NB
3772 result->type = CPP_DIV_EQ;
3773 break;
3774 }
3775 else
3776 {
480709cc
NB
3777 result->type = CPP_DIV;
3778 break;
3779 }
0d9f234d 3780
7bad794a
JJ
3781 if (fallthrough_comment_p (pfile, comment_start))
3782 fallthrough_comment = true;
3783
05945a1b
DM
3784 if (pfile->cb.comment)
3785 {
3786 size_t len = pfile->buffer->cur - comment_start;
3787 pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
3788 len + 1);
3789 }
3790
1c6d33ef
NB
3791 if (!pfile->state.save_comments)
3792 {
3793 result->flags |= PREV_WHITE;
5fddcffc 3794 goto update_tokens_line;
0d9f234d 3795 }
1c6d33ef 3796
7bad794a 3797 if (fallthrough_comment)
81fea426
MP
3798 result->flags |= PREV_FALLTHROUGH;
3799
1c6d33ef 3800 /* Save the comment as a token in its own right. */
477cdac7 3801 save_comment (pfile, result, comment_start, c);
bdcbe496 3802 break;
0d9f234d
NB
3803
3804 case '<':
3805 if (pfile->state.angled_headers)
3806 {
6338b358 3807 lex_string (pfile, result, buffer->cur - 1);
4bb09c26
JM
3808 if (result->type != CPP_LESS)
3809 break;
0d9f234d 3810 }
45b966db 3811
6f572ac2
NB
3812 result->type = CPP_LESS;
3813 if (*buffer->cur == '=')
b7689b96
JM
3814 {
3815 buffer->cur++, result->type = CPP_LESS_EQ;
3816 if (*buffer->cur == '>'
3817 && CPP_OPTION (pfile, cplusplus)
b04445d4 3818 && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
b7689b96
JM
3819 buffer->cur++, result->type = CPP_SPACESHIP;
3820 }
6f572ac2 3821 else if (*buffer->cur == '<')
0d9f234d 3822 {
6f572ac2
NB
3823 buffer->cur++;
3824 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
0d9f234d 3825 }
6f572ac2 3826 else if (CPP_OPTION (pfile, digraphs))
480709cc 3827 {
6f572ac2
NB
3828 if (*buffer->cur == ':')
3829 {
1582c677
PC
3830 /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
3831 three characters are <:: and the subsequent character
3832 is neither : nor >, the < is treated as a preprocessor
3833 token by itself". */
3834 if (CPP_OPTION (pfile, cplusplus)
61949153
PC
3835 && CPP_OPTION (pfile, lang) != CLK_CXX98
3836 && CPP_OPTION (pfile, lang) != CLK_GNUCXX
1582c677
PC
3837 && buffer->cur[1] == ':'
3838 && buffer->cur[2] != ':' && buffer->cur[2] != '>')
3839 break;
3840
6f572ac2
NB
3841 buffer->cur++;
3842 result->flags |= DIGRAPH;
3843 result->type = CPP_OPEN_SQUARE;
3844 }
3845 else if (*buffer->cur == '%')
3846 {
3847 buffer->cur++;
3848 result->flags |= DIGRAPH;
3849 result->type = CPP_OPEN_BRACE;
3850 }
480709cc 3851 }
0d9f234d
NB
3852 break;
3853
3854 case '>':
6f572ac2
NB
3855 result->type = CPP_GREATER;
3856 if (*buffer->cur == '=')
3857 buffer->cur++, result->type = CPP_GREATER_EQ;
3858 else if (*buffer->cur == '>')
0d9f234d 3859 {
6f572ac2
NB
3860 buffer->cur++;
3861 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
3862 }
0d9f234d
NB
3863 break;
3864
cbcff6df 3865 case '%':
6f572ac2
NB
3866 result->type = CPP_MOD;
3867 if (*buffer->cur == '=')
3868 buffer->cur++, result->type = CPP_MOD_EQ;
3869 else if (CPP_OPTION (pfile, digraphs))
480709cc 3870 {
6f572ac2 3871 if (*buffer->cur == ':')
480709cc 3872 {
6f572ac2
NB
3873 buffer->cur++;
3874 result->flags |= DIGRAPH;
3875 result->type = CPP_HASH;
3876 if (*buffer->cur == '%' && buffer->cur[1] == ':')
9a0c6187 3877 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
6f572ac2
NB
3878 }
3879 else if (*buffer->cur == '>')
3880 {
3881 buffer->cur++;
3882 result->flags |= DIGRAPH;
3883 result->type = CPP_CLOSE_BRACE;
480709cc 3884 }
480709cc 3885 }
0d9f234d
NB
3886 break;
3887
cbcff6df 3888 case '.':
480709cc 3889 result->type = CPP_DOT;
6f572ac2 3890 if (ISDIGIT (*buffer->cur))
480709cc 3891 {
50668cf6 3892 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
480709cc 3893 result->type = CPP_NUMBER;
50668cf6
GK
3894 lex_number (pfile, &result->val.str, &nst);
3895 warn_about_normalization (pfile, result, &nst);
480709cc 3896 }
6f572ac2
NB
3897 else if (*buffer->cur == '.' && buffer->cur[1] == '.')
3898 buffer->cur += 2, result->type = CPP_ELLIPSIS;
3899 else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3900 buffer->cur++, result->type = CPP_DOT_STAR;
0d9f234d 3901 break;
45b966db 3902
0d9f234d 3903 case '+':
6f572ac2
NB
3904 result->type = CPP_PLUS;
3905 if (*buffer->cur == '+')
3906 buffer->cur++, result->type = CPP_PLUS_PLUS;
3907 else if (*buffer->cur == '=')
3908 buffer->cur++, result->type = CPP_PLUS_EQ;
0d9f234d 3909 break;
04e3ec78 3910
0d9f234d 3911 case '-':
6f572ac2
NB
3912 result->type = CPP_MINUS;
3913 if (*buffer->cur == '>')
0d9f234d 3914 {
6f572ac2 3915 buffer->cur++;
480709cc 3916 result->type = CPP_DEREF;
6f572ac2
NB
3917 if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3918 buffer->cur++, result->type = CPP_DEREF_STAR;
480709cc 3919 }
6f572ac2
NB
3920 else if (*buffer->cur == '-')
3921 buffer->cur++, result->type = CPP_MINUS_MINUS;
3922 else if (*buffer->cur == '=')
3923 buffer->cur++, result->type = CPP_MINUS_EQ;
0d9f234d 3924 break;
45b966db 3925
0d9f234d 3926 case '&':
6f572ac2
NB
3927 result->type = CPP_AND;
3928 if (*buffer->cur == '&')
3929 buffer->cur++, result->type = CPP_AND_AND;
3930 else if (*buffer->cur == '=')
3931 buffer->cur++, result->type = CPP_AND_EQ;
0d9f234d 3932 break;
df383483 3933
0d9f234d 3934 case '|':
6f572ac2
NB
3935 result->type = CPP_OR;
3936 if (*buffer->cur == '|')
3937 buffer->cur++, result->type = CPP_OR_OR;
3938 else if (*buffer->cur == '=')
3939 buffer->cur++, result->type = CPP_OR_EQ;
0d9f234d 3940 break;
45b966db 3941
0d9f234d 3942 case ':':
6f572ac2 3943 result->type = CPP_COLON;
93313b94 3944 if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
6f572ac2
NB
3945 buffer->cur++, result->type = CPP_SCOPE;
3946 else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
0d9f234d 3947 {
6f572ac2 3948 buffer->cur++;
0d9f234d 3949 result->flags |= DIGRAPH;
480709cc
NB
3950 result->type = CPP_CLOSE_SQUARE;
3951 }
0d9f234d 3952 break;
45b966db 3953
480709cc
NB
3954 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
3955 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
3956 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
3957 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
9a0c6187 3958 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
480709cc 3959
26aea073 3960 case '?': result->type = CPP_QUERY; break;
0d9f234d
NB
3961 case '~': result->type = CPP_COMPL; break;
3962 case ',': result->type = CPP_COMMA; break;
3963 case '(': result->type = CPP_OPEN_PAREN; break;
3964 case ')': result->type = CPP_CLOSE_PAREN; break;
3965 case '[': result->type = CPP_OPEN_SQUARE; break;
3966 case ']': result->type = CPP_CLOSE_SQUARE; break;
3967 case '{': result->type = CPP_OPEN_BRACE; break;
3968 case '}': result->type = CPP_CLOSE_BRACE; break;
3969 case ';': result->type = CPP_SEMICOLON; break;
3970
40f03658 3971 /* @ is a punctuator in Objective-C. */
cc937581 3972 case '@': result->type = CPP_ATSIGN; break;
0d9f234d 3973
7d112d66 3974 default:
1613e52b
NB
3975 {
3976 const uchar *base = --buffer->cur;
0abc6a6a 3977
7d112d66
LH
3978 /* Check for an extended identifier ($ or UCN or UTF-8). */
3979 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
50668cf6 3980 if (forms_identifier_p (pfile, true, &nst))
1613e52b
NB
3981 {
3982 result->type = CPP_NAME;
be5ffc59
JM
3983 result->val.node.node = lex_identifier (pfile, base, true, &nst,
3984 &result->val.node.spelling);
50668cf6 3985 warn_about_normalization (pfile, result, &nst);
1613e52b
NB
3986 break;
3987 }
7d112d66
LH
3988
3989 /* Otherwise this will form a CPP_OTHER token. Parse valid UTF-8 as a
3990 single token. */
1613e52b 3991 buffer->cur++;
7d112d66
LH
3992 if (c >= utf8_signifier)
3993 {
3994 const uchar *pstr = base;
3995 cppchar_t s;
3996 if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
3997 buffer->cur = pstr;
3998 }
3999 create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
4000 break;
1067694a 4001 }
1613e52b 4002
0d9f234d 4003 }
bdcbe496 4004
a3998c2f
DM
4005 /* Potentially convert the location of the token to a range. */
4006 if (result->src_loc >= RESERVED_LOCATION_COUNT
4007 && result->type != CPP_EOF)
4008 {
4009 /* Ensure that any line notes are processed, so that we have the
4010 correct physical line/column for the end-point of the token even
4011 when a logical line is split via one or more backslashes. */
4012 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
4013 && !pfile->overlaid_buffer)
4014 _cpp_process_line_notes (pfile, false);
4015
4016 source_range tok_range;
4017 tok_range.m_start = result->src_loc;
4018 tok_range.m_finish
4019 = linemap_position_for_column (pfile->line_table,
4020 CPP_BUF_COLUMN (buffer, buffer->cur));
4021
4022 result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
4023 result->src_loc,
4024 tok_range, NULL);
4025 }
ebedc9a3 4026
bdcbe496 4027 return result;
0d9f234d
NB
4028}
4029
59325650
NB
4030/* An upper bound on the number of bytes needed to spell TOKEN.
4031 Does not include preceding whitespace. */
93c80368 4032unsigned int
6cf87ca4 4033cpp_token_len (const cpp_token *token)
0d9f234d 4034{
93c80368 4035 unsigned int len;
6d2c2047 4036
93c80368 4037 switch (TOKEN_SPELL (token))
041c3194 4038 {
cc955282 4039 default: len = 6; break;
6338b358 4040 case SPELL_LITERAL: len = token->val.str.len; break;
9a0c6187 4041 case SPELL_IDENT: len = NODE_LEN (token->val.node.node) * 10; break;
041c3194 4042 }
59325650
NB
4043
4044 return len;
6d2c2047
ZW
4045}
4046
47e20491
GK
4047/* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
4048 Return the number of bytes read out of NAME. (There are always
4049 10 bytes written to BUFFER.) */
4050
4051static size_t
4052utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
4053{
4054 int j;
4055 int ucn_len = 0;
4056 int ucn_len_c;
4057 unsigned t;
4058 unsigned long utf32;
4059
4060 /* Compute the length of the UTF-8 sequence. */
4061 for (t = *name; t & 0x80; t <<= 1)
4062 ucn_len++;
4063
4064 utf32 = *name & (0x7F >> ucn_len);
4065 for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
4066 {
4067 utf32 = (utf32 << 6) | (*++name & 0x3F);
4068
4069 /* Ill-formed UTF-8. */
4070 if ((*name & ~0x3F) != 0x80)
4071 abort ();
4072 }
4073
4074 *buffer++ = '\\';
4075 *buffer++ = 'U';
4076 for (j = 7; j >= 0; j--)
4077 *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
4078 return ucn_len;
4079}
4080
cfc93532
MLI
4081/* Given a token TYPE corresponding to a digraph, return a pointer to
4082 the spelling of the digraph. */
4083static const unsigned char *
4084cpp_digraph2name (enum cpp_ttype type)
4085{
4086 return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
4087}
47e20491 4088
be5ffc59
JM
4089/* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
4090 The buffer must already contain the enough space to hold the
4091 token's spelling. Returns a pointer to the character after the
4092 last character written. */
4093unsigned char *
4094_cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
4095{
4096 size_t i;
4097 const unsigned char *name = NODE_NAME (ident);
4098
4099 for (i = 0; i < NODE_LEN (ident); i++)
4100 if (name[i] & ~0x7F)
4101 {
4102 i += utf8_to_ucn (buffer, name + i) - 1;
4103 buffer += 10;
4104 }
4105 else
4106 *buffer++ = name[i];
4107
4108 return buffer;
4109}
4110
041c3194 4111/* Write the spelling of a token TOKEN to BUFFER. The buffer must
cf00a885 4112 already contain the enough space to hold the token's spelling.
6cf87ca4 4113 Returns a pointer to the character after the last character written.
47e20491 4114 FORSTRING is true if this is to be the spelling after translation
be5ffc59
JM
4115 phase 1 (with the original spelling of extended identifiers), false
4116 if extended identifiers should always be written using UCNs (there is
4117 no option for always writing them in the internal UTF-8 form).
6cf87ca4 4118 FIXME: Would be nice if we didn't need the PFILE argument. */
93c80368 4119unsigned char *
6cf87ca4 4120cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
47e20491 4121 unsigned char *buffer, bool forstring)
041c3194 4122{
96be6998 4123 switch (TOKEN_SPELL (token))
041c3194
ZW
4124 {
4125 case SPELL_OPERATOR:
4126 {
4127 const unsigned char *spelling;
4128 unsigned char c;
d6d5f795 4129
041c3194 4130 if (token->flags & DIGRAPH)
cfc93532 4131 spelling = cpp_digraph2name (token->type);
92936ecf
ZW
4132 else if (token->flags & NAMED_OP)
4133 goto spell_ident;
041c3194 4134 else
96be6998 4135 spelling = TOKEN_NAME (token);
df383483 4136
041c3194
ZW
4137 while ((c = *spelling++) != '\0')
4138 *buffer++ = c;
4139 }
4140 break;
d6d5f795 4141
47ad4138 4142 spell_ident:
041c3194 4143 case SPELL_IDENT:
47e20491
GK
4144 if (forstring)
4145 {
be5ffc59
JM
4146 memcpy (buffer, NODE_NAME (token->val.node.spelling),
4147 NODE_LEN (token->val.node.spelling));
4148 buffer += NODE_LEN (token->val.node.spelling);
47e20491
GK
4149 }
4150 else
be5ffc59 4151 buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
041c3194 4152 break;
d6d5f795 4153
6338b358 4154 case SPELL_LITERAL:
47ad4138
ZW
4155 memcpy (buffer, token->val.str.text, token->val.str.len);
4156 buffer += token->val.str.len;
4157 break;
4158
041c3194 4159 case SPELL_NONE:
0527bc4e
JDA
4160 cpp_error (pfile, CPP_DL_ICE,
4161 "unspellable token %s", TOKEN_NAME (token));
041c3194
ZW
4162 break;
4163 }
d6d5f795 4164
041c3194
ZW
4165 return buffer;
4166}
d6d5f795 4167
5d8ebbd8
NB
4168/* Returns TOKEN spelt as a null-terminated string. The string is
4169 freed when the reader is destroyed. Useful for diagnostics. */
93c80368 4170unsigned char *
6cf87ca4 4171cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
59325650
NB
4172{
4173 unsigned int len = cpp_token_len (token) + 1;
ece54d54 4174 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
c5a04734 4175
47e20491 4176 end = cpp_spell_token (pfile, token, start, false);
93c80368 4177 end[0] = '\0';
c5a04734 4178
93c80368
NB
4179 return start;
4180}
c5a04734 4181
cfc93532
MLI
4182/* Returns a pointer to a string which spells the token defined by
4183 TYPE and FLAGS. Used by C front ends, which really should move to
4184 using cpp_token_as_text. */
93c80368 4185const char *
cfc93532 4186cpp_type2name (enum cpp_ttype type, unsigned char flags)
93c80368 4187{
cfc93532
MLI
4188 if (flags & DIGRAPH)
4189 return (const char *) cpp_digraph2name (type);
4190 else if (flags & NAMED_OP)
4191 return cpp_named_operator2name (type);
4192
93c80368
NB
4193 return (const char *) token_spellings[type].name;
4194}
c5a04734 4195
4ed5bcfb
NB
4196/* Writes the spelling of token to FP, without any preceding space.
4197 Separated from cpp_spell_token for efficiency - to avoid stdio
4198 double-buffering. */
93c80368 4199void
6cf87ca4 4200cpp_output_token (const cpp_token *token, FILE *fp)
93c80368 4201{
93c80368 4202 switch (TOKEN_SPELL (token))
c5a04734 4203 {
93c80368
NB
4204 case SPELL_OPERATOR:
4205 {
4206 const unsigned char *spelling;
3b681e9d 4207 int c;
c5a04734 4208
93c80368 4209 if (token->flags & DIGRAPH)
cfc93532 4210 spelling = cpp_digraph2name (token->type);
93c80368
NB
4211 else if (token->flags & NAMED_OP)
4212 goto spell_ident;
4213 else
4214 spelling = TOKEN_NAME (token);
041c3194 4215
3b681e9d
ZW
4216 c = *spelling;
4217 do
4218 putc (c, fp);
4219 while ((c = *++spelling) != '\0');
93c80368
NB
4220 }
4221 break;
041c3194 4222
93c80368
NB
4223 spell_ident:
4224 case SPELL_IDENT:
47e20491
GK
4225 {
4226 size_t i;
9a0c6187 4227 const unsigned char * name = NODE_NAME (token->val.node.node);
47e20491 4228
9a0c6187 4229 for (i = 0; i < NODE_LEN (token->val.node.node); i++)
47e20491
GK
4230 if (name[i] & ~0x7F)
4231 {
4232 unsigned char buffer[10];
4233 i += utf8_to_ucn (buffer, name + i) - 1;
4234 fwrite (buffer, 1, 10, fp);
4235 }
4236 else
9a0c6187 4237 fputc (NODE_NAME (token->val.node.node)[i], fp);
47e20491
GK
4238 }
4239 break;
041c3194 4240
6338b358 4241 case SPELL_LITERAL:
c9c3d5f2
NS
4242 if (token->type == CPP_HEADER_NAME)
4243 fputc ('"', fp);
47ad4138 4244 fwrite (token->val.str.text, 1, token->val.str.len, fp);
c9c3d5f2
NS
4245 if (token->type == CPP_HEADER_NAME)
4246 fputc ('"', fp);
47ad4138
ZW
4247 break;
4248
93c80368
NB
4249 case SPELL_NONE:
4250 /* An error, most probably. */
4251 break;
041c3194 4252 }
c5a04734
ZW
4253}
4254
93c80368
NB
4255/* Compare two tokens. */
4256int
6cf87ca4 4257_cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
c5a04734 4258{
93c80368
NB
4259 if (a->type == b->type && a->flags == b->flags)
4260 switch (TOKEN_SPELL (a))
4261 {
4262 default: /* Keep compiler happy. */
4263 case SPELL_OPERATOR:
9a0c6187 4264 /* token_no is used to track where multiple consecutive ##
aa508502 4265 tokens were originally located. */
9a0c6187 4266 return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
93c80368 4267 case SPELL_NONE:
9a0c6187 4268 return (a->type != CPP_MACRO_ARG
be5ffc59
JM
4269 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
4270 && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
93c80368 4271 case SPELL_IDENT:
be5ffc59
JM
4272 return (a->val.node.node == b->val.node.node
4273 && a->val.node.spelling == b->val.node.spelling);
6338b358 4274 case SPELL_LITERAL:
93c80368
NB
4275 return (a->val.str.len == b->val.str.len
4276 && !memcmp (a->val.str.text, b->val.str.text,
4277 a->val.str.len));
4278 }
c5a04734 4279
041c3194
ZW
4280 return 0;
4281}
4282
93c80368
NB
4283/* Returns nonzero if a space should be inserted to avoid an
4284 accidental token paste for output. For simplicity, it is
4285 conservative, and occasionally advises a space where one is not
4286 needed, e.g. "." and ".2". */
93c80368 4287int
6cf87ca4
ZW
4288cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
4289 const cpp_token *token2)
c5a04734 4290{
93c80368
NB
4291 enum cpp_ttype a = token1->type, b = token2->type;
4292 cppchar_t c;
c5a04734 4293
93c80368
NB
4294 if (token1->flags & NAMED_OP)
4295 a = CPP_NAME;
4296 if (token2->flags & NAMED_OP)
4297 b = CPP_NAME;
c5a04734 4298
93c80368
NB
4299 c = EOF;
4300 if (token2->flags & DIGRAPH)
37b8524c 4301 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
93c80368
NB
4302 else if (token_spellings[b].category == SPELL_OPERATOR)
4303 c = token_spellings[b].name[0];
c5a04734 4304
93c80368 4305 /* Quickly get everything that can paste with an '='. */
37b8524c 4306 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
93c80368 4307 return 1;
c5a04734 4308
93c80368 4309 switch (a)
c5a04734 4310 {
b52dbbf8
SE
4311 case CPP_GREATER: return c == '>';
4312 case CPP_LESS: return c == '<' || c == '%' || c == ':';
93c80368
NB
4313 case CPP_PLUS: return c == '+';
4314 case CPP_MINUS: return c == '-' || c == '>';
4315 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
4316 case CPP_MOD: return c == ':' || c == '>';
4317 case CPP_AND: return c == '&';
4318 case CPP_OR: return c == '|';
4319 case CPP_COLON: return c == ':' || c == '>';
4320 case CPP_DEREF: return c == '*';
26ec42ee 4321 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
93c80368 4322 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
170c850e 4323 case CPP_PRAGMA:
93c80368
NB
4324 case CPP_NAME: return ((b == CPP_NUMBER
4325 && name_p (pfile, &token2->val.str))
4326 || b == CPP_NAME
4327 || b == CPP_CHAR || b == CPP_STRING); /* L */
4328 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
3e3fdf3d 4329 || b == CPP_CHAR
93c80368 4330 || c == '.' || c == '+' || c == '-');
1613e52b 4331 /* UCNs */
1067694a
NB
4332 case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
4333 && b == CPP_NAME)
1613e52b 4334 || (CPP_OPTION (pfile, objc)
1067694a 4335 && token1->val.str.text[0] == '@'
1613e52b 4336 && (b == CPP_NAME || b == CPP_STRING)));
b7689b96 4337 case CPP_LESS_EQ: return c == '>';
87e356ba
JJ
4338 case CPP_STRING:
4339 case CPP_WSTRING:
4340 case CPP_UTF8STRING:
4341 case CPP_STRING16:
4342 case CPP_STRING32: return (CPP_OPTION (pfile, user_literals)
4343 && (b == CPP_NAME
4344 || (TOKEN_SPELL (token2) == SPELL_LITERAL
4345 && ISIDST (token2->val.str.text[0]))));
4346
93c80368 4347 default: break;
c5a04734 4348 }
c5a04734 4349
417f3e3a 4350 return 0;
c5a04734
ZW
4351}
4352
93c80368 4353/* Output all the remaining tokens on the current line, and a newline
4ed5bcfb
NB
4354 character, to FP. Leading whitespace is removed. If there are
4355 macros, special token padding is not performed. */
c5a04734 4356void
6cf87ca4 4357cpp_output_line (cpp_reader *pfile, FILE *fp)
c5a04734 4358{
4ed5bcfb 4359 const cpp_token *token;
96be6998 4360
4ed5bcfb
NB
4361 token = cpp_get_token (pfile);
4362 while (token->type != CPP_EOF)
96be6998 4363 {
4ed5bcfb
NB
4364 cpp_output_token (token, fp);
4365 token = cpp_get_token (pfile);
4366 if (token->flags & PREV_WHITE)
4367 putc (' ', fp);
96be6998
ZW
4368 }
4369
93c80368 4370 putc ('\n', fp);
041c3194 4371}
c5a04734 4372
5d6342eb
TT
4373/* Return a string representation of all the remaining tokens on the
4374 current line. The result is allocated using xmalloc and must be
4375 freed by the caller. */
4376unsigned char *
4377cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
4378{
4379 const cpp_token *token;
4380 unsigned int out = dir_name ? ustrlen (dir_name) : 0;
4381 unsigned int alloced = 120 + out;
4382 unsigned char *result = (unsigned char *) xmalloc (alloced);
4383
4384 /* If DIR_NAME is empty, there are no initial contents. */
4385 if (dir_name)
4386 {
4387 sprintf ((char *) result, "#%s ", dir_name);
4388 out += 2;
4389 }
4390
4391 token = cpp_get_token (pfile);
4392 while (token->type != CPP_EOF)
4393 {
4394 unsigned char *last;
4395 /* Include room for a possible space and the terminating nul. */
4396 unsigned int len = cpp_token_len (token) + 2;
4397
4398 if (out + len > alloced)
4399 {
4400 alloced *= 2;
4401 if (out + len > alloced)
4402 alloced = out + len;
4403 result = (unsigned char *) xrealloc (result, alloced);
4404 }
4405
4406 last = cpp_spell_token (pfile, token, &result[out], 0);
4407 out = last - result;
4408
4409 token = cpp_get_token (pfile);
4410 if (token->flags & PREV_WHITE)
4411 result[out++] = ' ';
4412 }
4413
4414 result[out] = '\0';
4415 return result;
4416}
4417
1e013d2e
NB
4418/* Memory buffers. Changing these three constants can have a dramatic
4419 effect on performance. The values here are reasonable defaults,
4420 but might be tuned. If you adjust them, be sure to test across a
4421 range of uses of cpplib, including heavy nested function-like macro
4422 expansion. Also check the change in peak memory usage (NJAMD is a
4423 good tool for this). */
4424#define MIN_BUFF_SIZE 8000
87062813 4425#define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
1e013d2e
NB
4426#define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
4427 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
417f3e3a 4428
87062813
NB
4429#if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
4430 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
4431#endif
4432
c9e7a609
NB
4433/* Create a new allocation buffer. Place the control block at the end
4434 of the buffer, so that buffer overflows will cause immediate chaos. */
b8af0ca5 4435static _cpp_buff *
6cf87ca4 4436new_buff (size_t len)
b8af0ca5
NB
4437{
4438 _cpp_buff *result;
ece54d54 4439 unsigned char *base;
b8af0ca5 4440
1e013d2e
NB
4441 if (len < MIN_BUFF_SIZE)
4442 len = MIN_BUFF_SIZE;
c70f6ed3 4443 len = CPP_ALIGN (len);
b8af0ca5 4444
ceb17928 4445#ifdef ENABLE_VALGRIND_ANNOTATIONS
1a80db97
JJ
4446 /* Valgrind warns about uses of interior pointers, so put _cpp_buff
4447 struct first. */
4448 size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
4449 base = XNEWVEC (unsigned char, len + slen);
4450 result = (_cpp_buff *) base;
4451 base += slen;
4452#else
c3f829c1 4453 base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
b8af0ca5 4454 result = (_cpp_buff *) (base + len);
1a80db97 4455#endif
b8af0ca5
NB
4456 result->base = base;
4457 result->cur = base;
4458 result->limit = base + len;
4459 result->next = NULL;
4460 return result;
4461}
4462
4463/* Place a chain of unwanted allocation buffers on the free list. */
4464void
6cf87ca4 4465_cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
b8af0ca5
NB
4466{
4467 _cpp_buff *end = buff;
4468
4469 while (end->next)
4470 end = end->next;
4471 end->next = pfile->free_buffs;
4472 pfile->free_buffs = buff;
4473}
4474
4475/* Return a free buffer of size at least MIN_SIZE. */
4476_cpp_buff *
6cf87ca4 4477_cpp_get_buff (cpp_reader *pfile, size_t min_size)
b8af0ca5
NB
4478{
4479 _cpp_buff *result, **p;
4480
4481 for (p = &pfile->free_buffs;; p = &(*p)->next)
4482 {
6142088c 4483 size_t size;
1e013d2e
NB
4484
4485 if (*p == NULL)
b8af0ca5 4486 return new_buff (min_size);
1e013d2e
NB
4487 result = *p;
4488 size = result->limit - result->base;
4489 /* Return a buffer that's big enough, but don't waste one that's
4490 way too big. */
34f5271d 4491 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
b8af0ca5
NB
4492 break;
4493 }
4494
4495 *p = result->next;
4496 result->next = NULL;
4497 result->cur = result->base;
4498 return result;
4499}
4500
4fe9b91c 4501/* Creates a new buffer with enough space to hold the uncommitted
8c3b2693
NB
4502 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
4503 the excess bytes to the new buffer. Chains the new buffer after
4504 BUFF, and returns the new buffer. */
b8af0ca5 4505_cpp_buff *
6cf87ca4 4506_cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
b8af0ca5 4507{
6142088c 4508 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
8c3b2693 4509 _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
b8af0ca5 4510
8c3b2693
NB
4511 buff->next = new_buff;
4512 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
4513 return new_buff;
4514}
4515
4fe9b91c 4516/* Creates a new buffer with enough space to hold the uncommitted
8c3b2693
NB
4517 remaining bytes of the buffer pointed to by BUFF, and at least
4518 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
4519 Chains the new buffer before the buffer pointed to by BUFF, and
4520 updates the pointer to point to the new buffer. */
4521void
6cf87ca4 4522_cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
8c3b2693
NB
4523{
4524 _cpp_buff *new_buff, *old_buff = *pbuff;
4525 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
4526
4527 new_buff = _cpp_get_buff (pfile, size);
4528 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
4529 new_buff->next = old_buff;
4530 *pbuff = new_buff;
b8af0ca5
NB
4531}
4532
4533/* Free a chain of buffers starting at BUFF. */
4534void
5671bf27 4535_cpp_free_buff (_cpp_buff *buff)
b8af0ca5
NB
4536{
4537 _cpp_buff *next;
4538
4539 for (; buff; buff = next)
4540 {
4541 next = buff->next;
ceb17928 4542#ifdef ENABLE_VALGRIND_ANNOTATIONS
1a80db97
JJ
4543 free (buff);
4544#else
b8af0ca5 4545 free (buff->base);
1a80db97 4546#endif
b8af0ca5
NB
4547 }
4548}
417f3e3a 4549
ece54d54
NB
4550/* Allocate permanent, unaligned storage of length LEN. */
4551unsigned char *
6cf87ca4 4552_cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
ece54d54
NB
4553{
4554 _cpp_buff *buff = pfile->u_buff;
4555 unsigned char *result = buff->cur;
4556
4557 if (len > (size_t) (buff->limit - result))
4558 {
4559 buff = _cpp_get_buff (pfile, len);
4560 buff->next = pfile->u_buff;
4561 pfile->u_buff = buff;
4562 result = buff->cur;
4563 }
4564
4565 buff->cur = result + len;
4566 return result;
4567}
4568
87062813
NB
4569/* Allocate permanent, unaligned storage of length LEN from a_buff.
4570 That buffer is used for growing allocations when saving macro
4571 replacement lists in a #define, and when parsing an answer to an
4572 assertion in #assert, #unassert or #if (and therefore possibly
4573 whilst expanding macros). It therefore must not be used by any
4574 code that they might call: specifically the lexer and the guts of
4575 the macro expander.
4576
4577 All existing other uses clearly fit this restriction: storing
4578 registered pragmas during initialization. */
93c80368 4579unsigned char *
6cf87ca4 4580_cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3fef5b2b 4581{
8c3b2693
NB
4582 _cpp_buff *buff = pfile->a_buff;
4583 unsigned char *result = buff->cur;
3fef5b2b 4584
8c3b2693 4585 if (len > (size_t) (buff->limit - result))
3fef5b2b 4586 {
8c3b2693
NB
4587 buff = _cpp_get_buff (pfile, len);
4588 buff->next = pfile->a_buff;
4589 pfile->a_buff = buff;
4590 result = buff->cur;
3fef5b2b 4591 }
041c3194 4592
8c3b2693 4593 buff->cur = result + len;
93c80368 4594 return result;
041c3194 4595}
d8044160 4596
10f04917
NS
4597/* Commit or allocate storage from a buffer. */
4598
4599void *
4600_cpp_commit_buff (cpp_reader *pfile, size_t size)
4601{
4602 void *ptr = BUFF_FRONT (pfile->a_buff);
4603
4604 if (pfile->hash_table->alloc_subobject)
4605 {
4606 void *copy = pfile->hash_table->alloc_subobject (size);
4607 memcpy (copy, ptr, size);
4608 ptr = copy;
4609 }
4610 else
4611 BUFF_FRONT (pfile->a_buff) += size;
4612
4613 return ptr;
4614}
4615
d8044160
GK
4616/* Say which field of TOK is in use. */
4617
4618enum cpp_token_fld_kind
c26302d5 4619cpp_token_val_index (const cpp_token *tok)
d8044160
GK
4620{
4621 switch (TOKEN_SPELL (tok))
4622 {
4623 case SPELL_IDENT:
4624 return CPP_TOKEN_FLD_NODE;
4625 case SPELL_LITERAL:
4626 return CPP_TOKEN_FLD_STR;
aa508502 4627 case SPELL_OPERATOR:
3f23e487
AP
4628 /* Operands which were originally spelled as ident keep around
4629 the node for the exact spelling. */
4630 if (tok->flags & NAMED_OP)
4631 return CPP_TOKEN_FLD_NODE;
4632 else if (tok->type == CPP_PASTE)
9a0c6187 4633 return CPP_TOKEN_FLD_TOKEN_NO;
aa508502
JM
4634 else
4635 return CPP_TOKEN_FLD_NONE;
d8044160
GK
4636 case SPELL_NONE:
4637 if (tok->type == CPP_MACRO_ARG)
4638 return CPP_TOKEN_FLD_ARG_NO;
4639 else if (tok->type == CPP_PADDING)
4640 return CPP_TOKEN_FLD_SOURCE;
21b11495 4641 else if (tok->type == CPP_PRAGMA)
bc4071dd 4642 return CPP_TOKEN_FLD_PRAGMA;
191816a3 4643 /* fall through */
d8044160
GK
4644 default:
4645 return CPP_TOKEN_FLD_NONE;
4646 }
4647}
e3dfef44 4648
f3f6029d 4649/* All tokens lexed in R after calling this function will be forced to
620e594b 4650 have their location_t to be P, until
e3dfef44
GC
4651 cpp_stop_forcing_token_locations is called for R. */
4652
4653void
620e594b 4654cpp_force_token_locations (cpp_reader *r, location_t loc)
e3dfef44 4655{
f3f6029d 4656 r->forced_token_location = loc;
e3dfef44
GC
4657}
4658
4659/* Go back to assigning locations naturally for lexed tokens. */
4660
4661void
4662cpp_stop_forcing_token_locations (cpp_reader *r)
4663{
f3f6029d 4664 r->forced_token_location = 0;
e3dfef44 4665}
b224c376
NS
4666
4667/* We're looking at \, if it's escaping EOL, look past it. If at
4668 LIMIT, don't advance. */
4669
4670static const unsigned char *
4671do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
4672{
4673 const unsigned char *probe = peek;
4674
4675 if (__builtin_expect (peek[1] == '\n', true))
4676 {
4677 eol:
4678 probe += 2;
4679 if (__builtin_expect (probe < limit, true))
4680 {
4681 peek = probe;
4682 if (*peek == '\\')
4683 /* The user might be perverse. */
4684 return do_peek_backslash (peek, limit);
4685 }
4686 }
4687 else if (__builtin_expect (peek[1] == '\r', false))
4688 {
4689 if (probe[2] == '\n')
4690 probe++;
4691 goto eol;
4692 }
4693
4694 return peek;
4695}
4696
4697static const unsigned char *
4698do_peek_next (const unsigned char *peek, const unsigned char *limit)
4699{
4700 if (__builtin_expect (*peek == '\\', false))
4701 peek = do_peek_backslash (peek, limit);
4702 return peek;
4703}
4704
4705static const unsigned char *
4706do_peek_prev (const unsigned char *peek, const unsigned char *bound)
4707{
4708 if (peek == bound)
4709 return NULL;
4710
4711 unsigned char c = *--peek;
4712 if (__builtin_expect (c == '\n', false)
4713 || __builtin_expect (c == 'r', false))
4714 {
4715 if (peek == bound)
4716 return peek;
4717 int ix = -1;
4718 if (c == '\n' && peek[ix] == '\r')
4719 {
4720 if (peek + ix == bound)
4721 return peek;
4722 ix--;
4723 }
4724
4725 if (peek[ix] == '\\')
4726 return do_peek_prev (peek + ix, bound);
4727
4728 return peek;
4729 }
4730 else
4731 return peek;
4732}
4733
c9c3d5f2
NS
4734/* If PEEK[-1] is identifier MATCH, scan past it and trailing white
4735 space. Otherwise return NULL. */
4736
4737static const unsigned char *
4738do_peek_ident (const char *match, const unsigned char *peek,
4739 const unsigned char *limit)
4740{
4741 for (; *++match; peek++)
4742 if (*peek != *match)
4743 {
4744 peek = do_peek_next (peek, limit);
4745 if (*peek != *match)
4746 return NULL;
4747 }
4748
4749 /* Must now not be looking at an identifier char. */
4750 peek = do_peek_next (peek, limit);
4751 if (ISIDNUM (*peek))
4752 return NULL;
4753
4754 /* Skip control-line whitespace. */
4755 ws:
4756 while (*peek == ' ' || *peek == '\t')
4757 peek++;
4758 if (__builtin_expect (*peek == '\\', false))
4759 {
4760 peek = do_peek_backslash (peek, limit);
4761 if (*peek != '\\')
4762 goto ws;
4763 }
4764
4765 return peek;
4766}
4767
4768/* Are we looking at a module control line starting as PEEK - 1? */
4769
4770static bool
4771do_peek_module (cpp_reader *pfile, unsigned char c,
4772 const unsigned char *peek, const unsigned char *limit)
4773{
4774 bool import = false;
4775
4776 if (__builtin_expect (c == 'e', false))
4777 {
4778 if (!((peek[0] == 'x' || peek[0] == '\\')
4779 && (peek = do_peek_ident ("export", peek, limit))))
4780 return false;
4781
4782 /* export, peek for import or module. No need to peek __import
4783 here. */
4784 if (peek[0] == 'i')
4785 {
4786 if (!((peek[1] == 'm' || peek[1] == '\\')
4787 && (peek = do_peek_ident ("import", peek + 1, limit))))
4788 return false;
4789 import = true;
4790 }
4791 else if (peek[0] == 'm')
4792 {
4793 if (!((peek[1] == 'o' || peek[1] == '\\')
4794 && (peek = do_peek_ident ("module", peek + 1, limit))))
4795 return false;
4796 }
4797 else
4798 return false;
4799 }
4800 else if (__builtin_expect (c == 'i', false))
4801 {
4802 if (!((peek[0] == 'm' || peek[0] == '\\')
4803 && (peek = do_peek_ident ("import", peek, limit))))
4804 return false;
4805 import = true;
4806 }
4807 else if (__builtin_expect (c == '_', false))
4808 {
4809 /* Needed for translated includes. */
4810 if (!((peek[0] == '_' || peek[0] == '\\')
4811 && (peek = do_peek_ident ("__import", peek, limit))))
4812 return false;
4813 import = true;
4814 }
4815 else if (__builtin_expect (c == 'm', false))
4816 {
4817 if (!((peek[0] == 'o' || peek[0] == '\\')
4818 && (peek = do_peek_ident ("module", peek, limit))))
4819 return false;
4820 }
4821 else
4822 return false;
4823
4824 /* Peek the next character to see if it's good enough. We'll be at
4825 the first non-whitespace char, including skipping an escaped
4826 newline. */
4827 /* ... import followed by identifier, ':', '<' or header-name
4828 preprocessing tokens, or module followed by identifier, ':' or
4829 ';' preprocessing tokens. */
4830 unsigned char p = *peek++;
4831
4832 /* A character literal is ... single quotes, ... optionally preceded
4833 by u8, u, U, or L */
4834 /* A string-literal is a ... double quotes, optionally prefixed by
4835 R, u8, u8R, u, uR, U, UR, L, or LR */
4836 if (p == 'u')
4837 {
4838 peek = do_peek_next (peek, limit);
4839 if (*peek == '8')
4840 {
4841 peek++;
4842 goto peek_u8;
4843 }
4844 goto peek_u;
4845 }
4846 else if (p == 'U' || p == 'L')
4847 {
4848 peek_u8:
4849 peek = do_peek_next (peek, limit);
4850 peek_u:
4851 if (*peek == '\"' || *peek == '\'')
4852 return false;
4853
4854 if (*peek == 'R')
4855 goto peek_R;
4856 /* Identifier. Ok. */
4857 }
4858 else if (p == 'R')
4859 {
4860 peek_R:
4861 if (CPP_OPTION (pfile, rliterals))
4862 {
4863 peek = do_peek_next (peek, limit);
4864 if (*peek == '\"')
4865 return false;
4866 }
4867 /* Identifier. Ok. */
4868 }
4869 else if ('Z' - 'A' == 25
4870 ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
4871 : ISIDST (p))
4872 {
4873 /* Identifier. Ok. */
4874 }
4875 else if (p == '<')
4876 {
4877 /* Maybe angle header, ok for import. Reject
4878 '<=', '<<' digraph:'<:'. */
4879 if (!import)
4880 return false;
4881 peek = do_peek_next (peek, limit);
4882 if (*peek == '=' || *peek == '<'
4883 || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
4884 return false;
4885 }
4886 else if (p == ';')
4887 {
4888 /* SEMICOLON, ok for module. */
4889 if (import)
4890 return false;
4891 }
4892 else if (p == '"')
4893 {
4894 /* STRING, ok for import. */
4895 if (!import)
4896 return false;
4897 }
4898 else if (p == ':')
4899 {
4900 /* Maybe COLON, ok. Reject '::', digraph:':>'. */
4901 peek = do_peek_next (peek, limit);
4902 if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
4903 return false;
4904 }
4905 else
4906 /* FIXME: Detect a unicode character, excluding those not
4907 permitted as the initial character. [lex.name]/1. I presume
4908 we need to check the \[uU] spellings, and directly using
4909 Unicode in say UTF8 form? Or perhaps we do the phase-1
4910 conversion of UTF8 to universal-character-names? */
4911 return false;
4912
4913 return true;
4914}
4915
b224c376
NS
4916/* Directives-only scanning. Somewhat more relaxed than correct
4917 parsing -- some ill-formed programs will not be rejected. */
4918
4919void
4920cpp_directive_only_process (cpp_reader *pfile,
4921 void *data,
4922 void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
4923{
c9c3d5f2
NS
4924 bool module_p = CPP_OPTION (pfile, module_directives);
4925
b224c376
NS
4926 do
4927 {
4928 restart:
4929 /* Buffer initialization, but no line cleaning. */
4930 cpp_buffer *buffer = pfile->buffer;
4931 buffer->cur_note = buffer->notes_used = 0;
4932 buffer->cur = buffer->line_base = buffer->next_line;
4933 buffer->need_line = false;
ac16f432 4934 /* Files always end in a newline or carriage return. We rely on this for
b224c376 4935 character peeking safety. */
ac16f432 4936 gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
b224c376
NS
4937
4938 const unsigned char *base = buffer->cur;
4939 unsigned line_count = 0;
4940 const unsigned char *line_start = base;
4941
4942 bool bol = true;
4943 bool raw = false;
4944
4945 const unsigned char *lwm = base;
4946 for (const unsigned char *pos = base, *limit = buffer->rlimit;
4947 pos < limit;)
4948 {
4949 unsigned char c = *pos++;
4950 /* This matches the switch in _cpp_lex_direct. */
4951 switch (c)
4952 {
4953 case ' ': case '\t': case '\f': case '\v':
4954 /* Whitespace, do nothing. */
4955 break;
4956
4957 case '\r': /* MAC line ending, or Windows \r\n */
4958 if (*pos == '\n')
4959 pos++;
4960 /* FALLTHROUGH */
4961
4962 case '\n':
4963 bol = true;
4964
4965 next_line:
4966 CPP_INCREMENT_LINE (pfile, 0);
4967 line_count++;
4968 line_start = pos;
4969 break;
4970
4971 case '\\':
4972 /* <backslash><newline> is removed, and doesn't undo any
4973 preceeding escape or whatnot. */
4974 if (*pos == '\n')
4975 {
4976 pos++;
4977 goto next_line;
4978 }
4979 else if (*pos == '\r')
4980 {
4981 if (pos[1] == '\n')
4982 pos++;
4983 pos++;
4984 goto next_line;
4985 }
4986 goto dflt;
4987
4988 case '#':
4989 if (bol)
4990 {
4991 /* Line directive. */
4992 if (pos - 1 > base && !pfile->state.skipping)
4993 cb (pfile, CPP_DO_print, data,
4994 line_count, base, pos - 1 - base);
4995
4996 /* Prep things for directive handling. */
4997 buffer->next_line = pos;
4998 buffer->need_line = true;
2a0225e4
NS
4999 bool ok = _cpp_get_fresh_line (pfile);
5000 gcc_checking_assert (ok);
b224c376
NS
5001
5002 /* Ensure proper column numbering for generated
5003 error messages. */
5004 buffer->line_base -= pos - line_start;
5005
5006 _cpp_handle_directive (pfile, line_start + 1 != pos);
5007
5008 /* Sanitize the line settings. Duplicate #include's can
5009 mess things up. */
5010 // FIXME: Necessary?
5011 pfile->line_table->highest_location
5012 = pfile->line_table->highest_line;
5013
5014 if (!pfile->state.skipping
5015 && pfile->buffer->next_line < pfile->buffer->rlimit)
5016 cb (pfile, CPP_DO_location, data,
5017 pfile->line_table->highest_line);
5018
5019 goto restart;
5020 }
5021 goto dflt;
5022
5023 case '/':
5024 {
5025 const unsigned char *peek = do_peek_next (pos, limit);
5026 if (!(*peek == '/' || *peek == '*'))
5027 goto dflt;
5028
5029 /* Line or block comment */
5030 bool is_block = *peek == '*';
5031 bool star = false;
5032 bool esc = false;
5033 location_t sloc
5034 = linemap_position_for_column (pfile->line_table,
5035 pos - line_start);
5036
5037 while (pos < limit)
5038 {
5039 char c = *pos++;
5040 switch (c)
5041 {
5042 case '\\':
5043 esc = true;
5044 break;
5045
5046 case '\r':
5047 if (*pos == '\n')
5048 pos++;
5049 /* FALLTHROUGH */
5050
5051 case '\n':
5052 {
5053 CPP_INCREMENT_LINE (pfile, 0);
5054 line_count++;
5055 line_start = pos;
5056 if (!esc && !is_block)
5057 {
5058 bol = true;
5059 goto done_comment;
5060 }
5061 }
5062 if (!esc)
5063 star = false;
5064 esc = false;
5065 break;
5066
5067 case '*':
049f0efe 5068 if (pos > peek)
b224c376
NS
5069 star = is_block;
5070 esc = false;
5071 break;
5072
5073 case '/':
5074 if (star)
5075 goto done_comment;
5076 /* FALLTHROUGH */
5077
5078 default:
5079 star = false;
5080 esc = false;
5081 break;
5082 }
5083 }
d15a2d26
JJ
5084 if (pos < limit || is_block)
5085 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5086 "unterminated comment");
b224c376
NS
5087 done_comment:
5088 lwm = pos;
5089 break;
5090 }
5091
5092 case '\'':
5093 if (!CPP_OPTION (pfile, digit_separators))
5094 goto delimited_string;
5095
5096 /* Possibly a number punctuator. */
5097 if (!ISIDNUM (*do_peek_next (pos, limit)))
5098 goto delimited_string;
5099
5100 goto quote_peek;
5101
5102 case '\"':
5103 if (!CPP_OPTION (pfile, rliterals))
5104 goto delimited_string;
5105
5106 quote_peek:
5107 {
5108 /* For ' see if it's a number punctuator
5109 \.?<digit>(<digit>|<identifier-nondigit>
5110 |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
5111 /* For " see if it's a raw string
5112 {U,L,u,u8}R. This includes CPP_NUMBER detection,
5113 because that could be 0e+R. */
5114 const unsigned char *peek = pos - 1;
5115 bool quote_first = c == '"';
5116 bool quote_eight = false;
5117 bool maybe_number_start = false;
5118 bool want_number = false;
5119
5120 while ((peek = do_peek_prev (peek, lwm)))
5121 {
5122 unsigned char p = *peek;
5123 if (quote_first)
5124 {
5125 if (!raw)
5126 {
5127 if (p != 'R')
5128 break;
5129 raw = true;
5130 continue;
5131 }
5132
5133 quote_first = false;
5134 if (p == 'L' || p == 'U' || p == 'u')
5135 ;
5136 else if (p == '8')
5137 quote_eight = true;
5138 else
5139 goto second_raw;
5140 }
5141 else if (quote_eight)
5142 {
5143 if (p != 'u')
5144 {
5145 raw = false;
5146 break;
5147 }
5148 quote_eight = false;
5149 }
5150 else if (c == '"')
5151 {
5152 second_raw:;
5153 if (!want_number && ISIDNUM (p))
5154 {
5155 raw = false;
5156 break;
5157 }
5158 }
5159
5160 if (ISDIGIT (p))
5161 maybe_number_start = true;
5162 else if (p == '.')
5163 want_number = true;
5164 else if (ISIDNUM (p))
5165 maybe_number_start = false;
5166 else if (p == '+' || p == '-')
5167 {
5168 if (const unsigned char *peek_prev
5169 = do_peek_prev (peek, lwm))
5170 {
5171 p = *peek_prev;
5172 if (p == 'e' || p == 'E'
5173 || p == 'p' || p == 'P')
5174 {
5175 want_number = true;
5176 maybe_number_start = false;
5177 }
5178 else
5179 break;
5180 }
5181 else
5182 break;
5183 }
5184 else if (p == '\'' || p == '\"')
5185 {
5186 /* If this is lwm, this must be the end of a
5187 previous string. So this is a trailing
5188 literal type, (a) if those are allowed,
5189 and (b) maybe_start is false. Otherwise
5190 this must be a CPP_NUMBER because we've
5191 met another ', and we'd have checked that
5192 in its own right. */
5193 if (peek == lwm && CPP_OPTION (pfile, uliterals))
5194 {
5195 if (!maybe_number_start && !want_number)
5196 /* Must be a literal type. */
5197 raw = false;
5198 }
5199 else if (p == '\''
5200 && CPP_OPTION (pfile, digit_separators))
5201 maybe_number_start = true;
5202 break;
5203 }
5204 else if (c == '\'')
5205 break;
5206 else if (!quote_first && !quote_eight)
5207 break;
5208 }
5209
5210 if (maybe_number_start)
5211 {
5212 if (c == '\'')
5213 /* A CPP NUMBER. */
5214 goto dflt;
5215 raw = false;
5216 }
5217
5218 goto delimited_string;
5219 }
5220
5221 delimited_string:
5222 {
5223 /* (Possibly raw) string or char literal. */
5224 unsigned char end = c;
5225 int delim_len = -1;
5226 const unsigned char *delim = NULL;
5227 location_t sloc = linemap_position_for_column (pfile->line_table,
5228 pos - line_start);
5229 int esc = 0;
5230
5231 if (raw)
5232 {
5233 /* There can be no line breaks in the delimiter. */
5234 delim = pos;
5235 for (delim_len = 0; (c = *pos++) != '('; delim_len++)
5236 {
5237 if (delim_len == 16)
5238 {
5239 cpp_error_with_line (pfile, CPP_DL_ERROR,
5240 sloc, 0,
5241 "raw string delimiter"
5242 " longer than %d"
5243 " characters",
5244 delim_len);
5245 raw = false;
5246 pos = delim;
5247 break;
5248 }
5249 if (strchr (") \\\t\v\f\n", c))
5250 {
5251 cpp_error_with_line (pfile, CPP_DL_ERROR,
5252 sloc, 0,
5253 "invalid character '%c'"
5254 " in raw string"
5255 " delimiter", c);
5256 raw = false;
5257 pos = delim;
5258 break;
5259 }
5260 if (pos >= limit)
5261 goto bad_string;
5262 }
5263 }
5264
5265 while (pos < limit)
5266 {
5267 char c = *pos++;
5268 switch (c)
5269 {
5270 case '\\':
5271 if (!raw)
5272 esc++;
5273 break;
5274
5275 case '\r':
5276 if (*pos == '\n')
5277 pos++;
5278 /* FALLTHROUGH */
5279
5280 case '\n':
5281 {
5282 CPP_INCREMENT_LINE (pfile, 0);
5283 line_count++;
5284 line_start = pos;
5285 }
5286 if (esc)
5287 esc--;
5288 break;
5289
5290 case ')':
5291 if (raw
5292 && pos + delim_len + 1 < limit
5293 && pos[delim_len] == end
5294 && !memcmp (delim, pos, delim_len))
5295 {
5296 pos += delim_len + 1;
5297 raw = false;
5298 goto done_string;
5299 }
5300 break;
5301
5302 default:
5303 if (!raw && !(esc & 1) && c == end)
5304 goto done_string;
5305 esc = 0;
5306 break;
5307 }
5308 }
5309 bad_string:
5310 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5311 "unterminated literal");
5312
5313 done_string:
5314 raw = false;
5315 lwm = pos - 1;
5316 }
5317 goto dflt;
5318
c9c3d5f2
NS
5319 case '_':
5320 case 'e':
5321 case 'i':
5322 case 'm':
5323 if (bol && module_p && !pfile->state.skipping
5324 && do_peek_module (pfile, c, pos, limit))
5325 {
5326 /* We've seen the start of a module control line.
5327 Start up the tokenizer. */
5328 pos--; /* Backup over the first character. */
5329
5330 /* Backup over whitespace to start of line. */
5331 while (pos > line_start
5332 && (pos[-1] == ' ' || pos[-1] == '\t'))
5333 pos--;
5334
5335 if (pos > base)
5336 cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
5337
5338 /* Prep things for directive handling. */
5339 buffer->next_line = pos;
5340 buffer->need_line = true;
5341
5342 /* Now get tokens until the PRAGMA_EOL. */
5343 do
5344 {
5345 location_t spelling;
5346 const cpp_token *tok
5347 = cpp_get_token_with_location (pfile, &spelling);
5348
5349 gcc_assert (pfile->state.in_deferred_pragma
5350 || tok->type == CPP_PRAGMA_EOL);
5351 cb (pfile, CPP_DO_token, data, tok, spelling);
5352 }
5353 while (pfile->state.in_deferred_pragma);
5354
5355 if (pfile->buffer->next_line < pfile->buffer->rlimit)
5356 cb (pfile, CPP_DO_location, data,
5357 pfile->line_table->highest_line);
5358
5359 pfile->mi_valid = false;
5360 goto restart;
5361 }
5362 goto dflt;
5363
b224c376
NS
5364 default:
5365 dflt:
5366 bol = false;
5367 pfile->mi_valid = false;
5368 break;
5369 }
5370 }
5371
5372 if (buffer->rlimit > base && !pfile->state.skipping)
c6b664e2
JJ
5373 {
5374 const unsigned char *limit = buffer->rlimit;
5375 /* If the file was not newline terminated, add rlimit, which is
5376 guaranteed to point to a newline, to the end of our range. */
5377 if (limit[-1] != '\n')
5378 {
5379 limit++;
5380 CPP_INCREMENT_LINE (pfile, 0);
5381 line_count++;
5382 }
5383 cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
5384 }
b224c376
NS
5385
5386 _cpp_pop_buffer (pfile);
5387 }
5388 while (pfile->buffer);
5389}