libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2020 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  54 static int skip_line_comment (cpp_reader *);
  55 static void skip_whitespace (cpp_reader *, cppchar_t);
  56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  58 static void store_comment (cpp_reader *, cpp_token *);
  59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  60                             unsigned int, enum cpp_ttype);
  61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  62 static int name_p (cpp_reader *, const cpp_string *);
  63 static tokenrun *next_tokenrun (tokenrun *);
  64
  65 static _cpp_buff *new_buff (size_t);
  66
  67
  68 /* Utility routine:
  69
  70    Compares, the token TOKEN to the NUL-terminated string STRING.
  71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  72 int
  73 cpp_ideq (const cpp_token *token, const char *string)
  74 {
  75   if (token->type != CPP_NAME)
  76     return 0;
  77
  78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  79 }
  80
  81 /* Record a note TYPE at byte POS into the current cleaned logical
  82    line.  */
  83 static void
  84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  85 {
  86   if (buffer->notes_used == buffer->notes_cap)
  87     {
  88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  90                                   buffer->notes_cap);
  91     }
  92
  93   buffer->notes[buffer->notes_used].pos = pos;
  94   buffer->notes[buffer->notes_used].type = type;
  95   buffer->notes_used++;
  96 }
  97
  98 \f
  99 /* Fast path to find line special characters using optimized character
 100    scanning algorithms.  Anything complicated falls back to the slow
 101    path below.  Since this loop is very hot it's worth doing these kinds
 102    of optimizations.
 103
 104    One of the paths through the ifdefs should provide
 105
 106      const uchar *search_line_fast (const uchar *s, const uchar *end);
 107
 108    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 109    the found character.
 110
 111    Note that the last character of the buffer is *always* a newline,
 112    as forced by _cpp_convert_input.  This fact can be used to avoid
 113    explicitly looking for the end of the buffer.  */
 114
 115 /* Configure gives us an ifdef test.  */
 116 #ifndef WORDS_BIGENDIAN
 117 #define WORDS_BIGENDIAN 0
 118 #endif
 119
 120 /* We'd like the largest integer that fits into a register.  There's nothing
 121    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 122    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 123    can get the "real" word size.  */
 124 #ifdef __GNUC__
 125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 126 #else
 127 typedef unsigned long word_type;
 128 #endif
 129
 130 /* The code below is only expecting sizes 4 or 8.
 131    Die at compile-time if this expectation is violated.  */
 132 typedef char check_word_type_size
 133   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 134
 135 /* Return X with the first N bytes forced to values that won't match one
 136    of the interesting characters.  Note that NUL is not interesting.  */
 137
 138 static inline word_type
 139 acc_char_mask_misalign (word_type val, unsigned int n)
 140 {
 141   word_type mask = -1;
 142   if (WORDS_BIGENDIAN)
 143     mask >>= n * 8;
 144   else
 145     mask <<= n * 8;
 146   return val & mask;
 147 }
 148
 149 /* Return X replicated to all byte positions within WORD_TYPE.  */
 150
 151 static inline word_type
 152 acc_char_replicate (uchar x)
 153 {
 154   word_type ret;
 155
 156   ret = (x << 24) | (x << 16) | (x << 8) | x;
 157   if (sizeof(word_type) == 8)
 158     ret = (ret << 16 << 16) | ret;
 159   return ret;
 160 }
 161
 162 /* Return non-zero if some byte of VAL is (probably) C.  */
 163
 164 static inline word_type
 165 acc_char_cmp (word_type val, word_type c)
 166 {
 167 #if defined(__GNUC__) && defined(__alpha__)
 168   /* We can get exact results using a compare-bytes instruction.
 169      Get (val == c) via (0 >= (val ^ c)).  */
 170   return __builtin_alpha_cmpbge (0, val ^ c);
 171 #else
 172   word_type magic = 0x7efefefeU;
 173   if (sizeof(word_type) == 8)
 174     magic = (magic << 16 << 16) | 0xfefefefeU;
 175   magic |= 1;
 176
 177   val ^= c;
 178   return ((val + magic) ^ ~val) & ~magic;
 179 #endif
 180 }
 181
 182 /* Given the result of acc_char_cmp is non-zero, return the index of
 183    the found character.  If this was a false positive, return -1.  */
 184
 185 static inline int
 186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 187                 word_type val ATTRIBUTE_UNUSED)
 188 {
 189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 190   /* The cmpbge instruction sets *bits* of the result corresponding to
 191      matches in the bytes with no false positives.  */
 192   return __builtin_ctzl (cmp);
 193 #else
 194   unsigned int i;
 195
 196   /* ??? It would be nice to force unrolling here,
 197      and have all of these constants folded.  */
 198   for (i = 0; i < sizeof(word_type); ++i)
 199     {
 200       uchar c;
 201       if (WORDS_BIGENDIAN)
 202         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 203       else
 204         c = (val >> i * 8) & 0xff;
 205
 206       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 207         return i;
 208     }
 209
 210   return -1;
 211 #endif
 212 }
 213
 214 /* A version of the fast scanner using bit fiddling techniques.
 215
 216    For 32-bit words, one would normally perform 16 comparisons and
 217    16 branches.  With this algorithm one performs 24 arithmetic
 218    operations and one branch.  Whether this is faster with a 32-bit
 219    word size is going to be somewhat system dependent.
 220
 221    For 64-bit words, we eliminate twice the number of comparisons
 222    and branches without increasing the number of arithmetic operations.
 223    It's almost certainly going to be a win with 64-bit word size.  */
 224
 225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 226   ATTRIBUTE_UNUSED;
 227
 228 static const uchar *
 229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 230 {
 231   const word_type repl_nl = acc_char_replicate ('\n');
 232   const word_type repl_cr = acc_char_replicate ('\r');
 233   const word_type repl_bs = acc_char_replicate ('\\');
 234   const word_type repl_qm = acc_char_replicate ('?');
 235
 236   unsigned int misalign;
 237   const word_type *p;
 238   word_type val, t;
 239
 240   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 241   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 242   val = *p;
 243   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 244   if (misalign)
 245     val = acc_char_mask_misalign (val, misalign);
 246
 247   /* Main loop.  */
 248   while (1)
 249     {
 250       t  = acc_char_cmp (val, repl_nl);
 251       t |= acc_char_cmp (val, repl_cr);
 252       t |= acc_char_cmp (val, repl_bs);
 253       t |= acc_char_cmp (val, repl_qm);
 254
 255       if (__builtin_expect (t != 0, 0))
 256         {
 257           int i = acc_char_index (t, val);
 258           if (i >= 0)
 259             return (const uchar *)p + i;
 260         }
 261
 262       val = *++p;
 263     }
 264 }
 265
 266 /* Disable on Solaris 2/x86 until the following problem can be properly
 267    autoconfed:
 268
 269    The Solaris 10+ assembler tags objects with the instruction set
 270    extensions used, so SSE4.2 executables cannot run on machines that
 271    don't support that extension.  */
 272
 273 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 274
 275 /* Replicated character data to be shared between implementations.
 276    Recall that outside of a context with vector support we can't
 277    define compatible vector types, therefore these are all defined
 278    in terms of raw characters.  */
 279 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 280   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 281     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 282   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 283     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 284   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 285     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 286   { '?', '?', '?', '?', '?', '?', '?', '?',
 287     '?', '?', '?', '?', '?', '?', '?', '?' },
 288 };
 289
 290 /* A version of the fast scanner using MMX vectorized byte compare insns.
 291
 292    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 293    which was packaged into SSE1; it is also present in the AMD MMX
 294    extension.  Mark the function as using "sse" so that we emit a real
 295    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 296
 297 static const uchar *
 298 #ifndef __SSE__
 299 __attribute__((__target__("sse")))
 300 #endif
 301 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 302 {
 303   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 304   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 305
 306   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 307   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 308   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 309   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 310
 311   unsigned int misalign, found, mask;
 312   const v8qi *p;
 313   v8qi data, t, c;
 314
 315   /* Align the source pointer.  While MMX doesn't generate unaligned data
 316      faults, this allows us to safely scan to the end of the buffer without
 317      reading beyond the end of the last page.  */
 318   misalign = (uintptr_t)s & 7;
 319   p = (const v8qi *)((uintptr_t)s & -8);
 320   data = *p;
 321
 322   /* Create a mask for the bytes that are valid within the first
 323      16-byte block.  The Idea here is that the AND with the mask
 324      within the loop is "free", since we need some AND or TEST
 325      insn in order to set the flags for the branch anyway.  */
 326   mask = -1u << misalign;
 327
 328   /* Main loop processing 8 bytes at a time.  */
 329   goto start;
 330   do
 331     {
 332       data = *++p;
 333       mask = -1;
 334
 335     start:
 336       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 337       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 338       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 339       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 340       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 341       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 343       found = __builtin_ia32_pmovmskb (t);
 344       found &= mask;
 345     }
 346   while (!found);
 347
 348   __builtin_ia32_emms ();
 349
 350   /* FOUND contains 1 in bits for which we matched a relevant
 351      character.  Conversion to the byte index is trivial.  */
 352   found = __builtin_ctz(found);
 353   return (const uchar *)p + found;
 354 }
 355
 356 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 357
 358 static const uchar *
 359 #ifndef __SSE2__
 360 __attribute__((__target__("sse2")))
 361 #endif
 362 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 363 {
 364   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 365
 366   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 367   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 368   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 369   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 370
 371   unsigned int misalign, found, mask;
 372   const v16qi *p;
 373   v16qi data, t;
 374
 375   /* Align the source pointer.  */
 376   misalign = (uintptr_t)s & 15;
 377   p = (const v16qi *)((uintptr_t)s & -16);
 378   data = *p;
 379
 380   /* Create a mask for the bytes that are valid within the first
 381      16-byte block.  The Idea here is that the AND with the mask
 382      within the loop is "free", since we need some AND or TEST
 383      insn in order to set the flags for the branch anyway.  */
 384   mask = -1u << misalign;
 385
 386   /* Main loop processing 16 bytes at a time.  */
 387   goto start;
 388   do
 389     {
 390       data = *++p;
 391       mask = -1;
 392
 393     start:
 394       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 395       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 396       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 397       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 398       found = __builtin_ia32_pmovmskb128 (t);
 399       found &= mask;
 400     }
 401   while (!found);
 402
 403   /* FOUND contains 1 in bits for which we matched a relevant
 404      character.  Conversion to the byte index is trivial.  */
 405   found = __builtin_ctz(found);
 406   return (const uchar *)p + found;
 407 }
 408
 409 #ifdef HAVE_SSE4
 410 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 411
 412 static const uchar *
 413 #ifndef __SSE4_2__
 414 __attribute__((__target__("sse4.2")))
 415 #endif
 416 search_line_sse42 (const uchar *s, const uchar *end)
 417 {
 418   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 419   static const v16qi search = { '\n', '\r', '?', '\\' };
 420
 421   uintptr_t si = (uintptr_t)s;
 422   uintptr_t index;
 423
 424   /* Check for unaligned input.  */
 425   if (si & 15)
 426     {
 427       v16qi sv;
 428
 429       if (__builtin_expect (end - s < 16, 0)
 430           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 431         {
 432           /* There are less than 16 bytes left in the buffer, and less
 433              than 16 bytes left on the page.  Reading 16 bytes at this
 434              point might generate a spurious page fault.  Defer to the
 435              SSE2 implementation, which already handles alignment.  */
 436           return search_line_sse2 (s, end);
 437         }
 438
 439       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 440          memory need not be aligned.  */
 441       sv = __builtin_ia32_loaddqu ((const char *) s);
 442       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 443
 444       if (__builtin_expect (index < 16, 0))
 445         goto found;
 446
 447       /* Advance the pointer to an aligned address.  We will re-scan a
 448          few bytes, but we no longer need care for reading past the
 449          end of a page, since we're guaranteed a match.  */
 450       s = (const uchar *)((si + 15) & -16);
 451     }
 452
 453   /* Main loop, processing 16 bytes at a time.  */
 454 #ifdef __GCC_ASM_FLAG_OUTPUTS__
 455   while (1)
 456     {
 457       char f;
 458
 459       /* By using inline assembly instead of the builtin,
 460          we can use the result, as well as the flags set.  */
 461       __asm ("%vpcmpestri\t$0, %2, %3"
 462              : "=c"(index), "=@ccc"(f)
 463              : "m"(*s), "x"(search), "a"(4), "d"(16));
 464       if (f)
 465         break;
 466
 467       s += 16;
 468     }
 469 #else
 470   s -= 16;
 471   /* By doing the whole loop in inline assembly,
 472      we can make proper use of the flags set.  */
 473   __asm (      ".balign 16\n"
 474         "0:     add $16, %1\n"
 475         "       %vpcmpestri\t$0, (%1), %2\n"
 476         "       jnc 0b"
 477         : "=&c"(index), "+r"(s)
 478         : "x"(search), "a"(4), "d"(16));
 479 #endif
 480
 481  found:
 482   return s + index;
 483 }
 484
 485 #else
 486 /* Work around out-dated assemblers without sse4 support.  */
 487 #define search_line_sse42 search_line_sse2
 488 #endif
 489
 490 /* Check the CPU capabilities.  */
 491
 492 #include "../gcc/config/i386/cpuid.h"
 493
 494 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 495 static search_line_fast_type search_line_fast;
 496
 497 #define HAVE_init_vectorized_lexer 1
 498 static inline void
 499 init_vectorized_lexer (void)
 500 {
 501   unsigned dummy, ecx = 0, edx = 0;
 502   search_line_fast_type impl = search_line_acc_char;
 503   int minimum = 0;
 504
 505 #if defined(__SSE4_2__)
 506   minimum = 3;
 507 #elif defined(__SSE2__)
 508   minimum = 2;
 509 #elif defined(__SSE__)
 510   minimum = 1;
 511 #endif
 512
 513   if (minimum == 3)
 514     impl = search_line_sse42;
 515   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 516     {
 517       if (minimum == 3 || (ecx & bit_SSE4_2))
 518         impl = search_line_sse42;
 519       else if (minimum == 2 || (edx & bit_SSE2))
 520         impl = search_line_sse2;
 521       else if (minimum == 1 || (edx & bit_SSE))
 522         impl = search_line_mmx;
 523     }
 524   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 525     {
 526       if (minimum == 1
 527           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 528         impl = search_line_mmx;
 529     }
 530
 531   search_line_fast = impl;
 532 }
 533
 534 #elif defined(_ARCH_PWR8) && defined(__ALTIVEC__)
 535
 536 /* A vection of the fast scanner using AltiVec vectorized byte compares
 537    and VSX unaligned loads (when VSX is available).  This is otherwise
 538    the same as the pre-GCC 5 version.  */
 539
 540 ATTRIBUTE_NO_SANITIZE_UNDEFINED
 541 static const uchar *
 542 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 543 {
 544   typedef __attribute__((altivec(vector))) unsigned char vc;
 545
 546   const vc repl_nl = {
 547     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 548     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 549   };
 550   const vc repl_cr = {
 551     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 552     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 553   };
 554   const vc repl_bs = {
 555     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 556     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 557   };
 558   const vc repl_qm = {
 559     '?', '?', '?', '?', '?', '?', '?', '?',
 560     '?', '?', '?', '?', '?', '?', '?', '?',
 561   };
 562   const vc zero = { 0 };
 563
 564   vc data, t;
 565
 566   /* Main loop processing 16 bytes at a time.  */
 567   do
 568     {
 569       vc m_nl, m_cr, m_bs, m_qm;
 570
 571       data = __builtin_vec_vsx_ld (0, s);
 572       s += 16;
 573
 574       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 575       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 576       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 577       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 578       t = (m_nl | m_cr) | (m_bs | m_qm);
 579
 580       /* T now contains 0xff in bytes for which we matched one of the relevant
 581          characters.  We want to exit the loop if any byte in T is non-zero.
 582          Below is the expansion of vec_any_ne(t, zero).  */
 583     }
 584   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 585
 586   /* Restore s to to point to the 16 bytes we just processed.  */
 587   s -= 16;
 588
 589   {
 590 #define N  (sizeof(vc) / sizeof(long))
 591
 592     union {
 593       vc v;
 594       /* Statically assert that N is 2 or 4.  */
 595       unsigned long l[(N == 2 || N == 4) ? N : -1];
 596     } u;
 597     unsigned long l, i = 0;
 598
 599     u.v = t;
 600
 601     /* Find the first word of T that is non-zero.  */
 602     switch (N)
 603       {
 604       case 4:
 605         l = u.l[i++];
 606         if (l != 0)
 607           break;
 608         s += sizeof(unsigned long);
 609         l = u.l[i++];
 610         if (l != 0)
 611           break;
 612         s += sizeof(unsigned long);
 613         /* FALLTHRU */
 614       case 2:
 615         l = u.l[i++];
 616         if (l != 0)
 617           break;
 618         s += sizeof(unsigned long);
 619         l = u.l[i];
 620       }
 621
 622     /* L now contains 0xff in bytes for which we matched one of the
 623        relevant characters.  We can find the byte index by finding
 624        its bit index and dividing by 8.  */
 625 #ifdef __BIG_ENDIAN__
 626     l = __builtin_clzl(l) >> 3;
 627 #else
 628     l = __builtin_ctzl(l) >> 3;
 629 #endif
 630     return s + l;
 631
 632 #undef N
 633   }
 634 }
 635
 636 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
 637
 638 /* A vection of the fast scanner using AltiVec vectorized byte compares.
 639    This cannot be used for little endian because vec_lvsl/lvsr are
 640    deprecated for little endian and the code won't work properly.  */
 641 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 642    so we can't compile this function without -maltivec on the command line
 643    (or implied by some other switch).  */
 644
 645 static const uchar *
 646 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 647 {
 648   typedef __attribute__((altivec(vector))) unsigned char vc;
 649
 650   const vc repl_nl = {
 651     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 652     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 653   };
 654   const vc repl_cr = {
 655     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 656     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 657   };
 658   const vc repl_bs = {
 659     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 660     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 661   };
 662   const vc repl_qm = {
 663     '?', '?', '?', '?', '?', '?', '?', '?',
 664     '?', '?', '?', '?', '?', '?', '?', '?',
 665   };
 666   const vc ones = {
 667     -1, -1, -1, -1, -1, -1, -1, -1,
 668     -1, -1, -1, -1, -1, -1, -1, -1,
 669   };
 670   const vc zero = { 0 };
 671
 672   vc data, mask, t;
 673
 674   /* Altivec loads automatically mask addresses with -16.  This lets us
 675      issue the first load as early as possible.  */
 676   data = __builtin_vec_ld(0, (const vc *)s);
 677
 678   /* Discard bytes before the beginning of the buffer.  Do this by
 679      beginning with all ones and shifting in zeros according to the
 680      mis-alignment.  The LVSR instruction pulls the exact shift we
 681      want from the address.  */
 682   mask = __builtin_vec_lvsr(0, s);
 683   mask = __builtin_vec_perm(zero, ones, mask);
 684   data &= mask;
 685
 686   /* While altivec loads mask addresses, we still need to align S so
 687      that the offset we compute at the end is correct.  */
 688   s = (const uchar *)((uintptr_t)s & -16);
 689
 690   /* Main loop processing 16 bytes at a time.  */
 691   goto start;
 692   do
 693     {
 694       vc m_nl, m_cr, m_bs, m_qm;
 695
 696       s += 16;
 697       data = __builtin_vec_ld(0, (const vc *)s);
 698
 699     start:
 700       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 701       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 702       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 703       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 704       t = (m_nl | m_cr) | (m_bs | m_qm);
 705
 706       /* T now contains 0xff in bytes for which we matched one of the relevant
 707          characters.  We want to exit the loop if any byte in T is non-zero.
 708          Below is the expansion of vec_any_ne(t, zero).  */
 709     }
 710   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 711
 712   {
 713 #define N  (sizeof(vc) / sizeof(long))
 714
 715     union {
 716       vc v;
 717       /* Statically assert that N is 2 or 4.  */
 718       unsigned long l[(N == 2 || N == 4) ? N : -1];
 719     } u;
 720     unsigned long l, i = 0;
 721
 722     u.v = t;
 723
 724     /* Find the first word of T that is non-zero.  */
 725     switch (N)
 726       {
 727       case 4:
 728         l = u.l[i++];
 729         if (l != 0)
 730           break;
 731         s += sizeof(unsigned long);
 732         l = u.l[i++];
 733         if (l != 0)
 734           break;
 735         s += sizeof(unsigned long);
 736         /* FALLTHROUGH */
 737       case 2:
 738         l = u.l[i++];
 739         if (l != 0)
 740           break;
 741         s += sizeof(unsigned long);
 742         l = u.l[i];
 743       }
 744
 745     /* L now contains 0xff in bytes for which we matched one of the
 746        relevant characters.  We can find the byte index by finding
 747        its bit index and dividing by 8.  */
 748     l = __builtin_clzl(l) >> 3;
 749     return s + l;
 750
 751 #undef N
 752   }
 753 }
 754
 755 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
 756 #include "arm_neon.h"
 757
 758 /* This doesn't have to be the exact page size, but no system may use
 759    a size smaller than this.  ARMv8 requires a minimum page size of
 760    4k.  The impact of being conservative here is a small number of
 761    cases will take the slightly slower entry path into the main
 762    loop.  */
 763
 764 #define AARCH64_MIN_PAGE_SIZE 4096
 765
 766 static const uchar *
 767 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 768 {
 769   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 770   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 771   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 772   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 773   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 774
 775 #ifdef __ARM_BIG_ENDIAN
 776   const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
 777 #else
 778   const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
 779 #endif
 780
 781   unsigned int found;
 782   const uint8_t *p;
 783   uint8x16_t data;
 784   uint8x16_t t;
 785   uint16x8_t m;
 786   uint8x16_t u, v, w;
 787
 788   /* Align the source pointer.  */
 789   p = (const uint8_t *)((uintptr_t)s & -16);
 790
 791   /* Assuming random string start positions, with a 4k page size we'll take
 792      the slow path about 0.37% of the time.  */
 793   if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
 794                          - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
 795                         < 16, 0))
 796     {
 797       /* Slow path: the string starts near a possible page boundary.  */
 798       uint32_t misalign, mask;
 799
 800       misalign = (uintptr_t)s & 15;
 801       mask = (-1u << misalign) & 0xffff;
 802       data = vld1q_u8 (p);
 803       t = vceqq_u8 (data, repl_nl);
 804       u = vceqq_u8 (data, repl_cr);
 805       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 806       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 807       t = vorrq_u8 (v, w);
 808       t = vandq_u8 (t, xmask);
 809       m = vpaddlq_u8 (t);
 810       m = vshlq_u16 (m, shift);
 811       found = vaddvq_u16 (m);
 812       found &= mask;
 813       if (found)
 814         return (const uchar*)p + __builtin_ctz (found);
 815     }
 816   else
 817     {
 818       data = vld1q_u8 ((const uint8_t *) s);
 819       t = vceqq_u8 (data, repl_nl);
 820       u = vceqq_u8 (data, repl_cr);
 821       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 822       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 823       t = vorrq_u8 (v, w);
 824       if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
 825         goto done;
 826     }
 827
 828   do
 829     {
 830       p += 16;
 831       data = vld1q_u8 (p);
 832       t = vceqq_u8 (data, repl_nl);
 833       u = vceqq_u8 (data, repl_cr);
 834       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 835       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 836       t = vorrq_u8 (v, w);
 837     } while (!vpaddd_u64 ((uint64x2_t)t));
 838
 839 done:
 840   /* Now that we've found the terminating substring, work out precisely where
 841      we need to stop.  */
 842   t = vandq_u8 (t, xmask);
 843   m = vpaddlq_u8 (t);
 844   m = vshlq_u16 (m, shift);
 845   found = vaddvq_u16 (m);
 846   return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
 847           + __builtin_ctz (found));
 848 }
 849
 850 #elif defined (__ARM_NEON)
 851 #include "arm_neon.h"
 852
 853 static const uchar *
 854 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 855 {
 856   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 857   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 858   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 859   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 860   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 861
 862   unsigned int misalign, found, mask;
 863   const uint8_t *p;
 864   uint8x16_t data;
 865
 866   /* Align the source pointer.  */
 867   misalign = (uintptr_t)s & 15;
 868   p = (const uint8_t *)((uintptr_t)s & -16);
 869   data = vld1q_u8 (p);
 870
 871   /* Create a mask for the bytes that are valid within the first
 872      16-byte block.  The Idea here is that the AND with the mask
 873      within the loop is "free", since we need some AND or TEST
 874      insn in order to set the flags for the branch anyway.  */
 875   mask = (-1u << misalign) & 0xffff;
 876
 877   /* Main loop, processing 16 bytes at a time.  */
 878   goto start;
 879
 880   do
 881     {
 882       uint8x8_t l;
 883       uint16x4_t m;
 884       uint32x2_t n;
 885       uint8x16_t t, u, v, w;
 886
 887       p += 16;
 888       data = vld1q_u8 (p);
 889       mask = 0xffff;
 890
 891     start:
 892       t = vceqq_u8 (data, repl_nl);
 893       u = vceqq_u8 (data, repl_cr);
 894       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 895       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 896       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 897       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 898       m = vpaddl_u8 (l);
 899       n = vpaddl_u16 (m);
 900
 901       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 902               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 903       found &= mask;
 904     }
 905   while (!found);
 906
 907   /* FOUND contains 1 in bits for which we matched a relevant
 908      character.  Conversion to the byte index is trivial.  */
 909   found = __builtin_ctz (found);
 910   return (const uchar *)p + found;
 911 }
 912
 913 #else
 914
 915 /* We only have one accelerated alternative.  Use a direct call so that
 916    we encourage inlining.  */
 917
 918 #define search_line_fast  search_line_acc_char
 919
 920 #endif
 921
 922 /* Initialize the lexer if needed.  */
 923
 924 void
 925 _cpp_init_lexer (void)
 926 {
 927 #ifdef HAVE_init_vectorized_lexer
 928   init_vectorized_lexer ();
 929 #endif
 930 }
 931
 932 /* Returns with a logical line that contains no escaped newlines or
 933    trigraphs.  This is a time-critical inner loop.  */
 934 void
 935 _cpp_clean_line (cpp_reader *pfile)
 936 {
 937   cpp_buffer *buffer;
 938   const uchar *s;
 939   uchar c, *d, *p;
 940
 941   buffer = pfile->buffer;
 942   buffer->cur_note = buffer->notes_used = 0;
 943   buffer->cur = buffer->line_base = buffer->next_line;
 944   buffer->need_line = false;
 945   s = buffer->next_line;
 946
 947   if (!buffer->from_stage3)
 948     {
 949       const uchar *pbackslash = NULL;
 950
 951       /* Fast path.  This is the common case of an un-escaped line with
 952          no trigraphs.  The primary win here is by not writing any
 953          data back to memory until we have to.  */
 954       while (1)
 955         {
 956           /* Perform an optimized search for \n, \r, \\, ?.  */
 957           s = search_line_fast (s, buffer->rlimit);
 958
 959           c = *s;
 960           if (c == '\\')
 961             {
 962               /* Record the location of the backslash and continue.  */
 963               pbackslash = s++;
 964             }
 965           else if (__builtin_expect (c == '?', 0))
 966             {
 967               if (__builtin_expect (s[1] == '?', false)
 968                    && _cpp_trigraph_map[s[2]])
 969                 {
 970                   /* Have a trigraph.  We may or may not have to convert
 971                      it.  Add a line note regardless, for -Wtrigraphs.  */
 972                   add_line_note (buffer, s, s[2]);
 973                   if (CPP_OPTION (pfile, trigraphs))
 974                     {
 975                       /* We do, and that means we have to switch to the
 976                          slow path.  */
 977                       d = (uchar *) s;
 978                       *d = _cpp_trigraph_map[s[2]];
 979                       s += 2;
 980                       goto slow_path;
 981                     }
 982                 }
 983               /* Not a trigraph.  Continue on fast-path.  */
 984               s++;
 985             }
 986           else
 987             break;
 988         }
 989
 990       /* This must be \r or \n.  We're either done, or we'll be forced
 991          to write back to the buffer and continue on the slow path.  */
 992       d = (uchar *) s;
 993
 994       if (__builtin_expect (s == buffer->rlimit, false))
 995         goto done;
 996
 997       /* DOS line ending? */
 998       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 999         {
1000           s++;
1001           if (s == buffer->rlimit)
1002             goto done;
1003         }
1004
1005       if (__builtin_expect (pbackslash == NULL, true))
1006         goto done;
1007
1008       /* Check for escaped newline.  */
1009       p = d;
1010       while (is_nvspace (p[-1]))
1011         p--;
1012       if (p - 1 != pbackslash)
1013         goto done;
1014
1015       /* Have an escaped newline; process it and proceed to
1016          the slow path.  */
1017       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
1018       d = p - 2;
1019       buffer->next_line = p - 1;
1020
1021     slow_path:
1022       while (1)
1023         {
1024           c = *++s;
1025           *++d = c;
1026
1027           if (c == '\n' || c == '\r')
1028             {
1029               /* Handle DOS line endings.  */
1030               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
1031                 s++;
1032               if (s == buffer->rlimit)
1033                 break;
1034
1035               /* Escaped?  */
1036               p = d;
1037               while (p != buffer->next_line && is_nvspace (p[-1]))
1038                 p--;
1039               if (p == buffer->next_line || p[-1] != '\\')
1040                 break;
1041
1042               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
1043               d = p - 2;
1044               buffer->next_line = p - 1;
1045             }
1046           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
1047             {
1048               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
1049               add_line_note (buffer, d, s[2]);
1050               if (CPP_OPTION (pfile, trigraphs))
1051                 {
1052                   *d = _cpp_trigraph_map[s[2]];
1053                   s += 2;
1054                 }
1055             }
1056         }
1057     }
1058   else
1059     {
1060       while (*s != '\n' && *s != '\r')
1061         s++;
1062       d = (uchar *) s;
1063
1064       /* Handle DOS line endings.  */
1065       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
1066         s++;
1067     }
1068
1069  done:
1070   *d = '\n';
1071   /* A sentinel note that should never be processed.  */
1072   add_line_note (buffer, d + 1, '\n');
1073   buffer->next_line = s + 1;
1074 }
1075
1076 /* Return true if the trigraph indicated by NOTE should be warned
1077    about in a comment.  */
1078 static bool
1079 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1080 {
1081   const uchar *p;
1082
1083   /* Within comments we don't warn about trigraphs, unless the
1084      trigraph forms an escaped newline, as that may change
1085      behavior.  */
1086   if (note->type != '/')
1087     return false;
1088
1089   /* If -trigraphs, then this was an escaped newline iff the next note
1090      is coincident.  */
1091   if (CPP_OPTION (pfile, trigraphs))
1092     return note[1].pos == note->pos;
1093
1094   /* Otherwise, see if this forms an escaped newline.  */
1095   p = note->pos + 3;
1096   while (is_nvspace (*p))
1097     p++;
1098
1099   /* There might have been escaped newlines between the trigraph and the
1100      newline we found.  Hence the position test.  */
1101   return (*p == '\n' && p < note[1].pos);
1102 }
1103
1104 /* Process the notes created by add_line_note as far as the current
1105    location.  */
1106 void
1107 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1108 {
1109   cpp_buffer *buffer = pfile->buffer;
1110
1111   for (;;)
1112     {
1113       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1114       unsigned int col;
1115
1116       if (note->pos > buffer->cur)
1117         break;
1118
1119       buffer->cur_note++;
1120       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1121
1122       if (note->type == '\\' || note->type == ' ')
1123         {
1124           if (note->type == ' ' && !in_comment)
1125             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1126                                  "backslash and newline separated by space");
1127
1128           if (buffer->next_line > buffer->rlimit)
1129             {
1130               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1131                                    "backslash-newline at end of file");
1132               /* Prevent "no newline at end of file" warning.  */
1133               buffer->next_line = buffer->rlimit;
1134             }
1135
1136           buffer->line_base = note->pos;
1137           CPP_INCREMENT_LINE (pfile, 0);
1138         }
1139       else if (_cpp_trigraph_map[note->type])
1140         {
1141           if (CPP_OPTION (pfile, warn_trigraphs)
1142               && (!in_comment || warn_in_comment (pfile, note)))
1143             {
1144               if (CPP_OPTION (pfile, trigraphs))
1145                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1146                                        pfile->line_table->highest_line, col,
1147                                        "trigraph ??%c converted to %c",
1148                                        note->type,
1149                                        (int) _cpp_trigraph_map[note->type]);
1150               else
1151                 {
1152                   cpp_warning_with_line
1153                     (pfile, CPP_W_TRIGRAPHS,
1154                      pfile->line_table->highest_line, col,
1155                      "trigraph ??%c ignored, use -trigraphs to enable",
1156                      note->type);
1157                 }
1158             }
1159         }
1160       else if (note->type == 0)
1161         /* Already processed in lex_raw_string.  */;
1162       else
1163         abort ();
1164     }
1165 }
1166
1167 /* Skip a C-style block comment.  We find the end of the comment by
1168    seeing if an asterisk is before every '/' we encounter.  Returns
1169    nonzero if comment terminated by EOF, zero otherwise.
1170
1171    Buffer->cur points to the initial asterisk of the comment.  */
1172 bool
1173 _cpp_skip_block_comment (cpp_reader *pfile)
1174 {
1175   cpp_buffer *buffer = pfile->buffer;
1176   const uchar *cur = buffer->cur;
1177   uchar c;
1178
1179   cur++;
1180   if (*cur == '/')
1181     cur++;
1182
1183   for (;;)
1184     {
1185       /* People like decorating comments with '*', so check for '/'
1186          instead for efficiency.  */
1187       c = *cur++;
1188
1189       if (c == '/')
1190         {
1191           if (cur[-2] == '*')
1192             break;
1193
1194           /* Warn about potential nested comments, but not if the '/'
1195              comes immediately before the true comment delimiter.
1196              Don't bother to get it right across escaped newlines.  */
1197           if (CPP_OPTION (pfile, warn_comments)
1198               && cur[0] == '*' && cur[1] != '/')
1199             {
1200               buffer->cur = cur;
1201               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1202                                      pfile->line_table->highest_line,
1203                                      CPP_BUF_COL (buffer),
1204                                      "\"/*\" within comment");
1205             }
1206         }
1207       else if (c == '\n')
1208         {
1209           unsigned int cols;
1210           buffer->cur = cur - 1;
1211           _cpp_process_line_notes (pfile, true);
1212           if (buffer->next_line >= buffer->rlimit)
1213             return true;
1214           _cpp_clean_line (pfile);
1215
1216           cols = buffer->next_line - buffer->line_base;
1217           CPP_INCREMENT_LINE (pfile, cols);
1218
1219           cur = buffer->cur;
1220         }
1221     }
1222
1223   buffer->cur = cur;
1224   _cpp_process_line_notes (pfile, true);
1225   return false;
1226 }
1227
1228 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1229    terminating newline.  Handles escaped newlines.  Returns nonzero
1230    if a multiline comment.  */
1231 static int
1232 skip_line_comment (cpp_reader *pfile)
1233 {
1234   cpp_buffer *buffer = pfile->buffer;
1235   location_t orig_line = pfile->line_table->highest_line;
1236
1237   while (*buffer->cur != '\n')
1238     buffer->cur++;
1239
1240   _cpp_process_line_notes (pfile, true);
1241   return orig_line != pfile->line_table->highest_line;
1242 }
1243
1244 /* Skips whitespace, saving the next non-whitespace character.  */
1245 static void
1246 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1247 {
1248   cpp_buffer *buffer = pfile->buffer;
1249   bool saw_NUL = false;
1250
1251   do
1252     {
1253       /* Horizontal space always OK.  */
1254       if (c == ' ' || c == '\t')
1255         ;
1256       /* Just \f \v or \0 left.  */
1257       else if (c == '\0')
1258         saw_NUL = true;
1259       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1260         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1261                              CPP_BUF_COL (buffer),
1262                              "%s in preprocessing directive",
1263                              c == '\f' ? "form feed" : "vertical tab");
1264
1265       c = *buffer->cur++;
1266     }
1267   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1268   while (is_nvspace (c));
1269
1270   if (saw_NUL)
1271     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1272
1273   buffer->cur--;
1274 }
1275
1276 /* See if the characters of a number token are valid in a name (no
1277    '.', '+' or '-').  */
1278 static int
1279 name_p (cpp_reader *pfile, const cpp_string *string)
1280 {
1281   unsigned int i;
1282
1283   for (i = 0; i < string->len; i++)
1284     if (!is_idchar (string->text[i]))
1285       return 0;
1286
1287   return 1;
1288 }
1289
1290 /* After parsing an identifier or other sequence, produce a warning about
1291    sequences not in NFC/NFKC.  */
1292 static void
1293 warn_about_normalization (cpp_reader *pfile,
1294                           const cpp_token *token,
1295                           const struct normalize_state *s)
1296 {
1297   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1298       && !pfile->state.skipping)
1299     {
1300       /* Make sure that the token is printed using UCNs, even
1301          if we'd otherwise happily print UTF-8.  */
1302       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1303       size_t sz;
1304
1305       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1306       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1307         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1308                                "`%.*s' is not in NFKC", (int) sz, buf);
1309       else
1310         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1311                                "`%.*s' is not in NFC", (int) sz, buf);
1312       free (buf);
1313     }
1314 }
1315
1316 static const cppchar_t utf8_signifier = 0xC0;
1317
1318 /* Returns TRUE if the sequence starting at buffer->cur is valid in
1319    an identifier.  FIRST is TRUE if this starts an identifier.  */
1320 static bool
1321 forms_identifier_p (cpp_reader *pfile, int first,
1322                     struct normalize_state *state)
1323 {
1324   cpp_buffer *buffer = pfile->buffer;
1325
1326   if (*buffer->cur == '$')
1327     {
1328       if (!CPP_OPTION (pfile, dollars_in_ident))
1329         return false;
1330
1331       buffer->cur++;
1332       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1333         {
1334           CPP_OPTION (pfile, warn_dollars) = 0;
1335           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1336         }
1337
1338       return true;
1339     }
1340
1341   /* Is this a syntactically valid UCN or a valid UTF-8 char?  */
1342   if (CPP_OPTION (pfile, extended_identifiers))
1343     {
1344       cppchar_t s;
1345       if (*buffer->cur >= utf8_signifier)
1346         {
1347           if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1348                                state, &s))
1349             return true;
1350         }
1351       else if (*buffer->cur == '\\'
1352                && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1353         {
1354           buffer->cur += 2;
1355           if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1356                               state, &s, NULL, NULL))
1357             return true;
1358           buffer->cur -= 2;
1359         }
1360     }
1361
1362   return false;
1363 }
1364
1365 /* Helper function to issue error about improper __VA_OPT__ use.  */
1366 static void
1367 maybe_va_opt_error (cpp_reader *pfile)
1368 {
1369   if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
1370     {
1371       /* __VA_OPT__ should not be accepted at all, but allow it in
1372          system headers.  */
1373       if (!cpp_in_system_header (pfile))
1374         cpp_error (pfile, CPP_DL_PEDWARN,
1375                    "__VA_OPT__ is not available until C++20");
1376     }
1377   else if (!pfile->state.va_args_ok)
1378     {
1379       /* __VA_OPT__ should only appear in the replacement list of a
1380          variadic macro.  */
1381       cpp_error (pfile, CPP_DL_PEDWARN,
1382                  "__VA_OPT__ can only appear in the expansion"
1383                  " of a C++20 variadic macro");
1384     }
1385 }
1386
1387 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1388 static cpp_hashnode *
1389 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1390 {
1391   cpp_hashnode *result;
1392   const uchar *cur;
1393   unsigned int len;
1394   unsigned int hash = HT_HASHSTEP (0, *base);
1395
1396   cur = base + 1;
1397   while (ISIDNUM (*cur))
1398     {
1399       hash = HT_HASHSTEP (hash, *cur);
1400       cur++;
1401     }
1402   len = cur - base;
1403   hash = HT_HASHFINISH (hash, len);
1404   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1405                                               base, len, hash, HT_ALLOC));
1406
1407   /* Rarely, identifiers require diagnostics when lexed.  */
1408   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1409                         && !pfile->state.skipping, 0))
1410     {
1411       /* It is allowed to poison the same identifier twice.  */
1412       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1413         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1414                    NODE_NAME (result));
1415
1416       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1417          replacement list of a variadic macro.  */
1418       if (result == pfile->spec_nodes.n__VA_ARGS__
1419           && !pfile->state.va_args_ok)
1420         {
1421           if (CPP_OPTION (pfile, cplusplus))
1422             cpp_error (pfile, CPP_DL_PEDWARN,
1423                        "__VA_ARGS__ can only appear in the expansion"
1424                        " of a C++11 variadic macro");
1425           else
1426             cpp_error (pfile, CPP_DL_PEDWARN,
1427                        "__VA_ARGS__ can only appear in the expansion"
1428                        " of a C99 variadic macro");
1429         }
1430
1431       if (result == pfile->spec_nodes.n__VA_OPT__)
1432         maybe_va_opt_error (pfile);
1433
1434       /* For -Wc++-compat, warn about use of C++ named operators.  */
1435       if (result->flags & NODE_WARN_OPERATOR)
1436         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1437                      "identifier \"%s\" is a special operator name in C++",
1438                      NODE_NAME (result));
1439     }
1440
1441   return result;
1442 }
1443
1444 /* Get the cpp_hashnode of an identifier specified by NAME in
1445    the current cpp_reader object.  If none is found, NULL is returned.  */
1446 cpp_hashnode *
1447 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1448 {
1449   cpp_hashnode *result;
1450   result = lex_identifier_intern (pfile, (uchar *) name);
1451   return result;
1452 }
1453
1454 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1455 static cpp_hashnode *
1456 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1457                 struct normalize_state *nst, cpp_hashnode **spelling)
1458 {
1459   cpp_hashnode *result;
1460   const uchar *cur;
1461   unsigned int len;
1462   unsigned int hash = HT_HASHSTEP (0, *base);
1463
1464   cur = pfile->buffer->cur;
1465   if (! starts_ucn)
1466     {
1467       while (ISIDNUM (*cur))
1468         {
1469           hash = HT_HASHSTEP (hash, *cur);
1470           cur++;
1471         }
1472       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
1473     }
1474   pfile->buffer->cur = cur;
1475   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1476     {
1477       /* Slower version for identifiers containing UCNs
1478          or extended chars (including $).  */
1479       do {
1480         while (ISIDNUM (*pfile->buffer->cur))
1481           {
1482             NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
1483             pfile->buffer->cur++;
1484           }
1485       } while (forms_identifier_p (pfile, false, nst));
1486       result = _cpp_interpret_identifier (pfile, base,
1487                                           pfile->buffer->cur - base);
1488       *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
1489     }
1490   else
1491     {
1492       len = cur - base;
1493       hash = HT_HASHFINISH (hash, len);
1494
1495       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1496                                                   base, len, hash, HT_ALLOC));
1497       *spelling = result;
1498     }
1499
1500   /* Rarely, identifiers require diagnostics when lexed.  */
1501   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1502                         && !pfile->state.skipping, 0))
1503     {
1504       /* It is allowed to poison the same identifier twice.  */
1505       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1506         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1507                    NODE_NAME (result));
1508
1509       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1510          replacement list of a variadic macro.  */
1511       if (result == pfile->spec_nodes.n__VA_ARGS__
1512           && !pfile->state.va_args_ok)
1513         {
1514           if (CPP_OPTION (pfile, cplusplus))
1515             cpp_error (pfile, CPP_DL_PEDWARN,
1516                        "__VA_ARGS__ can only appear in the expansion"
1517                        " of a C++11 variadic macro");
1518           else
1519             cpp_error (pfile, CPP_DL_PEDWARN,
1520                        "__VA_ARGS__ can only appear in the expansion"
1521                        " of a C99 variadic macro");
1522         }
1523
1524       /* __VA_OPT__ should only appear in the replacement list of a
1525          variadic macro.  */
1526       if (result == pfile->spec_nodes.n__VA_OPT__)
1527         maybe_va_opt_error (pfile);
1528
1529       /* For -Wc++-compat, warn about use of C++ named operators.  */
1530       if (result->flags & NODE_WARN_OPERATOR)
1531         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1532                      "identifier \"%s\" is a special operator name in C++",
1533                      NODE_NAME (result));
1534     }
1535
1536   return result;
1537 }
1538
1539 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1540 static void
1541 lex_number (cpp_reader *pfile, cpp_string *number,
1542             struct normalize_state *nst)
1543 {
1544   const uchar *cur;
1545   const uchar *base;
1546   uchar *dest;
1547
1548   base = pfile->buffer->cur - 1;
1549   do
1550     {
1551       cur = pfile->buffer->cur;
1552
1553       /* N.B. ISIDNUM does not include $.  */
1554       while (ISIDNUM (*cur) || *cur == '.' || DIGIT_SEP (*cur)
1555              || VALID_SIGN (*cur, cur[-1]))
1556         {
1557           NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
1558           cur++;
1559         }
1560       /* A number can't end with a digit separator.  */
1561       while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
1562         --cur;
1563
1564       pfile->buffer->cur = cur;
1565     }
1566   while (forms_identifier_p (pfile, false, nst));
1567
1568   number->len = cur - base;
1569   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1570   memcpy (dest, base, number->len);
1571   dest[number->len] = '\0';
1572   number->text = dest;
1573 }
1574
1575 /* Create a token of type TYPE with a literal spelling.  */
1576 static void
1577 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1578                 unsigned int len, enum cpp_ttype type)
1579 {
1580   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1581
1582   memcpy (dest, base, len);
1583   dest[len] = '\0';
1584   token->type = type;
1585   token->val.str.len = len;
1586   token->val.str.text = dest;
1587 }
1588
1589 /* A pair of raw buffer pointers.  The currently open one is [1], the
1590    first one is [0].  Used for string literal lexing.  */
1591 struct lit_accum {
1592   _cpp_buff *first;
1593   _cpp_buff *last;
1594   const uchar *rpos;
1595   size_t accum;
1596
1597   lit_accum ()
1598     : first (NULL), last (NULL), rpos (0), accum (0)
1599   {
1600   }
1601
1602   void append (cpp_reader *, const uchar *, size_t);
1603
1604   void read_begin (cpp_reader *);
1605   bool reading_p () const
1606   {
1607     return rpos != NULL;
1608   }
1609   char read_char ()
1610   {
1611     char c = *rpos++;
1612     if (rpos == BUFF_FRONT (last))
1613       rpos = NULL;
1614     return c;
1615   }
1616 };
1617
1618 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1619    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1620
1621 void
1622 lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
1623 {
1624   if (!last)
1625     /* Starting.  */
1626     first = last = _cpp_get_buff (pfile, len);
1627   else if (len > BUFF_ROOM (last))
1628     {
1629       /* There is insufficient room in the buffer.  Copy what we can,
1630          and then either extend or create a new one.  */
1631       size_t room = BUFF_ROOM (last);
1632       memcpy (BUFF_FRONT (last), base, room);
1633       BUFF_FRONT (last) += room;
1634       base += room;
1635       len -= room;
1636       accum += room;
1637
1638       gcc_checking_assert (!rpos);
1639
1640       last = _cpp_append_extend_buff (pfile, last, len);
1641     }
1642
1643   memcpy (BUFF_FRONT (last), base, len);
1644   BUFF_FRONT (last) += len;
1645   accum += len;
1646 }
1647
1648 void
1649 lit_accum::read_begin (cpp_reader *pfile)
1650 {
1651   /* We never accumulate more than 4 chars to read.  */
1652   if (BUFF_ROOM (last) < 4)
1653
1654     last = _cpp_append_extend_buff (pfile, last, 4);
1655   rpos = BUFF_FRONT (last);
1656 }
1657
1658 /* Returns true if a macro has been defined.
1659    This might not work if compile with -save-temps,
1660    or preprocess separately from compilation.  */
1661
1662 static bool
1663 is_macro(cpp_reader *pfile, const uchar *base)
1664 {
1665   const uchar *cur = base;
1666   if (! ISIDST (*cur))
1667     return false;
1668   unsigned int hash = HT_HASHSTEP (0, *cur);
1669   ++cur;
1670   while (ISIDNUM (*cur))
1671     {
1672       hash = HT_HASHSTEP (hash, *cur);
1673       ++cur;
1674     }
1675   hash = HT_HASHFINISH (hash, cur - base);
1676
1677   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1678                                         base, cur - base, hash, HT_NO_INSERT));
1679
1680   return result && cpp_macro_p (result);
1681 }
1682
1683 /* Returns true if a literal suffix does not have the expected form
1684    and is defined as a macro.  */
1685
1686 static bool
1687 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base)
1688 {
1689   /* User-defined literals outside of namespace std must start with a single
1690      underscore, so assume anything of that form really is a UDL suffix.
1691      We don't need to worry about UDLs defined inside namespace std because
1692      their names are reserved, so cannot be used as macro names in valid
1693      programs.  */
1694   if (base[0] == '_' && base[1] != '_')
1695     return false;
1696   return is_macro (pfile, base);
1697 }
1698
1699 /* Lexes a raw string.  The stored string contains the spelling,
1700    including double quotes, delimiter string, '(' and ')', any leading
1701    'L', 'u', 'U' or 'u8' and 'R' modifier.  The created token contains
1702    the type of the literal, or CPP_OTHER if it was not properly
1703    terminated.
1704
1705    BASE is the start of the token.  Updates pfile->buffer->cur to just
1706    after the lexed string.
1707
1708    The spelling is NUL-terminated, but it is not guaranteed that this
1709    is the first NUL since embedded NULs are preserved.  */
1710
1711 static void
1712 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1713 {
1714   const uchar *pos = base;
1715
1716   /* 'tis a pity this information isn't passed down from the lexer's
1717      initial categorization of the token.  */
1718   enum cpp_ttype type = CPP_STRING;
1719
1720   if (*pos == 'L')
1721     {
1722       type = CPP_WSTRING;
1723       pos++;
1724     }
1725   else if (*pos == 'U')
1726     {
1727       type = CPP_STRING32;
1728       pos++;
1729     }
1730   else if (*pos == 'u')
1731     {
1732       if (pos[1] == '8')
1733         {
1734           type = CPP_UTF8STRING;
1735           pos++;
1736         }
1737       else
1738         type = CPP_STRING16;
1739       pos++;
1740     }
1741
1742   gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
1743   pos += 2;
1744
1745   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1746
1747   /* Skip notes before the ".  */
1748   while (note->pos < pos)
1749     ++note;
1750
1751   lit_accum accum;
1752
1753   uchar prefix[17];
1754   unsigned prefix_len = 0;
1755   enum Phase
1756   {
1757    PHASE_PREFIX = -2,
1758    PHASE_NONE = -1,
1759    PHASE_SUFFIX = 0
1760   } phase = PHASE_PREFIX;
1761
1762   for (;;)
1763     {
1764       gcc_checking_assert (note->pos >= pos);
1765
1766       /* Undo any escaped newlines and trigraphs.  */
1767       if (!accum.reading_p () && note->pos == pos)
1768         switch (note->type)
1769           {
1770           case '\\':
1771           case ' ':
1772             /* Restore backslash followed by newline.  */
1773             accum.append (pfile, base, pos - base);
1774             base = pos;
1775             accum.read_begin (pfile);
1776             accum.append (pfile, UC"\\", 1);
1777
1778           after_backslash:
1779             if (note->type == ' ')
1780               /* GNU backslash whitespace newline extension.  FIXME
1781                  could be any sequence of non-vertical space.  When we
1782                  can properly restore any such sequence, we should
1783                  mark this note as handled so _cpp_process_line_notes
1784                  doesn't warn.  */
1785               accum.append (pfile, UC" ", 1);
1786
1787             accum.append (pfile, UC"\n", 1);
1788             note++;
1789             break;
1790
1791           case '\n':
1792             /* This can happen for ??/<NEWLINE> when trigraphs are not
1793                being interpretted.  */
1794             gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
1795             note->type = 0;
1796             note++;
1797             break;
1798
1799           default:
1800             gcc_checking_assert (_cpp_trigraph_map[note->type]);
1801
1802             /* Don't warn about this trigraph in
1803                _cpp_process_line_notes, since trigraphs show up as
1804                trigraphs in raw strings.  */
1805             uchar type = note->type;
1806             note->type = 0;
1807
1808             if (CPP_OPTION (pfile, trigraphs))
1809               {
1810                 accum.append (pfile, base, pos - base);
1811                 base = pos;
1812                 accum.read_begin (pfile);
1813                 accum.append (pfile, UC"??", 2);
1814                 accum.append (pfile, &type, 1);
1815
1816                 /* ??/ followed by newline gets two line notes, one for
1817                    the trigraph and one for the backslash/newline.  */
1818                 if (type == '/' && note[1].pos == pos)
1819                   {
1820                     note++;
1821                     gcc_assert (note->type == '\\' || note->type == ' ');
1822                     goto after_backslash;
1823                   }
1824                 /* Skip the replacement character.  */
1825                 base = ++pos;
1826               }
1827
1828             note++;
1829             break;
1830           }
1831
1832       /* Now get a char to process.  Either from an expanded note, or
1833          from the line buffer.  */
1834       bool read_note = accum.reading_p ();
1835       char c = read_note ? accum.read_char () : *pos++;
1836
1837       if (phase == PHASE_PREFIX)
1838         {
1839           if (c == '(')
1840             {
1841               /* Done.  */
1842               phase = PHASE_NONE;
1843               prefix[prefix_len++] = '"';
1844             }
1845           else if (prefix_len < 16
1846                    /* Prefix chars are any of the basic character set,
1847                       [lex.charset] except for '
1848                       ()\\\t\v\f\n'. Optimized for a contiguous
1849                       alphabet.  */
1850                    /* Unlike a switch, this collapses down to one or
1851                       two shift and bitmask operations on an ASCII
1852                       system, with an outlier or two.   */
1853                    && (('Z' - 'A' == 25
1854                         ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
1855                         : ISIDST (c))
1856                        || (c >= '0' && c <= '9')
1857                        || c == '_' || c == '{' || c == '}'
1858                        || c == '[' || c == ']' || c == '#'
1859                        || c == '<' || c == '>' || c == '%'
1860                        || c == ':' || c == ';' || c == '.' || c == '?'
1861                        || c == '*' || c == '+' || c == '-' || c == '/'
1862                        || c == '^' || c == '&' || c == '|' || c == '~'
1863                        || c == '!' || c == '=' || c == ','
1864                        || c == '"' || c == '\''))
1865             prefix[prefix_len++] = c;
1866           else
1867             {
1868               /* Something is wrong.  */
1869               int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
1870               if (prefix_len == 16)
1871                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1872                                      col, "raw string delimiter longer "
1873                                      "than 16 characters");
1874               else if (c == '\n')
1875                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1876                                      col, "invalid new-line in raw "
1877                                      "string delimiter");
1878               else
1879                 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1880                                      col, "invalid character '%c' in "
1881                                      "raw string delimiter", c);
1882               type = CPP_OTHER;
1883               phase = PHASE_NONE;
1884               /* Continue until we get a close quote, that's probably
1885                  the best failure mode.  */
1886               prefix_len = 0;
1887             }
1888           continue;
1889         }
1890
1891       if (phase != PHASE_NONE)
1892         {
1893           if (prefix[phase] != c)
1894             phase = PHASE_NONE;
1895           else if (unsigned (phase + 1) == prefix_len)
1896             break;
1897           else
1898             {
1899               phase = Phase (phase + 1);
1900               continue;
1901             }
1902         }
1903
1904       if (!prefix_len && c == '"')
1905         /* Failure mode lexing.  */
1906         goto out;
1907       else if (prefix_len && c == ')')
1908         phase = PHASE_SUFFIX;
1909       else if (!read_note && c == '\n')
1910         {
1911           pos--;
1912           pfile->buffer->cur = pos;
1913           if (pfile->state.in_directive
1914               || (pfile->state.parsing_args
1915                   && pfile->buffer->next_line >= pfile->buffer->rlimit))
1916             {
1917               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1918                                    "unterminated raw string");
1919               type = CPP_OTHER;
1920               goto out;
1921             }
1922
1923           accum.append (pfile, base, pos - base + 1);
1924           _cpp_process_line_notes (pfile, false);
1925
1926           if (pfile->buffer->next_line < pfile->buffer->rlimit)
1927             CPP_INCREMENT_LINE (pfile, 0);
1928           pfile->buffer->need_line = true;
1929
1930           if (!_cpp_get_fresh_line (pfile))
1931             {
1932               /* We ran out of file and failed to get a line.  */
1933               location_t src_loc = token->src_loc;
1934               token->type = CPP_EOF;
1935               /* Tell the compiler the line number of the EOF token.  */
1936               token->src_loc = pfile->line_table->highest_line;
1937               token->flags = BOL;
1938               if (accum.first)
1939                 _cpp_release_buff (pfile, accum.first);
1940               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1941                                    "unterminated raw string");
1942               /* Now pop the buffer that _cpp_get_fresh_line did not.  */
1943               _cpp_pop_buffer (pfile);
1944               return;
1945             }
1946
1947           pos = base = pfile->buffer->cur;
1948           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1949         }
1950     }
1951
1952   if (CPP_OPTION (pfile, user_literals))
1953     {
1954       /* If a string format macro, say from inttypes.h, is placed touching
1955          a string literal it could be parsed as a C++11 user-defined string
1956          literal thus breaking the program.  */
1957       if (is_macro_not_literal_suffix (pfile, pos))
1958         {
1959           /* Raise a warning, but do not consume subsequent tokens.  */
1960           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
1961             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1962                                    token->src_loc, 0,
1963                                    "invalid suffix on literal; C++11 requires "
1964                                    "a space between literal and string macro");
1965         }
1966       /* Grab user defined literal suffix.  */
1967       else if (ISIDST (*pos))
1968         {
1969           type = cpp_userdef_string_add_type (type);
1970           ++pos;
1971
1972           while (ISIDNUM (*pos))
1973             ++pos;
1974         }
1975     }
1976
1977  out:
1978   pfile->buffer->cur = pos;
1979   if (!accum.accum)
1980     create_literal (pfile, token, base, pos - base, type);
1981   else
1982     {
1983       size_t extra_len = pos - base;
1984       uchar *dest = _cpp_unaligned_alloc (pfile, accum.accum + extra_len + 1);
1985
1986       token->type = type;
1987       token->val.str.len = accum.accum + extra_len;
1988       token->val.str.text = dest;
1989       for (_cpp_buff *buf = accum.first; buf; buf = buf->next)
1990         {
1991           size_t len = BUFF_FRONT (buf) - buf->base;
1992           memcpy (dest, buf->base, len);
1993           dest += len;
1994         }
1995       _cpp_release_buff (pfile, accum.first);
1996       memcpy (dest, base, extra_len);
1997       dest[extra_len] = '\0';
1998     }
1999 }
2000
2001 /* Lexes a string, character constant, or angle-bracketed header file
2002    name.  The stored string contains the spelling, including opening
2003    quote and any leading 'L', 'u', 'U' or 'u8' and optional
2004    'R' modifier.  It returns the type of the literal, or CPP_OTHER
2005    if it was not properly terminated, or CPP_LESS for an unterminated
2006    header name which must be relexed as normal tokens.
2007
2008    The spelling is NUL-terminated, but it is not guaranteed that this
2009    is the first NUL since embedded NULs are preserved.  */
2010 static void
2011 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2012 {
2013   bool saw_NUL = false;
2014   const uchar *cur;
2015   cppchar_t terminator;
2016   enum cpp_ttype type;
2017
2018   cur = base;
2019   terminator = *cur++;
2020   if (terminator == 'L' || terminator == 'U')
2021     terminator = *cur++;
2022   else if (terminator == 'u')
2023     {
2024       terminator = *cur++;
2025       if (terminator == '8')
2026         terminator = *cur++;
2027     }
2028   if (terminator == 'R')
2029     {
2030       lex_raw_string (pfile, token, base);
2031       return;
2032     }
2033   if (terminator == '"')
2034     type = (*base == 'L' ? CPP_WSTRING :
2035             *base == 'U' ? CPP_STRING32 :
2036             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2037                          : CPP_STRING);
2038   else if (terminator == '\'')
2039     type = (*base == 'L' ? CPP_WCHAR :
2040             *base == 'U' ? CPP_CHAR32 :
2041             *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2042                          : CPP_CHAR);
2043   else
2044     terminator = '>', type = CPP_HEADER_NAME;
2045
2046   for (;;)
2047     {
2048       cppchar_t c = *cur++;
2049
2050       /* In #include-style directives, terminators are not escapable.  */
2051       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2052         cur++;
2053       else if (c == terminator)
2054         break;
2055       else if (c == '\n')
2056         {
2057           cur--;
2058           /* Unmatched quotes always yield undefined behavior, but
2059              greedy lexing means that what appears to be an unterminated
2060              header name may actually be a legitimate sequence of tokens.  */
2061           if (terminator == '>')
2062             {
2063               token->type = CPP_LESS;
2064               return;
2065             }
2066           type = CPP_OTHER;
2067           break;
2068         }
2069       else if (c == '\0')
2070         saw_NUL = true;
2071     }
2072
2073   if (saw_NUL && !pfile->state.skipping)
2074     cpp_error (pfile, CPP_DL_WARNING,
2075                "null character(s) preserved in literal");
2076
2077   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2078     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
2079                (int) terminator);
2080
2081   if (CPP_OPTION (pfile, user_literals))
2082     {
2083       /* If a string format macro, say from inttypes.h, is placed touching
2084          a string literal it could be parsed as a C++11 user-defined string
2085          literal thus breaking the program.  */
2086       if (is_macro_not_literal_suffix (pfile, cur))
2087         {
2088           /* Raise a warning, but do not consume subsequent tokens.  */
2089           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2090             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
2091                                    token->src_loc, 0,
2092                                    "invalid suffix on literal; C++11 requires "
2093                                    "a space between literal and string macro");
2094         }
2095       /* Grab user defined literal suffix.  */
2096       else if (ISIDST (*cur))
2097         {
2098           type = cpp_userdef_char_add_type (type);
2099           type = cpp_userdef_string_add_type (type);
2100           ++cur;
2101
2102           while (ISIDNUM (*cur))
2103             ++cur;
2104         }
2105     }
2106   else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2107            && is_macro (pfile, cur)
2108            && !pfile->state.skipping)
2109     cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2110                            token->src_loc, 0, "C++11 requires a space "
2111                            "between string literal and macro");
2112
2113   pfile->buffer->cur = cur;
2114   create_literal (pfile, token, base, cur - base, type);
2115 }
2116
2117 /* Return the comment table. The client may not make any assumption
2118    about the ordering of the table.  */
2119 cpp_comment_table *
2120 cpp_get_comments (cpp_reader *pfile)
2121 {
2122   return &pfile->comments;
2123 }
2124
2125 /* Append a comment to the end of the comment table. */
2126 static void
2127 store_comment (cpp_reader *pfile, cpp_token *token)
2128 {
2129   int len;
2130
2131   if (pfile->comments.allocated == 0)
2132     {
2133       pfile->comments.allocated = 256;
2134       pfile->comments.entries = (cpp_comment *) xmalloc
2135         (pfile->comments.allocated * sizeof (cpp_comment));
2136     }
2137
2138   if (pfile->comments.count == pfile->comments.allocated)
2139     {
2140       pfile->comments.allocated *= 2;
2141       pfile->comments.entries = (cpp_comment *) xrealloc
2142         (pfile->comments.entries,
2143          pfile->comments.allocated * sizeof (cpp_comment));
2144     }
2145
2146   len = token->val.str.len;
2147
2148   /* Copy comment. Note, token may not be NULL terminated. */
2149   pfile->comments.entries[pfile->comments.count].comment =
2150     (char *) xmalloc (sizeof (char) * (len + 1));
2151   memcpy (pfile->comments.entries[pfile->comments.count].comment,
2152           token->val.str.text, len);
2153   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
2154
2155   /* Set source location. */
2156   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
2157
2158   /* Increment the count of entries in the comment table. */
2159   pfile->comments.count++;
2160 }
2161
2162 /* The stored comment includes the comment start and any terminator.  */
2163 static void
2164 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
2165               cppchar_t type)
2166 {
2167   unsigned char *buffer;
2168   unsigned int len, clen, i;
2169
2170   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
2171
2172   /* C++ comments probably (not definitely) have moved past a new
2173      line, which we don't want to save in the comment.  */
2174   if (is_vspace (pfile->buffer->cur[-1]))
2175     len--;
2176
2177   /* If we are currently in a directive or in argument parsing, then
2178      we need to store all C++ comments as C comments internally, and
2179      so we need to allocate a little extra space in that case.
2180
2181      Note that the only time we encounter a directive here is
2182      when we are saving comments in a "#define".  */
2183   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
2184           && type == '/') ? len + 2 : len;
2185
2186   buffer = _cpp_unaligned_alloc (pfile, clen);
2187
2188   token->type = CPP_COMMENT;
2189   token->val.str.len = clen;
2190   token->val.str.text = buffer;
2191
2192   buffer[0] = '/';
2193   memcpy (buffer + 1, from, len - 1);
2194
2195   /* Finish conversion to a C comment, if necessary.  */
2196   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
2197     {
2198       buffer[1] = '*';
2199       buffer[clen - 2] = '*';
2200       buffer[clen - 1] = '/';
2201       /* As there can be in a C++ comments illegal sequences for C comments
2202          we need to filter them out.  */
2203       for (i = 2; i < (clen - 2); i++)
2204         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
2205           buffer[i] = '|';
2206     }
2207
2208   /* Finally store this comment for use by clients of libcpp. */
2209   store_comment (pfile, token);
2210 }
2211
2212 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
2213    comment.  */
2214
2215 static bool
2216 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
2217 {
2218   const unsigned char *from = comment_start + 1;
2219
2220   switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
2221     {
2222       /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
2223          don't recognize any comments.  The latter only checks attributes,
2224          the former doesn't warn.  */
2225     case 0:
2226     default:
2227       return false;
2228       /* -Wimplicit-fallthrough=1 considers any comment, no matter what
2229          content it has.  */
2230     case 1:
2231       return true;
2232     case 2:
2233       /* -Wimplicit-fallthrough=2 looks for (case insensitive)
2234          .*falls?[ \t-]*thr(u|ough).* regex.  */
2235       for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
2236            from++)
2237         {
2238           /* Is there anything like strpbrk with upper boundary, or
2239              memchr looking for 2 characters rather than just one?  */
2240           if (from[0] != 'f' && from[0] != 'F')
2241             continue;
2242           if (from[1] != 'a' && from[1] != 'A')
2243             continue;
2244           if (from[2] != 'l' && from[2] != 'L')
2245             continue;
2246           if (from[3] != 'l' && from[3] != 'L')
2247             continue;
2248           from += sizeof "fall" - 1;
2249           if (from[0] == 's' || from[0] == 'S')
2250             from++;
2251           while (*from == ' ' || *from == '\t' || *from == '-')
2252             from++;
2253           if (from[0] != 't' && from[0] != 'T')
2254             continue;
2255           if (from[1] != 'h' && from[1] != 'H')
2256             continue;
2257           if (from[2] != 'r' && from[2] != 'R')
2258             continue;
2259           if (from[3] == 'u' || from[3] == 'U')
2260             return true;
2261           if (from[3] != 'o' && from[3] != 'O')
2262             continue;
2263           if (from[4] != 'u' && from[4] != 'U')
2264             continue;
2265           if (from[5] != 'g' && from[5] != 'G')
2266             continue;
2267           if (from[6] != 'h' && from[6] != 'H')
2268             continue;
2269           return true;
2270         }
2271       return false;
2272     case 3:
2273     case 4:
2274       break;
2275     }
2276
2277   /* Whole comment contents:
2278      -fallthrough
2279      @fallthrough@
2280    */
2281   if (*from == '-' || *from == '@')
2282     {
2283       size_t len = sizeof "fallthrough" - 1;
2284       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2285         return false;
2286       if (memcmp (from + 1, "fallthrough", len))
2287         return false;
2288       if (*from == '@')
2289         {
2290           if (from[len + 1] != '@')
2291             return false;
2292           len++;
2293         }
2294       from += 1 + len;
2295     }
2296   /* Whole comment contents (regex):
2297      lint -fallthrough[ \t]*
2298    */
2299   else if (*from == 'l')
2300     {
2301       size_t len = sizeof "int -fallthrough" - 1;
2302       if ((size_t) (pfile->buffer->cur - from - 1) < len)
2303         return false;
2304       if (memcmp (from + 1, "int -fallthrough", len))
2305         return false;
2306       from += 1 + len;
2307       while (*from == ' ' || *from == '\t')
2308         from++;
2309     }
2310   /* Whole comment contents (regex):
2311      [ \t]*FALLTHR(U|OUGH)[ \t]*
2312    */
2313   else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
2314     {
2315       while (*from == ' ' || *from == '\t')
2316         from++;
2317       if ((size_t) (pfile->buffer->cur - from)  < sizeof "FALLTHRU" - 1)
2318         return false;
2319       if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1))
2320         return false;
2321       from += sizeof "FALLTHR" - 1;
2322       if (*from == 'U')
2323         from++;
2324       else if ((size_t) (pfile->buffer->cur - from)  < sizeof "OUGH" - 1)
2325         return false;
2326       else if (memcmp (from, "OUGH", sizeof "OUGH" - 1))
2327         return false;
2328       else
2329         from += sizeof "OUGH" - 1;
2330       while (*from == ' ' || *from == '\t')
2331         from++;
2332     }
2333   /* Whole comment contents (regex):
2334      [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
2335      [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
2336      [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
2337    */
2338   else
2339     {
2340       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2341         from++;
2342       unsigned char f = *from;
2343       bool all_upper = false;
2344       if (f == 'E' || f == 'e')
2345         {
2346           if ((size_t) (pfile->buffer->cur - from)
2347               < sizeof "else fallthru" - 1)
2348             return false;
2349           if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0)
2350             all_upper = true;
2351           else if (memcmp (from + 1, "lse", sizeof "lse" - 1))
2352             return false;
2353           from += sizeof "else" - 1;
2354           if (*from == ',')
2355             from++;
2356           if (*from != ' ')
2357             return false;
2358           from++;
2359           if (all_upper && *from == 'f')
2360             return false;
2361           if (f == 'e' && *from == 'F')
2362             return false;
2363           f = *from;
2364         }
2365       else if (f == 'I' || f == 'i')
2366         {
2367           if ((size_t) (pfile->buffer->cur - from)
2368               < sizeof "intentional fallthru" - 1)
2369             return false;
2370           if (f == 'I' && memcmp (from + 1, "NTENTIONAL",
2371                                   sizeof "NTENTIONAL" - 1) == 0)
2372             all_upper = true;
2373           else if (memcmp (from + 1, "ntentional",
2374                            sizeof "ntentional" - 1))
2375             return false;
2376           from += sizeof "intentional" - 1;
2377           if (*from == ' ')
2378             {
2379               from++;
2380               if (all_upper && *from == 'f')
2381                 return false;
2382             }
2383           else if (all_upper)
2384             {
2385               if (memcmp (from, "LY F", sizeof "LY F" - 1))
2386                 return false;
2387               from += sizeof "LY " - 1;
2388             }
2389           else
2390             {
2391               if (memcmp (from, "ly ", sizeof "ly " - 1))
2392                 return false;
2393               from += sizeof "ly " - 1;
2394             }
2395           if (f == 'i' && *from == 'F')
2396             return false;
2397           f = *from;
2398         }
2399       if (f != 'F' && f != 'f')
2400         return false;
2401       if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
2402         return false;
2403       if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0)
2404         all_upper = true;
2405       else if (all_upper)
2406         return false;
2407       else if (memcmp (from + 1, "all", sizeof "all" - 1))
2408         return false;
2409       from += sizeof "fall" - 1;
2410       if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
2411         from += 2;
2412       else if (*from == ' ' || *from == '-')
2413         from++;
2414       else if (*from != (all_upper ? 'T' : 't'))
2415         return false;
2416       if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
2417         return false;
2418       if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
2419         return false;
2420       if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1))
2421         {
2422           if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
2423             return false;
2424           if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough",
2425                       sizeof "hrough" - 1))
2426             return false;
2427           from += sizeof "through" - 1;
2428         }
2429       else
2430         from += sizeof "thru" - 1;
2431       while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
2432         from++;
2433       if (*from == '-')
2434         {
2435           from++;
2436           if (*comment_start == '*')
2437             {
2438               do
2439                 {
2440                   while (*from && *from != '*'
2441                          && *from != '\n' && *from != '\r')
2442                     from++;
2443                   if (*from != '*' || from[1] == '/')
2444                     break;
2445                   from++;
2446                 }
2447               while (1);
2448             }
2449           else
2450             while (*from && *from != '\n' && *from != '\r')
2451               from++;
2452         }
2453     }
2454   /* C block comment.  */
2455   if (*comment_start == '*')
2456     {
2457       if (*from != '*' || from[1] != '/')
2458         return false;
2459     }
2460   /* C++ line comment.  */
2461   else if (*from != '\n')
2462     return false;
2463
2464   return true;
2465 }
2466
2467 /* Allocate COUNT tokens for RUN.  */
2468 void
2469 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
2470 {
2471   run->base = XNEWVEC (cpp_token, count);
2472   run->limit = run->base + count;
2473   run->next = NULL;
2474 }
2475
2476 /* Returns the next tokenrun, or creates one if there is none.  */
2477 static tokenrun *
2478 next_tokenrun (tokenrun *run)
2479 {
2480   if (run->next == NULL)
2481     {
2482       run->next = XNEW (tokenrun);
2483       run->next->prev = run;
2484       _cpp_init_tokenrun (run->next, 250);
2485     }
2486
2487   return run->next;
2488 }
2489
2490 /* Return the number of not yet processed token in a given
2491    context.  */
2492 int
2493 _cpp_remaining_tokens_num_in_context (cpp_context *context)
2494 {
2495   if (context->tokens_kind == TOKENS_KIND_DIRECT)
2496     return (LAST (context).token - FIRST (context).token);
2497   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2498            || context->tokens_kind == TOKENS_KIND_EXTENDED)
2499     return (LAST (context).ptoken - FIRST (context).ptoken);
2500   else
2501       abort ();
2502 }
2503
2504 /* Returns the token present at index INDEX in a given context.  If
2505    INDEX is zero, the next token to be processed is returned.  */
2506 static const cpp_token*
2507 _cpp_token_from_context_at (cpp_context *context, int index)
2508 {
2509   if (context->tokens_kind == TOKENS_KIND_DIRECT)
2510     return &(FIRST (context).token[index]);
2511   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
2512            || context->tokens_kind == TOKENS_KIND_EXTENDED)
2513     return FIRST (context).ptoken[index];
2514  else
2515    abort ();
2516 }
2517
2518 /* Look ahead in the input stream.  */
2519 const cpp_token *
2520 cpp_peek_token (cpp_reader *pfile, int index)
2521 {
2522   cpp_context *context = pfile->context;
2523   const cpp_token *peektok;
2524   int count;
2525
2526   /* First, scan through any pending cpp_context objects.  */
2527   while (context->prev)
2528     {
2529       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
2530
2531       if (index < (int) sz)
2532         return _cpp_token_from_context_at (context, index);
2533       index -= (int) sz;
2534       context = context->prev;
2535     }
2536
2537   /* We will have to read some new tokens after all (and do so
2538      without invalidating preceding tokens).  */
2539   count = index;
2540   pfile->keep_tokens++;
2541
2542   /* For peeked tokens temporarily disable line_change reporting,
2543      until the tokens are parsed for real.  */
2544   void (*line_change) (cpp_reader *, const cpp_token *, int)
2545     = pfile->cb.line_change;
2546   pfile->cb.line_change = NULL;
2547
2548   do
2549     {
2550       peektok = _cpp_lex_token (pfile);
2551       if (peektok->type == CPP_EOF)
2552         {
2553           index--;
2554           break;
2555         }
2556     }
2557   while (index--);
2558
2559   _cpp_backup_tokens_direct (pfile, count - index);
2560   pfile->keep_tokens--;
2561   pfile->cb.line_change = line_change;
2562
2563   return peektok;
2564 }
2565
2566 /* Allocate a single token that is invalidated at the same time as the
2567    rest of the tokens on the line.  Has its line and col set to the
2568    same as the last lexed token, so that diagnostics appear in the
2569    right place.  */
2570 cpp_token *
2571 _cpp_temp_token (cpp_reader *pfile)
2572 {
2573   cpp_token *old, *result;
2574   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
2575   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
2576
2577   old = pfile->cur_token - 1;
2578   /* Any pre-existing lookaheads must not be clobbered.  */
2579   if (la)
2580     {
2581       if (sz <= la)
2582         {
2583           tokenrun *next = next_tokenrun (pfile->cur_run);
2584
2585           if (sz < la)
2586             memmove (next->base + 1, next->base,
2587                      (la - sz) * sizeof (cpp_token));
2588
2589           next->base[0] = pfile->cur_run->limit[-1];
2590         }
2591
2592       if (sz > 1)
2593         memmove (pfile->cur_token + 1, pfile->cur_token,
2594                  MIN (la, sz - 1) * sizeof (cpp_token));
2595     }
2596
2597   if (!sz && pfile->cur_token == pfile->cur_run->limit)
2598     {
2599       pfile->cur_run = next_tokenrun (pfile->cur_run);
2600       pfile->cur_token = pfile->cur_run->base;
2601     }
2602
2603   result = pfile->cur_token++;
2604   result->src_loc = old->src_loc;
2605   return result;
2606 }
2607
2608 /* Lex a token into RESULT (external interface).  Takes care of issues
2609    like directive handling, token lookahead, multiple include
2610    optimization and skipping.  */
2611 const cpp_token *
2612 _cpp_lex_token (cpp_reader *pfile)
2613 {
2614   cpp_token *result;
2615
2616   for (;;)
2617     {
2618       if (pfile->cur_token == pfile->cur_run->limit)
2619         {
2620           pfile->cur_run = next_tokenrun (pfile->cur_run);
2621           pfile->cur_token = pfile->cur_run->base;
2622         }
2623       /* We assume that the current token is somewhere in the current
2624          run.  */
2625       if (pfile->cur_token < pfile->cur_run->base
2626           || pfile->cur_token >= pfile->cur_run->limit)
2627         abort ();
2628
2629       if (pfile->lookaheads)
2630         {
2631           pfile->lookaheads--;
2632           result = pfile->cur_token++;
2633         }
2634       else
2635         result = _cpp_lex_direct (pfile);
2636
2637       if (result->flags & BOL)
2638         {
2639           /* Is this a directive.  If _cpp_handle_directive returns
2640              false, it is an assembler #.  */
2641           if (result->type == CPP_HASH
2642               /* 6.10.3 p 11: Directives in a list of macro arguments
2643                  gives undefined behavior.  This implementation
2644                  handles the directive as normal.  */
2645               && pfile->state.parsing_args != 1)
2646             {
2647               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2648                 {
2649                   if (pfile->directive_result.type == CPP_PADDING)
2650                     continue;
2651                   result = &pfile->directive_result;
2652                 }
2653             }
2654           else if (pfile->state.in_deferred_pragma)
2655             result = &pfile->directive_result;
2656
2657           if (pfile->cb.line_change && !pfile->state.skipping)
2658             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2659         }
2660
2661       /* We don't skip tokens in directives.  */
2662       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2663         break;
2664
2665       /* Outside a directive, invalidate controlling macros.  At file
2666          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2667          get here and MI optimization works.  */
2668       pfile->mi_valid = false;
2669
2670       if (!pfile->state.skipping || result->type == CPP_EOF)
2671         break;
2672     }
2673
2674   return result;
2675 }
2676
2677 /* Returns true if a fresh line has been loaded.  */
2678 bool
2679 _cpp_get_fresh_line (cpp_reader *pfile)
2680 {
2681   /* We can't get a new line until we leave the current directive.  */
2682   if (pfile->state.in_directive)
2683     return false;
2684
2685   for (;;)
2686     {
2687       cpp_buffer *buffer = pfile->buffer;
2688
2689       if (!buffer->need_line)
2690         return true;
2691
2692       if (buffer->next_line < buffer->rlimit)
2693         {
2694           _cpp_clean_line (pfile);
2695           return true;
2696         }
2697
2698       /* First, get out of parsing arguments state.  */
2699       if (pfile->state.parsing_args)
2700         return false;
2701
2702       /* End of buffer.  Non-empty files should end in a newline.  */
2703       if (buffer->buf != buffer->rlimit
2704           && buffer->next_line > buffer->rlimit
2705           && !buffer->from_stage3)
2706         {
2707           /* Clip to buffer size.  */
2708           buffer->next_line = buffer->rlimit;
2709         }
2710
2711       if (buffer->prev && !buffer->return_at_eof)
2712         _cpp_pop_buffer (pfile);
2713       else
2714         {
2715           /* End of translation.  Do not pop the buffer yet. Increment
2716              line number so that the EOF token is on a line of its own
2717              (_cpp_lex_direct doesn't increment in that case, because
2718              it's hard for it to distinguish this special case). */
2719           CPP_INCREMENT_LINE (pfile, 0);
2720           return false;
2721         }
2722     }
2723 }
2724
2725 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
2726   do                                                    \
2727     {                                                   \
2728       result->type = ELSE_TYPE;                         \
2729       if (*buffer->cur == CHAR)                         \
2730         buffer->cur++, result->type = THEN_TYPE;        \
2731     }                                                   \
2732   while (0)
2733
2734 /* Lex a token into pfile->cur_token, which is also incremented, to
2735    get diagnostics pointing to the correct location.
2736
2737    Does not handle issues such as token lookahead, multiple-include
2738    optimization, directives, skipping etc.  This function is only
2739    suitable for use by _cpp_lex_token, and in special cases like
2740    lex_expansion_token which doesn't care for any of these issues.
2741
2742    When meeting a newline, returns CPP_EOF if parsing a directive,
2743    otherwise returns to the start of the token buffer if permissible.
2744    Returns the location of the lexed token.  */
2745 cpp_token *
2746 _cpp_lex_direct (cpp_reader *pfile)
2747 {
2748   cppchar_t c;
2749   cpp_buffer *buffer;
2750   const unsigned char *comment_start;
2751   bool fallthrough_comment = false;
2752   cpp_token *result = pfile->cur_token++;
2753
2754  fresh_line:
2755   result->flags = 0;
2756   buffer = pfile->buffer;
2757   if (buffer->need_line)
2758     {
2759       if (pfile->state.in_deferred_pragma)
2760         {
2761           result->type = CPP_PRAGMA_EOL;
2762           pfile->state.in_deferred_pragma = false;
2763           if (!pfile->state.pragma_allow_expansion)
2764             pfile->state.prevent_expansion--;
2765           return result;
2766         }
2767       if (!_cpp_get_fresh_line (pfile))
2768         {
2769           result->type = CPP_EOF;
2770           if (!pfile->state.in_directive)
2771             {
2772               /* Tell the compiler the line number of the EOF token.  */
2773               result->src_loc = pfile->line_table->highest_line;
2774               result->flags = BOL;
2775               /* Now pop the buffer that _cpp_get_fresh_line did not.  */
2776               _cpp_pop_buffer (pfile);
2777             }
2778           return result;
2779         }
2780       if (buffer != pfile->buffer)
2781         fallthrough_comment = false;
2782       if (!pfile->keep_tokens)
2783         {
2784           pfile->cur_run = &pfile->base_run;
2785           result = pfile->base_run.base;
2786           pfile->cur_token = result + 1;
2787         }
2788       result->flags = BOL;
2789       if (pfile->state.parsing_args == 2)
2790         result->flags |= PREV_WHITE;
2791     }
2792   buffer = pfile->buffer;
2793  update_tokens_line:
2794   result->src_loc = pfile->line_table->highest_line;
2795
2796  skipped_white:
2797   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2798       && !pfile->overlaid_buffer)
2799     {
2800       _cpp_process_line_notes (pfile, false);
2801       result->src_loc = pfile->line_table->highest_line;
2802     }
2803   c = *buffer->cur++;
2804
2805   if (pfile->forced_token_location)
2806     result->src_loc = pfile->forced_token_location;
2807   else
2808     result->src_loc = linemap_position_for_column (pfile->line_table,
2809                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2810
2811   switch (c)
2812     {
2813     case ' ': case '\t': case '\f': case '\v': case '\0':
2814       result->flags |= PREV_WHITE;
2815       skip_whitespace (pfile, c);
2816       goto skipped_white;
2817
2818     case '\n':
2819       /* Increment the line, unless this is the last line ...  */
2820       if (buffer->cur < buffer->rlimit
2821           /* ... or this is a #include, (where _cpp_stack_file needs to
2822              unwind by one line) ...  */
2823           || (pfile->state.in_directive > 1
2824               /* ... except traditional-cpp increments this elsewhere.  */
2825               && !CPP_OPTION (pfile, traditional)))
2826         CPP_INCREMENT_LINE (pfile, 0);
2827       buffer->need_line = true;
2828       goto fresh_line;
2829
2830     case '0': case '1': case '2': case '3': case '4':
2831     case '5': case '6': case '7': case '8': case '9':
2832       {
2833         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2834         result->type = CPP_NUMBER;
2835         lex_number (pfile, &result->val.str, &nst);
2836         warn_about_normalization (pfile, result, &nst);
2837         break;
2838       }
2839
2840     case 'L':
2841     case 'u':
2842     case 'U':
2843     case 'R':
2844       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2845          wide strings or raw strings.  */
2846       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2847           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2848         {
2849           if ((*buffer->cur == '\'' && c != 'R')
2850               || *buffer->cur == '"'
2851               || (*buffer->cur == 'R'
2852                   && c != 'R'
2853                   && buffer->cur[1] == '"'
2854                   && CPP_OPTION (pfile, rliterals))
2855               || (*buffer->cur == '8'
2856                   && c == 'u'
2857                   && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
2858                                 && CPP_OPTION (pfile, utf8_char_literals)))
2859                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2860                           && CPP_OPTION (pfile, rliterals)))))
2861             {
2862               lex_string (pfile, result, buffer->cur - 1);
2863               break;
2864             }
2865         }
2866       /* Fall through.  */
2867
2868     case '_':
2869     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2870     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2871     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2872     case 's': case 't':           case 'v': case 'w': case 'x':
2873     case 'y': case 'z':
2874     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2875     case 'G': case 'H': case 'I': case 'J': case 'K':
2876     case 'M': case 'N': case 'O': case 'P': case 'Q':
2877     case 'S': case 'T':           case 'V': case 'W': case 'X':
2878     case 'Y': case 'Z':
2879       result->type = CPP_NAME;
2880       {
2881         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2882         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2883                                                 &nst,
2884                                                 &result->val.node.spelling);
2885         warn_about_normalization (pfile, result, &nst);
2886       }
2887
2888       /* Convert named operators to their proper types.  */
2889       if (result->val.node.node->flags & NODE_OPERATOR)
2890         {
2891           result->flags |= NAMED_OP;
2892           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2893         }
2894
2895       /* Signal FALLTHROUGH comment followed by another token.  */
2896       if (fallthrough_comment)
2897         result->flags |= PREV_FALLTHROUGH;
2898       break;
2899
2900     case '\'':
2901     case '"':
2902       lex_string (pfile, result, buffer->cur - 1);
2903       break;
2904
2905     case '/':
2906       /* A potential block or line comment.  */
2907       comment_start = buffer->cur;
2908       c = *buffer->cur;
2909
2910       if (c == '*')
2911         {
2912           if (_cpp_skip_block_comment (pfile))
2913             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2914         }
2915       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
2916         {
2917           /* Don't warn for system headers.  */
2918           if (cpp_in_system_header (pfile))
2919             ;
2920           /* Warn about comments if pedantically GNUC89, and not
2921              in system headers.  */
2922           else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
2923                    && CPP_PEDANTIC (pfile)
2924                    && ! buffer->warned_cplusplus_comments)
2925             {
2926               if (cpp_error (pfile, CPP_DL_PEDWARN,
2927                              "C++ style comments are not allowed in ISO C90"))
2928                 cpp_error (pfile, CPP_DL_NOTE,
2929                            "(this will be reported only once per input file)");
2930               buffer->warned_cplusplus_comments = 1;
2931             }
2932           /* Or if specifically desired via -Wc90-c99-compat.  */
2933           else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
2934                    && ! CPP_OPTION (pfile, cplusplus)
2935                    && ! buffer->warned_cplusplus_comments)
2936             {
2937               if (cpp_error (pfile, CPP_DL_WARNING,
2938                              "C++ style comments are incompatible with C90"))
2939                 cpp_error (pfile, CPP_DL_NOTE,
2940                            "(this will be reported only once per input file)");
2941               buffer->warned_cplusplus_comments = 1;
2942             }
2943           /* In C89/C94, C++ style comments are forbidden.  */
2944           else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
2945                     || CPP_OPTION (pfile, lang) == CLK_STDC94))
2946             {
2947               /* But don't be confused about valid code such as
2948                  - // immediately followed by *,
2949                  - // in a preprocessing directive,
2950                  - // in an #if 0 block.  */
2951               if (buffer->cur[1] == '*'
2952                   || pfile->state.in_directive
2953                   || pfile->state.skipping)
2954                 {
2955                   result->type = CPP_DIV;
2956                   break;
2957                 }
2958               else if (! buffer->warned_cplusplus_comments)
2959                 {
2960                   if (cpp_error (pfile, CPP_DL_ERROR,
2961                                  "C++ style comments are not allowed in "
2962                                  "ISO C90"))
2963                     cpp_error (pfile, CPP_DL_NOTE,
2964                                "(this will be reported only once per input "
2965                                "file)");
2966                   buffer->warned_cplusplus_comments = 1;
2967                 }
2968             }
2969           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2970             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2971         }
2972       else if (c == '=')
2973         {
2974           buffer->cur++;
2975           result->type = CPP_DIV_EQ;
2976           break;
2977         }
2978       else
2979         {
2980           result->type = CPP_DIV;
2981           break;
2982         }
2983
2984       if (fallthrough_comment_p (pfile, comment_start))
2985         fallthrough_comment = true;
2986
2987       if (pfile->cb.comment)
2988         {
2989           size_t len = pfile->buffer->cur - comment_start;
2990           pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
2991                              len + 1);
2992         }
2993
2994       if (!pfile->state.save_comments)
2995         {
2996           result->flags |= PREV_WHITE;
2997           goto update_tokens_line;
2998         }
2999
3000       if (fallthrough_comment)
3001         result->flags |= PREV_FALLTHROUGH;
3002
3003       /* Save the comment as a token in its own right.  */
3004       save_comment (pfile, result, comment_start, c);
3005       break;
3006
3007     case '<':
3008       if (pfile->state.angled_headers)
3009         {
3010           lex_string (pfile, result, buffer->cur - 1);
3011           if (result->type != CPP_LESS)
3012             break;
3013         }
3014
3015       result->type = CPP_LESS;
3016       if (*buffer->cur == '=')
3017         {
3018           buffer->cur++, result->type = CPP_LESS_EQ;
3019           if (*buffer->cur == '>'
3020               && CPP_OPTION (pfile, cplusplus)
3021               && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
3022             buffer->cur++, result->type = CPP_SPACESHIP;
3023         }
3024       else if (*buffer->cur == '<')
3025         {
3026           buffer->cur++;
3027           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
3028         }
3029       else if (CPP_OPTION (pfile, digraphs))
3030         {
3031           if (*buffer->cur == ':')
3032             {
3033               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
3034                  three characters are <:: and the subsequent character
3035                  is neither : nor >, the < is treated as a preprocessor
3036                  token by itself".  */
3037               if (CPP_OPTION (pfile, cplusplus)
3038                   && CPP_OPTION (pfile, lang) != CLK_CXX98
3039                   && CPP_OPTION (pfile, lang) != CLK_GNUCXX
3040                   && buffer->cur[1] == ':'
3041                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
3042                 break;
3043
3044               buffer->cur++;
3045               result->flags |= DIGRAPH;
3046               result->type = CPP_OPEN_SQUARE;
3047             }
3048           else if (*buffer->cur == '%')
3049             {
3050               buffer->cur++;
3051               result->flags |= DIGRAPH;
3052               result->type = CPP_OPEN_BRACE;
3053             }
3054         }
3055       break;
3056
3057     case '>':
3058       result->type = CPP_GREATER;
3059       if (*buffer->cur == '=')
3060         buffer->cur++, result->type = CPP_GREATER_EQ;
3061       else if (*buffer->cur == '>')
3062         {
3063           buffer->cur++;
3064           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
3065         }
3066       break;
3067
3068     case '%':
3069       result->type = CPP_MOD;
3070       if (*buffer->cur == '=')
3071         buffer->cur++, result->type = CPP_MOD_EQ;
3072       else if (CPP_OPTION (pfile, digraphs))
3073         {
3074           if (*buffer->cur == ':')
3075             {
3076               buffer->cur++;
3077               result->flags |= DIGRAPH;
3078               result->type = CPP_HASH;
3079               if (*buffer->cur == '%' && buffer->cur[1] == ':')
3080                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
3081             }
3082           else if (*buffer->cur == '>')
3083             {
3084               buffer->cur++;
3085               result->flags |= DIGRAPH;
3086               result->type = CPP_CLOSE_BRACE;
3087             }
3088         }
3089       break;
3090
3091     case '.':
3092       result->type = CPP_DOT;
3093       if (ISDIGIT (*buffer->cur))
3094         {
3095           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3096           result->type = CPP_NUMBER;
3097           lex_number (pfile, &result->val.str, &nst);
3098           warn_about_normalization (pfile, result, &nst);
3099         }
3100       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
3101         buffer->cur += 2, result->type = CPP_ELLIPSIS;
3102       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3103         buffer->cur++, result->type = CPP_DOT_STAR;
3104       break;
3105
3106     case '+':
3107       result->type = CPP_PLUS;
3108       if (*buffer->cur == '+')
3109         buffer->cur++, result->type = CPP_PLUS_PLUS;
3110       else if (*buffer->cur == '=')
3111         buffer->cur++, result->type = CPP_PLUS_EQ;
3112       break;
3113
3114     case '-':
3115       result->type = CPP_MINUS;
3116       if (*buffer->cur == '>')
3117         {
3118           buffer->cur++;
3119           result->type = CPP_DEREF;
3120           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
3121             buffer->cur++, result->type = CPP_DEREF_STAR;
3122         }
3123       else if (*buffer->cur == '-')
3124         buffer->cur++, result->type = CPP_MINUS_MINUS;
3125       else if (*buffer->cur == '=')
3126         buffer->cur++, result->type = CPP_MINUS_EQ;
3127       break;
3128
3129     case '&':
3130       result->type = CPP_AND;
3131       if (*buffer->cur == '&')
3132         buffer->cur++, result->type = CPP_AND_AND;
3133       else if (*buffer->cur == '=')
3134         buffer->cur++, result->type = CPP_AND_EQ;
3135       break;
3136
3137     case '|':
3138       result->type = CPP_OR;
3139       if (*buffer->cur == '|')
3140         buffer->cur++, result->type = CPP_OR_OR;
3141       else if (*buffer->cur == '=')
3142         buffer->cur++, result->type = CPP_OR_EQ;
3143       break;
3144
3145     case ':':
3146       result->type = CPP_COLON;
3147       if (*buffer->cur == ':' && CPP_OPTION (pfile, scope))
3148         buffer->cur++, result->type = CPP_SCOPE;
3149       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
3150         {
3151           buffer->cur++;
3152           result->flags |= DIGRAPH;
3153           result->type = CPP_CLOSE_SQUARE;
3154         }
3155       break;
3156
3157     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
3158     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
3159     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
3160     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
3161     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
3162
3163     case '?': result->type = CPP_QUERY; break;
3164     case '~': result->type = CPP_COMPL; break;
3165     case ',': result->type = CPP_COMMA; break;
3166     case '(': result->type = CPP_OPEN_PAREN; break;
3167     case ')': result->type = CPP_CLOSE_PAREN; break;
3168     case '[': result->type = CPP_OPEN_SQUARE; break;
3169     case ']': result->type = CPP_CLOSE_SQUARE; break;
3170     case '{': result->type = CPP_OPEN_BRACE; break;
3171     case '}': result->type = CPP_CLOSE_BRACE; break;
3172     case ';': result->type = CPP_SEMICOLON; break;
3173
3174       /* @ is a punctuator in Objective-C.  */
3175     case '@': result->type = CPP_ATSIGN; break;
3176
3177     default:
3178       {
3179         const uchar *base = --buffer->cur;
3180
3181         /* Check for an extended identifier ($ or UCN or UTF-8).  */
3182         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3183         if (forms_identifier_p (pfile, true, &nst))
3184           {
3185             result->type = CPP_NAME;
3186             result->val.node.node = lex_identifier (pfile, base, true, &nst,
3187                                                     &result->val.node.spelling);
3188             warn_about_normalization (pfile, result, &nst);
3189             break;
3190           }
3191
3192         /* Otherwise this will form a CPP_OTHER token.  Parse valid UTF-8 as a
3193            single token.  */
3194         buffer->cur++;
3195         if (c >= utf8_signifier)
3196           {
3197             const uchar *pstr = base;
3198             cppchar_t s;
3199             if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s))
3200               buffer->cur = pstr;
3201           }
3202         create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER);
3203         break;
3204       }
3205
3206     }
3207
3208   /* Potentially convert the location of the token to a range.  */
3209   if (result->src_loc >= RESERVED_LOCATION_COUNT
3210       && result->type != CPP_EOF)
3211     {
3212       /* Ensure that any line notes are processed, so that we have the
3213          correct physical line/column for the end-point of the token even
3214          when a logical line is split via one or more backslashes.  */
3215       if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3216           && !pfile->overlaid_buffer)
3217         _cpp_process_line_notes (pfile, false);
3218
3219       source_range tok_range;
3220       tok_range.m_start = result->src_loc;
3221       tok_range.m_finish
3222         = linemap_position_for_column (pfile->line_table,
3223                                        CPP_BUF_COLUMN (buffer, buffer->cur));
3224
3225       result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table,
3226                                                result->src_loc,
3227                                                tok_range, NULL);
3228     }
3229
3230   return result;
3231 }
3232
3233 /* An upper bound on the number of bytes needed to spell TOKEN.
3234    Does not include preceding whitespace.  */
3235 unsigned int
3236 cpp_token_len (const cpp_token *token)
3237 {
3238   unsigned int len;
3239
3240   switch (TOKEN_SPELL (token))
3241     {
3242     default:            len = 6;                                break;
3243     case SPELL_LITERAL: len = token->val.str.len;               break;
3244     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
3245     }
3246
3247   return len;
3248 }
3249
3250 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
3251    Return the number of bytes read out of NAME.  (There are always
3252    10 bytes written to BUFFER.)  */
3253
3254 static size_t
3255 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
3256 {
3257   int j;
3258   int ucn_len = 0;
3259   int ucn_len_c;
3260   unsigned t;
3261   unsigned long utf32;
3262
3263   /* Compute the length of the UTF-8 sequence.  */
3264   for (t = *name; t & 0x80; t <<= 1)
3265     ucn_len++;
3266
3267   utf32 = *name & (0x7F >> ucn_len);
3268   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
3269     {
3270       utf32 = (utf32 << 6) | (*++name & 0x3F);
3271
3272       /* Ill-formed UTF-8.  */
3273       if ((*name & ~0x3F) != 0x80)
3274         abort ();
3275     }
3276
3277   *buffer++ = '\\';
3278   *buffer++ = 'U';
3279   for (j = 7; j >= 0; j--)
3280     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
3281   return ucn_len;
3282 }
3283
3284 /* Given a token TYPE corresponding to a digraph, return a pointer to
3285    the spelling of the digraph.  */
3286 static const unsigned char *
3287 cpp_digraph2name (enum cpp_ttype type)
3288 {
3289   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
3290 }
3291
3292 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
3293    The buffer must already contain the enough space to hold the
3294    token's spelling.  Returns a pointer to the character after the
3295    last character written.  */
3296 unsigned char *
3297 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
3298 {
3299   size_t i;
3300   const unsigned char *name = NODE_NAME (ident);
3301
3302   for (i = 0; i < NODE_LEN (ident); i++)
3303     if (name[i] & ~0x7F)
3304       {
3305         i += utf8_to_ucn (buffer, name + i) - 1;
3306         buffer += 10;
3307       }
3308     else
3309       *buffer++ = name[i];
3310
3311   return buffer;
3312 }
3313
3314 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
3315    already contain the enough space to hold the token's spelling.
3316    Returns a pointer to the character after the last character written.
3317    FORSTRING is true if this is to be the spelling after translation
3318    phase 1 (with the original spelling of extended identifiers), false
3319    if extended identifiers should always be written using UCNs (there is
3320    no option for always writing them in the internal UTF-8 form).
3321    FIXME: Would be nice if we didn't need the PFILE argument.  */
3322 unsigned char *
3323 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
3324                  unsigned char *buffer, bool forstring)
3325 {
3326   switch (TOKEN_SPELL (token))
3327     {
3328     case SPELL_OPERATOR:
3329       {
3330         const unsigned char *spelling;
3331         unsigned char c;
3332
3333         if (token->flags & DIGRAPH)
3334           spelling = cpp_digraph2name (token->type);
3335         else if (token->flags & NAMED_OP)
3336           goto spell_ident;
3337         else
3338           spelling = TOKEN_NAME (token);
3339
3340         while ((c = *spelling++) != '\0')
3341           *buffer++ = c;
3342       }
3343       break;
3344
3345     spell_ident:
3346     case SPELL_IDENT:
3347       if (forstring)
3348         {
3349           memcpy (buffer, NODE_NAME (token->val.node.spelling),
3350                   NODE_LEN (token->val.node.spelling));
3351           buffer += NODE_LEN (token->val.node.spelling);
3352         }
3353       else
3354         buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node);
3355       break;
3356
3357     case SPELL_LITERAL:
3358       memcpy (buffer, token->val.str.text, token->val.str.len);
3359       buffer += token->val.str.len;
3360       break;
3361
3362     case SPELL_NONE:
3363       cpp_error (pfile, CPP_DL_ICE,
3364                  "unspellable token %s", TOKEN_NAME (token));
3365       break;
3366     }
3367
3368   return buffer;
3369 }
3370
3371 /* Returns TOKEN spelt as a null-terminated string.  The string is
3372    freed when the reader is destroyed.  Useful for diagnostics.  */
3373 unsigned char *
3374 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
3375 {
3376   unsigned int len = cpp_token_len (token) + 1;
3377   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
3378
3379   end = cpp_spell_token (pfile, token, start, false);
3380   end[0] = '\0';
3381
3382   return start;
3383 }
3384
3385 /* Returns a pointer to a string which spells the token defined by
3386    TYPE and FLAGS.  Used by C front ends, which really should move to
3387    using cpp_token_as_text.  */
3388 const char *
3389 cpp_type2name (enum cpp_ttype type, unsigned char flags)
3390 {
3391   if (flags & DIGRAPH)
3392     return (const char *) cpp_digraph2name (type);
3393   else if (flags & NAMED_OP)
3394     return cpp_named_operator2name (type);
3395
3396   return (const char *) token_spellings[type].name;
3397 }
3398
3399 /* Writes the spelling of token to FP, without any preceding space.
3400    Separated from cpp_spell_token for efficiency - to avoid stdio
3401    double-buffering.  */
3402 void
3403 cpp_output_token (const cpp_token *token, FILE *fp)
3404 {
3405   switch (TOKEN_SPELL (token))
3406     {
3407     case SPELL_OPERATOR:
3408       {
3409         const unsigned char *spelling;
3410         int c;
3411
3412         if (token->flags & DIGRAPH)
3413           spelling = cpp_digraph2name (token->type);
3414         else if (token->flags & NAMED_OP)
3415           goto spell_ident;
3416         else
3417           spelling = TOKEN_NAME (token);
3418
3419         c = *spelling;
3420         do
3421           putc (c, fp);
3422         while ((c = *++spelling) != '\0');
3423       }
3424       break;
3425
3426     spell_ident:
3427     case SPELL_IDENT:
3428       {
3429         size_t i;
3430         const unsigned char * name = NODE_NAME (token->val.node.node);
3431
3432         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
3433           if (name[i] & ~0x7F)
3434             {
3435               unsigned char buffer[10];
3436               i += utf8_to_ucn (buffer, name + i) - 1;
3437               fwrite (buffer, 1, 10, fp);
3438             }
3439           else
3440             fputc (NODE_NAME (token->val.node.node)[i], fp);
3441       }
3442       break;
3443
3444     case SPELL_LITERAL:
3445       fwrite (token->val.str.text, 1, token->val.str.len, fp);
3446       break;
3447
3448     case SPELL_NONE:
3449       /* An error, most probably.  */
3450       break;
3451     }
3452 }
3453
3454 /* Compare two tokens.  */
3455 int
3456 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
3457 {
3458   if (a->type == b->type && a->flags == b->flags)
3459     switch (TOKEN_SPELL (a))
3460       {
3461       default:                  /* Keep compiler happy.  */
3462       case SPELL_OPERATOR:
3463         /* token_no is used to track where multiple consecutive ##
3464            tokens were originally located.  */
3465         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
3466       case SPELL_NONE:
3467         return (a->type != CPP_MACRO_ARG
3468                 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
3469                     && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
3470       case SPELL_IDENT:
3471         return (a->val.node.node == b->val.node.node
3472                 && a->val.node.spelling == b->val.node.spelling);
3473       case SPELL_LITERAL:
3474         return (a->val.str.len == b->val.str.len
3475                 && !memcmp (a->val.str.text, b->val.str.text,
3476                             a->val.str.len));
3477       }
3478
3479   return 0;
3480 }
3481
3482 /* Returns nonzero if a space should be inserted to avoid an
3483    accidental token paste for output.  For simplicity, it is
3484    conservative, and occasionally advises a space where one is not
3485    needed, e.g. "." and ".2".  */
3486 int
3487 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
3488                  const cpp_token *token2)
3489 {
3490   enum cpp_ttype a = token1->type, b = token2->type;
3491   cppchar_t c;
3492
3493   if (token1->flags & NAMED_OP)
3494     a = CPP_NAME;
3495   if (token2->flags & NAMED_OP)
3496     b = CPP_NAME;
3497
3498   c = EOF;
3499   if (token2->flags & DIGRAPH)
3500     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
3501   else if (token_spellings[b].category == SPELL_OPERATOR)
3502     c = token_spellings[b].name[0];
3503
3504   /* Quickly get everything that can paste with an '='.  */
3505   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
3506     return 1;
3507
3508   switch (a)
3509     {
3510     case CPP_GREATER:   return c == '>';
3511     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
3512     case CPP_PLUS:      return c == '+';
3513     case CPP_MINUS:     return c == '-' || c == '>';
3514     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
3515     case CPP_MOD:       return c == ':' || c == '>';
3516     case CPP_AND:       return c == '&';
3517     case CPP_OR:        return c == '|';
3518     case CPP_COLON:     return c == ':' || c == '>';
3519     case CPP_DEREF:     return c == '*';
3520     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
3521     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
3522     case CPP_NAME:      return ((b == CPP_NUMBER
3523                                  && name_p (pfile, &token2->val.str))
3524                                 || b == CPP_NAME
3525                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
3526     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
3527                                 || c == '.' || c == '+' || c == '-');
3528                                       /* UCNs */
3529     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
3530                                  && b == CPP_NAME)
3531                                 || (CPP_OPTION (pfile, objc)
3532                                     && token1->val.str.text[0] == '@'
3533                                     && (b == CPP_NAME || b == CPP_STRING)));
3534     case CPP_LESS_EQ:   return c == '>';
3535     case CPP_STRING:
3536     case CPP_WSTRING:
3537     case CPP_UTF8STRING:
3538     case CPP_STRING16:
3539     case CPP_STRING32:  return (CPP_OPTION (pfile, user_literals)
3540                                 && (b == CPP_NAME
3541                                     || (TOKEN_SPELL (token2) == SPELL_LITERAL
3542                                         && ISIDST (token2->val.str.text[0]))));
3543
3544     default:            break;
3545     }
3546
3547   return 0;
3548 }
3549
3550 /* Output all the remaining tokens on the current line, and a newline
3551    character, to FP.  Leading whitespace is removed.  If there are
3552    macros, special token padding is not performed.  */
3553 void
3554 cpp_output_line (cpp_reader *pfile, FILE *fp)
3555 {
3556   const cpp_token *token;
3557
3558   token = cpp_get_token (pfile);
3559   while (token->type != CPP_EOF)
3560     {
3561       cpp_output_token (token, fp);
3562       token = cpp_get_token (pfile);
3563       if (token->flags & PREV_WHITE)
3564         putc (' ', fp);
3565     }
3566
3567   putc ('\n', fp);
3568 }
3569
3570 /* Return a string representation of all the remaining tokens on the
3571    current line.  The result is allocated using xmalloc and must be
3572    freed by the caller.  */
3573 unsigned char *
3574 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
3575 {
3576   const cpp_token *token;
3577   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
3578   unsigned int alloced = 120 + out;
3579   unsigned char *result = (unsigned char *) xmalloc (alloced);
3580
3581   /* If DIR_NAME is empty, there are no initial contents.  */
3582   if (dir_name)
3583     {
3584       sprintf ((char *) result, "#%s ", dir_name);
3585       out += 2;
3586     }
3587
3588   token = cpp_get_token (pfile);
3589   while (token->type != CPP_EOF)
3590     {
3591       unsigned char *last;
3592       /* Include room for a possible space and the terminating nul.  */
3593       unsigned int len = cpp_token_len (token) + 2;
3594
3595       if (out + len > alloced)
3596         {
3597           alloced *= 2;
3598           if (out + len > alloced)
3599             alloced = out + len;
3600           result = (unsigned char *) xrealloc (result, alloced);
3601         }
3602
3603       last = cpp_spell_token (pfile, token, &result[out], 0);
3604       out = last - result;
3605
3606       token = cpp_get_token (pfile);
3607       if (token->flags & PREV_WHITE)
3608         result[out++] = ' ';
3609     }
3610
3611   result[out] = '\0';
3612   return result;
3613 }
3614
3615 /* Memory buffers.  Changing these three constants can have a dramatic
3616    effect on performance.  The values here are reasonable defaults,
3617    but might be tuned.  If you adjust them, be sure to test across a
3618    range of uses of cpplib, including heavy nested function-like macro
3619    expansion.  Also check the change in peak memory usage (NJAMD is a
3620    good tool for this).  */
3621 #define MIN_BUFF_SIZE 8000
3622 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
3623 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
3624         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
3625
3626 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
3627   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
3628 #endif
3629
3630 /* Create a new allocation buffer.  Place the control block at the end
3631    of the buffer, so that buffer overflows will cause immediate chaos.  */
3632 static _cpp_buff *
3633 new_buff (size_t len)
3634 {
3635   _cpp_buff *result;
3636   unsigned char *base;
3637
3638   if (len < MIN_BUFF_SIZE)
3639     len = MIN_BUFF_SIZE;
3640   len = CPP_ALIGN (len);
3641
3642 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3643   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
3644      struct first.  */
3645   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
3646   base = XNEWVEC (unsigned char, len + slen);
3647   result = (_cpp_buff *) base;
3648   base += slen;
3649 #else
3650   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
3651   result = (_cpp_buff *) (base + len);
3652 #endif
3653   result->base = base;
3654   result->cur = base;
3655   result->limit = base + len;
3656   result->next = NULL;
3657   return result;
3658 }
3659
3660 /* Place a chain of unwanted allocation buffers on the free list.  */
3661 void
3662 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
3663 {
3664   _cpp_buff *end = buff;
3665
3666   while (end->next)
3667     end = end->next;
3668   end->next = pfile->free_buffs;
3669   pfile->free_buffs = buff;
3670 }
3671
3672 /* Return a free buffer of size at least MIN_SIZE.  */
3673 _cpp_buff *
3674 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
3675 {
3676   _cpp_buff *result, **p;
3677
3678   for (p = &pfile->free_buffs;; p = &(*p)->next)
3679     {
3680       size_t size;
3681
3682       if (*p == NULL)
3683         return new_buff (min_size);
3684       result = *p;
3685       size = result->limit - result->base;
3686       /* Return a buffer that's big enough, but don't waste one that's
3687          way too big.  */
3688       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
3689         break;
3690     }
3691
3692   *p = result->next;
3693   result->next = NULL;
3694   result->cur = result->base;
3695   return result;
3696 }
3697
3698 /* Creates a new buffer with enough space to hold the uncommitted
3699    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
3700    the excess bytes to the new buffer.  Chains the new buffer after
3701    BUFF, and returns the new buffer.  */
3702 _cpp_buff *
3703 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
3704 {
3705   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
3706   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
3707
3708   buff->next = new_buff;
3709   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
3710   return new_buff;
3711 }
3712
3713 /* Creates a new buffer with enough space to hold the uncommitted
3714    remaining bytes of the buffer pointed to by BUFF, and at least
3715    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
3716    Chains the new buffer before the buffer pointed to by BUFF, and
3717    updates the pointer to point to the new buffer.  */
3718 void
3719 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3720 {
3721   _cpp_buff *new_buff, *old_buff = *pbuff;
3722   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
3723
3724   new_buff = _cpp_get_buff (pfile, size);
3725   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
3726   new_buff->next = old_buff;
3727   *pbuff = new_buff;
3728 }
3729
3730 /* Free a chain of buffers starting at BUFF.  */
3731 void
3732 _cpp_free_buff (_cpp_buff *buff)
3733 {
3734   _cpp_buff *next;
3735
3736   for (; buff; buff = next)
3737     {
3738       next = buff->next;
3739 #ifdef ENABLE_VALGRIND_ANNOTATIONS
3740       free (buff);
3741 #else
3742       free (buff->base);
3743 #endif
3744     }
3745 }
3746
3747 /* Allocate permanent, unaligned storage of length LEN.  */
3748 unsigned char *
3749 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3750 {
3751   _cpp_buff *buff = pfile->u_buff;
3752   unsigned char *result = buff->cur;
3753
3754   if (len > (size_t) (buff->limit - result))
3755     {
3756       buff = _cpp_get_buff (pfile, len);
3757       buff->next = pfile->u_buff;
3758       pfile->u_buff = buff;
3759       result = buff->cur;
3760     }
3761
3762   buff->cur = result + len;
3763   return result;
3764 }
3765
3766 /* Allocate permanent, unaligned storage of length LEN from a_buff.
3767    That buffer is used for growing allocations when saving macro
3768    replacement lists in a #define, and when parsing an answer to an
3769    assertion in #assert, #unassert or #if (and therefore possibly
3770    whilst expanding macros).  It therefore must not be used by any
3771    code that they might call: specifically the lexer and the guts of
3772    the macro expander.
3773
3774    All existing other uses clearly fit this restriction: storing
3775    registered pragmas during initialization.  */
3776 unsigned char *
3777 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3778 {
3779   _cpp_buff *buff = pfile->a_buff;
3780   unsigned char *result = buff->cur;
3781
3782   if (len > (size_t) (buff->limit - result))
3783     {
3784       buff = _cpp_get_buff (pfile, len);
3785       buff->next = pfile->a_buff;
3786       pfile->a_buff = buff;
3787       result = buff->cur;
3788     }
3789
3790   buff->cur = result + len;
3791   return result;
3792 }
3793
3794 /* Commit or allocate storage from a buffer.  */
3795
3796 void *
3797 _cpp_commit_buff (cpp_reader *pfile, size_t size)
3798 {
3799   void *ptr = BUFF_FRONT (pfile->a_buff);
3800
3801   if (pfile->hash_table->alloc_subobject)
3802     {
3803       void *copy = pfile->hash_table->alloc_subobject (size);
3804       memcpy (copy, ptr, size);
3805       ptr = copy;
3806     }
3807   else
3808     BUFF_FRONT (pfile->a_buff) += size;
3809
3810   return ptr;
3811 }
3812
3813 /* Say which field of TOK is in use.  */
3814
3815 enum cpp_token_fld_kind
3816 cpp_token_val_index (const cpp_token *tok)
3817 {
3818   switch (TOKEN_SPELL (tok))
3819     {
3820     case SPELL_IDENT:
3821       return CPP_TOKEN_FLD_NODE;
3822     case SPELL_LITERAL:
3823       return CPP_TOKEN_FLD_STR;
3824     case SPELL_OPERATOR:
3825       /* Operands which were originally spelled as ident keep around
3826          the node for the exact spelling.  */
3827       if (tok->flags & NAMED_OP)
3828         return CPP_TOKEN_FLD_NODE;
3829       else if (tok->type == CPP_PASTE)
3830         return CPP_TOKEN_FLD_TOKEN_NO;
3831       else
3832         return CPP_TOKEN_FLD_NONE;
3833     case SPELL_NONE:
3834       if (tok->type == CPP_MACRO_ARG)
3835         return CPP_TOKEN_FLD_ARG_NO;
3836       else if (tok->type == CPP_PADDING)
3837         return CPP_TOKEN_FLD_SOURCE;
3838       else if (tok->type == CPP_PRAGMA)
3839         return CPP_TOKEN_FLD_PRAGMA;
3840       /* fall through */
3841     default:
3842       return CPP_TOKEN_FLD_NONE;
3843     }
3844 }
3845
3846 /* All tokens lexed in R after calling this function will be forced to
3847    have their location_t to be P, until
3848    cpp_stop_forcing_token_locations is called for R.  */
3849
3850 void
3851 cpp_force_token_locations (cpp_reader *r, location_t loc)
3852 {
3853   r->forced_token_location = loc;
3854 }
3855
3856 /* Go back to assigning locations naturally for lexed tokens.  */
3857
3858 void
3859 cpp_stop_forcing_token_locations (cpp_reader *r)
3860 {
3861   r->forced_token_location = 0;
3862 }
3863
3864 /* We're looking at \, if it's escaping EOL, look past it.  If at
3865    LIMIT, don't advance.  */
3866
3867 static const unsigned char *
3868 do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
3869 {
3870   const unsigned char *probe = peek;
3871
3872   if (__builtin_expect (peek[1] == '\n', true))
3873     {
3874     eol:
3875       probe += 2;
3876       if (__builtin_expect (probe < limit, true))
3877         {
3878           peek = probe;
3879           if (*peek == '\\')
3880             /* The user might be perverse.  */
3881             return do_peek_backslash (peek, limit);
3882         }
3883     }
3884   else if (__builtin_expect (peek[1] == '\r', false))
3885     {
3886       if (probe[2] == '\n')
3887         probe++;
3888       goto eol;
3889     }
3890
3891   return peek;
3892 }
3893
3894 static const unsigned char *
3895 do_peek_next (const unsigned char *peek, const unsigned char *limit)
3896 {
3897   if (__builtin_expect (*peek == '\\', false))
3898     peek = do_peek_backslash (peek, limit);
3899   return peek;
3900 }
3901
3902 static const unsigned char *
3903 do_peek_prev (const unsigned char *peek, const unsigned char *bound)
3904 {
3905   if (peek == bound)
3906     return NULL;
3907
3908   unsigned char c = *--peek;
3909   if (__builtin_expect (c == '\n', false)
3910       || __builtin_expect (c == 'r', false))
3911     {
3912       if (peek == bound)
3913         return peek;
3914       int ix = -1;
3915       if (c == '\n' && peek[ix] == '\r')
3916         {
3917           if (peek + ix == bound)
3918             return peek;
3919           ix--;
3920         }
3921
3922       if (peek[ix] == '\\')
3923         return do_peek_prev (peek + ix, bound);
3924
3925       return peek;
3926     }
3927   else
3928     return peek;
3929 }
3930
3931 /* Directives-only scanning.  Somewhat more relaxed than correct
3932    parsing -- some ill-formed programs will not be rejected.  */
3933
3934 void
3935 cpp_directive_only_process (cpp_reader *pfile,
3936                             void *data,
3937                             void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
3938 {
3939   do
3940     {
3941     restart:
3942       /* Buffer initialization, but no line cleaning. */
3943       cpp_buffer *buffer = pfile->buffer;
3944       buffer->cur_note = buffer->notes_used = 0;
3945       buffer->cur = buffer->line_base = buffer->next_line;
3946       buffer->need_line = false;
3947       /* Files always end in a newline.  We rely on this for
3948          character peeking safety.  */
3949       gcc_assert (buffer->rlimit[-1] == '\n');
3950
3951       const unsigned char *base = buffer->cur;
3952       unsigned line_count = 0;
3953       const unsigned char *line_start = base;
3954
3955       bool bol = true;
3956       bool raw = false;
3957
3958       const unsigned char *lwm = base;
3959       for (const unsigned char *pos = base, *limit = buffer->rlimit;
3960            pos < limit;)
3961         {
3962           unsigned char c = *pos++;
3963           /* This matches the switch in _cpp_lex_direct.  */
3964           switch (c)
3965             {
3966             case ' ': case '\t': case '\f': case '\v':
3967               /* Whitespace, do nothing.  */
3968               break;
3969
3970             case '\r': /* MAC line ending, or Windows \r\n  */
3971               if (*pos == '\n')
3972                 pos++;
3973               /* FALLTHROUGH */
3974
3975             case '\n':
3976               bol = true;
3977
3978             next_line:
3979               CPP_INCREMENT_LINE (pfile, 0);
3980               line_count++;
3981               line_start = pos;
3982               break;
3983
3984             case '\\':
3985               /* <backslash><newline> is removed, and doesn't undo any
3986                  preceeding escape or whatnot.  */
3987               if (*pos == '\n')
3988                 {
3989                   pos++;
3990                   goto next_line;
3991                 }
3992               else if (*pos == '\r')
3993                 {
3994                   if (pos[1] == '\n')
3995                     pos++;
3996                   pos++;
3997                   goto next_line;
3998                 }
3999               goto dflt;
4000
4001             case '#':
4002               if (bol)
4003                 {
4004                   /* Line directive.  */
4005                   if (pos - 1 > base && !pfile->state.skipping)
4006                     cb (pfile, CPP_DO_print, data,
4007                         line_count, base, pos - 1 - base);
4008
4009                   /* Prep things for directive handling. */
4010                   buffer->next_line = pos;
4011                   buffer->need_line = true;
4012                   bool ok = _cpp_get_fresh_line (pfile);
4013                   gcc_checking_assert (ok);
4014
4015                   /* Ensure proper column numbering for generated
4016                      error messages. */
4017                   buffer->line_base -= pos - line_start;
4018
4019                   _cpp_handle_directive (pfile, line_start + 1 != pos);
4020
4021                   /* Sanitize the line settings.  Duplicate #include's can
4022                      mess things up. */
4023                   // FIXME: Necessary?
4024                   pfile->line_table->highest_location
4025                     = pfile->line_table->highest_line;
4026
4027                   if (!pfile->state.skipping
4028                       && pfile->buffer->next_line < pfile->buffer->rlimit)
4029                     cb (pfile, CPP_DO_location, data,
4030                         pfile->line_table->highest_line);
4031
4032                   goto restart;
4033                 }
4034               goto dflt;
4035
4036             case '/':
4037               {
4038                 const unsigned char *peek = do_peek_next (pos, limit);
4039                 if (!(*peek == '/' || *peek == '*'))
4040                   goto dflt;
4041
4042                 /* Line or block comment  */
4043                 bool is_block = *peek == '*';
4044                 bool star = false;
4045                 bool esc = false;
4046                 location_t sloc
4047                   = linemap_position_for_column (pfile->line_table,
4048                                                  pos - line_start);
4049
4050                 while (pos < limit)
4051                   {
4052                     char c = *pos++;
4053                     switch (c)
4054                       {
4055                       case '\\':
4056                         esc = true;
4057                         break;
4058
4059                       case '\r':
4060                         if (*pos == '\n')
4061                           pos++;
4062                         /* FALLTHROUGH  */
4063
4064                       case '\n':
4065                         {
4066                           CPP_INCREMENT_LINE (pfile, 0);
4067                           line_count++;
4068                           line_start = pos;
4069                           if (!esc && !is_block)
4070                             {
4071                               bol = true;
4072                               goto done_comment;
4073                             }
4074                         }
4075                         if (!esc)
4076                           star = false;
4077                         esc = false;
4078                         break;
4079
4080                       case '*':
4081                         if (pos > peek && !esc)
4082                           star = is_block;
4083                         esc = false;
4084                         break;
4085
4086                       case '/':
4087                         if (star)
4088                           goto done_comment;
4089                         /* FALLTHROUGH  */
4090
4091                       default:
4092                         star = false;
4093                         esc = false;
4094                         break;
4095                       }
4096                   }
4097                 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
4098                                      "unterminated comment");
4099               done_comment:
4100                 lwm = pos;
4101                 break;
4102               }
4103
4104             case '\'':
4105               if (!CPP_OPTION (pfile, digit_separators))
4106                 goto delimited_string;
4107
4108               /* Possibly a number punctuator.  */
4109               if (!ISIDNUM (*do_peek_next (pos, limit)))
4110                 goto delimited_string;
4111
4112               goto quote_peek;
4113
4114             case '\"':
4115               if (!CPP_OPTION (pfile, rliterals))
4116                 goto delimited_string;
4117
4118             quote_peek:
4119               {
4120                 /* For ' see if it's a number punctuator
4121                    \.?<digit>(<digit>|<identifier-nondigit>
4122                    |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
4123                 /* For " see if it's a raw string
4124                    {U,L,u,u8}R.  This includes CPP_NUMBER detection,
4125                    because that could be 0e+R.  */
4126                 const unsigned char *peek = pos - 1;
4127                 bool quote_first = c == '"';
4128                 bool quote_eight = false;
4129                 bool maybe_number_start = false;
4130                 bool want_number = false;
4131
4132                 while ((peek = do_peek_prev (peek, lwm)))
4133                   {
4134                     unsigned char p = *peek;
4135                     if (quote_first)
4136                       {
4137                         if (!raw)
4138                           {
4139                             if (p != 'R')
4140                               break;
4141                             raw = true;
4142                             continue;
4143                           }
4144
4145                         quote_first = false;
4146                         if (p == 'L' || p == 'U' || p == 'u')
4147                           ;
4148                         else if (p == '8')
4149                           quote_eight = true;
4150                         else
4151                           goto second_raw;
4152                       }
4153                     else if (quote_eight)
4154                       {
4155                         if (p != 'u')
4156                           {
4157                             raw = false;
4158                             break;
4159                           }
4160                         quote_eight = false;
4161                       }
4162                     else if (c == '"')
4163                       {
4164                       second_raw:;
4165                         if (!want_number && ISIDNUM (p))
4166                           {
4167                             raw = false;
4168                             break;
4169                           }
4170                       }
4171
4172                     if (ISDIGIT (p))
4173                       maybe_number_start = true;
4174                     else if (p == '.')
4175                       want_number = true;
4176                     else if (ISIDNUM (p))
4177                       maybe_number_start = false;
4178                     else if (p == '+' || p == '-')
4179                       {
4180                         if (const unsigned char *peek_prev
4181                             = do_peek_prev (peek, lwm))
4182                           {
4183                             p = *peek_prev;
4184                             if (p == 'e' || p == 'E'
4185                                 || p == 'p' || p == 'P')
4186                               {
4187                                 want_number = true;
4188                                 maybe_number_start = false;
4189                               }
4190                             else
4191                               break;
4192                           }
4193                         else
4194                           break;
4195                       }
4196                     else if (p == '\'' || p == '\"')
4197                       {
4198                         /* If this is lwm, this must be the end of a
4199                            previous string.  So this is a trailing
4200                            literal type, (a) if those are allowed,
4201                              and (b) maybe_start is false.  Otherwise
4202                              this must be a CPP_NUMBER because we've
4203                              met another ', and we'd have checked that
4204                              in its own right.  */
4205                         if (peek == lwm && CPP_OPTION (pfile, uliterals))
4206                           {
4207                             if  (!maybe_number_start && !want_number)
4208                               /* Must be a literal type.  */
4209                               raw = false;
4210                           }
4211                         else if (p == '\''
4212                                  && CPP_OPTION (pfile, digit_separators))
4213                           maybe_number_start = true;
4214                         break;
4215                       }
4216                     else if (c == '\'')
4217                       break;
4218                     else if (!quote_first && !quote_eight)
4219                       break;
4220                   }
4221
4222                 if (maybe_number_start)
4223                   {
4224                     if (c == '\'')
4225                       /* A CPP NUMBER.  */
4226                       goto dflt;
4227                     raw = false;
4228                   }
4229
4230                 goto delimited_string;
4231               }
4232
4233             delimited_string:
4234               {
4235                 /* (Possibly raw) string or char literal.  */
4236                 unsigned char end = c;
4237                 int delim_len = -1;
4238                 const unsigned char *delim = NULL;
4239                 location_t sloc = linemap_position_for_column (pfile->line_table,
4240                                                                pos - line_start);
4241                 int esc = 0;
4242
4243                 if (raw)
4244                   {
4245                     /* There can be no line breaks in the delimiter.  */
4246                     delim = pos;
4247                     for (delim_len = 0; (c = *pos++) != '('; delim_len++)
4248                       {
4249                         if (delim_len == 16)
4250                           {
4251                             cpp_error_with_line (pfile, CPP_DL_ERROR,
4252                                                  sloc, 0,
4253                                                  "raw string delimiter"
4254                                                  " longer than %d"
4255                                                  " characters",
4256                                                  delim_len);
4257                             raw = false;
4258                             pos = delim;
4259                             break;
4260                           }
4261                         if (strchr (") \\\t\v\f\n", c))
4262                           {
4263                             cpp_error_with_line (pfile, CPP_DL_ERROR,
4264                                                  sloc, 0,
4265                                                  "invalid character '%c'"
4266                                                  " in raw string"
4267                                                  " delimiter", c);
4268                             raw = false;
4269                             pos = delim;
4270                             break;
4271                           }
4272                         if (pos >= limit)
4273                           goto bad_string;
4274                       }
4275                   }
4276
4277                 while (pos < limit)
4278                   {
4279                     char c = *pos++;
4280                     switch (c)
4281                       {
4282                       case '\\':
4283                         if (!raw)
4284                           esc++;
4285                         break;
4286
4287                       case '\r':
4288                         if (*pos == '\n')
4289                           pos++;
4290                         /* FALLTHROUGH  */
4291
4292                       case '\n':
4293                         {
4294                           CPP_INCREMENT_LINE (pfile, 0);
4295                           line_count++;
4296                           line_start = pos;
4297                         }
4298                         if (esc)
4299                           esc--;
4300                         break;
4301
4302                       case ')':
4303                         if (raw
4304                             && pos + delim_len + 1 < limit
4305                             && pos[delim_len] == end
4306                             && !memcmp (delim, pos, delim_len))
4307                           {
4308                             pos += delim_len + 1;
4309                             raw = false;
4310                             goto done_string;
4311                           }
4312                         break;
4313
4314                       default:
4315                         if (!raw && !(esc & 1) && c == end)
4316                           goto done_string;
4317                         esc = 0;
4318                         break;
4319                       }
4320                   }
4321               bad_string:
4322                 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
4323                                      "unterminated literal");
4324
4325               done_string:
4326                 raw = false;
4327                 lwm = pos - 1;
4328               }
4329               goto dflt;
4330
4331             default:
4332             dflt:
4333               bol = false;
4334               pfile->mi_valid = false;
4335               break;
4336             }
4337         }
4338
4339       if (buffer->rlimit > base && !pfile->state.skipping)
4340         cb (pfile, CPP_DO_print, data, line_count, base, buffer->rlimit - base);
4341
4342       _cpp_pop_buffer (pfile);
4343     }
4344   while (pfile->buffer);
4345 }