libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000-2014 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 3, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
  45 #define TK(e, s) { SPELL_ ## s,    UC #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  54 static int skip_line_comment (cpp_reader *);
  55 static void skip_whitespace (cpp_reader *, cppchar_t);
  56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  58 static void store_comment (cpp_reader *, cpp_token *);
  59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  60                             unsigned int, enum cpp_ttype);
  61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  62 static int name_p (cpp_reader *, const cpp_string *);
  63 static tokenrun *next_tokenrun (tokenrun *);
  64
  65 static _cpp_buff *new_buff (size_t);
  66
  67
  68 /* Utility routine:
  69
  70    Compares, the token TOKEN to the NUL-terminated string STRING.
  71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  72 int
  73 cpp_ideq (const cpp_token *token, const char *string)
  74 {
  75   if (token->type != CPP_NAME)
  76     return 0;
  77
  78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
  79 }
  80
  81 /* Record a note TYPE at byte POS into the current cleaned logical
  82    line.  */
  83 static void
  84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  85 {
  86   if (buffer->notes_used == buffer->notes_cap)
  87     {
  88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  90                                   buffer->notes_cap);
  91     }
  92
  93   buffer->notes[buffer->notes_used].pos = pos;
  94   buffer->notes[buffer->notes_used].type = type;
  95   buffer->notes_used++;
  96 }
  97
  98 \f
  99 /* Fast path to find line special characters using optimized character
 100    scanning algorithms.  Anything complicated falls back to the slow
 101    path below.  Since this loop is very hot it's worth doing these kinds
 102    of optimizations.
 103
 104    One of the paths through the ifdefs should provide
 105
 106      const uchar *search_line_fast (const uchar *s, const uchar *end);
 107
 108    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
 109    the found character.
 110
 111    Note that the last character of the buffer is *always* a newline,
 112    as forced by _cpp_convert_input.  This fact can be used to avoid
 113    explicitly looking for the end of the buffer.  */
 114
 115 /* Configure gives us an ifdef test.  */
 116 #ifndef WORDS_BIGENDIAN
 117 #define WORDS_BIGENDIAN 0
 118 #endif
 119
 120 /* We'd like the largest integer that fits into a register.  There's nothing
 121    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
 122    but MS decided on an LLP64 model.  Thankfully when building with GCC we
 123    can get the "real" word size.  */
 124 #ifdef __GNUC__
 125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
 126 #else
 127 typedef unsigned long word_type;
 128 #endif
 129
 130 /* The code below is only expecting sizes 4 or 8.
 131    Die at compile-time if this expectation is violated.  */
 132 typedef char check_word_type_size
 133   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
 134
 135 /* Return X with the first N bytes forced to values that won't match one
 136    of the interesting characters.  Note that NUL is not interesting.  */
 137
 138 static inline word_type
 139 acc_char_mask_misalign (word_type val, unsigned int n)
 140 {
 141   word_type mask = -1;
 142   if (WORDS_BIGENDIAN)
 143     mask >>= n * 8;
 144   else
 145     mask <<= n * 8;
 146   return val & mask;
 147 }
 148
 149 /* Return X replicated to all byte positions within WORD_TYPE.  */
 150
 151 static inline word_type
 152 acc_char_replicate (uchar x)
 153 {
 154   word_type ret;
 155
 156   ret = (x << 24) | (x << 16) | (x << 8) | x;
 157   if (sizeof(word_type) == 8)
 158     ret = (ret << 16 << 16) | ret;
 159   return ret;
 160 }
 161
 162 /* Return non-zero if some byte of VAL is (probably) C.  */
 163
 164 static inline word_type
 165 acc_char_cmp (word_type val, word_type c)
 166 {
 167 #if defined(__GNUC__) && defined(__alpha__)
 168   /* We can get exact results using a compare-bytes instruction.
 169      Get (val == c) via (0 >= (val ^ c)).  */
 170   return __builtin_alpha_cmpbge (0, val ^ c);
 171 #else
 172   word_type magic = 0x7efefefeU;
 173   if (sizeof(word_type) == 8)
 174     magic = (magic << 16 << 16) | 0xfefefefeU;
 175   magic |= 1;
 176
 177   val ^= c;
 178   return ((val + magic) ^ ~val) & ~magic;
 179 #endif
 180 }
 181
 182 /* Given the result of acc_char_cmp is non-zero, return the index of
 183    the found character.  If this was a false positive, return -1.  */
 184
 185 static inline int
 186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
 187                 word_type val ATTRIBUTE_UNUSED)
 188 {
 189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
 190   /* The cmpbge instruction sets *bits* of the result corresponding to
 191      matches in the bytes with no false positives.  */
 192   return __builtin_ctzl (cmp);
 193 #else
 194   unsigned int i;
 195
 196   /* ??? It would be nice to force unrolling here,
 197      and have all of these constants folded.  */
 198   for (i = 0; i < sizeof(word_type); ++i)
 199     {
 200       uchar c;
 201       if (WORDS_BIGENDIAN)
 202         c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
 203       else
 204         c = (val >> i * 8) & 0xff;
 205
 206       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
 207         return i;
 208     }
 209
 210   return -1;
 211 #endif
 212 }
 213
 214 /* A version of the fast scanner using bit fiddling techniques.
 215
 216    For 32-bit words, one would normally perform 16 comparisons and
 217    16 branches.  With this algorithm one performs 24 arithmetic
 218    operations and one branch.  Whether this is faster with a 32-bit
 219    word size is going to be somewhat system dependent.
 220
 221    For 64-bit words, we eliminate twice the number of comparisons
 222    and branches without increasing the number of arithmetic operations.
 223    It's almost certainly going to be a win with 64-bit word size.  */
 224
 225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
 226   ATTRIBUTE_UNUSED;
 227
 228 static const uchar *
 229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 230 {
 231   const word_type repl_nl = acc_char_replicate ('\n');
 232   const word_type repl_cr = acc_char_replicate ('\r');
 233   const word_type repl_bs = acc_char_replicate ('\\');
 234   const word_type repl_qm = acc_char_replicate ('?');
 235
 236   unsigned int misalign;
 237   const word_type *p;
 238   word_type val, t;
 239
 240   /* Align the buffer.  Mask out any bytes from before the beginning.  */
 241   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
 242   val = *p;
 243   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
 244   if (misalign)
 245     val = acc_char_mask_misalign (val, misalign);
 246
 247   /* Main loop.  */
 248   while (1)
 249     {
 250       t  = acc_char_cmp (val, repl_nl);
 251       t |= acc_char_cmp (val, repl_cr);
 252       t |= acc_char_cmp (val, repl_bs);
 253       t |= acc_char_cmp (val, repl_qm);
 254
 255       if (__builtin_expect (t != 0, 0))
 256         {
 257           int i = acc_char_index (t, val);
 258           if (i >= 0)
 259             return (const uchar *)p + i;
 260         }
 261
 262       val = *++p;
 263     }
 264 }
 265
 266 /* Disable on Solaris 2/x86 until the following problem can be properly
 267    autoconfed:
 268
 269    The Solaris 10+ assembler tags objects with the instruction set
 270    extensions used, so SSE4.2 executables cannot run on machines that
 271    don't support that extension.  */
 272
 273 #if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
 274
 275 /* Replicated character data to be shared between implementations.
 276    Recall that outside of a context with vector support we can't
 277    define compatible vector types, therefore these are all defined
 278    in terms of raw characters.  */
 279 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
 280   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 281     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
 282   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 283     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
 284   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 285     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
 286   { '?', '?', '?', '?', '?', '?', '?', '?',
 287     '?', '?', '?', '?', '?', '?', '?', '?' },
 288 };
 289
 290 /* A version of the fast scanner using MMX vectorized byte compare insns.
 291
 292    This uses the PMOVMSKB instruction which was introduced with "MMX2",
 293    which was packaged into SSE1; it is also present in the AMD MMX
 294    extension.  Mark the function as using "sse" so that we emit a real
 295    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
 296
 297 static const uchar *
 298 #ifndef __SSE__
 299 __attribute__((__target__("sse")))
 300 #endif
 301 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 302 {
 303   typedef char v8qi __attribute__ ((__vector_size__ (8)));
 304   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
 305
 306   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
 307   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
 308   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
 309   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
 310
 311   unsigned int misalign, found, mask;
 312   const v8qi *p;
 313   v8qi data, t, c;
 314
 315   /* Align the source pointer.  While MMX doesn't generate unaligned data
 316      faults, this allows us to safely scan to the end of the buffer without
 317      reading beyond the end of the last page.  */
 318   misalign = (uintptr_t)s & 7;
 319   p = (const v8qi *)((uintptr_t)s & -8);
 320   data = *p;
 321
 322   /* Create a mask for the bytes that are valid within the first
 323      16-byte block.  The Idea here is that the AND with the mask
 324      within the loop is "free", since we need some AND or TEST
 325      insn in order to set the flags for the branch anyway.  */
 326   mask = -1u << misalign;
 327
 328   /* Main loop processing 8 bytes at a time.  */
 329   goto start;
 330   do
 331     {
 332       data = *++p;
 333       mask = -1;
 334
 335     start:
 336       t = __builtin_ia32_pcmpeqb(data, repl_nl);
 337       c = __builtin_ia32_pcmpeqb(data, repl_cr);
 338       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 339       c = __builtin_ia32_pcmpeqb(data, repl_bs);
 340       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 341       c = __builtin_ia32_pcmpeqb(data, repl_qm);
 342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
 343       found = __builtin_ia32_pmovmskb (t);
 344       found &= mask;
 345     }
 346   while (!found);
 347
 348   __builtin_ia32_emms ();
 349
 350   /* FOUND contains 1 in bits for which we matched a relevant
 351      character.  Conversion to the byte index is trivial.  */
 352   found = __builtin_ctz(found);
 353   return (const uchar *)p + found;
 354 }
 355
 356 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 357
 358 static const uchar *
 359 #ifndef __SSE2__
 360 __attribute__((__target__("sse2")))
 361 #endif
 362 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 363 {
 364   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 365
 366   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
 367   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
 368   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
 369   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
 370
 371   unsigned int misalign, found, mask;
 372   const v16qi *p;
 373   v16qi data, t;
 374
 375   /* Align the source pointer.  */
 376   misalign = (uintptr_t)s & 15;
 377   p = (const v16qi *)((uintptr_t)s & -16);
 378   data = *p;
 379
 380   /* Create a mask for the bytes that are valid within the first
 381      16-byte block.  The Idea here is that the AND with the mask
 382      within the loop is "free", since we need some AND or TEST
 383      insn in order to set the flags for the branch anyway.  */
 384   mask = -1u << misalign;
 385
 386   /* Main loop processing 16 bytes at a time.  */
 387   goto start;
 388   do
 389     {
 390       data = *++p;
 391       mask = -1;
 392
 393     start:
 394       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
 395       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
 396       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
 397       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
 398       found = __builtin_ia32_pmovmskb128 (t);
 399       found &= mask;
 400     }
 401   while (!found);
 402
 403   /* FOUND contains 1 in bits for which we matched a relevant
 404      character.  Conversion to the byte index is trivial.  */
 405   found = __builtin_ctz(found);
 406   return (const uchar *)p + found;
 407 }
 408
 409 #ifdef HAVE_SSE4
 410 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 411
 412 static const uchar *
 413 #ifndef __SSE4_2__
 414 __attribute__((__target__("sse4.2")))
 415 #endif
 416 search_line_sse42 (const uchar *s, const uchar *end)
 417 {
 418   typedef char v16qi __attribute__ ((__vector_size__ (16)));
 419   static const v16qi search = { '\n', '\r', '?', '\\' };
 420
 421   uintptr_t si = (uintptr_t)s;
 422   uintptr_t index;
 423
 424   /* Check for unaligned input.  */
 425   if (si & 15)
 426     {
 427       v16qi sv;
 428
 429       if (__builtin_expect (end - s < 16, 0)
 430           && __builtin_expect ((si & 0xfff) > 0xff0, 0))
 431         {
 432           /* There are less than 16 bytes left in the buffer, and less
 433              than 16 bytes left on the page.  Reading 16 bytes at this
 434              point might generate a spurious page fault.  Defer to the
 435              SSE2 implementation, which already handles alignment.  */
 436           return search_line_sse2 (s, end);
 437         }
 438
 439       /* ??? The builtin doesn't understand that the PCMPESTRI read from
 440          memory need not be aligned.  */
 441       sv = __builtin_ia32_loaddqu ((const char *) s);
 442       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
 443
 444       if (__builtin_expect (index < 16, 0))
 445         goto found;
 446
 447       /* Advance the pointer to an aligned address.  We will re-scan a
 448          few bytes, but we no longer need care for reading past the
 449          end of a page, since we're guaranteed a match.  */
 450       s = (const uchar *)((si + 16) & -16);
 451     }
 452
 453   /* Main loop, processing 16 bytes at a time.  By doing the whole loop
 454      in inline assembly, we can make proper use of the flags set.  */
 455   __asm (      "sub $16, %1\n"
 456         "       .balign 16\n"
 457         "0:     add $16, %1\n"
 458         "       %vpcmpestri $0, (%1), %2\n"
 459         "       jnc 0b"
 460         : "=&c"(index), "+r"(s)
 461         : "x"(search), "a"(4), "d"(16));
 462
 463  found:
 464   return s + index;
 465 }
 466
 467 #else
 468 /* Work around out-dated assemblers without sse4 support.  */
 469 #define search_line_sse42 search_line_sse2
 470 #endif
 471
 472 /* Check the CPU capabilities.  */
 473
 474 #include "../gcc/config/i386/cpuid.h"
 475
 476 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
 477 static search_line_fast_type search_line_fast;
 478
 479 #define HAVE_init_vectorized_lexer 1
 480 static inline void
 481 init_vectorized_lexer (void)
 482 {
 483   unsigned dummy, ecx = 0, edx = 0;
 484   search_line_fast_type impl = search_line_acc_char;
 485   int minimum = 0;
 486
 487 #if defined(__SSE4_2__)
 488   minimum = 3;
 489 #elif defined(__SSE2__)
 490   minimum = 2;
 491 #elif defined(__SSE__)
 492   minimum = 1;
 493 #endif
 494
 495   if (minimum == 3)
 496     impl = search_line_sse42;
 497   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
 498     {
 499       if (minimum == 3 || (ecx & bit_SSE4_2))
 500         impl = search_line_sse42;
 501       else if (minimum == 2 || (edx & bit_SSE2))
 502         impl = search_line_sse2;
 503       else if (minimum == 1 || (edx & bit_SSE))
 504         impl = search_line_mmx;
 505     }
 506   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
 507     {
 508       if (minimum == 1
 509           || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
 510         impl = search_line_mmx;
 511     }
 512
 513   search_line_fast = impl;
 514 }
 515
 516 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__)
 517
 518 /* A vection of the fast scanner using AltiVec vectorized byte compares.  */
 519 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
 520    so we can't compile this function without -maltivec on the command line
 521    (or implied by some other switch).  */
 522
 523 static const uchar *
 524 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 525 {
 526   typedef __attribute__((altivec(vector))) unsigned char vc;
 527
 528   const vc repl_nl = {
 529     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
 530     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
 531   };
 532   const vc repl_cr = {
 533     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
 534     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
 535   };
 536   const vc repl_bs = {
 537     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
 538     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
 539   };
 540   const vc repl_qm = {
 541     '?', '?', '?', '?', '?', '?', '?', '?',
 542     '?', '?', '?', '?', '?', '?', '?', '?',
 543   };
 544   const vc ones = {
 545     -1, -1, -1, -1, -1, -1, -1, -1,
 546     -1, -1, -1, -1, -1, -1, -1, -1,
 547   };
 548   const vc zero = { 0 };
 549
 550   vc data, mask, t;
 551
 552   /* Altivec loads automatically mask addresses with -16.  This lets us
 553      issue the first load as early as possible.  */
 554   data = __builtin_vec_ld(0, (const vc *)s);
 555
 556   /* Discard bytes before the beginning of the buffer.  Do this by
 557      beginning with all ones and shifting in zeros according to the
 558      mis-alignment.  The LVSR instruction pulls the exact shift we
 559      want from the address.  */
 560 #ifdef __BIG_ENDIAN__
 561   mask = __builtin_vec_lvsr(0, s);
 562   mask = __builtin_vec_perm(zero, ones, mask);
 563 #else
 564   mask = __builtin_vec_lvsl(0, s);
 565   mask = __builtin_vec_perm(ones, zero, mask);
 566 #endif
 567   data &= mask;
 568
 569   /* While altivec loads mask addresses, we still need to align S so
 570      that the offset we compute at the end is correct.  */
 571   s = (const uchar *)((uintptr_t)s & -16);
 572
 573   /* Main loop processing 16 bytes at a time.  */
 574   goto start;
 575   do
 576     {
 577       vc m_nl, m_cr, m_bs, m_qm;
 578
 579       s += 16;
 580       data = __builtin_vec_ld(0, (const vc *)s);
 581
 582     start:
 583       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
 584       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
 585       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
 586       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
 587       t = (m_nl | m_cr) | (m_bs | m_qm);
 588
 589       /* T now contains 0xff in bytes for which we matched one of the relevant
 590          characters.  We want to exit the loop if any byte in T is non-zero.
 591          Below is the expansion of vec_any_ne(t, zero).  */
 592     }
 593   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
 594
 595   {
 596 #define N  (sizeof(vc) / sizeof(long))
 597
 598     union {
 599       vc v;
 600       /* Statically assert that N is 2 or 4.  */
 601       unsigned long l[(N == 2 || N == 4) ? N : -1];
 602     } u;
 603     unsigned long l, i = 0;
 604
 605     u.v = t;
 606
 607     /* Find the first word of T that is non-zero.  */
 608     switch (N)
 609       {
 610       case 4:
 611         l = u.l[i++];
 612         if (l != 0)
 613           break;
 614         s += sizeof(unsigned long);
 615         l = u.l[i++];
 616         if (l != 0)
 617           break;
 618         s += sizeof(unsigned long);
 619       case 2:
 620         l = u.l[i++];
 621         if (l != 0)
 622           break;
 623         s += sizeof(unsigned long);
 624         l = u.l[i];
 625       }
 626
 627     /* L now contains 0xff in bytes for which we matched one of the
 628        relevant characters.  We can find the byte index by finding
 629        its bit index and dividing by 8.  */
 630 #ifdef __BIG_ENDIAN__
 631     l = __builtin_clzl(l) >> 3;
 632 #else
 633     l = __builtin_ctzl(l) >> 3;
 634 #endif
 635     return s + l;
 636
 637 #undef N
 638   }
 639 }
 640
 641 #elif defined (__ARM_NEON__)
 642 #include "arm_neon.h"
 643
 644 static const uchar *
 645 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 646 {
 647   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
 648   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
 649   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
 650   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
 651   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
 652
 653   unsigned int misalign, found, mask;
 654   const uint8_t *p;
 655   uint8x16_t data;
 656
 657   /* Align the source pointer.  */
 658   misalign = (uintptr_t)s & 15;
 659   p = (const uint8_t *)((uintptr_t)s & -16);
 660   data = vld1q_u8 (p);
 661
 662   /* Create a mask for the bytes that are valid within the first
 663      16-byte block.  The Idea here is that the AND with the mask
 664      within the loop is "free", since we need some AND or TEST
 665      insn in order to set the flags for the branch anyway.  */
 666   mask = (-1u << misalign) & 0xffff;
 667
 668   /* Main loop, processing 16 bytes at a time.  */
 669   goto start;
 670
 671   do
 672     {
 673       uint8x8_t l;
 674       uint16x4_t m;
 675       uint32x2_t n;
 676       uint8x16_t t, u, v, w;
 677
 678       p += 16;
 679       data = vld1q_u8 (p);
 680       mask = 0xffff;
 681
 682     start:
 683       t = vceqq_u8 (data, repl_nl);
 684       u = vceqq_u8 (data, repl_cr);
 685       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
 686       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
 687       t = vandq_u8 (vorrq_u8 (v, w), xmask);
 688       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
 689       m = vpaddl_u8 (l);
 690       n = vpaddl_u16 (m);
 691
 692       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
 693               vshr_n_u64 ((uint64x1_t) n, 24)), 0);
 694       found &= mask;
 695     }
 696   while (!found);
 697
 698   /* FOUND contains 1 in bits for which we matched a relevant
 699      character.  Conversion to the byte index is trivial.  */
 700   found = __builtin_ctz (found);
 701   return (const uchar *)p + found;
 702 }
 703
 704 #else
 705
 706 /* We only have one accellerated alternative.  Use a direct call so that
 707    we encourage inlining.  */
 708
 709 #define search_line_fast  search_line_acc_char
 710
 711 #endif
 712
 713 /* Initialize the lexer if needed.  */
 714
 715 void
 716 _cpp_init_lexer (void)
 717 {
 718 #ifdef HAVE_init_vectorized_lexer
 719   init_vectorized_lexer ();
 720 #endif
 721 }
 722
 723 /* Returns with a logical line that contains no escaped newlines or
 724    trigraphs.  This is a time-critical inner loop.  */
 725 void
 726 _cpp_clean_line (cpp_reader *pfile)
 727 {
 728   cpp_buffer *buffer;
 729   const uchar *s;
 730   uchar c, *d, *p;
 731
 732   buffer = pfile->buffer;
 733   buffer->cur_note = buffer->notes_used = 0;
 734   buffer->cur = buffer->line_base = buffer->next_line;
 735   buffer->need_line = false;
 736   s = buffer->next_line;
 737
 738   if (!buffer->from_stage3)
 739     {
 740       const uchar *pbackslash = NULL;
 741
 742       /* Fast path.  This is the common case of an un-escaped line with
 743          no trigraphs.  The primary win here is by not writing any
 744          data back to memory until we have to.  */
 745       while (1)
 746         {
 747           /* Perform an optimized search for \n, \r, \\, ?.  */
 748           s = search_line_fast (s, buffer->rlimit);
 749
 750           c = *s;
 751           if (c == '\\')
 752             {
 753               /* Record the location of the backslash and continue.  */
 754               pbackslash = s++;
 755             }
 756           else if (__builtin_expect (c == '?', 0))
 757             {
 758               if (__builtin_expect (s[1] == '?', false)
 759                    && _cpp_trigraph_map[s[2]])
 760                 {
 761                   /* Have a trigraph.  We may or may not have to convert
 762                      it.  Add a line note regardless, for -Wtrigraphs.  */
 763                   add_line_note (buffer, s, s[2]);
 764                   if (CPP_OPTION (pfile, trigraphs))
 765                     {
 766                       /* We do, and that means we have to switch to the
 767                          slow path.  */
 768                       d = (uchar *) s;
 769                       *d = _cpp_trigraph_map[s[2]];
 770                       s += 2;
 771                       goto slow_path;
 772                     }
 773                 }
 774               /* Not a trigraph.  Continue on fast-path.  */
 775               s++;
 776             }
 777           else
 778             break;
 779         }
 780
 781       /* This must be \r or \n.  We're either done, or we'll be forced
 782          to write back to the buffer and continue on the slow path.  */
 783       d = (uchar *) s;
 784
 785       if (__builtin_expect (s == buffer->rlimit, false))
 786         goto done;
 787
 788       /* DOS line ending? */
 789       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
 790         {
 791           s++;
 792           if (s == buffer->rlimit)
 793             goto done;
 794         }
 795
 796       if (__builtin_expect (pbackslash == NULL, true))
 797         goto done;
 798
 799       /* Check for escaped newline.  */
 800       p = d;
 801       while (is_nvspace (p[-1]))
 802         p--;
 803       if (p - 1 != pbackslash)
 804         goto done;
 805
 806       /* Have an escaped newline; process it and proceed to
 807          the slow path.  */
 808       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 809       d = p - 2;
 810       buffer->next_line = p - 1;
 811
 812     slow_path:
 813       while (1)
 814         {
 815           c = *++s;
 816           *++d = c;
 817
 818           if (c == '\n' || c == '\r')
 819             {
 820               /* Handle DOS line endings.  */
 821               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 822                 s++;
 823               if (s == buffer->rlimit)
 824                 break;
 825
 826               /* Escaped?  */
 827               p = d;
 828               while (p != buffer->next_line && is_nvspace (p[-1]))
 829                 p--;
 830               if (p == buffer->next_line || p[-1] != '\\')
 831                 break;
 832
 833               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 834               d = p - 2;
 835               buffer->next_line = p - 1;
 836             }
 837           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 838             {
 839               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 840               add_line_note (buffer, d, s[2]);
 841               if (CPP_OPTION (pfile, trigraphs))
 842                 {
 843                   *d = _cpp_trigraph_map[s[2]];
 844                   s += 2;
 845                 }
 846             }
 847         }
 848     }
 849   else
 850     {
 851       while (*s != '\n' && *s != '\r')
 852         s++;
 853       d = (uchar *) s;
 854
 855       /* Handle DOS line endings.  */
 856       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 857         s++;
 858     }
 859
 860  done:
 861   *d = '\n';
 862   /* A sentinel note that should never be processed.  */
 863   add_line_note (buffer, d + 1, '\n');
 864   buffer->next_line = s + 1;
 865 }
 866
 867 /* Return true if the trigraph indicated by NOTE should be warned
 868    about in a comment.  */
 869 static bool
 870 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 871 {
 872   const uchar *p;
 873
 874   /* Within comments we don't warn about trigraphs, unless the
 875      trigraph forms an escaped newline, as that may change
 876      behavior.  */
 877   if (note->type != '/')
 878     return false;
 879
 880   /* If -trigraphs, then this was an escaped newline iff the next note
 881      is coincident.  */
 882   if (CPP_OPTION (pfile, trigraphs))
 883     return note[1].pos == note->pos;
 884
 885   /* Otherwise, see if this forms an escaped newline.  */
 886   p = note->pos + 3;
 887   while (is_nvspace (*p))
 888     p++;
 889
 890   /* There might have been escaped newlines between the trigraph and the
 891      newline we found.  Hence the position test.  */
 892   return (*p == '\n' && p < note[1].pos);
 893 }
 894
 895 /* Process the notes created by add_line_note as far as the current
 896    location.  */
 897 void
 898 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 899 {
 900   cpp_buffer *buffer = pfile->buffer;
 901
 902   for (;;)
 903     {
 904       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 905       unsigned int col;
 906
 907       if (note->pos > buffer->cur)
 908         break;
 909
 910       buffer->cur_note++;
 911       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 912
 913       if (note->type == '\\' || note->type == ' ')
 914         {
 915           if (note->type == ' ' && !in_comment)
 916             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 917                                  "backslash and newline separated by space");
 918
 919           if (buffer->next_line > buffer->rlimit)
 920             {
 921               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 922                                    "backslash-newline at end of file");
 923               /* Prevent "no newline at end of file" warning.  */
 924               buffer->next_line = buffer->rlimit;
 925             }
 926
 927           buffer->line_base = note->pos;
 928           CPP_INCREMENT_LINE (pfile, 0);
 929         }
 930       else if (_cpp_trigraph_map[note->type])
 931         {
 932           if (CPP_OPTION (pfile, warn_trigraphs)
 933               && (!in_comment || warn_in_comment (pfile, note)))
 934             {
 935               if (CPP_OPTION (pfile, trigraphs))
 936                 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
 937                                        pfile->line_table->highest_line, col,
 938                                        "trigraph ??%c converted to %c",
 939                                        note->type,
 940                                        (int) _cpp_trigraph_map[note->type]);
 941               else
 942                 {
 943                   cpp_warning_with_line
 944                     (pfile, CPP_W_TRIGRAPHS,
 945                      pfile->line_table->highest_line, col,
 946                      "trigraph ??%c ignored, use -trigraphs to enable",
 947                      note->type);
 948                 }
 949             }
 950         }
 951       else if (note->type == 0)
 952         /* Already processed in lex_raw_string.  */;
 953       else
 954         abort ();
 955     }
 956 }
 957
 958 /* Skip a C-style block comment.  We find the end of the comment by
 959    seeing if an asterisk is before every '/' we encounter.  Returns
 960    nonzero if comment terminated by EOF, zero otherwise.
 961
 962    Buffer->cur points to the initial asterisk of the comment.  */
 963 bool
 964 _cpp_skip_block_comment (cpp_reader *pfile)
 965 {
 966   cpp_buffer *buffer = pfile->buffer;
 967   const uchar *cur = buffer->cur;
 968   uchar c;
 969
 970   cur++;
 971   if (*cur == '/')
 972     cur++;
 973
 974   for (;;)
 975     {
 976       /* People like decorating comments with '*', so check for '/'
 977          instead for efficiency.  */
 978       c = *cur++;
 979
 980       if (c == '/')
 981         {
 982           if (cur[-2] == '*')
 983             break;
 984
 985           /* Warn about potential nested comments, but not if the '/'
 986              comes immediately before the true comment delimiter.
 987              Don't bother to get it right across escaped newlines.  */
 988           if (CPP_OPTION (pfile, warn_comments)
 989               && cur[0] == '*' && cur[1] != '/')
 990             {
 991               buffer->cur = cur;
 992               cpp_warning_with_line (pfile, CPP_W_COMMENTS,
 993                                      pfile->line_table->highest_line,
 994                                      CPP_BUF_COL (buffer),
 995                                      "\"/*\" within comment");
 996             }
 997         }
 998       else if (c == '\n')
 999         {
1000           unsigned int cols;
1001           buffer->cur = cur - 1;
1002           _cpp_process_line_notes (pfile, true);
1003           if (buffer->next_line >= buffer->rlimit)
1004             return true;
1005           _cpp_clean_line (pfile);
1006
1007           cols = buffer->next_line - buffer->line_base;
1008           CPP_INCREMENT_LINE (pfile, cols);
1009
1010           cur = buffer->cur;
1011         }
1012     }
1013
1014   buffer->cur = cur;
1015   _cpp_process_line_notes (pfile, true);
1016   return false;
1017 }
1018
1019 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1020    terminating newline.  Handles escaped newlines.  Returns nonzero
1021    if a multiline comment.  */
1022 static int
1023 skip_line_comment (cpp_reader *pfile)
1024 {
1025   cpp_buffer *buffer = pfile->buffer;
1026   source_location orig_line = pfile->line_table->highest_line;
1027
1028   while (*buffer->cur != '\n')
1029     buffer->cur++;
1030
1031   _cpp_process_line_notes (pfile, true);
1032   return orig_line != pfile->line_table->highest_line;
1033 }
1034
1035 /* Skips whitespace, saving the next non-whitespace character.  */
1036 static void
1037 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1038 {
1039   cpp_buffer *buffer = pfile->buffer;
1040   bool saw_NUL = false;
1041
1042   do
1043     {
1044       /* Horizontal space always OK.  */
1045       if (c == ' ' || c == '\t')
1046         ;
1047       /* Just \f \v or \0 left.  */
1048       else if (c == '\0')
1049         saw_NUL = true;
1050       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1051         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1052                              CPP_BUF_COL (buffer),
1053                              "%s in preprocessing directive",
1054                              c == '\f' ? "form feed" : "vertical tab");
1055
1056       c = *buffer->cur++;
1057     }
1058   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1059   while (is_nvspace (c));
1060
1061   if (saw_NUL)
1062     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1063
1064   buffer->cur--;
1065 }
1066
1067 /* See if the characters of a number token are valid in a name (no
1068    '.', '+' or '-').  */
1069 static int
1070 name_p (cpp_reader *pfile, const cpp_string *string)
1071 {
1072   unsigned int i;
1073
1074   for (i = 0; i < string->len; i++)
1075     if (!is_idchar (string->text[i]))
1076       return 0;
1077
1078   return 1;
1079 }
1080
1081 /* After parsing an identifier or other sequence, produce a warning about
1082    sequences not in NFC/NFKC.  */
1083 static void
1084 warn_about_normalization (cpp_reader *pfile,
1085                           const cpp_token *token,
1086                           const struct normalize_state *s)
1087 {
1088   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1089       && !pfile->state.skipping)
1090     {
1091       /* Make sure that the token is printed using UCNs, even
1092          if we'd otherwise happily print UTF-8.  */
1093       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1094       size_t sz;
1095
1096       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1097       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1098         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1099                                "`%.*s' is not in NFKC", (int) sz, buf);
1100       else
1101         cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1102                                "`%.*s' is not in NFC", (int) sz, buf);
1103       free (buf);
1104     }
1105 }
1106
1107 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1108    an identifier.  FIRST is TRUE if this starts an identifier.  */
1109 static bool
1110 forms_identifier_p (cpp_reader *pfile, int first,
1111                     struct normalize_state *state)
1112 {
1113   cpp_buffer *buffer = pfile->buffer;
1114
1115   if (*buffer->cur == '$')
1116     {
1117       if (!CPP_OPTION (pfile, dollars_in_ident))
1118         return false;
1119
1120       buffer->cur++;
1121       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1122         {
1123           CPP_OPTION (pfile, warn_dollars) = 0;
1124           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1125         }
1126
1127       return true;
1128     }
1129
1130   /* Is this a syntactically valid UCN?  */
1131   if (CPP_OPTION (pfile, extended_identifiers)
1132       && *buffer->cur == '\\'
1133       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1134     {
1135       buffer->cur += 2;
1136       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1137                           state))
1138         return true;
1139       buffer->cur -= 2;
1140     }
1141
1142   return false;
1143 }
1144
1145 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1146 static cpp_hashnode *
1147 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1148 {
1149   cpp_hashnode *result;
1150   const uchar *cur;
1151   unsigned int len;
1152   unsigned int hash = HT_HASHSTEP (0, *base);
1153
1154   cur = base + 1;
1155   while (ISIDNUM (*cur))
1156     {
1157       hash = HT_HASHSTEP (hash, *cur);
1158       cur++;
1159     }
1160   len = cur - base;
1161   hash = HT_HASHFINISH (hash, len);
1162   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1163                                               base, len, hash, HT_ALLOC));
1164
1165   /* Rarely, identifiers require diagnostics when lexed.  */
1166   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1167                         && !pfile->state.skipping, 0))
1168     {
1169       /* It is allowed to poison the same identifier twice.  */
1170       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1171         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1172                    NODE_NAME (result));
1173
1174       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1175          replacement list of a variadic macro.  */
1176       if (result == pfile->spec_nodes.n__VA_ARGS__
1177           && !pfile->state.va_args_ok)
1178         {
1179           if (CPP_OPTION (pfile, cplusplus))
1180             cpp_error (pfile, CPP_DL_PEDWARN,
1181                        "__VA_ARGS__ can only appear in the expansion"
1182                        " of a C++11 variadic macro");
1183           else
1184             cpp_error (pfile, CPP_DL_PEDWARN,
1185                        "__VA_ARGS__ can only appear in the expansion"
1186                        " of a C99 variadic macro");
1187         }
1188
1189       /* For -Wc++-compat, warn about use of C++ named operators.  */
1190       if (result->flags & NODE_WARN_OPERATOR)
1191         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1192                      "identifier \"%s\" is a special operator name in C++",
1193                      NODE_NAME (result));
1194     }
1195
1196   return result;
1197 }
1198
1199 /* Get the cpp_hashnode of an identifier specified by NAME in
1200    the current cpp_reader object.  If none is found, NULL is returned.  */
1201 cpp_hashnode *
1202 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1203 {
1204   cpp_hashnode *result;
1205   result = lex_identifier_intern (pfile, (uchar *) name);
1206   return result;
1207 }
1208
1209 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1210 static cpp_hashnode *
1211 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1212                 struct normalize_state *nst)
1213 {
1214   cpp_hashnode *result;
1215   const uchar *cur;
1216   unsigned int len;
1217   unsigned int hash = HT_HASHSTEP (0, *base);
1218
1219   cur = pfile->buffer->cur;
1220   if (! starts_ucn)
1221     {
1222       while (ISIDNUM (*cur))
1223         {
1224           hash = HT_HASHSTEP (hash, *cur);
1225           cur++;
1226         }
1227       NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
1228     }
1229   pfile->buffer->cur = cur;
1230   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1231     {
1232       /* Slower version for identifiers containing UCNs (or $).  */
1233       do {
1234         while (ISIDNUM (*pfile->buffer->cur))
1235           {
1236             NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
1237             pfile->buffer->cur++;
1238           }
1239       } while (forms_identifier_p (pfile, false, nst));
1240       result = _cpp_interpret_identifier (pfile, base,
1241                                           pfile->buffer->cur - base);
1242     }
1243   else
1244     {
1245       len = cur - base;
1246       hash = HT_HASHFINISH (hash, len);
1247
1248       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1249                                                   base, len, hash, HT_ALLOC));
1250     }
1251
1252   /* Rarely, identifiers require diagnostics when lexed.  */
1253   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1254                         && !pfile->state.skipping, 0))
1255     {
1256       /* It is allowed to poison the same identifier twice.  */
1257       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1258         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1259                    NODE_NAME (result));
1260
1261       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1262          replacement list of a variadic macro.  */
1263       if (result == pfile->spec_nodes.n__VA_ARGS__
1264           && !pfile->state.va_args_ok)
1265         {
1266           if (CPP_OPTION (pfile, cplusplus))
1267             cpp_error (pfile, CPP_DL_PEDWARN,
1268                        "__VA_ARGS__ can only appear in the expansion"
1269                        " of a C++11 variadic macro");
1270           else
1271             cpp_error (pfile, CPP_DL_PEDWARN,
1272                        "__VA_ARGS__ can only appear in the expansion"
1273                        " of a C99 variadic macro");
1274         }
1275
1276       /* For -Wc++-compat, warn about use of C++ named operators.  */
1277       if (result->flags & NODE_WARN_OPERATOR)
1278         cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1279                      "identifier \"%s\" is a special operator name in C++",
1280                      NODE_NAME (result));
1281     }
1282
1283   return result;
1284 }
1285
1286 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1287 static void
1288 lex_number (cpp_reader *pfile, cpp_string *number,
1289             struct normalize_state *nst)
1290 {
1291   const uchar *cur;
1292   const uchar *base;
1293   uchar *dest;
1294
1295   base = pfile->buffer->cur - 1;
1296   do
1297     {
1298       cur = pfile->buffer->cur;
1299
1300       /* N.B. ISIDNUM does not include $.  */
1301       while (ISIDNUM (*cur) || *cur == '.' || DIGIT_SEP (*cur)
1302              || VALID_SIGN (*cur, cur[-1]))
1303         {
1304           NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
1305           cur++;
1306         }
1307
1308       pfile->buffer->cur = cur;
1309     }
1310   while (forms_identifier_p (pfile, false, nst));
1311
1312   number->len = cur - base;
1313   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1314   memcpy (dest, base, number->len);
1315   dest[number->len] = '\0';
1316   number->text = dest;
1317 }
1318
1319 /* Create a token of type TYPE with a literal spelling.  */
1320 static void
1321 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1322                 unsigned int len, enum cpp_ttype type)
1323 {
1324   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1325
1326   memcpy (dest, base, len);
1327   dest[len] = '\0';
1328   token->type = type;
1329   token->val.str.len = len;
1330   token->val.str.text = dest;
1331 }
1332
1333 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1334    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1335
1336 static void
1337 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1338                 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1339 {
1340   _cpp_buff *first_buff = *first_buff_p;
1341   _cpp_buff *last_buff = *last_buff_p;
1342
1343   if (first_buff == NULL)
1344     first_buff = last_buff = _cpp_get_buff (pfile, len);
1345   else if (len > BUFF_ROOM (last_buff))
1346     {
1347       size_t room = BUFF_ROOM (last_buff);
1348       memcpy (BUFF_FRONT (last_buff), base, room);
1349       BUFF_FRONT (last_buff) += room;
1350       base += room;
1351       len -= room;
1352       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1353     }
1354
1355   memcpy (BUFF_FRONT (last_buff), base, len);
1356   BUFF_FRONT (last_buff) += len;
1357
1358   *first_buff_p = first_buff;
1359   *last_buff_p = last_buff;
1360 }
1361
1362
1363 /* Returns true if a macro has been defined.
1364    This might not work if compile with -save-temps,
1365    or preprocess separately from compilation.  */
1366
1367 static bool
1368 is_macro(cpp_reader *pfile, const uchar *base)
1369 {
1370   const uchar *cur = base;
1371   if (! ISIDST (*cur))
1372     return false;
1373   unsigned int hash = HT_HASHSTEP (0, *cur);
1374   ++cur;
1375   while (ISIDNUM (*cur))
1376     {
1377       hash = HT_HASHSTEP (hash, *cur);
1378       ++cur;
1379     }
1380   hash = HT_HASHFINISH (hash, cur - base);
1381
1382   cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1383                                         base, cur - base, hash, HT_NO_INSERT));
1384
1385   return !result ? false : (result->type == NT_MACRO);
1386 }
1387
1388
1389 /* Lexes a raw string.  The stored string contains the spelling, including
1390    double quotes, delimiter string, '(' and ')', any leading
1391    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1392    literal, or CPP_OTHER if it was not properly terminated.
1393
1394    The spelling is NUL-terminated, but it is not guaranteed that this
1395    is the first NUL since embedded NULs are preserved.  */
1396
1397 static void
1398 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1399                 const uchar *cur)
1400 {
1401   uchar raw_prefix[17];
1402   uchar temp_buffer[18];
1403   const uchar *orig_base;
1404   unsigned int raw_prefix_len = 0, raw_suffix_len = 0;
1405   enum raw_str_phase { RAW_STR_PREFIX, RAW_STR, RAW_STR_SUFFIX };
1406   raw_str_phase phase = RAW_STR_PREFIX;
1407   enum cpp_ttype type;
1408   size_t total_len = 0;
1409   /* Index into temp_buffer during phases other than RAW_STR,
1410      during RAW_STR phase 17 to tell BUF_APPEND that nothing should
1411      be appended to temp_buffer.  */
1412   size_t temp_buffer_len = 0;
1413   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1414   size_t raw_prefix_start;
1415   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1416
1417   type = (*base == 'L' ? CPP_WSTRING :
1418           *base == 'U' ? CPP_STRING32 :
1419           *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1420           : CPP_STRING);
1421
1422 #define BUF_APPEND(STR,LEN)                                     \
1423       do {                                                      \
1424         bufring_append (pfile, (const uchar *)(STR), (LEN),     \
1425                         &first_buff, &last_buff);               \
1426         total_len += (LEN);                                     \
1427         if (__builtin_expect (temp_buffer_len < 17, 0)          \
1428             && (const uchar *)(STR) != base                     \
1429             && (LEN) <= 2)                                      \
1430           {                                                     \
1431             memcpy (temp_buffer + temp_buffer_len,              \
1432                     (const uchar *)(STR), (LEN));               \
1433             temp_buffer_len += (LEN);                           \
1434           }                                                     \
1435       } while (0);
1436
1437   orig_base = base;
1438   ++cur;
1439   raw_prefix_start = cur - base;
1440   for (;;)
1441     {
1442       cppchar_t c;
1443
1444       /* If we previously performed any trigraph or line splicing
1445          transformations, undo them in between the opening and closing
1446          double quote.  */
1447       while (note->pos < cur)
1448         ++note;
1449       for (; note->pos == cur; ++note)
1450         {
1451           switch (note->type)
1452             {
1453             case '\\':
1454             case ' ':
1455               /* Restore backslash followed by newline.  */
1456               BUF_APPEND (base, cur - base);
1457               base = cur;
1458               BUF_APPEND ("\\", 1);
1459             after_backslash:
1460               if (note->type == ' ')
1461                 {
1462                   /* GNU backslash whitespace newline extension.  FIXME
1463                      could be any sequence of non-vertical space.  When we
1464                      can properly restore any such sequence, we should mark
1465                      this note as handled so _cpp_process_line_notes
1466                      doesn't warn.  */
1467                   BUF_APPEND (" ", 1);
1468                 }
1469
1470               BUF_APPEND ("\n", 1);
1471               break;
1472
1473             case 0:
1474               /* Already handled.  */
1475               break;
1476
1477             default:
1478               if (_cpp_trigraph_map[note->type])
1479                 {
1480                   /* Don't warn about this trigraph in
1481                      _cpp_process_line_notes, since trigraphs show up as
1482                      trigraphs in raw strings.  */
1483                   uchar type = note->type;
1484                   note->type = 0;
1485
1486                   if (!CPP_OPTION (pfile, trigraphs))
1487                     /* If we didn't convert the trigraph in the first
1488                        place, don't do anything now either.  */
1489                     break;
1490
1491                   BUF_APPEND (base, cur - base);
1492                   base = cur;
1493                   BUF_APPEND ("??", 2);
1494
1495                   /* ??/ followed by newline gets two line notes, one for
1496                      the trigraph and one for the backslash/newline.  */
1497                   if (type == '/' && note[1].pos == cur)
1498                     {
1499                       if (note[1].type != '\\'
1500                           && note[1].type != ' ')
1501                         abort ();
1502                       BUF_APPEND ("/", 1);
1503                       ++note;
1504                       goto after_backslash;
1505                     }
1506                   else
1507                     {
1508                       /* Skip the replacement character.  */
1509                       base = ++cur;
1510                       BUF_APPEND (&type, 1);
1511                       c = type;
1512                       goto check_c;
1513                     }
1514                 }
1515               else
1516                 abort ();
1517               break;
1518             }
1519         }
1520       c = *cur++;
1521       if (__builtin_expect (temp_buffer_len < 17, 0))
1522         temp_buffer[temp_buffer_len++] = c;
1523
1524      check_c:
1525       if (phase == RAW_STR_PREFIX)
1526         {
1527           while (raw_prefix_len < temp_buffer_len)
1528             {
1529               raw_prefix[raw_prefix_len] = temp_buffer[raw_prefix_len];
1530               switch (raw_prefix[raw_prefix_len])
1531                 {
1532                 case ' ': case '(': case ')': case '\\': case '\t':
1533                 case '\v': case '\f': case '\n': default:
1534                   break;
1535                 /* Basic source charset except the above chars.  */
1536                 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1537                 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1538                 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1539                 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1540                 case 'y': case 'z':
1541                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1542                 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1543                 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1544                 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1545                 case 'Y': case 'Z':
1546                 case '0': case '1': case '2': case '3': case '4': case '5':
1547                 case '6': case '7': case '8': case '9':
1548                 case '_': case '{': case '}': case '#': case '[': case ']':
1549                 case '<': case '>': case '%': case ':': case ';': case '.':
1550                 case '?': case '*': case '+': case '-': case '/': case '^':
1551                 case '&': case '|': case '~': case '!': case '=': case ',':
1552                 case '"': case '\'':
1553                   if (raw_prefix_len < 16)
1554                     {
1555                       raw_prefix_len++;
1556                       continue;
1557                     }
1558                   break;
1559                 }
1560
1561               if (raw_prefix[raw_prefix_len] != '(')
1562                 {
1563                   int col = CPP_BUF_COLUMN (pfile->buffer, cur) + 1;
1564                   if (raw_prefix_len == 16)
1565                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1566                                          col, "raw string delimiter longer "
1567                                               "than 16 characters");
1568                   else if (raw_prefix[raw_prefix_len] == '\n')
1569                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1570                                          col, "invalid new-line in raw "
1571                                               "string delimiter");
1572                   else
1573                     cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
1574                                          col, "invalid character '%c' in "
1575                                               "raw string delimiter",
1576                                          (int) raw_prefix[raw_prefix_len]);
1577                   pfile->buffer->cur = orig_base + raw_prefix_start - 1;
1578                   create_literal (pfile, token, orig_base,
1579                                   raw_prefix_start - 1, CPP_OTHER);
1580                   if (first_buff)
1581                     _cpp_release_buff (pfile, first_buff);
1582                   return;
1583                 }
1584               raw_prefix[raw_prefix_len] = '"';
1585               phase = RAW_STR;
1586               /* Nothing should be appended to temp_buffer during
1587                  RAW_STR phase.  */
1588               temp_buffer_len = 17;
1589               break;
1590             }
1591           continue;
1592         }
1593       else if (phase == RAW_STR_SUFFIX)
1594         {
1595           while (raw_suffix_len <= raw_prefix_len
1596                  && raw_suffix_len < temp_buffer_len
1597                  && temp_buffer[raw_suffix_len] == raw_prefix[raw_suffix_len])
1598             raw_suffix_len++;
1599           if (raw_suffix_len > raw_prefix_len)
1600             break;
1601           if (raw_suffix_len == temp_buffer_len)
1602             continue;
1603           phase = RAW_STR;
1604           /* Nothing should be appended to temp_buffer during
1605              RAW_STR phase.  */
1606           temp_buffer_len = 17;
1607         }
1608       if (c == ')')
1609         {
1610           phase = RAW_STR_SUFFIX;
1611           raw_suffix_len = 0;
1612           temp_buffer_len = 0;
1613         }
1614       else if (c == '\n')
1615         {
1616           if (pfile->state.in_directive
1617               || (pfile->state.parsing_args
1618                   && pfile->buffer->next_line >= pfile->buffer->rlimit))
1619             {
1620               cur--;
1621               type = CPP_OTHER;
1622               cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1623                                    "unterminated raw string");
1624               break;
1625             }
1626
1627           BUF_APPEND (base, cur - base);
1628
1629           if (pfile->buffer->cur < pfile->buffer->rlimit)
1630             CPP_INCREMENT_LINE (pfile, 0);
1631           pfile->buffer->need_line = true;
1632
1633           pfile->buffer->cur = cur-1;
1634           _cpp_process_line_notes (pfile, false);
1635           if (!_cpp_get_fresh_line (pfile))
1636             {
1637               source_location src_loc = token->src_loc;
1638               token->type = CPP_EOF;
1639               /* Tell the compiler the line number of the EOF token.  */
1640               token->src_loc = pfile->line_table->highest_line;
1641               token->flags = BOL;
1642               if (first_buff != NULL)
1643                 _cpp_release_buff (pfile, first_buff);
1644               cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1645                                    "unterminated raw string");
1646               return;
1647             }
1648
1649           cur = base = pfile->buffer->cur;
1650           note = &pfile->buffer->notes[pfile->buffer->cur_note];
1651         }
1652     }
1653
1654   if (CPP_OPTION (pfile, user_literals))
1655     {
1656       /* If a string format macro, say from inttypes.h, is placed touching
1657          a string literal it could be parsed as a C++11 user-defined string
1658          literal thus breaking the program.
1659          Try to identify macros with is_macro. A warning is issued. */
1660       if (is_macro (pfile, cur))
1661         {
1662           /* Raise a warning, but do not consume subsequent tokens.  */
1663           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
1664             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1665                                    token->src_loc, 0,
1666                                    "invalid suffix on literal; C++11 requires "
1667                                    "a space between literal and string macro");
1668         }
1669       /* Grab user defined literal suffix.  */
1670       else if (ISIDST (*cur))
1671         {
1672           type = cpp_userdef_string_add_type (type);
1673           ++cur;
1674
1675           while (ISIDNUM (*cur))
1676             ++cur;
1677         }
1678     }
1679
1680   pfile->buffer->cur = cur;
1681   if (first_buff == NULL)
1682     create_literal (pfile, token, base, cur - base, type);
1683   else
1684     {
1685       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1686
1687       token->type = type;
1688       token->val.str.len = total_len + (cur - base);
1689       token->val.str.text = dest;
1690       last_buff = first_buff;
1691       while (last_buff != NULL)
1692         {
1693           memcpy (dest, last_buff->base,
1694                   BUFF_FRONT (last_buff) - last_buff->base);
1695           dest += BUFF_FRONT (last_buff) - last_buff->base;
1696           last_buff = last_buff->next;
1697         }
1698       _cpp_release_buff (pfile, first_buff);
1699       memcpy (dest, base, cur - base);
1700       dest[cur - base] = '\0';
1701     }
1702 }
1703
1704 /* Lexes a string, character constant, or angle-bracketed header file
1705    name.  The stored string contains the spelling, including opening
1706    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1707    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1708    if it was not properly terminated, or CPP_LESS for an unterminated
1709    header name which must be relexed as normal tokens.
1710
1711    The spelling is NUL-terminated, but it is not guaranteed that this
1712    is the first NUL since embedded NULs are preserved.  */
1713 static void
1714 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1715 {
1716   bool saw_NUL = false;
1717   const uchar *cur;
1718   cppchar_t terminator;
1719   enum cpp_ttype type;
1720
1721   cur = base;
1722   terminator = *cur++;
1723   if (terminator == 'L' || terminator == 'U')
1724     terminator = *cur++;
1725   else if (terminator == 'u')
1726     {
1727       terminator = *cur++;
1728       if (terminator == '8')
1729         terminator = *cur++;
1730     }
1731   if (terminator == 'R')
1732     {
1733       lex_raw_string (pfile, token, base, cur);
1734       return;
1735     }
1736   if (terminator == '"')
1737     type = (*base == 'L' ? CPP_WSTRING :
1738             *base == 'U' ? CPP_STRING32 :
1739             *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1740                          : CPP_STRING);
1741   else if (terminator == '\'')
1742     type = (*base == 'L' ? CPP_WCHAR :
1743             *base == 'U' ? CPP_CHAR32 :
1744             *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
1745   else
1746     terminator = '>', type = CPP_HEADER_NAME;
1747
1748   for (;;)
1749     {
1750       cppchar_t c = *cur++;
1751
1752       /* In #include-style directives, terminators are not escapable.  */
1753       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1754         cur++;
1755       else if (c == terminator)
1756         break;
1757       else if (c == '\n')
1758         {
1759           cur--;
1760           /* Unmatched quotes always yield undefined behavior, but
1761              greedy lexing means that what appears to be an unterminated
1762              header name may actually be a legitimate sequence of tokens.  */
1763           if (terminator == '>')
1764             {
1765               token->type = CPP_LESS;
1766               return;
1767             }
1768           type = CPP_OTHER;
1769           break;
1770         }
1771       else if (c == '\0')
1772         saw_NUL = true;
1773     }
1774
1775   if (saw_NUL && !pfile->state.skipping)
1776     cpp_error (pfile, CPP_DL_WARNING,
1777                "null character(s) preserved in literal");
1778
1779   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1780     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1781                (int) terminator);
1782
1783   if (CPP_OPTION (pfile, user_literals))
1784     {
1785       /* If a string format macro, say from inttypes.h, is placed touching
1786          a string literal it could be parsed as a C++11 user-defined string
1787          literal thus breaking the program.
1788          Try to identify macros with is_macro. A warning is issued. */
1789       if (is_macro (pfile, cur))
1790         {
1791           /* Raise a warning, but do not consume subsequent tokens.  */
1792           if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
1793             cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1794                                    token->src_loc, 0,
1795                                    "invalid suffix on literal; C++11 requires "
1796                                    "a space between literal and string macro");
1797         }
1798       /* Grab user defined literal suffix.  */
1799       else if (ISIDST (*cur))
1800         {
1801           type = cpp_userdef_char_add_type (type);
1802           type = cpp_userdef_string_add_type (type);
1803           ++cur;
1804
1805           while (ISIDNUM (*cur))
1806             ++cur;
1807         }
1808     }
1809
1810   pfile->buffer->cur = cur;
1811   create_literal (pfile, token, base, cur - base, type);
1812 }
1813
1814 /* Return the comment table. The client may not make any assumption
1815    about the ordering of the table.  */
1816 cpp_comment_table *
1817 cpp_get_comments (cpp_reader *pfile)
1818 {
1819   return &pfile->comments;
1820 }
1821
1822 /* Append a comment to the end of the comment table. */
1823 static void
1824 store_comment (cpp_reader *pfile, cpp_token *token)
1825 {
1826   int len;
1827
1828   if (pfile->comments.allocated == 0)
1829     {
1830       pfile->comments.allocated = 256;
1831       pfile->comments.entries = (cpp_comment *) xmalloc
1832         (pfile->comments.allocated * sizeof (cpp_comment));
1833     }
1834
1835   if (pfile->comments.count == pfile->comments.allocated)
1836     {
1837       pfile->comments.allocated *= 2;
1838       pfile->comments.entries = (cpp_comment *) xrealloc
1839         (pfile->comments.entries,
1840          pfile->comments.allocated * sizeof (cpp_comment));
1841     }
1842
1843   len = token->val.str.len;
1844
1845   /* Copy comment. Note, token may not be NULL terminated. */
1846   pfile->comments.entries[pfile->comments.count].comment =
1847     (char *) xmalloc (sizeof (char) * (len + 1));
1848   memcpy (pfile->comments.entries[pfile->comments.count].comment,
1849           token->val.str.text, len);
1850   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
1851
1852   /* Set source location. */
1853   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
1854
1855   /* Increment the count of entries in the comment table. */
1856   pfile->comments.count++;
1857 }
1858
1859 /* The stored comment includes the comment start and any terminator.  */
1860 static void
1861 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1862               cppchar_t type)
1863 {
1864   unsigned char *buffer;
1865   unsigned int len, clen, i;
1866
1867   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1868
1869   /* C++ comments probably (not definitely) have moved past a new
1870      line, which we don't want to save in the comment.  */
1871   if (is_vspace (pfile->buffer->cur[-1]))
1872     len--;
1873
1874   /* If we are currently in a directive or in argument parsing, then
1875      we need to store all C++ comments as C comments internally, and
1876      so we need to allocate a little extra space in that case.
1877
1878      Note that the only time we encounter a directive here is
1879      when we are saving comments in a "#define".  */
1880   clen = ((pfile->state.in_directive || pfile->state.parsing_args)
1881           && type == '/') ? len + 2 : len;
1882
1883   buffer = _cpp_unaligned_alloc (pfile, clen);
1884
1885   token->type = CPP_COMMENT;
1886   token->val.str.len = clen;
1887   token->val.str.text = buffer;
1888
1889   buffer[0] = '/';
1890   memcpy (buffer + 1, from, len - 1);
1891
1892   /* Finish conversion to a C comment, if necessary.  */
1893   if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
1894     {
1895       buffer[1] = '*';
1896       buffer[clen - 2] = '*';
1897       buffer[clen - 1] = '/';
1898       /* As there can be in a C++ comments illegal sequences for C comments
1899          we need to filter them out.  */
1900       for (i = 2; i < (clen - 2); i++)
1901         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
1902           buffer[i] = '|';
1903     }
1904
1905   /* Finally store this comment for use by clients of libcpp. */
1906   store_comment (pfile, token);
1907 }
1908
1909 /* Allocate COUNT tokens for RUN.  */
1910 void
1911 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1912 {
1913   run->base = XNEWVEC (cpp_token, count);
1914   run->limit = run->base + count;
1915   run->next = NULL;
1916 }
1917
1918 /* Returns the next tokenrun, or creates one if there is none.  */
1919 static tokenrun *
1920 next_tokenrun (tokenrun *run)
1921 {
1922   if (run->next == NULL)
1923     {
1924       run->next = XNEW (tokenrun);
1925       run->next->prev = run;
1926       _cpp_init_tokenrun (run->next, 250);
1927     }
1928
1929   return run->next;
1930 }
1931
1932 /* Return the number of not yet processed token in a given
1933    context.  */
1934 int
1935 _cpp_remaining_tokens_num_in_context (cpp_context *context)
1936 {
1937   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1938     return (LAST (context).token - FIRST (context).token);
1939   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1940            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1941     return (LAST (context).ptoken - FIRST (context).ptoken);
1942   else
1943       abort ();
1944 }
1945
1946 /* Returns the token present at index INDEX in a given context.  If
1947    INDEX is zero, the next token to be processed is returned.  */
1948 static const cpp_token*
1949 _cpp_token_from_context_at (cpp_context *context, int index)
1950 {
1951   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1952     return &(FIRST (context).token[index]);
1953   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1954            || context->tokens_kind == TOKENS_KIND_EXTENDED)
1955     return FIRST (context).ptoken[index];
1956  else
1957    abort ();
1958 }
1959
1960 /* Look ahead in the input stream.  */
1961 const cpp_token *
1962 cpp_peek_token (cpp_reader *pfile, int index)
1963 {
1964   cpp_context *context = pfile->context;
1965   const cpp_token *peektok;
1966   int count;
1967
1968   /* First, scan through any pending cpp_context objects.  */
1969   while (context->prev)
1970     {
1971       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
1972
1973       if (index < (int) sz)
1974         return _cpp_token_from_context_at (context, index);
1975       index -= (int) sz;
1976       context = context->prev;
1977     }
1978
1979   /* We will have to read some new tokens after all (and do so
1980      without invalidating preceding tokens).  */
1981   count = index;
1982   pfile->keep_tokens++;
1983
1984   do
1985     {
1986       peektok = _cpp_lex_token (pfile);
1987       if (peektok->type == CPP_EOF)
1988         return peektok;
1989     }
1990   while (index--);
1991
1992   _cpp_backup_tokens_direct (pfile, count + 1);
1993   pfile->keep_tokens--;
1994
1995   return peektok;
1996 }
1997
1998 /* Allocate a single token that is invalidated at the same time as the
1999    rest of the tokens on the line.  Has its line and col set to the
2000    same as the last lexed token, so that diagnostics appear in the
2001    right place.  */
2002 cpp_token *
2003 _cpp_temp_token (cpp_reader *pfile)
2004 {
2005   cpp_token *old, *result;
2006   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
2007   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
2008
2009   old = pfile->cur_token - 1;
2010   /* Any pre-existing lookaheads must not be clobbered.  */
2011   if (la)
2012     {
2013       if (sz <= la)
2014         {
2015           tokenrun *next = next_tokenrun (pfile->cur_run);
2016
2017           if (sz < la)
2018             memmove (next->base + 1, next->base,
2019                      (la - sz) * sizeof (cpp_token));
2020
2021           next->base[0] = pfile->cur_run->limit[-1];
2022         }
2023
2024       if (sz > 1)
2025         memmove (pfile->cur_token + 1, pfile->cur_token,
2026                  MIN (la, sz - 1) * sizeof (cpp_token));
2027     }
2028
2029   if (!sz && pfile->cur_token == pfile->cur_run->limit)
2030     {
2031       pfile->cur_run = next_tokenrun (pfile->cur_run);
2032       pfile->cur_token = pfile->cur_run->base;
2033     }
2034
2035   result = pfile->cur_token++;
2036   result->src_loc = old->src_loc;
2037   return result;
2038 }
2039
2040 /* Lex a token into RESULT (external interface).  Takes care of issues
2041    like directive handling, token lookahead, multiple include
2042    optimization and skipping.  */
2043 const cpp_token *
2044 _cpp_lex_token (cpp_reader *pfile)
2045 {
2046   cpp_token *result;
2047
2048   for (;;)
2049     {
2050       if (pfile->cur_token == pfile->cur_run->limit)
2051         {
2052           pfile->cur_run = next_tokenrun (pfile->cur_run);
2053           pfile->cur_token = pfile->cur_run->base;
2054         }
2055       /* We assume that the current token is somewhere in the current
2056          run.  */
2057       if (pfile->cur_token < pfile->cur_run->base
2058           || pfile->cur_token >= pfile->cur_run->limit)
2059         abort ();
2060
2061       if (pfile->lookaheads)
2062         {
2063           pfile->lookaheads--;
2064           result = pfile->cur_token++;
2065         }
2066       else
2067         result = _cpp_lex_direct (pfile);
2068
2069       if (result->flags & BOL)
2070         {
2071           /* Is this a directive.  If _cpp_handle_directive returns
2072              false, it is an assembler #.  */
2073           if (result->type == CPP_HASH
2074               /* 6.10.3 p 11: Directives in a list of macro arguments
2075                  gives undefined behavior.  This implementation
2076                  handles the directive as normal.  */
2077               && pfile->state.parsing_args != 1)
2078             {
2079               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2080                 {
2081                   if (pfile->directive_result.type == CPP_PADDING)
2082                     continue;
2083                   result = &pfile->directive_result;
2084                 }
2085             }
2086           else if (pfile->state.in_deferred_pragma)
2087             result = &pfile->directive_result;
2088
2089           if (pfile->cb.line_change && !pfile->state.skipping)
2090             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2091         }
2092
2093       /* We don't skip tokens in directives.  */
2094       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2095         break;
2096
2097       /* Outside a directive, invalidate controlling macros.  At file
2098          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2099          get here and MI optimization works.  */
2100       pfile->mi_valid = false;
2101
2102       if (!pfile->state.skipping || result->type == CPP_EOF)
2103         break;
2104     }
2105
2106   return result;
2107 }
2108
2109 /* Returns true if a fresh line has been loaded.  */
2110 bool
2111 _cpp_get_fresh_line (cpp_reader *pfile)
2112 {
2113   int return_at_eof;
2114
2115   /* We can't get a new line until we leave the current directive.  */
2116   if (pfile->state.in_directive)
2117     return false;
2118
2119   for (;;)
2120     {
2121       cpp_buffer *buffer = pfile->buffer;
2122
2123       if (!buffer->need_line)
2124         return true;
2125
2126       if (buffer->next_line < buffer->rlimit)
2127         {
2128           _cpp_clean_line (pfile);
2129           return true;
2130         }
2131
2132       /* First, get out of parsing arguments state.  */
2133       if (pfile->state.parsing_args)
2134         return false;
2135
2136       /* End of buffer.  Non-empty files should end in a newline.  */
2137       if (buffer->buf != buffer->rlimit
2138           && buffer->next_line > buffer->rlimit
2139           && !buffer->from_stage3)
2140         {
2141           /* Clip to buffer size.  */
2142           buffer->next_line = buffer->rlimit;
2143         }
2144
2145       return_at_eof = buffer->return_at_eof;
2146       _cpp_pop_buffer (pfile);
2147       if (pfile->buffer == NULL || return_at_eof)
2148         return false;
2149     }
2150 }
2151
2152 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
2153   do                                                    \
2154     {                                                   \
2155       result->type = ELSE_TYPE;                         \
2156       if (*buffer->cur == CHAR)                         \
2157         buffer->cur++, result->type = THEN_TYPE;        \
2158     }                                                   \
2159   while (0)
2160
2161 /* Lex a token into pfile->cur_token, which is also incremented, to
2162    get diagnostics pointing to the correct location.
2163
2164    Does not handle issues such as token lookahead, multiple-include
2165    optimization, directives, skipping etc.  This function is only
2166    suitable for use by _cpp_lex_token, and in special cases like
2167    lex_expansion_token which doesn't care for any of these issues.
2168
2169    When meeting a newline, returns CPP_EOF if parsing a directive,
2170    otherwise returns to the start of the token buffer if permissible.
2171    Returns the location of the lexed token.  */
2172 cpp_token *
2173 _cpp_lex_direct (cpp_reader *pfile)
2174 {
2175   cppchar_t c;
2176   cpp_buffer *buffer;
2177   const unsigned char *comment_start;
2178   cpp_token *result = pfile->cur_token++;
2179
2180  fresh_line:
2181   result->flags = 0;
2182   buffer = pfile->buffer;
2183   if (buffer->need_line)
2184     {
2185       if (pfile->state.in_deferred_pragma)
2186         {
2187           result->type = CPP_PRAGMA_EOL;
2188           pfile->state.in_deferred_pragma = false;
2189           if (!pfile->state.pragma_allow_expansion)
2190             pfile->state.prevent_expansion--;
2191           return result;
2192         }
2193       if (!_cpp_get_fresh_line (pfile))
2194         {
2195           result->type = CPP_EOF;
2196           if (!pfile->state.in_directive)
2197             {
2198               /* Tell the compiler the line number of the EOF token.  */
2199               result->src_loc = pfile->line_table->highest_line;
2200               result->flags = BOL;
2201             }
2202           return result;
2203         }
2204       if (!pfile->keep_tokens)
2205         {
2206           pfile->cur_run = &pfile->base_run;
2207           result = pfile->base_run.base;
2208           pfile->cur_token = result + 1;
2209         }
2210       result->flags = BOL;
2211       if (pfile->state.parsing_args == 2)
2212         result->flags |= PREV_WHITE;
2213     }
2214   buffer = pfile->buffer;
2215  update_tokens_line:
2216   result->src_loc = pfile->line_table->highest_line;
2217
2218  skipped_white:
2219   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2220       && !pfile->overlaid_buffer)
2221     {
2222       _cpp_process_line_notes (pfile, false);
2223       result->src_loc = pfile->line_table->highest_line;
2224     }
2225   c = *buffer->cur++;
2226
2227   if (pfile->forced_token_location_p)
2228     result->src_loc = *pfile->forced_token_location_p;
2229   else
2230     result->src_loc = linemap_position_for_column (pfile->line_table,
2231                                           CPP_BUF_COLUMN (buffer, buffer->cur));
2232
2233   switch (c)
2234     {
2235     case ' ': case '\t': case '\f': case '\v': case '\0':
2236       result->flags |= PREV_WHITE;
2237       skip_whitespace (pfile, c);
2238       goto skipped_white;
2239
2240     case '\n':
2241       if (buffer->cur < buffer->rlimit)
2242         CPP_INCREMENT_LINE (pfile, 0);
2243       buffer->need_line = true;
2244       goto fresh_line;
2245
2246     case '0': case '1': case '2': case '3': case '4':
2247     case '5': case '6': case '7': case '8': case '9':
2248       {
2249         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2250         result->type = CPP_NUMBER;
2251         lex_number (pfile, &result->val.str, &nst);
2252         warn_about_normalization (pfile, result, &nst);
2253         break;
2254       }
2255
2256     case 'L':
2257     case 'u':
2258     case 'U':
2259     case 'R':
2260       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2261          wide strings or raw strings.  */
2262       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2263           || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2264         {
2265           if ((*buffer->cur == '\'' && c != 'R')
2266               || *buffer->cur == '"'
2267               || (*buffer->cur == 'R'
2268                   && c != 'R'
2269                   && buffer->cur[1] == '"'
2270                   && CPP_OPTION (pfile, rliterals))
2271               || (*buffer->cur == '8'
2272                   && c == 'u'
2273                   && (buffer->cur[1] == '"'
2274                       || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2275                           && CPP_OPTION (pfile, rliterals)))))
2276             {
2277               lex_string (pfile, result, buffer->cur - 1);
2278               break;
2279             }
2280         }
2281       /* Fall through.  */
2282
2283     case '_':
2284     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2285     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2286     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2287     case 's': case 't':           case 'v': case 'w': case 'x':
2288     case 'y': case 'z':
2289     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2290     case 'G': case 'H': case 'I': case 'J': case 'K':
2291     case 'M': case 'N': case 'O': case 'P': case 'Q':
2292     case 'S': case 'T':           case 'V': case 'W': case 'X':
2293     case 'Y': case 'Z':
2294       result->type = CPP_NAME;
2295       {
2296         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2297         result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2298                                                 &nst);
2299         warn_about_normalization (pfile, result, &nst);
2300       }
2301
2302       /* Convert named operators to their proper types.  */
2303       if (result->val.node.node->flags & NODE_OPERATOR)
2304         {
2305           result->flags |= NAMED_OP;
2306           result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2307         }
2308       break;
2309
2310     case '\'':
2311     case '"':
2312       lex_string (pfile, result, buffer->cur - 1);
2313       break;
2314
2315     case '/':
2316       /* A potential block or line comment.  */
2317       comment_start = buffer->cur;
2318       c = *buffer->cur;
2319
2320       if (c == '*')
2321         {
2322           if (_cpp_skip_block_comment (pfile))
2323             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2324         }
2325       else if (c == '/' && ! CPP_OPTION (pfile, traditional))
2326         {
2327           /* Don't warn for system headers.  */
2328           if (cpp_in_system_header (pfile))
2329             ;
2330           /* Warn about comments if pedantically GNUC89, and not
2331              in system headers.  */
2332           else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
2333                    && CPP_PEDANTIC (pfile)
2334                    && ! buffer->warned_cplusplus_comments)
2335             {
2336               cpp_error (pfile, CPP_DL_PEDWARN,
2337                          "C++ style comments are not allowed in ISO C90");
2338               cpp_error (pfile, CPP_DL_PEDWARN,
2339                          "(this will be reported only once per input file)");
2340               buffer->warned_cplusplus_comments = 1;
2341             }
2342           /* Or if specifically desired via -Wc90-c99-compat.  */
2343           else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
2344                    && ! CPP_OPTION (pfile, cplusplus)
2345                    && ! buffer->warned_cplusplus_comments)
2346             {
2347               cpp_error (pfile, CPP_DL_WARNING,
2348                          "C++ style comments are incompatible with C90");
2349               cpp_error (pfile, CPP_DL_WARNING,
2350                          "(this will be reported only once per input file)");
2351               buffer->warned_cplusplus_comments = 1;
2352             }
2353           /* In C89/C94, C++ style comments are forbidden.  */
2354           else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
2355                     || CPP_OPTION (pfile, lang) == CLK_STDC94))
2356             {
2357               /* But don't be confused about valid code such as
2358                  - // immediately followed by *,
2359                  - // in a preprocessing directive,
2360                  - // in an #if 0 block.  */
2361               if (buffer->cur[1] == '*'
2362                   || pfile->state.in_directive
2363                   || pfile->state.skipping)
2364                 {
2365                   result->type = CPP_DIV;
2366                   break;
2367                 }
2368               else if (! buffer->warned_cplusplus_comments)
2369                 {
2370                   cpp_error (pfile, CPP_DL_ERROR,
2371                              "C++ style comments are not allowed in ISO C90");
2372                   cpp_error (pfile, CPP_DL_ERROR,
2373                              "(this will be reported only once per input "
2374                              "file)");
2375                   buffer->warned_cplusplus_comments = 1;
2376                 }
2377             }
2378           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2379             cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2380         }
2381       else if (c == '=')
2382         {
2383           buffer->cur++;
2384           result->type = CPP_DIV_EQ;
2385           break;
2386         }
2387       else
2388         {
2389           result->type = CPP_DIV;
2390           break;
2391         }
2392
2393       if (!pfile->state.save_comments)
2394         {
2395           result->flags |= PREV_WHITE;
2396           goto update_tokens_line;
2397         }
2398
2399       /* Save the comment as a token in its own right.  */
2400       save_comment (pfile, result, comment_start, c);
2401       break;
2402
2403     case '<':
2404       if (pfile->state.angled_headers)
2405         {
2406           lex_string (pfile, result, buffer->cur - 1);
2407           if (result->type != CPP_LESS)
2408             break;
2409         }
2410
2411       result->type = CPP_LESS;
2412       if (*buffer->cur == '=')
2413         buffer->cur++, result->type = CPP_LESS_EQ;
2414       else if (*buffer->cur == '<')
2415         {
2416           buffer->cur++;
2417           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2418         }
2419       else if (CPP_OPTION (pfile, digraphs))
2420         {
2421           if (*buffer->cur == ':')
2422             {
2423               /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
2424                  three characters are <:: and the subsequent character
2425                  is neither : nor >, the < is treated as a preprocessor
2426                  token by itself".  */
2427               if (CPP_OPTION (pfile, cplusplus)
2428                   && CPP_OPTION (pfile, lang) != CLK_CXX98
2429                   && CPP_OPTION (pfile, lang) != CLK_GNUCXX
2430                   && buffer->cur[1] == ':'
2431                   && buffer->cur[2] != ':' && buffer->cur[2] != '>')
2432                 break;
2433
2434               buffer->cur++;
2435               result->flags |= DIGRAPH;
2436               result->type = CPP_OPEN_SQUARE;
2437             }
2438           else if (*buffer->cur == '%')
2439             {
2440               buffer->cur++;
2441               result->flags |= DIGRAPH;
2442               result->type = CPP_OPEN_BRACE;
2443             }
2444         }
2445       break;
2446
2447     case '>':
2448       result->type = CPP_GREATER;
2449       if (*buffer->cur == '=')
2450         buffer->cur++, result->type = CPP_GREATER_EQ;
2451       else if (*buffer->cur == '>')
2452         {
2453           buffer->cur++;
2454           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2455         }
2456       break;
2457
2458     case '%':
2459       result->type = CPP_MOD;
2460       if (*buffer->cur == '=')
2461         buffer->cur++, result->type = CPP_MOD_EQ;
2462       else if (CPP_OPTION (pfile, digraphs))
2463         {
2464           if (*buffer->cur == ':')
2465             {
2466               buffer->cur++;
2467               result->flags |= DIGRAPH;
2468               result->type = CPP_HASH;
2469               if (*buffer->cur == '%' && buffer->cur[1] == ':')
2470                 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2471             }
2472           else if (*buffer->cur == '>')
2473             {
2474               buffer->cur++;
2475               result->flags |= DIGRAPH;
2476               result->type = CPP_CLOSE_BRACE;
2477             }
2478         }
2479       break;
2480
2481     case '.':
2482       result->type = CPP_DOT;
2483       if (ISDIGIT (*buffer->cur))
2484         {
2485           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2486           result->type = CPP_NUMBER;
2487           lex_number (pfile, &result->val.str, &nst);
2488           warn_about_normalization (pfile, result, &nst);
2489         }
2490       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
2491         buffer->cur += 2, result->type = CPP_ELLIPSIS;
2492       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2493         buffer->cur++, result->type = CPP_DOT_STAR;
2494       break;
2495
2496     case '+':
2497       result->type = CPP_PLUS;
2498       if (*buffer->cur == '+')
2499         buffer->cur++, result->type = CPP_PLUS_PLUS;
2500       else if (*buffer->cur == '=')
2501         buffer->cur++, result->type = CPP_PLUS_EQ;
2502       break;
2503
2504     case '-':
2505       result->type = CPP_MINUS;
2506       if (*buffer->cur == '>')
2507         {
2508           buffer->cur++;
2509           result->type = CPP_DEREF;
2510           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2511             buffer->cur++, result->type = CPP_DEREF_STAR;
2512         }
2513       else if (*buffer->cur == '-')
2514         buffer->cur++, result->type = CPP_MINUS_MINUS;
2515       else if (*buffer->cur == '=')
2516         buffer->cur++, result->type = CPP_MINUS_EQ;
2517       break;
2518
2519     case '&':
2520       result->type = CPP_AND;
2521       if (*buffer->cur == '&')
2522         buffer->cur++, result->type = CPP_AND_AND;
2523       else if (*buffer->cur == '=')
2524         buffer->cur++, result->type = CPP_AND_EQ;
2525       break;
2526
2527     case '|':
2528       result->type = CPP_OR;
2529       if (*buffer->cur == '|')
2530         buffer->cur++, result->type = CPP_OR_OR;
2531       else if (*buffer->cur == '=')
2532         buffer->cur++, result->type = CPP_OR_EQ;
2533       break;
2534
2535     case ':':
2536       result->type = CPP_COLON;
2537       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
2538         buffer->cur++, result->type = CPP_SCOPE;
2539       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
2540         {
2541           buffer->cur++;
2542           result->flags |= DIGRAPH;
2543           result->type = CPP_CLOSE_SQUARE;
2544         }
2545       break;
2546
2547     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
2548     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
2549     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
2550     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
2551     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
2552
2553     case '?': result->type = CPP_QUERY; break;
2554     case '~': result->type = CPP_COMPL; break;
2555     case ',': result->type = CPP_COMMA; break;
2556     case '(': result->type = CPP_OPEN_PAREN; break;
2557     case ')': result->type = CPP_CLOSE_PAREN; break;
2558     case '[': result->type = CPP_OPEN_SQUARE; break;
2559     case ']': result->type = CPP_CLOSE_SQUARE; break;
2560     case '{': result->type = CPP_OPEN_BRACE; break;
2561     case '}': result->type = CPP_CLOSE_BRACE; break;
2562     case ';': result->type = CPP_SEMICOLON; break;
2563
2564       /* @ is a punctuator in Objective-C.  */
2565     case '@': result->type = CPP_ATSIGN; break;
2566
2567     case '$':
2568     case '\\':
2569       {
2570         const uchar *base = --buffer->cur;
2571         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2572
2573         if (forms_identifier_p (pfile, true, &nst))
2574           {
2575             result->type = CPP_NAME;
2576             result->val.node.node = lex_identifier (pfile, base, true, &nst);
2577             warn_about_normalization (pfile, result, &nst);
2578             break;
2579           }
2580         buffer->cur++;
2581       }
2582
2583     default:
2584       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
2585       break;
2586     }
2587
2588   return result;
2589 }
2590
2591 /* An upper bound on the number of bytes needed to spell TOKEN.
2592    Does not include preceding whitespace.  */
2593 unsigned int
2594 cpp_token_len (const cpp_token *token)
2595 {
2596   unsigned int len;
2597
2598   switch (TOKEN_SPELL (token))
2599     {
2600     default:            len = 6;                                break;
2601     case SPELL_LITERAL: len = token->val.str.len;               break;
2602     case SPELL_IDENT:   len = NODE_LEN (token->val.node.node) * 10;     break;
2603     }
2604
2605   return len;
2606 }
2607
2608 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
2609    Return the number of bytes read out of NAME.  (There are always
2610    10 bytes written to BUFFER.)  */
2611
2612 static size_t
2613 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
2614 {
2615   int j;
2616   int ucn_len = 0;
2617   int ucn_len_c;
2618   unsigned t;
2619   unsigned long utf32;
2620
2621   /* Compute the length of the UTF-8 sequence.  */
2622   for (t = *name; t & 0x80; t <<= 1)
2623     ucn_len++;
2624
2625   utf32 = *name & (0x7F >> ucn_len);
2626   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
2627     {
2628       utf32 = (utf32 << 6) | (*++name & 0x3F);
2629
2630       /* Ill-formed UTF-8.  */
2631       if ((*name & ~0x3F) != 0x80)
2632         abort ();
2633     }
2634
2635   *buffer++ = '\\';
2636   *buffer++ = 'U';
2637   for (j = 7; j >= 0; j--)
2638     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
2639   return ucn_len;
2640 }
2641
2642 /* Given a token TYPE corresponding to a digraph, return a pointer to
2643    the spelling of the digraph.  */
2644 static const unsigned char *
2645 cpp_digraph2name (enum cpp_ttype type)
2646 {
2647   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
2648 }
2649
2650 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
2651    already contain the enough space to hold the token's spelling.
2652    Returns a pointer to the character after the last character written.
2653    FORSTRING is true if this is to be the spelling after translation
2654    phase 1 (this is different for UCNs).
2655    FIXME: Would be nice if we didn't need the PFILE argument.  */
2656 unsigned char *
2657 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
2658                  unsigned char *buffer, bool forstring)
2659 {
2660   switch (TOKEN_SPELL (token))
2661     {
2662     case SPELL_OPERATOR:
2663       {
2664         const unsigned char *spelling;
2665         unsigned char c;
2666
2667         if (token->flags & DIGRAPH)
2668           spelling = cpp_digraph2name (token->type);
2669         else if (token->flags & NAMED_OP)
2670           goto spell_ident;
2671         else
2672           spelling = TOKEN_NAME (token);
2673
2674         while ((c = *spelling++) != '\0')
2675           *buffer++ = c;
2676       }
2677       break;
2678
2679     spell_ident:
2680     case SPELL_IDENT:
2681       if (forstring)
2682         {
2683           memcpy (buffer, NODE_NAME (token->val.node.node),
2684                   NODE_LEN (token->val.node.node));
2685           buffer += NODE_LEN (token->val.node.node);
2686         }
2687       else
2688         {
2689           size_t i;
2690           const unsigned char * name = NODE_NAME (token->val.node.node);
2691
2692           for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2693             if (name[i] & ~0x7F)
2694               {
2695                 i += utf8_to_ucn (buffer, name + i) - 1;
2696                 buffer += 10;
2697               }
2698             else
2699               *buffer++ = NODE_NAME (token->val.node.node)[i];
2700         }
2701       break;
2702
2703     case SPELL_LITERAL:
2704       memcpy (buffer, token->val.str.text, token->val.str.len);
2705       buffer += token->val.str.len;
2706       break;
2707
2708     case SPELL_NONE:
2709       cpp_error (pfile, CPP_DL_ICE,
2710                  "unspellable token %s", TOKEN_NAME (token));
2711       break;
2712     }
2713
2714   return buffer;
2715 }
2716
2717 /* Returns TOKEN spelt as a null-terminated string.  The string is
2718    freed when the reader is destroyed.  Useful for diagnostics.  */
2719 unsigned char *
2720 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
2721 {
2722   unsigned int len = cpp_token_len (token) + 1;
2723   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
2724
2725   end = cpp_spell_token (pfile, token, start, false);
2726   end[0] = '\0';
2727
2728   return start;
2729 }
2730
2731 /* Returns a pointer to a string which spells the token defined by
2732    TYPE and FLAGS.  Used by C front ends, which really should move to
2733    using cpp_token_as_text.  */
2734 const char *
2735 cpp_type2name (enum cpp_ttype type, unsigned char flags)
2736 {
2737   if (flags & DIGRAPH)
2738     return (const char *) cpp_digraph2name (type);
2739   else if (flags & NAMED_OP)
2740     return cpp_named_operator2name (type);
2741
2742   return (const char *) token_spellings[type].name;
2743 }
2744
2745 /* Writes the spelling of token to FP, without any preceding space.
2746    Separated from cpp_spell_token for efficiency - to avoid stdio
2747    double-buffering.  */
2748 void
2749 cpp_output_token (const cpp_token *token, FILE *fp)
2750 {
2751   switch (TOKEN_SPELL (token))
2752     {
2753     case SPELL_OPERATOR:
2754       {
2755         const unsigned char *spelling;
2756         int c;
2757
2758         if (token->flags & DIGRAPH)
2759           spelling = cpp_digraph2name (token->type);
2760         else if (token->flags & NAMED_OP)
2761           goto spell_ident;
2762         else
2763           spelling = TOKEN_NAME (token);
2764
2765         c = *spelling;
2766         do
2767           putc (c, fp);
2768         while ((c = *++spelling) != '\0');
2769       }
2770       break;
2771
2772     spell_ident:
2773     case SPELL_IDENT:
2774       {
2775         size_t i;
2776         const unsigned char * name = NODE_NAME (token->val.node.node);
2777
2778         for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2779           if (name[i] & ~0x7F)
2780             {
2781               unsigned char buffer[10];
2782               i += utf8_to_ucn (buffer, name + i) - 1;
2783               fwrite (buffer, 1, 10, fp);
2784             }
2785           else
2786             fputc (NODE_NAME (token->val.node.node)[i], fp);
2787       }
2788       break;
2789
2790     case SPELL_LITERAL:
2791       fwrite (token->val.str.text, 1, token->val.str.len, fp);
2792       break;
2793
2794     case SPELL_NONE:
2795       /* An error, most probably.  */
2796       break;
2797     }
2798 }
2799
2800 /* Compare two tokens.  */
2801 int
2802 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
2803 {
2804   if (a->type == b->type && a->flags == b->flags)
2805     switch (TOKEN_SPELL (a))
2806       {
2807       default:                  /* Keep compiler happy.  */
2808       case SPELL_OPERATOR:
2809         /* token_no is used to track where multiple consecutive ##
2810            tokens were originally located.  */
2811         return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
2812       case SPELL_NONE:
2813         return (a->type != CPP_MACRO_ARG
2814                 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
2815       case SPELL_IDENT:
2816         return a->val.node.node == b->val.node.node;
2817       case SPELL_LITERAL:
2818         return (a->val.str.len == b->val.str.len
2819                 && !memcmp (a->val.str.text, b->val.str.text,
2820                             a->val.str.len));
2821       }
2822
2823   return 0;
2824 }
2825
2826 /* Returns nonzero if a space should be inserted to avoid an
2827    accidental token paste for output.  For simplicity, it is
2828    conservative, and occasionally advises a space where one is not
2829    needed, e.g. "." and ".2".  */
2830 int
2831 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
2832                  const cpp_token *token2)
2833 {
2834   enum cpp_ttype a = token1->type, b = token2->type;
2835   cppchar_t c;
2836
2837   if (token1->flags & NAMED_OP)
2838     a = CPP_NAME;
2839   if (token2->flags & NAMED_OP)
2840     b = CPP_NAME;
2841
2842   c = EOF;
2843   if (token2->flags & DIGRAPH)
2844     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
2845   else if (token_spellings[b].category == SPELL_OPERATOR)
2846     c = token_spellings[b].name[0];
2847
2848   /* Quickly get everything that can paste with an '='.  */
2849   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
2850     return 1;
2851
2852   switch (a)
2853     {
2854     case CPP_GREATER:   return c == '>';
2855     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
2856     case CPP_PLUS:      return c == '+';
2857     case CPP_MINUS:     return c == '-' || c == '>';
2858     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
2859     case CPP_MOD:       return c == ':' || c == '>';
2860     case CPP_AND:       return c == '&';
2861     case CPP_OR:        return c == '|';
2862     case CPP_COLON:     return c == ':' || c == '>';
2863     case CPP_DEREF:     return c == '*';
2864     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
2865     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
2866     case CPP_NAME:      return ((b == CPP_NUMBER
2867                                  && name_p (pfile, &token2->val.str))
2868                                 || b == CPP_NAME
2869                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
2870     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
2871                                 || c == '.' || c == '+' || c == '-');
2872                                       /* UCNs */
2873     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
2874                                  && b == CPP_NAME)
2875                                 || (CPP_OPTION (pfile, objc)
2876                                     && token1->val.str.text[0] == '@'
2877                                     && (b == CPP_NAME || b == CPP_STRING)));
2878     case CPP_STRING:
2879     case CPP_WSTRING:
2880     case CPP_UTF8STRING:
2881     case CPP_STRING16:
2882     case CPP_STRING32:  return (CPP_OPTION (pfile, user_literals)
2883                                 && (b == CPP_NAME
2884                                     || (TOKEN_SPELL (token2) == SPELL_LITERAL
2885                                         && ISIDST (token2->val.str.text[0]))));
2886
2887     default:            break;
2888     }
2889
2890   return 0;
2891 }
2892
2893 /* Output all the remaining tokens on the current line, and a newline
2894    character, to FP.  Leading whitespace is removed.  If there are
2895    macros, special token padding is not performed.  */
2896 void
2897 cpp_output_line (cpp_reader *pfile, FILE *fp)
2898 {
2899   const cpp_token *token;
2900
2901   token = cpp_get_token (pfile);
2902   while (token->type != CPP_EOF)
2903     {
2904       cpp_output_token (token, fp);
2905       token = cpp_get_token (pfile);
2906       if (token->flags & PREV_WHITE)
2907         putc (' ', fp);
2908     }
2909
2910   putc ('\n', fp);
2911 }
2912
2913 /* Return a string representation of all the remaining tokens on the
2914    current line.  The result is allocated using xmalloc and must be
2915    freed by the caller.  */
2916 unsigned char *
2917 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
2918 {
2919   const cpp_token *token;
2920   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
2921   unsigned int alloced = 120 + out;
2922   unsigned char *result = (unsigned char *) xmalloc (alloced);
2923
2924   /* If DIR_NAME is empty, there are no initial contents.  */
2925   if (dir_name)
2926     {
2927       sprintf ((char *) result, "#%s ", dir_name);
2928       out += 2;
2929     }
2930
2931   token = cpp_get_token (pfile);
2932   while (token->type != CPP_EOF)
2933     {
2934       unsigned char *last;
2935       /* Include room for a possible space and the terminating nul.  */
2936       unsigned int len = cpp_token_len (token) + 2;
2937
2938       if (out + len > alloced)
2939         {
2940           alloced *= 2;
2941           if (out + len > alloced)
2942             alloced = out + len;
2943           result = (unsigned char *) xrealloc (result, alloced);
2944         }
2945
2946       last = cpp_spell_token (pfile, token, &result[out], 0);
2947       out = last - result;
2948
2949       token = cpp_get_token (pfile);
2950       if (token->flags & PREV_WHITE)
2951         result[out++] = ' ';
2952     }
2953
2954   result[out] = '\0';
2955   return result;
2956 }
2957
2958 /* Memory buffers.  Changing these three constants can have a dramatic
2959    effect on performance.  The values here are reasonable defaults,
2960    but might be tuned.  If you adjust them, be sure to test across a
2961    range of uses of cpplib, including heavy nested function-like macro
2962    expansion.  Also check the change in peak memory usage (NJAMD is a
2963    good tool for this).  */
2964 #define MIN_BUFF_SIZE 8000
2965 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2966 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2967         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2968
2969 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2970   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2971 #endif
2972
2973 /* Create a new allocation buffer.  Place the control block at the end
2974    of the buffer, so that buffer overflows will cause immediate chaos.  */
2975 static _cpp_buff *
2976 new_buff (size_t len)
2977 {
2978   _cpp_buff *result;
2979   unsigned char *base;
2980
2981   if (len < MIN_BUFF_SIZE)
2982     len = MIN_BUFF_SIZE;
2983   len = CPP_ALIGN (len);
2984
2985 #ifdef ENABLE_VALGRIND_CHECKING
2986   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
2987      struct first.  */
2988   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
2989   base = XNEWVEC (unsigned char, len + slen);
2990   result = (_cpp_buff *) base;
2991   base += slen;
2992 #else
2993   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
2994   result = (_cpp_buff *) (base + len);
2995 #endif
2996   result->base = base;
2997   result->cur = base;
2998   result->limit = base + len;
2999   result->next = NULL;
3000   return result;
3001 }
3002
3003 /* Place a chain of unwanted allocation buffers on the free list.  */
3004 void
3005 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
3006 {
3007   _cpp_buff *end = buff;
3008
3009   while (end->next)
3010     end = end->next;
3011   end->next = pfile->free_buffs;
3012   pfile->free_buffs = buff;
3013 }
3014
3015 /* Return a free buffer of size at least MIN_SIZE.  */
3016 _cpp_buff *
3017 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
3018 {
3019   _cpp_buff *result, **p;
3020
3021   for (p = &pfile->free_buffs;; p = &(*p)->next)
3022     {
3023       size_t size;
3024
3025       if (*p == NULL)
3026         return new_buff (min_size);
3027       result = *p;
3028       size = result->limit - result->base;
3029       /* Return a buffer that's big enough, but don't waste one that's
3030          way too big.  */
3031       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
3032         break;
3033     }
3034
3035   *p = result->next;
3036   result->next = NULL;
3037   result->cur = result->base;
3038   return result;
3039 }
3040
3041 /* Creates a new buffer with enough space to hold the uncommitted
3042    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
3043    the excess bytes to the new buffer.  Chains the new buffer after
3044    BUFF, and returns the new buffer.  */
3045 _cpp_buff *
3046 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
3047 {
3048   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
3049   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
3050
3051   buff->next = new_buff;
3052   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
3053   return new_buff;
3054 }
3055
3056 /* Creates a new buffer with enough space to hold the uncommitted
3057    remaining bytes of the buffer pointed to by BUFF, and at least
3058    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
3059    Chains the new buffer before the buffer pointed to by BUFF, and
3060    updates the pointer to point to the new buffer.  */
3061 void
3062 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3063 {
3064   _cpp_buff *new_buff, *old_buff = *pbuff;
3065   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
3066
3067   new_buff = _cpp_get_buff (pfile, size);
3068   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
3069   new_buff->next = old_buff;
3070   *pbuff = new_buff;
3071 }
3072
3073 /* Free a chain of buffers starting at BUFF.  */
3074 void
3075 _cpp_free_buff (_cpp_buff *buff)
3076 {
3077   _cpp_buff *next;
3078
3079   for (; buff; buff = next)
3080     {
3081       next = buff->next;
3082 #ifdef ENABLE_VALGRIND_CHECKING
3083       free (buff);
3084 #else
3085       free (buff->base);
3086 #endif
3087     }
3088 }
3089
3090 /* Allocate permanent, unaligned storage of length LEN.  */
3091 unsigned char *
3092 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3093 {
3094   _cpp_buff *buff = pfile->u_buff;
3095   unsigned char *result = buff->cur;
3096
3097   if (len > (size_t) (buff->limit - result))
3098     {
3099       buff = _cpp_get_buff (pfile, len);
3100       buff->next = pfile->u_buff;
3101       pfile->u_buff = buff;
3102       result = buff->cur;
3103     }
3104
3105   buff->cur = result + len;
3106   return result;
3107 }
3108
3109 /* Allocate permanent, unaligned storage of length LEN from a_buff.
3110    That buffer is used for growing allocations when saving macro
3111    replacement lists in a #define, and when parsing an answer to an
3112    assertion in #assert, #unassert or #if (and therefore possibly
3113    whilst expanding macros).  It therefore must not be used by any
3114    code that they might call: specifically the lexer and the guts of
3115    the macro expander.
3116
3117    All existing other uses clearly fit this restriction: storing
3118    registered pragmas during initialization.  */
3119 unsigned char *
3120 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3121 {
3122   _cpp_buff *buff = pfile->a_buff;
3123   unsigned char *result = buff->cur;
3124
3125   if (len > (size_t) (buff->limit - result))
3126     {
3127       buff = _cpp_get_buff (pfile, len);
3128       buff->next = pfile->a_buff;
3129       pfile->a_buff = buff;
3130       result = buff->cur;
3131     }
3132
3133   buff->cur = result + len;
3134   return result;
3135 }
3136
3137 /* Say which field of TOK is in use.  */
3138
3139 enum cpp_token_fld_kind
3140 cpp_token_val_index (const cpp_token *tok)
3141 {
3142   switch (TOKEN_SPELL (tok))
3143     {
3144     case SPELL_IDENT:
3145       return CPP_TOKEN_FLD_NODE;
3146     case SPELL_LITERAL:
3147       return CPP_TOKEN_FLD_STR;
3148     case SPELL_OPERATOR:
3149       if (tok->type == CPP_PASTE)
3150         return CPP_TOKEN_FLD_TOKEN_NO;
3151       else
3152         return CPP_TOKEN_FLD_NONE;
3153     case SPELL_NONE:
3154       if (tok->type == CPP_MACRO_ARG)
3155         return CPP_TOKEN_FLD_ARG_NO;
3156       else if (tok->type == CPP_PADDING)
3157         return CPP_TOKEN_FLD_SOURCE;
3158       else if (tok->type == CPP_PRAGMA)
3159         return CPP_TOKEN_FLD_PRAGMA;
3160       /* else fall through */
3161     default:
3162       return CPP_TOKEN_FLD_NONE;
3163     }
3164 }
3165
3166 /* All tokens lexed in R after calling this function will be forced to have
3167    their source_location the same as the location referenced by P, until
3168    cpp_stop_forcing_token_locations is called for R.  */
3169
3170 void
3171 cpp_force_token_locations (cpp_reader *r, source_location *p)
3172 {
3173   r->forced_token_location_p = p;
3174 }
3175
3176 /* Go back to assigning locations naturally for lexed tokens.  */
3177
3178 void
3179 cpp_stop_forcing_token_locations (cpp_reader *r)
3180 {
3181   r->forced_token_location_p = NULL;
3182 }