gcc/input.c

   1 /* Data and functions related to line maps and input files.
   2    Copyright (C) 2004-2016 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 3, or (at your option) any later
   9 version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "intl.h"
  24 #include "diagnostic-core.h"
  25 #include "selftest.h"
  26 #include "cpplib.h"
  27
  28 #ifndef HAVE_ICONV
  29 #define HAVE_ICONV 0
  30 #endif
  31
  32 /* This is a cache used by get_next_line to store the content of a
  33    file to be searched for file lines.  */
  34 struct fcache
  35 {
  36   /* These are information used to store a line boundary.  */
  37   struct line_info
  38   {
  39     /* The line number.  It starts from 1.  */
  40     size_t line_num;
  41
  42     /* The position (byte count) of the beginning of the line,
  43        relative to the file data pointer.  This starts at zero.  */
  44     size_t start_pos;
  45
  46     /* The position (byte count) of the last byte of the line.  This
  47        normally points to the '\n' character, or to one byte after the
  48        last byte of the file, if the file doesn't contain a '\n'
  49        character.  */
  50     size_t end_pos;
  51
  52     line_info (size_t l, size_t s, size_t e)
  53       : line_num (l), start_pos (s), end_pos (e)
  54     {}
  55
  56     line_info ()
  57       :line_num (0), start_pos (0), end_pos (0)
  58     {}
  59   };
  60
  61   /* The number of time this file has been accessed.  This is used
  62      to designate which file cache to evict from the cache
  63      array.  */
  64   unsigned use_count;
  65
  66   const char *file_path;
  67
  68   FILE *fp;
  69
  70   /* This points to the content of the file that we've read so
  71      far.  */
  72   char *data;
  73
  74   /*  The size of the DATA array above.*/
  75   size_t size;
  76
  77   /* The number of bytes read from the underlying file so far.  This
  78      must be less (or equal) than SIZE above.  */
  79   size_t nb_read;
  80
  81   /* The index of the beginning of the current line.  */
  82   size_t line_start_idx;
  83
  84   /* The number of the previous line read.  This starts at 1.  Zero
  85      means we've read no line so far.  */
  86   size_t line_num;
  87
  88   /* This is the total number of lines of the current file.  At the
  89      moment, we try to get this information from the line map
  90      subsystem.  Note that this is just a hint.  When using the C++
  91      front-end, this hint is correct because the input file is then
  92      completely tokenized before parsing starts; so the line map knows
  93      the number of lines before compilation really starts.  For e.g,
  94      the C front-end, it can happen that we start emitting diagnostics
  95      before the line map has seen the end of the file.  */
  96   size_t total_lines;
  97
  98   /* Could this file be missing a trailing newline on its final line?
  99      Initially true (to cope with empty files), set to true/false
 100      as each line is read.  */
 101   bool missing_trailing_newline;
 102
 103   /* This is a record of the beginning and end of the lines we've seen
 104      while reading the file.  This is useful to avoid walking the data
 105      from the beginning when we are asked to read a line that is
 106      before LINE_START_IDX above.  Note that the maximum size of this
 107      record is fcache_line_record_size, so that the memory consumption
 108      doesn't explode.  We thus scale total_lines down to
 109      fcache_line_record_size.  */
 110   vec<line_info, va_heap> line_record;
 111
 112   fcache ();
 113   ~fcache ();
 114 };
 115
 116 /* Current position in real source file.  */
 117
 118 location_t input_location = UNKNOWN_LOCATION;
 119
 120 struct line_maps *line_table;
 121
 122 /* A stashed copy of "line_table" for use by selftest::line_table_test.
 123    This needs to be a global so that it can be a GC root, and thus
 124    prevent the stashed copy from being garbage-collected if the GC runs
 125    during a line_table_test.  */
 126
 127 struct line_maps *saved_line_table;
 128
 129 static fcache *fcache_tab;
 130 static const size_t fcache_tab_size = 16;
 131 static const size_t fcache_buffer_size = 4 * 1024;
 132 static const size_t fcache_line_record_size = 100;
 133
 134 /* Expand the source location LOC into a human readable location.  If
 135    LOC resolves to a builtin location, the file name of the readable
 136    location is set to the string "<built-in>". If EXPANSION_POINT_P is
 137    TRUE and LOC is virtual, then it is resolved to the expansion
 138    point of the involved macro.  Otherwise, it is resolved to the
 139    spelling location of the token.
 140
 141    When resolving to the spelling location of the token, if the
 142    resulting location is for a built-in location (that is, it has no
 143    associated line/column) in the context of a macro expansion, the
 144    returned location is the first one (while unwinding the macro
 145    location towards its expansion point) that is in real source
 146    code.  */
 147
 148 static expanded_location
 149 expand_location_1 (source_location loc,
 150                    bool expansion_point_p)
 151 {
 152   expanded_location xloc;
 153   const line_map_ordinary *map;
 154   enum location_resolution_kind lrk = LRK_MACRO_EXPANSION_POINT;
 155   tree block = NULL;
 156
 157   if (IS_ADHOC_LOC (loc))
 158     {
 159       block = LOCATION_BLOCK (loc);
 160       loc = LOCATION_LOCUS (loc);
 161     }
 162
 163   memset (&xloc, 0, sizeof (xloc));
 164
 165   if (loc >= RESERVED_LOCATION_COUNT)
 166     {
 167       if (!expansion_point_p)
 168         {
 169           /* We want to resolve LOC to its spelling location.
 170
 171              But if that spelling location is a reserved location that
 172              appears in the context of a macro expansion (like for a
 173              location for a built-in token), let's consider the first
 174              location (toward the expansion point) that is not reserved;
 175              that is, the first location that is in real source code.  */
 176           loc = linemap_unwind_to_first_non_reserved_loc (line_table,
 177                                                           loc, NULL);
 178           lrk = LRK_SPELLING_LOCATION;
 179         }
 180       loc = linemap_resolve_location (line_table, loc,
 181                                       lrk, &map);
 182       xloc = linemap_expand_location (line_table, map, loc);
 183     }
 184
 185   xloc.data = block;
 186   if (loc <= BUILTINS_LOCATION)
 187     xloc.file = loc == UNKNOWN_LOCATION ? NULL : _("<built-in>");
 188
 189   return xloc;
 190 }
 191
 192 /* Initialize the set of cache used for files accessed by caret
 193    diagnostic.  */
 194
 195 static void
 196 diagnostic_file_cache_init (void)
 197 {
 198   if (fcache_tab == NULL)
 199     fcache_tab = new fcache[fcache_tab_size];
 200 }
 201
 202 /* Free the resources used by the set of cache used for files accessed
 203    by caret diagnostic.  */
 204
 205 void
 206 diagnostic_file_cache_fini (void)
 207 {
 208   if (fcache_tab)
 209     {
 210       delete [] (fcache_tab);
 211       fcache_tab = NULL;
 212     }
 213 }
 214
 215 /* Return the total lines number that have been read so far by the
 216    line map (in the preprocessor) so far.  For languages like C++ that
 217    entirely preprocess the input file before starting to parse, this
 218    equals the actual number of lines of the file.  */
 219
 220 static size_t
 221 total_lines_num (const char *file_path)
 222 {
 223   size_t r = 0;
 224   source_location l = 0;
 225   if (linemap_get_file_highest_location (line_table, file_path, &l))
 226     {
 227       gcc_assert (l >= RESERVED_LOCATION_COUNT);
 228       expanded_location xloc = expand_location (l);
 229       r = xloc.line;
 230     }
 231   return r;
 232 }
 233
 234 /* Lookup the cache used for the content of a given file accessed by
 235    caret diagnostic.  Return the found cached file, or NULL if no
 236    cached file was found.  */
 237
 238 static fcache*
 239 lookup_file_in_cache_tab (const char *file_path)
 240 {
 241   if (file_path == NULL)
 242     return NULL;
 243
 244   diagnostic_file_cache_init ();
 245
 246   /* This will contain the found cached file.  */
 247   fcache *r = NULL;
 248   for (unsigned i = 0; i < fcache_tab_size; ++i)
 249     {
 250       fcache *c = &fcache_tab[i];
 251       if (c->file_path && !strcmp (c->file_path, file_path))
 252         {
 253           ++c->use_count;
 254           r = c;
 255         }
 256     }
 257
 258   if (r)
 259     ++r->use_count;
 260
 261   return r;
 262 }
 263
 264 /* Purge any mention of FILENAME from the cache of files used for
 265    printing source code.  For use in selftests when working
 266    with tempfiles.  */
 267
 268 void
 269 diagnostics_file_cache_forcibly_evict_file (const char *file_path)
 270 {
 271   gcc_assert (file_path);
 272
 273   fcache *r = lookup_file_in_cache_tab (file_path);
 274   if (!r)
 275     /* Not found.  */
 276     return;
 277
 278   r->file_path = NULL;
 279   if (r->fp)
 280     fclose (r->fp);
 281   r->fp = NULL;
 282   r->nb_read = 0;
 283   r->line_start_idx = 0;
 284   r->line_num = 0;
 285   r->line_record.truncate (0);
 286   r->use_count = 0;
 287   r->total_lines = 0;
 288   r->missing_trailing_newline = true;
 289 }
 290
 291 /* Return the file cache that has been less used, recently, or the
 292    first empty one.  If HIGHEST_USE_COUNT is non-null,
 293    *HIGHEST_USE_COUNT is set to the highest use count of the entries
 294    in the cache table.  */
 295
 296 static fcache*
 297 evicted_cache_tab_entry (unsigned *highest_use_count)
 298 {
 299   diagnostic_file_cache_init ();
 300
 301   fcache *to_evict = &fcache_tab[0];
 302   unsigned huc = to_evict->use_count;
 303   for (unsigned i = 1; i < fcache_tab_size; ++i)
 304     {
 305       fcache *c = &fcache_tab[i];
 306       bool c_is_empty = (c->file_path == NULL);
 307
 308       if (c->use_count < to_evict->use_count
 309           || (to_evict->file_path && c_is_empty))
 310         /* We evict C because it's either an entry with a lower use
 311            count or one that is empty.  */
 312         to_evict = c;
 313
 314       if (huc < c->use_count)
 315         huc = c->use_count;
 316
 317       if (c_is_empty)
 318         /* We've reached the end of the cache; subsequent elements are
 319            all empty.  */
 320         break;
 321     }
 322
 323   if (highest_use_count)
 324     *highest_use_count = huc;
 325
 326   return to_evict;
 327 }
 328
 329 /* Create the cache used for the content of a given file to be
 330    accessed by caret diagnostic.  This cache is added to an array of
 331    cache and can be retrieved by lookup_file_in_cache_tab.  This
 332    function returns the created cache.  Note that only the last
 333    fcache_tab_size files are cached.  */
 334
 335 static fcache*
 336 add_file_to_cache_tab (const char *file_path)
 337 {
 338
 339   FILE *fp = fopen (file_path, "r");
 340   if (fp == NULL)
 341     return NULL;
 342
 343   unsigned highest_use_count = 0;
 344   fcache *r = evicted_cache_tab_entry (&highest_use_count);
 345   r->file_path = file_path;
 346   if (r->fp)
 347     fclose (r->fp);
 348   r->fp = fp;
 349   r->nb_read = 0;
 350   r->line_start_idx = 0;
 351   r->line_num = 0;
 352   r->line_record.truncate (0);
 353   /* Ensure that this cache entry doesn't get evicted next time
 354      add_file_to_cache_tab is called.  */
 355   r->use_count = ++highest_use_count;
 356   r->total_lines = total_lines_num (file_path);
 357   r->missing_trailing_newline = true;
 358
 359   return r;
 360 }
 361
 362 /* Lookup the cache used for the content of a given file accessed by
 363    caret diagnostic.  If no cached file was found, create a new cache
 364    for this file, add it to the array of cached file and return
 365    it.  */
 366
 367 static fcache*
 368 lookup_or_add_file_to_cache_tab (const char *file_path)
 369 {
 370   fcache *r = lookup_file_in_cache_tab (file_path);
 371   if (r == NULL)
 372     r = add_file_to_cache_tab (file_path);
 373   return r;
 374 }
 375
 376 /* Default constructor for a cache of file used by caret
 377    diagnostic.  */
 378
 379 fcache::fcache ()
 380 : use_count (0), file_path (NULL), fp (NULL), data (0),
 381   size (0), nb_read (0), line_start_idx (0), line_num (0),
 382   total_lines (0), missing_trailing_newline (true)
 383 {
 384   line_record.create (0);
 385 }
 386
 387 /* Destructor for a cache of file used by caret diagnostic.  */
 388
 389 fcache::~fcache ()
 390 {
 391   if (fp)
 392     {
 393       fclose (fp);
 394       fp = NULL;
 395     }
 396   if (data)
 397     {
 398       XDELETEVEC (data);
 399       data = 0;
 400     }
 401   line_record.release ();
 402 }
 403
 404 /* Returns TRUE iff the cache would need to be filled with data coming
 405    from the file.  That is, either the cache is empty or full or the
 406    current line is empty.  Note that if the cache is full, it would
 407    need to be extended and filled again.  */
 408
 409 static bool
 410 needs_read (fcache *c)
 411 {
 412   return (c->nb_read == 0
 413           || c->nb_read == c->size
 414           || (c->line_start_idx >= c->nb_read - 1));
 415 }
 416
 417 /*  Return TRUE iff the cache is full and thus needs to be
 418     extended.  */
 419
 420 static bool
 421 needs_grow (fcache *c)
 422 {
 423   return c->nb_read == c->size;
 424 }
 425
 426 /* Grow the cache if it needs to be extended.  */
 427
 428 static void
 429 maybe_grow (fcache *c)
 430 {
 431   if (!needs_grow (c))
 432     return;
 433
 434   size_t size = c->size == 0 ? fcache_buffer_size : c->size * 2;
 435   c->data = XRESIZEVEC (char, c->data, size + 1);
 436   c->size = size;
 437 }
 438
 439 /*  Read more data into the cache.  Extends the cache if need be.
 440     Returns TRUE iff new data could be read.  */
 441
 442 static bool
 443 read_data (fcache *c)
 444 {
 445   if (feof (c->fp) || ferror (c->fp))
 446     return false;
 447
 448   maybe_grow (c);
 449
 450   char * from = c->data + c->nb_read;
 451   size_t to_read = c->size - c->nb_read;
 452   size_t nb_read = fread (from, 1, to_read, c->fp);
 453
 454   if (ferror (c->fp))
 455     return false;
 456
 457   c->nb_read += nb_read;
 458   return !!nb_read;
 459 }
 460
 461 /* Read new data iff the cache needs to be filled with more data
 462    coming from the file FP.  Return TRUE iff the cache was filled with
 463    mode data.  */
 464
 465 static bool
 466 maybe_read_data (fcache *c)
 467 {
 468   if (!needs_read (c))
 469     return false;
 470   return read_data (c);
 471 }
 472
 473 /* Read a new line from file FP, using C as a cache for the data
 474    coming from the file.  Upon successful completion, *LINE is set to
 475    the beginning of the line found.  Space for that line has been
 476    allocated in the cache thus *LINE has the same life time as C.
 477    *LINE_LEN is set to the length of the line.  Note that the line
 478    does not contain any terminal delimiter.  This function returns
 479    true if some data was read or process from the cache, false
 480    otherwise.  Note that subsequent calls to get_next_line return the
 481    next lines of the file and might overwrite the content of
 482    *LINE.  */
 483
 484 static bool
 485 get_next_line (fcache *c, char **line, ssize_t *line_len)
 486 {
 487   /* Fill the cache with data to process.  */
 488   maybe_read_data (c);
 489
 490   size_t remaining_size = c->nb_read - c->line_start_idx;
 491   if (remaining_size == 0)
 492     /* There is no more data to process.  */
 493     return false;
 494
 495   char *line_start = c->data + c->line_start_idx;
 496
 497   char *next_line_start = NULL;
 498   size_t len = 0;
 499   char *line_end = (char *) memchr (line_start, '\n', remaining_size);
 500   if (line_end == NULL)
 501     {
 502       /* We haven't found the end-of-line delimiter in the cache.
 503          Fill the cache with more data from the file and look for the
 504          '\n'.  */
 505       while (maybe_read_data (c))
 506         {
 507           line_start = c->data + c->line_start_idx;
 508           remaining_size = c->nb_read - c->line_start_idx;
 509           line_end = (char *) memchr (line_start, '\n', remaining_size);
 510           if (line_end != NULL)
 511             {
 512               next_line_start = line_end + 1;
 513               break;
 514             }
 515         }
 516       if (line_end == NULL)
 517         {
 518           /* We've loadded all the file into the cache and still no
 519              '\n'.  Let's say the line ends up at one byte passed the
 520              end of the file.  This is to stay consistent with the case
 521              of when the line ends up with a '\n' and line_end points to
 522              that terminal '\n'.  That consistency is useful below in
 523              the len calculation.  */
 524           line_end = c->data + c->nb_read ;
 525           c->missing_trailing_newline = true;
 526         }
 527       else
 528         c->missing_trailing_newline = false;
 529     }
 530   else
 531     {
 532       next_line_start = line_end + 1;
 533       c->missing_trailing_newline = false;
 534     }
 535
 536   if (ferror (c->fp))
 537     return -1;
 538
 539   /* At this point, we've found the end of the of line.  It either
 540      points to the '\n' or to one byte after the last byte of the
 541      file.  */
 542   gcc_assert (line_end != NULL);
 543
 544   len = line_end - line_start;
 545
 546   if (c->line_start_idx < c->nb_read)
 547     *line = line_start;
 548
 549   ++c->line_num;
 550
 551   /* Before we update our line record, make sure the hint about the
 552      total number of lines of the file is correct.  If it's not, then
 553      we give up recording line boundaries from now on.  */
 554   bool update_line_record = true;
 555   if (c->line_num > c->total_lines)
 556     update_line_record = false;
 557
 558     /* Now update our line record so that re-reading lines from the
 559      before c->line_start_idx is faster.  */
 560   if (update_line_record
 561       && c->line_record.length () < fcache_line_record_size)
 562     {
 563       /* If the file lines fits in the line record, we just record all
 564          its lines ...*/
 565       if (c->total_lines <= fcache_line_record_size
 566           && c->line_num > c->line_record.length ())
 567         c->line_record.safe_push (fcache::line_info (c->line_num,
 568                                                  c->line_start_idx,
 569                                                  line_end - c->data));
 570       else if (c->total_lines > fcache_line_record_size)
 571         {
 572           /* ... otherwise, we just scale total_lines down to
 573              (fcache_line_record_size lines.  */
 574           size_t n = (c->line_num * fcache_line_record_size) / c->total_lines;
 575           if (c->line_record.length () == 0
 576               || n >= c->line_record.length ())
 577             c->line_record.safe_push (fcache::line_info (c->line_num,
 578                                                      c->line_start_idx,
 579                                                      line_end - c->data));
 580         }
 581     }
 582
 583   /* Update c->line_start_idx so that it points to the next line to be
 584      read.  */
 585   if (next_line_start)
 586     c->line_start_idx = next_line_start - c->data;
 587   else
 588     /* We didn't find any terminal '\n'.  Let's consider that the end
 589        of line is the end of the data in the cache.  The next
 590        invocation of get_next_line will either read more data from the
 591        underlying file or return false early because we've reached the
 592        end of the file.  */
 593     c->line_start_idx = c->nb_read;
 594
 595   *line_len = len;
 596
 597   return true;
 598 }
 599
 600 /* Reads the next line from FILE into *LINE.  If *LINE is too small
 601    (or NULL) it is allocated (or extended) to have enough space to
 602    containe the line.  *LINE_LENGTH must contain the size of the
 603    initial*LINE buffer.  It's then updated by this function to the
 604    actual length of the returned line.  Note that the returned line
 605    can contain several zero bytes.  Also note that the returned string
 606    is allocated in static storage that is going to be re-used by
 607    subsequent invocations of read_line.  */
 608
 609 static bool
 610 read_next_line (fcache *cache, char ** line, ssize_t *line_len)
 611 {
 612   char *l = NULL;
 613   ssize_t len = 0;
 614
 615   if (!get_next_line (cache, &l, &len))
 616     return false;
 617
 618   if (*line == NULL)
 619     *line = XNEWVEC (char, len);
 620   else
 621     if (*line_len < len)
 622         *line = XRESIZEVEC (char, *line, len);
 623
 624   memcpy (*line, l, len);
 625   *line_len = len;
 626
 627   return true;
 628 }
 629
 630 /* Consume the next bytes coming from the cache (or from its
 631    underlying file if there are remaining unread bytes in the file)
 632    until we reach the next end-of-line (or end-of-file).  There is no
 633    copying from the cache involved.  Return TRUE upon successful
 634    completion.  */
 635
 636 static bool
 637 goto_next_line (fcache *cache)
 638 {
 639   char *l;
 640   ssize_t len;
 641
 642   return get_next_line (cache, &l, &len);
 643 }
 644
 645 /* Read an arbitrary line number LINE_NUM from the file cached in C.
 646    The line is copied into *LINE.  *LINE_LEN must have been set to the
 647    length of *LINE.  If *LINE is too small (or NULL) it's extended (or
 648    allocated) and *LINE_LEN is adjusted accordingly.  *LINE ends up
 649    with a terminal zero byte and can contain additional zero bytes.
 650    This function returns bool if a line was read.  */
 651
 652 static bool
 653 read_line_num (fcache *c, size_t line_num,
 654                char ** line, ssize_t *line_len)
 655 {
 656   gcc_assert (line_num > 0);
 657
 658   if (line_num <= c->line_num)
 659     {
 660       /* We've been asked to read lines that are before c->line_num.
 661          So lets use our line record (if it's not empty) to try to
 662          avoid re-reading the file from the beginning again.  */
 663
 664       if (c->line_record.is_empty ())
 665         {
 666           c->line_start_idx = 0;
 667           c->line_num = 0;
 668         }
 669       else
 670         {
 671           fcache::line_info *i = NULL;
 672           if (c->total_lines <= fcache_line_record_size)
 673             {
 674               /* In languages where the input file is not totally
 675                  preprocessed up front, the c->total_lines hint
 676                  can be smaller than the number of lines of the
 677                  file.  In that case, only the first
 678                  c->total_lines have been recorded.
 679
 680                  Otherwise, the first c->total_lines we've read have
 681                  their start/end recorded here.  */
 682               i = (line_num <= c->total_lines)
 683                 ? &c->line_record[line_num - 1]
 684                 : &c->line_record[c->total_lines - 1];
 685               gcc_assert (i->line_num <= line_num);
 686             }
 687           else
 688             {
 689               /*  So the file had more lines than our line record
 690                   size.  Thus the number of lines we've recorded has
 691                   been scaled down to fcache_line_reacord_size.  Let's
 692                   pick the start/end of the recorded line that is
 693                   closest to line_num.  */
 694               size_t n = (line_num <= c->total_lines)
 695                 ? line_num * fcache_line_record_size / c->total_lines
 696                 : c ->line_record.length () - 1;
 697               if (n < c->line_record.length ())
 698                 {
 699                   i = &c->line_record[n];
 700                   gcc_assert (i->line_num <= line_num);
 701                 }
 702             }
 703
 704           if (i && i->line_num == line_num)
 705             {
 706               /* We have the start/end of the line.  Let's just copy
 707                  it again and we are done.  */
 708               ssize_t len = i->end_pos - i->start_pos + 1;
 709               if (*line_len < len)
 710                 *line = XRESIZEVEC (char, *line, len);
 711               memmove (*line, c->data + i->start_pos, len);
 712               (*line)[len - 1] = '\0';
 713               *line_len = --len;
 714               return true;
 715             }
 716
 717           if (i)
 718             {
 719               c->line_start_idx = i->start_pos;
 720               c->line_num = i->line_num - 1;
 721             }
 722           else
 723             {
 724               c->line_start_idx = 0;
 725               c->line_num = 0;
 726             }
 727         }
 728     }
 729
 730   /*  Let's walk from line c->line_num up to line_num - 1, without
 731       copying any line.  */
 732   while (c->line_num < line_num - 1)
 733     if (!goto_next_line (c))
 734       return false;
 735
 736   /* The line we want is the next one.  Let's read and copy it back to
 737      the caller.  */
 738   return read_next_line (c, line, line_len);
 739 }
 740
 741 /* Return the physical source line that corresponds to FILE_PATH/LINE in a
 742    buffer that is statically allocated.  The newline is replaced by
 743    the null character.  Note that the line can contain several null
 744    characters, so LINE_LEN, if non-null, points to the actual length
 745    of the line.  */
 746
 747 const char *
 748 location_get_source_line (const char *file_path, int line,
 749                           int *line_len)
 750 {
 751   static char *buffer;
 752   static ssize_t len;
 753
 754   if (line == 0)
 755     return NULL;
 756
 757   fcache *c = lookup_or_add_file_to_cache_tab (file_path);
 758   if (c == NULL)
 759     return NULL;
 760
 761   bool read = read_line_num (c, line, &buffer, &len);
 762
 763   if (read && line_len)
 764     *line_len = len;
 765
 766   return read ? buffer : NULL;
 767 }
 768
 769 /* Determine if FILE_PATH missing a trailing newline on its final line.
 770    Only valid to call once all of the file has been loaded, by
 771    requesting a line number beyond the end of the file.  */
 772
 773 bool
 774 location_missing_trailing_newline (const char *file_path)
 775 {
 776   fcache *c = lookup_or_add_file_to_cache_tab (file_path);
 777   if (c == NULL)
 778     return false;
 779
 780   return c->missing_trailing_newline;
 781 }
 782
 783 /* Test if the location originates from the spelling location of a
 784    builtin-tokens.  That is, return TRUE if LOC is a (possibly
 785    virtual) location of a built-in token that appears in the expansion
 786    list of a macro.  Please note that this function also works on
 787    tokens that result from built-in tokens.  For instance, the
 788    function would return true if passed a token "4" that is the result
 789    of the expansion of the built-in __LINE__ macro.  */
 790 bool
 791 is_location_from_builtin_token (source_location loc)
 792 {
 793   const line_map_ordinary *map = NULL;
 794   loc = linemap_resolve_location (line_table, loc,
 795                                   LRK_SPELLING_LOCATION, &map);
 796   return loc == BUILTINS_LOCATION;
 797 }
 798
 799 /* Expand the source location LOC into a human readable location.  If
 800    LOC is virtual, it resolves to the expansion point of the involved
 801    macro.  If LOC resolves to a builtin location, the file name of the
 802    readable location is set to the string "<built-in>".  */
 803
 804 expanded_location
 805 expand_location (source_location loc)
 806 {
 807   return expand_location_1 (loc, /*expansion_point_p=*/true);
 808 }
 809
 810 /* Expand the source location LOC into a human readable location.  If
 811    LOC is virtual, it resolves to the expansion location of the
 812    relevant macro.  If LOC resolves to a builtin location, the file
 813    name of the readable location is set to the string
 814    "<built-in>".  */
 815
 816 expanded_location
 817 expand_location_to_spelling_point (source_location loc)
 818 {
 819   return expand_location_1 (loc, /*expansion_point_p=*/false);
 820 }
 821
 822 /* The rich_location class within libcpp requires a way to expand
 823    source_location instances, and relies on the client code
 824    providing a symbol named
 825      linemap_client_expand_location_to_spelling_point
 826    to do this.
 827
 828    This is the implementation for libcommon.a (all host binaries),
 829    which simply calls into expand_location_to_spelling_point.  */
 830
 831 expanded_location
 832 linemap_client_expand_location_to_spelling_point (source_location loc)
 833 {
 834   return expand_location_to_spelling_point (loc);
 835 }
 836
 837
 838 /* If LOCATION is in a system header and if it is a virtual location for
 839    a token coming from the expansion of a macro, unwind it to the
 840    location of the expansion point of the macro.  Otherwise, just return
 841    LOCATION.
 842
 843    This is used for instance when we want to emit diagnostics about a
 844    token that may be located in a macro that is itself defined in a
 845    system header, for example, for the NULL macro.  In such a case, if
 846    LOCATION were passed directly to diagnostic functions such as
 847    warning_at, the diagnostic would be suppressed (unless
 848    -Wsystem-headers).  */
 849
 850 source_location
 851 expansion_point_location_if_in_system_header (source_location location)
 852 {
 853   if (in_system_header_at (location))
 854     location = linemap_resolve_location (line_table, location,
 855                                          LRK_MACRO_EXPANSION_POINT,
 856                                          NULL);
 857   return location;
 858 }
 859
 860 /* If LOCATION is a virtual location for a token coming from the expansion
 861    of a macro, unwind to the location of the expansion point of the macro.  */
 862
 863 source_location
 864 expansion_point_location (source_location location)
 865 {
 866   return linemap_resolve_location (line_table, location,
 867                                    LRK_MACRO_EXPANSION_POINT, NULL);
 868 }
 869
 870 /* Construct a location with caret at CARET, ranging from START to
 871    finish e.g.
 872
 873                  11111111112
 874         12345678901234567890
 875      522
 876      523   return foo + bar;
 877                   ~~~~^~~~~
 878      524
 879
 880    The location's caret is at the "+", line 523 column 15, but starts
 881    earlier, at the "f" of "foo" at column 11.  The finish is at the "r"
 882    of "bar" at column 19.  */
 883
 884 location_t
 885 make_location (location_t caret, location_t start, location_t finish)
 886 {
 887   location_t pure_loc = get_pure_location (caret);
 888   source_range src_range;
 889   src_range.m_start = get_start (start);
 890   src_range.m_finish = get_finish (finish);
 891   location_t combined_loc = COMBINE_LOCATION_DATA (line_table,
 892                                                    pure_loc,
 893                                                    src_range,
 894                                                    NULL);
 895   return combined_loc;
 896 }
 897
 898 #define ONE_K 1024
 899 #define ONE_M (ONE_K * ONE_K)
 900
 901 /* Display a number as an integer multiple of either:
 902    - 1024, if said integer is >= to 10 K (in base 2)
 903    - 1024 * 1024, if said integer is >= 10 M in (base 2)
 904  */
 905 #define SCALE(x) ((unsigned long) ((x) < 10 * ONE_K \
 906                   ? (x) \
 907                   : ((x) < 10 * ONE_M \
 908                      ? (x) / ONE_K \
 909                      : (x) / ONE_M)))
 910
 911 /* For a given integer, display either:
 912    - the character 'k', if the number is higher than 10 K (in base 2)
 913      but strictly lower than 10 M (in base 2)
 914    - the character 'M' if the number is higher than 10 M (in base2)
 915    - the charcter ' ' if the number is strictly lower  than 10 K  */
 916 #define STAT_LABEL(x) ((x) < 10 * ONE_K ? ' ' : ((x) < 10 * ONE_M ? 'k' : 'M'))
 917
 918 /* Display an integer amount as multiple of 1K or 1M (in base 2).
 919    Display the correct unit (either k, M, or ' ') after the amout, as
 920    well.  */
 921 #define FORMAT_AMOUNT(size) SCALE (size), STAT_LABEL (size)
 922
 923 /* Dump statistics to stderr about the memory usage of the line_table
 924    set of line maps.  This also displays some statistics about macro
 925    expansion.  */
 926
 927 void
 928 dump_line_table_statistics (void)
 929 {
 930   struct linemap_stats s;
 931   long total_used_map_size,
 932     macro_maps_size,
 933     total_allocated_map_size;
 934
 935   memset (&s, 0, sizeof (s));
 936
 937   linemap_get_statistics (line_table, &s);
 938
 939   macro_maps_size = s.macro_maps_used_size
 940     + s.macro_maps_locations_size;
 941
 942   total_allocated_map_size = s.ordinary_maps_allocated_size
 943     + s.macro_maps_allocated_size
 944     + s.macro_maps_locations_size;
 945
 946   total_used_map_size = s.ordinary_maps_used_size
 947     + s.macro_maps_used_size
 948     + s.macro_maps_locations_size;
 949
 950   fprintf (stderr, "Number of expanded macros:                     %5ld\n",
 951            s.num_expanded_macros);
 952   if (s.num_expanded_macros != 0)
 953     fprintf (stderr, "Average number of tokens per macro expansion:  %5ld\n",
 954              s.num_macro_tokens / s.num_expanded_macros);
 955   fprintf (stderr,
 956            "\nLine Table allocations during the "
 957            "compilation process\n");
 958   fprintf (stderr, "Number of ordinary maps used:        %5ld%c\n",
 959            SCALE (s.num_ordinary_maps_used),
 960            STAT_LABEL (s.num_ordinary_maps_used));
 961   fprintf (stderr, "Ordinary map used size:              %5ld%c\n",
 962            SCALE (s.ordinary_maps_used_size),
 963            STAT_LABEL (s.ordinary_maps_used_size));
 964   fprintf (stderr, "Number of ordinary maps allocated:   %5ld%c\n",
 965            SCALE (s.num_ordinary_maps_allocated),
 966            STAT_LABEL (s.num_ordinary_maps_allocated));
 967   fprintf (stderr, "Ordinary maps allocated size:        %5ld%c\n",
 968            SCALE (s.ordinary_maps_allocated_size),
 969            STAT_LABEL (s.ordinary_maps_allocated_size));
 970   fprintf (stderr, "Number of macro maps used:           %5ld%c\n",
 971            SCALE (s.num_macro_maps_used),
 972            STAT_LABEL (s.num_macro_maps_used));
 973   fprintf (stderr, "Macro maps used size:                %5ld%c\n",
 974            SCALE (s.macro_maps_used_size),
 975            STAT_LABEL (s.macro_maps_used_size));
 976   fprintf (stderr, "Macro maps locations size:           %5ld%c\n",
 977            SCALE (s.macro_maps_locations_size),
 978            STAT_LABEL (s.macro_maps_locations_size));
 979   fprintf (stderr, "Macro maps size:                     %5ld%c\n",
 980            SCALE (macro_maps_size),
 981            STAT_LABEL (macro_maps_size));
 982   fprintf (stderr, "Duplicated maps locations size:      %5ld%c\n",
 983            SCALE (s.duplicated_macro_maps_locations_size),
 984            STAT_LABEL (s.duplicated_macro_maps_locations_size));
 985   fprintf (stderr, "Total allocated maps size:           %5ld%c\n",
 986            SCALE (total_allocated_map_size),
 987            STAT_LABEL (total_allocated_map_size));
 988   fprintf (stderr, "Total used maps size:                %5ld%c\n",
 989            SCALE (total_used_map_size),
 990            STAT_LABEL (total_used_map_size));
 991   fprintf (stderr, "Ad-hoc table size:                   %5ld%c\n",
 992            SCALE (s.adhoc_table_size),
 993            STAT_LABEL (s.adhoc_table_size));
 994   fprintf (stderr, "Ad-hoc table entries used:           %5ld\n",
 995            s.adhoc_table_entries_used);
 996   fprintf (stderr, "optimized_ranges: %i\n",
 997            line_table->num_optimized_ranges);
 998   fprintf (stderr, "unoptimized_ranges: %i\n",
 999            line_table->num_unoptimized_ranges);
1000
1001   fprintf (stderr, "\n");
1002 }
1003
1004 /* Get location one beyond the final location in ordinary map IDX.  */
1005
1006 static source_location
1007 get_end_location (struct line_maps *set, unsigned int idx)
1008 {
1009   if (idx == LINEMAPS_ORDINARY_USED (set) - 1)
1010     return set->highest_location;
1011
1012   struct line_map *next_map = LINEMAPS_ORDINARY_MAP_AT (set, idx + 1);
1013   return MAP_START_LOCATION (next_map);
1014 }
1015
1016 /* Helper function for write_digit_row.  */
1017
1018 static void
1019 write_digit (FILE *stream, int digit)
1020 {
1021   fputc ('0' + (digit % 10), stream);
1022 }
1023
1024 /* Helper function for dump_location_info.
1025    Write a row of numbers to STREAM, numbering a source line,
1026    giving the units, tens, hundreds etc of the column number.  */
1027
1028 static void
1029 write_digit_row (FILE *stream, int indent,
1030                  const line_map_ordinary *map,
1031                  source_location loc, int max_col, int divisor)
1032 {
1033   fprintf (stream, "%*c", indent, ' ');
1034   fprintf (stream, "|");
1035   for (int column = 1; column < max_col; column++)
1036     {
1037       source_location column_loc = loc + (column << map->m_range_bits);
1038       write_digit (stream, column_loc / divisor);
1039     }
1040   fprintf (stream, "\n");
1041 }
1042
1043 /* Write a half-closed (START) / half-open (END) interval of
1044    source_location to STREAM.  */
1045
1046 static void
1047 dump_location_range (FILE *stream,
1048                      source_location start, source_location end)
1049 {
1050   fprintf (stream,
1051            "  source_location interval: %u <= loc < %u\n",
1052            start, end);
1053 }
1054
1055 /* Write a labelled description of a half-closed (START) / half-open (END)
1056    interval of source_location to STREAM.  */
1057
1058 static void
1059 dump_labelled_location_range (FILE *stream,
1060                               const char *name,
1061                               source_location start, source_location end)
1062 {
1063   fprintf (stream, "%s\n", name);
1064   dump_location_range (stream, start, end);
1065   fprintf (stream, "\n");
1066 }
1067
1068 /* Write a visualization of the locations in the line_table to STREAM.  */
1069
1070 void
1071 dump_location_info (FILE *stream)
1072 {
1073   /* Visualize the reserved locations.  */
1074   dump_labelled_location_range (stream, "RESERVED LOCATIONS",
1075                                 0, RESERVED_LOCATION_COUNT);
1076
1077   /* Visualize the ordinary line_map instances, rendering the sources. */
1078   for (unsigned int idx = 0; idx < LINEMAPS_ORDINARY_USED (line_table); idx++)
1079     {
1080       source_location end_location = get_end_location (line_table, idx);
1081       /* half-closed: doesn't include this one. */
1082
1083       const line_map_ordinary *map
1084         = LINEMAPS_ORDINARY_MAP_AT (line_table, idx);
1085       fprintf (stream, "ORDINARY MAP: %i\n", idx);
1086       dump_location_range (stream,
1087                            MAP_START_LOCATION (map), end_location);
1088       fprintf (stream, "  file: %s\n", ORDINARY_MAP_FILE_NAME (map));
1089       fprintf (stream, "  starting at line: %i\n",
1090                ORDINARY_MAP_STARTING_LINE_NUMBER (map));
1091       fprintf (stream, "  column and range bits: %i\n",
1092                map->m_column_and_range_bits);
1093       fprintf (stream, "  column bits: %i\n",
1094                map->m_column_and_range_bits - map->m_range_bits);
1095       fprintf (stream, "  range bits: %i\n",
1096                map->m_range_bits);
1097
1098       /* Render the span of source lines that this "map" covers.  */
1099       for (source_location loc = MAP_START_LOCATION (map);
1100            loc < end_location;
1101            loc += (1 << map->m_range_bits) )
1102         {
1103           gcc_assert (pure_location_p (line_table, loc) );
1104
1105           expanded_location exploc
1106             = linemap_expand_location (line_table, map, loc);
1107
1108           if (0 == exploc.column)
1109             {
1110               /* Beginning of a new source line: draw the line.  */
1111
1112               int line_size;
1113               const char *line_text = location_get_source_line (exploc.file,
1114                                                                 exploc.line,
1115                                                                 &line_size);
1116               if (!line_text)
1117                 break;
1118               fprintf (stream,
1119                        "%s:%3i|loc:%5i|%.*s\n",
1120                        exploc.file, exploc.line,
1121                        loc,
1122                        line_size, line_text);
1123
1124               /* "loc" is at column 0, which means "the whole line".
1125                  Render the locations *within* the line, by underlining
1126                  it, showing the source_location numeric values
1127                  at each column.  */
1128               int max_col = (1 << map->m_column_and_range_bits) - 1;
1129               if (max_col > line_size)
1130                 max_col = line_size + 1;
1131
1132               int indent = 14 + strlen (exploc.file);
1133
1134               /* Thousands.  */
1135               if (end_location > 999)
1136                 write_digit_row (stream, indent, map, loc, max_col, 1000);
1137
1138               /* Hundreds.  */
1139               if (end_location > 99)
1140                 write_digit_row (stream, indent, map, loc, max_col, 100);
1141
1142               /* Tens.  */
1143               write_digit_row (stream, indent, map, loc, max_col, 10);
1144
1145               /* Units.  */
1146               write_digit_row (stream, indent, map, loc, max_col, 1);
1147             }
1148         }
1149       fprintf (stream, "\n");
1150     }
1151
1152   /* Visualize unallocated values.  */
1153   dump_labelled_location_range (stream, "UNALLOCATED LOCATIONS",
1154                                 line_table->highest_location,
1155                                 LINEMAPS_MACRO_LOWEST_LOCATION (line_table));
1156
1157   /* Visualize the macro line_map instances, rendering the sources. */
1158   for (unsigned int i = 0; i < LINEMAPS_MACRO_USED (line_table); i++)
1159     {
1160       /* Each macro map that is allocated owns source_location values
1161          that are *lower* that the one before them.
1162          Hence it's meaningful to view them either in order of ascending
1163          source locations, or in order of ascending macro map index.  */
1164       const bool ascending_source_locations = true;
1165       unsigned int idx = (ascending_source_locations
1166                           ? (LINEMAPS_MACRO_USED (line_table) - (i + 1))
1167                           : i);
1168       const line_map_macro *map = LINEMAPS_MACRO_MAP_AT (line_table, idx);
1169       fprintf (stream, "MACRO %i: %s (%u tokens)\n",
1170                idx,
1171                linemap_map_get_macro_name (map),
1172                MACRO_MAP_NUM_MACRO_TOKENS (map));
1173       dump_location_range (stream,
1174                            map->start_location,
1175                            (map->start_location
1176                             + MACRO_MAP_NUM_MACRO_TOKENS (map)));
1177       inform (MACRO_MAP_EXPANSION_POINT_LOCATION (map),
1178               "expansion point is location %i",
1179               MACRO_MAP_EXPANSION_POINT_LOCATION (map));
1180       fprintf (stream, "  map->start_location: %u\n",
1181                map->start_location);
1182
1183       fprintf (stream, "  macro_locations:\n");
1184       for (unsigned int i = 0; i < MACRO_MAP_NUM_MACRO_TOKENS (map); i++)
1185         {
1186           source_location x = MACRO_MAP_LOCATIONS (map)[2 * i];
1187           source_location y = MACRO_MAP_LOCATIONS (map)[(2 * i) + 1];
1188
1189           /* linemap_add_macro_token encodes token numbers in an expansion
1190              by putting them after MAP_START_LOCATION. */
1191
1192           /* I'm typically seeing 4 uninitialized entries at the end of
1193              0xafafafaf.
1194              This appears to be due to macro.c:replace_args
1195              adding 2 extra args for padding tokens; presumably there may
1196              be a leading and/or trailing padding token injected,
1197              each for 2 more location slots.
1198              This would explain there being up to 4 source_locations slots
1199              that may be uninitialized.  */
1200
1201           fprintf (stream, "    %u: %u, %u\n",
1202                    i,
1203                    x,
1204                    y);
1205           if (x == y)
1206             {
1207               if (x < MAP_START_LOCATION (map))
1208                 inform (x, "token %u has x-location == y-location == %u", i, x);
1209               else
1210                 fprintf (stream,
1211                          "x-location == y-location == %u encodes token # %u\n",
1212                          x, x - MAP_START_LOCATION (map));
1213                 }
1214           else
1215             {
1216               inform (x, "token %u has x-location == %u", i, x);
1217               inform (x, "token %u has y-location == %u", i, y);
1218             }
1219         }
1220       fprintf (stream, "\n");
1221     }
1222
1223   /* It appears that MAX_SOURCE_LOCATION itself is never assigned to a
1224      macro map, presumably due to an off-by-one error somewhere
1225      between the logic in linemap_enter_macro and
1226      LINEMAPS_MACRO_LOWEST_LOCATION.  */
1227   dump_labelled_location_range (stream, "MAX_SOURCE_LOCATION",
1228                                 MAX_SOURCE_LOCATION,
1229                                 MAX_SOURCE_LOCATION + 1);
1230
1231   /* Visualize ad-hoc values.  */
1232   dump_labelled_location_range (stream, "AD-HOC LOCATIONS",
1233                                 MAX_SOURCE_LOCATION + 1, UINT_MAX);
1234 }
1235
1236 /* string_concat's constructor.  */
1237
1238 string_concat::string_concat (int num, location_t *locs)
1239   : m_num (num)
1240 {
1241   m_locs = ggc_vec_alloc <location_t> (num);
1242   for (int i = 0; i < num; i++)
1243     m_locs[i] = locs[i];
1244 }
1245
1246 /* string_concat_db's constructor.  */
1247
1248 string_concat_db::string_concat_db ()
1249 {
1250   m_table = hash_map <location_hash, string_concat *>::create_ggc (64);
1251 }
1252
1253 /* Record that a string concatenation occurred, covering NUM
1254    string literal tokens.  LOCS is an array of size NUM, containing the
1255    locations of the tokens.  A copy of LOCS is taken.  */
1256
1257 void
1258 string_concat_db::record_string_concatenation (int num, location_t *locs)
1259 {
1260   gcc_assert (num > 1);
1261   gcc_assert (locs);
1262
1263   location_t key_loc = get_key_loc (locs[0]);
1264
1265   string_concat *concat
1266     = new (ggc_alloc <string_concat> ()) string_concat (num, locs);
1267   m_table->put (key_loc, concat);
1268 }
1269
1270 /* Determine if LOC was the location of the the initial token of a
1271    concatenation of string literal tokens.
1272    If so, *OUT_NUM is written to with the number of tokens, and
1273    *OUT_LOCS with the location of an array of locations of the
1274    tokens, and return true.  *OUT_LOCS is a borrowed pointer to
1275    storage owned by the string_concat_db.
1276    Otherwise, return false.  */
1277
1278 bool
1279 string_concat_db::get_string_concatenation (location_t loc,
1280                                             int *out_num,
1281                                             location_t **out_locs)
1282 {
1283   gcc_assert (out_num);
1284   gcc_assert (out_locs);
1285
1286   location_t key_loc = get_key_loc (loc);
1287
1288   string_concat **concat = m_table->get (key_loc);
1289   if (!concat)
1290     return false;
1291
1292   *out_num = (*concat)->m_num;
1293   *out_locs =(*concat)->m_locs;
1294   return true;
1295 }
1296
1297 /* Internal function.  Canonicalize LOC into a form suitable for
1298    use as a key within the database, stripping away macro expansion,
1299    ad-hoc information, and range information, using the location of
1300    the start of LOC within an ordinary linemap.  */
1301
1302 location_t
1303 string_concat_db::get_key_loc (location_t loc)
1304 {
1305   loc = linemap_resolve_location (line_table, loc, LRK_SPELLING_LOCATION,
1306                                   NULL);
1307
1308   loc = get_range_from_loc (line_table, loc).m_start;
1309
1310   return loc;
1311 }
1312
1313 /* Helper class for use within get_substring_ranges_for_loc.
1314    An vec of cpp_string with responsibility for releasing all of the
1315    str->text for each str in the vector.  */
1316
1317 class auto_cpp_string_vec :  public auto_vec <cpp_string>
1318 {
1319  public:
1320   auto_cpp_string_vec (int alloc)
1321     : auto_vec <cpp_string> (alloc) {}
1322
1323   ~auto_cpp_string_vec ()
1324   {
1325     /* Clean up the copies within this vec.  */
1326     int i;
1327     cpp_string *str;
1328     FOR_EACH_VEC_ELT (*this, i, str)
1329       free (const_cast <unsigned char *> (str->text));
1330   }
1331 };
1332
1333 /* Attempt to populate RANGES with source location information on the
1334    individual characters within the string literal found at STRLOC.
1335    If CONCATS is non-NULL, then any string literals that the token at
1336    STRLOC  was concatenated with are also added to RANGES.
1337
1338    Return NULL if successful, or an error message if any errors occurred (in
1339    which case RANGES may be only partially populated and should not
1340    be used).
1341
1342    This is implemented by re-parsing the relevant source line(s).  */
1343
1344 static const char *
1345 get_substring_ranges_for_loc (cpp_reader *pfile,
1346                               string_concat_db *concats,
1347                               location_t strloc,
1348                               enum cpp_ttype type,
1349                               cpp_substring_ranges &ranges)
1350 {
1351   gcc_assert (pfile);
1352
1353   if (strloc == UNKNOWN_LOCATION)
1354     return "unknown location";
1355
1356   /* If string concatenation has occurred at STRLOC, get the locations
1357      of all of the literal tokens making up the compound string.
1358      Otherwise, just use STRLOC.  */
1359   int num_locs = 1;
1360   location_t *strlocs = &strloc;
1361   if (concats)
1362     concats->get_string_concatenation (strloc, &num_locs, &strlocs);
1363
1364   auto_cpp_string_vec strs (num_locs);
1365   auto_vec <cpp_string_location_reader> loc_readers (num_locs);
1366   for (int i = 0; i < num_locs; i++)
1367     {
1368       /* Get range of strloc.  We will use it to locate the start and finish
1369          of the literal token within the line.  */
1370       source_range src_range = get_range_from_loc (line_table, strlocs[i]);
1371
1372       if (src_range.m_start >= LINEMAPS_MACRO_LOWEST_LOCATION (line_table))
1373         /* If the string is within a macro expansion, we can't get at the
1374            end location.  */
1375         return "macro expansion";
1376
1377       if (src_range.m_start >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1378         /* If so, we can't reliably determine where the token started within
1379            its line.  */
1380         return "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS";
1381
1382       if (src_range.m_finish >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1383         /* If so, we can't reliably determine where the token finished within
1384            its line.  */
1385         return "range ends after LINE_MAP_MAX_LOCATION_WITH_COLS";
1386
1387       expanded_location start
1388         = expand_location_to_spelling_point (src_range.m_start);
1389       expanded_location finish
1390         = expand_location_to_spelling_point (src_range.m_finish);
1391       if (start.file != finish.file)
1392         return "range endpoints are in different files";
1393       if (start.line != finish.line)
1394         return "range endpoints are on different lines";
1395       if (start.column > finish.column)
1396         return "range endpoints are reversed";
1397
1398       int line_width;
1399       const char *line = location_get_source_line (start.file, start.line,
1400                                                    &line_width);
1401       if (line == NULL)
1402         return "unable to read source line";
1403
1404       /* Determine the location of the literal (including quotes
1405          and leading prefix chars, such as the 'u' in a u""
1406          token).  */
1407       const char *literal = line + start.column - 1;
1408       int literal_length = finish.column - start.column + 1;
1409
1410       gcc_assert (line_width >= (start.column - 1 + literal_length));
1411       cpp_string from;
1412       from.len = literal_length;
1413       /* Make a copy of the literal, to avoid having to rely on
1414          the lifetime of the copy of the line within the cache.
1415          This will be released by the auto_cpp_string_vec dtor.  */
1416       from.text = XDUPVEC (unsigned char, literal, literal_length);
1417       strs.safe_push (from);
1418
1419       /* For very long lines, a new linemap could have started
1420          halfway through the token.
1421          Ensure that the loc_reader uses the linemap of the
1422          *end* of the token for its start location.  */
1423       const line_map_ordinary *final_ord_map;
1424       linemap_resolve_location (line_table, src_range.m_finish,
1425                                 LRK_MACRO_EXPANSION_POINT, &final_ord_map);
1426       location_t start_loc
1427         = linemap_position_for_line_and_column (line_table, final_ord_map,
1428                                                 start.line, start.column);
1429
1430       cpp_string_location_reader loc_reader (start_loc, line_table);
1431       loc_readers.safe_push (loc_reader);
1432     }
1433
1434   /* Rerun cpp_interpret_string, or rather, a modified version of it.  */
1435   const char *err = cpp_interpret_string_ranges (pfile, strs.address (),
1436                                                  loc_readers.address (),
1437                                                  num_locs, &ranges, type);
1438   if (err)
1439     return err;
1440
1441   /* Success: "ranges" should now contain information on the string.  */
1442   return NULL;
1443 }
1444
1445 /* Attempt to populate *OUT_LOC with source location information on the
1446    given characters within the string literal found at STRLOC.
1447    CARET_IDX, START_IDX, and END_IDX refer to offsets within the execution
1448    character set.
1449
1450    For example, given CARET_IDX = 4, START_IDX = 3, END_IDX  = 7
1451    and string literal "012345\n789"
1452    *OUT_LOC is written to with:
1453      "012345\n789"
1454          ~^~~~~
1455
1456    If CONCATS is non-NULL, then any string literals that the token at
1457    STRLOC was concatenated with are also considered.
1458
1459    This is implemented by re-parsing the relevant source line(s).
1460
1461    Return NULL if successful, or an error message if any errors occurred.
1462    Error messages are intended for GCC developers (to help debugging) rather
1463    than for end-users.  */
1464
1465 const char *
1466 get_source_location_for_substring (cpp_reader *pfile,
1467                                    string_concat_db *concats,
1468                                    location_t strloc,
1469                                    enum cpp_ttype type,
1470                                    int caret_idx, int start_idx, int end_idx,
1471                                    source_location *out_loc)
1472 {
1473   gcc_checking_assert (caret_idx >= 0);
1474   gcc_checking_assert (start_idx >= 0);
1475   gcc_checking_assert (end_idx >= 0);
1476   gcc_assert (out_loc);
1477
1478   cpp_substring_ranges ranges;
1479   const char *err
1480     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1481   if (err)
1482     return err;
1483
1484   if (caret_idx >= ranges.get_num_ranges ())
1485     return "caret_idx out of range";
1486   if (start_idx >= ranges.get_num_ranges ())
1487     return "start_idx out of range";
1488   if (end_idx >= ranges.get_num_ranges ())
1489     return "end_idx out of range";
1490
1491   *out_loc = make_location (ranges.get_range (caret_idx).m_start,
1492                             ranges.get_range (start_idx).m_start,
1493                             ranges.get_range (end_idx).m_finish);
1494   return NULL;
1495 }
1496
1497 #if CHECKING_P
1498
1499 namespace selftest {
1500
1501 /* Selftests of location handling.  */
1502
1503 /* Attempt to populate *OUT_RANGE with source location information on the
1504    given character within the string literal found at STRLOC.
1505    CHAR_IDX refers to an offset within the execution character set.
1506    If CONCATS is non-NULL, then any string literals that the token at
1507    STRLOC was concatenated with are also considered.
1508
1509    This is implemented by re-parsing the relevant source line(s).
1510
1511    Return NULL if successful, or an error message if any errors occurred.
1512    Error messages are intended for GCC developers (to help debugging) rather
1513    than for end-users.  */
1514
1515 static const char *
1516 get_source_range_for_char (cpp_reader *pfile,
1517                            string_concat_db *concats,
1518                            location_t strloc,
1519                            enum cpp_ttype type,
1520                            int char_idx,
1521                            source_range *out_range)
1522 {
1523   gcc_checking_assert (char_idx >= 0);
1524   gcc_assert (out_range);
1525
1526   cpp_substring_ranges ranges;
1527   const char *err
1528     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1529   if (err)
1530     return err;
1531
1532   if (char_idx >= ranges.get_num_ranges ())
1533     return "char_idx out of range";
1534
1535   *out_range = ranges.get_range (char_idx);
1536   return NULL;
1537 }
1538
1539 /* As get_source_range_for_char, but write to *OUT the number
1540    of ranges that are available.  */
1541
1542 static const char *
1543 get_num_source_ranges_for_substring (cpp_reader *pfile,
1544                                      string_concat_db *concats,
1545                                      location_t strloc,
1546                                      enum cpp_ttype type,
1547                                      int *out)
1548 {
1549   gcc_assert (out);
1550
1551   cpp_substring_ranges ranges;
1552   const char *err
1553     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1554
1555   if (err)
1556     return err;
1557
1558   *out = ranges.get_num_ranges ();
1559   return NULL;
1560 }
1561
1562 /* Selftests of location handling.  */
1563
1564 /* Helper function for verifying location data: when location_t
1565    values are > LINE_MAP_MAX_LOCATION_WITH_COLS, they are treated
1566    as having column 0.  */
1567
1568 static bool
1569 should_have_column_data_p (location_t loc)
1570 {
1571   if (IS_ADHOC_LOC (loc))
1572     loc = get_location_from_adhoc_loc (line_table, loc);
1573   if (loc > LINE_MAP_MAX_LOCATION_WITH_COLS)
1574     return false;
1575   return true;
1576 }
1577
1578 /* Selftest for should_have_column_data_p.  */
1579
1580 static void
1581 test_should_have_column_data_p ()
1582 {
1583   ASSERT_TRUE (should_have_column_data_p (RESERVED_LOCATION_COUNT));
1584   ASSERT_TRUE
1585     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS));
1586   ASSERT_FALSE
1587     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS + 1));
1588 }
1589
1590 /* Verify the result of LOCATION_FILE/LOCATION_LINE/LOCATION_COLUMN
1591    on LOC.  */
1592
1593 static void
1594 assert_loceq (const char *exp_filename, int exp_linenum, int exp_colnum,
1595               location_t loc)
1596 {
1597   ASSERT_STREQ (exp_filename, LOCATION_FILE (loc));
1598   ASSERT_EQ (exp_linenum, LOCATION_LINE (loc));
1599   /* If location_t values are sufficiently high, then column numbers
1600      will be unavailable and LOCATION_COLUMN (loc) will be 0.
1601      When close to the threshold, column numbers *may* be present: if
1602      the final linemap before the threshold contains a line that straddles
1603      the threshold, locations in that line have column information.  */
1604   if (should_have_column_data_p (loc))
1605     ASSERT_EQ (exp_colnum, LOCATION_COLUMN (loc));
1606 }
1607
1608 /* Various selftests involve constructing a line table and one or more
1609    line maps within it.
1610
1611    For maximum test coverage we want to run these tests with a variety
1612    of situations:
1613    - line_table->default_range_bits: some frontends use a non-zero value
1614    and others use zero
1615    - the fallback modes within line-map.c: there are various threshold
1616    values for source_location/location_t beyond line-map.c changes
1617    behavior (disabling of the range-packing optimization, disabling
1618    of column-tracking).  We can exercise these by starting the line_table
1619    at interesting values at or near these thresholds.
1620
1621    The following struct describes a particular case within our test
1622    matrix.  */
1623
1624 struct line_table_case
1625 {
1626   line_table_case (int default_range_bits, int base_location)
1627   : m_default_range_bits (default_range_bits),
1628     m_base_location (base_location)
1629   {}
1630
1631   int m_default_range_bits;
1632   int m_base_location;
1633 };
1634
1635 /* Constructor.  Store the old value of line_table, and create a new
1636    one, using sane defaults.  */
1637
1638 line_table_test::line_table_test ()
1639 {
1640   gcc_assert (saved_line_table == NULL);
1641   saved_line_table = line_table;
1642   line_table = ggc_alloc<line_maps> ();
1643   linemap_init (line_table, BUILTINS_LOCATION);
1644   gcc_assert (saved_line_table->reallocator);
1645   line_table->reallocator = saved_line_table->reallocator;
1646   gcc_assert (saved_line_table->round_alloc_size);
1647   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1648   line_table->default_range_bits = 0;
1649 }
1650
1651 /* Constructor.  Store the old value of line_table, and create a new
1652    one, using the sitation described in CASE_.  */
1653
1654 line_table_test::line_table_test (const line_table_case &case_)
1655 {
1656   gcc_assert (saved_line_table == NULL);
1657   saved_line_table = line_table;
1658   line_table = ggc_alloc<line_maps> ();
1659   linemap_init (line_table, BUILTINS_LOCATION);
1660   gcc_assert (saved_line_table->reallocator);
1661   line_table->reallocator = saved_line_table->reallocator;
1662   gcc_assert (saved_line_table->round_alloc_size);
1663   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1664   line_table->default_range_bits = case_.m_default_range_bits;
1665   if (case_.m_base_location)
1666     {
1667       line_table->highest_location = case_.m_base_location;
1668       line_table->highest_line = case_.m_base_location;
1669     }
1670 }
1671
1672 /* Destructor.  Restore the old value of line_table.  */
1673
1674 line_table_test::~line_table_test ()
1675 {
1676   gcc_assert (saved_line_table != NULL);
1677   line_table = saved_line_table;
1678   saved_line_table = NULL;
1679 }
1680
1681 /* Verify basic operation of ordinary linemaps.  */
1682
1683 static void
1684 test_accessing_ordinary_linemaps (const line_table_case &case_)
1685 {
1686   line_table_test ltt (case_);
1687
1688   /* Build a simple linemap describing some locations. */
1689   linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
1690
1691   linemap_line_start (line_table, 1, 100);
1692   location_t loc_a = linemap_position_for_column (line_table, 1);
1693   location_t loc_b = linemap_position_for_column (line_table, 23);
1694
1695   linemap_line_start (line_table, 2, 100);
1696   location_t loc_c = linemap_position_for_column (line_table, 1);
1697   location_t loc_d = linemap_position_for_column (line_table, 17);
1698
1699   /* Example of a very long line.  */
1700   linemap_line_start (line_table, 3, 2000);
1701   location_t loc_e = linemap_position_for_column (line_table, 700);
1702
1703   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
1704
1705   /* Multiple files.  */
1706   linemap_add (line_table, LC_ENTER, false, "bar.c", 0);
1707   linemap_line_start (line_table, 1, 200);
1708   location_t loc_f = linemap_position_for_column (line_table, 150);
1709   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
1710
1711   /* Verify that we can recover the location info.  */
1712   assert_loceq ("foo.c", 1, 1, loc_a);
1713   assert_loceq ("foo.c", 1, 23, loc_b);
1714   assert_loceq ("foo.c", 2, 1, loc_c);
1715   assert_loceq ("foo.c", 2, 17, loc_d);
1716   assert_loceq ("foo.c", 3, 700, loc_e);
1717   assert_loceq ("bar.c", 1, 150, loc_f);
1718
1719   ASSERT_FALSE (is_location_from_builtin_token (loc_a));
1720   ASSERT_TRUE (pure_location_p (line_table, loc_a));
1721
1722   /* Verify using make_location to build a range, and extracting data
1723      back from it.  */
1724   location_t range_c_b_d = make_location (loc_c, loc_b, loc_d);
1725   ASSERT_FALSE (pure_location_p (line_table, range_c_b_d));
1726   ASSERT_EQ (loc_c, get_location_from_adhoc_loc (line_table, range_c_b_d));
1727   source_range src_range = get_range_from_loc (line_table, range_c_b_d);
1728   ASSERT_EQ (loc_b, src_range.m_start);
1729   ASSERT_EQ (loc_d, src_range.m_finish);
1730 }
1731
1732 /* Verify various properties of UNKNOWN_LOCATION.  */
1733
1734 static void
1735 test_unknown_location ()
1736 {
1737   ASSERT_EQ (NULL, LOCATION_FILE (UNKNOWN_LOCATION));
1738   ASSERT_EQ (0, LOCATION_LINE (UNKNOWN_LOCATION));
1739   ASSERT_EQ (0, LOCATION_COLUMN (UNKNOWN_LOCATION));
1740 }
1741
1742 /* Verify various properties of BUILTINS_LOCATION.  */
1743
1744 static void
1745 test_builtins ()
1746 {
1747   assert_loceq (_("<built-in>"), 0, 0, BUILTINS_LOCATION);
1748   ASSERT_PRED1 (is_location_from_builtin_token, BUILTINS_LOCATION);
1749 }
1750
1751 /* Regression test for make_location.
1752    Ensure that we use pure locations for the start/finish of the range,
1753    rather than storing a packed or ad-hoc range as the start/finish.  */
1754
1755 static void
1756 test_make_location_nonpure_range_endpoints (const line_table_case &case_)
1757 {
1758   /* Issue seen with testsuite/c-c++-common/Wlogical-not-parentheses-2.c
1759      with C++ frontend.
1760      ....................0000000001111111111222.
1761      ....................1234567890123456789012.  */
1762   const char *content = "     r += !aaa == bbb;\n";
1763   temp_source_file tmp (SELFTEST_LOCATION, ".C", content);
1764   line_table_test ltt (case_);
1765   linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 1);
1766
1767   const location_t c11 = linemap_position_for_column (line_table, 11);
1768   const location_t c12 = linemap_position_for_column (line_table, 12);
1769   const location_t c13 = linemap_position_for_column (line_table, 13);
1770   const location_t c14 = linemap_position_for_column (line_table, 14);
1771   const location_t c21 = linemap_position_for_column (line_table, 21);
1772
1773   if (c21 > LINE_MAP_MAX_LOCATION_WITH_COLS)
1774     return;
1775
1776   /* Use column 13 for the caret location, arbitrarily, to verify that we
1777      handle start != caret.  */
1778   const location_t aaa = make_location (c13, c12, c14);
1779   ASSERT_EQ (c13, get_pure_location (aaa));
1780   ASSERT_EQ (c12, get_start (aaa));
1781   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa)));
1782   ASSERT_EQ (c14, get_finish (aaa));
1783   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa)));
1784
1785   /* Make a location using a location with a range as the start-point.  */
1786   const location_t not_aaa = make_location (c11, aaa, c14);
1787   ASSERT_EQ (c11, get_pure_location (not_aaa));
1788   /* It should use the start location of the range, not store the range
1789      itself.  */
1790   ASSERT_EQ (c12, get_start (not_aaa));
1791   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa)));
1792   ASSERT_EQ (c14, get_finish (not_aaa));
1793   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa)));
1794
1795   /* Similarly, make a location with a range as the end-point.  */
1796   const location_t aaa_eq_bbb = make_location (c12, c12, c21);
1797   ASSERT_EQ (c12, get_pure_location (aaa_eq_bbb));
1798   ASSERT_EQ (c12, get_start (aaa_eq_bbb));
1799   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa_eq_bbb)));
1800   ASSERT_EQ (c21, get_finish (aaa_eq_bbb));
1801   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa_eq_bbb)));
1802   const location_t not_aaa_eq_bbb = make_location (c11, c12, aaa_eq_bbb);
1803   /* It should use the finish location of the range, not store the range
1804      itself.  */
1805   ASSERT_EQ (c11, get_pure_location (not_aaa_eq_bbb));
1806   ASSERT_EQ (c12, get_start (not_aaa_eq_bbb));
1807   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa_eq_bbb)));
1808   ASSERT_EQ (c21, get_finish (not_aaa_eq_bbb));
1809   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa_eq_bbb)));
1810 }
1811
1812 /* Verify reading of input files (e.g. for caret-based diagnostics).  */
1813
1814 static void
1815 test_reading_source_line ()
1816 {
1817   /* Create a tempfile and write some text to it.  */
1818   temp_source_file tmp (SELFTEST_LOCATION, ".txt",
1819                         "01234567890123456789\n"
1820                         "This is the test text\n"
1821                         "This is the 3rd line\n");
1822
1823   /* Read back a specific line from the tempfile.  */
1824   int line_size;
1825   const char *source_line = location_get_source_line (tmp.get_filename (),
1826                                                       2, &line_size);
1827   ASSERT_TRUE (source_line != NULL);
1828   ASSERT_EQ (21, line_size);
1829   if (!strncmp ("This is the test text",
1830                 source_line, line_size))
1831     ::selftest::pass (SELFTEST_LOCATION,
1832                       "source_line matched expected value");
1833   else
1834     ::selftest::fail (SELFTEST_LOCATION,
1835                       "source_line did not match expected value");
1836
1837 }
1838
1839 /* Tests of lexing.  */
1840
1841 /* Verify that token TOK from PARSER has cpp_token_as_text
1842    equal to EXPECTED_TEXT.  */
1843
1844 #define ASSERT_TOKEN_AS_TEXT_EQ(PARSER, TOK, EXPECTED_TEXT)             \
1845   SELFTEST_BEGIN_STMT                                                   \
1846     unsigned char *actual_txt = cpp_token_as_text ((PARSER), (TOK));    \
1847     ASSERT_STREQ ((EXPECTED_TEXT), (const char *)actual_txt);           \
1848   SELFTEST_END_STMT
1849
1850 /* Verify that TOK's src_loc is within EXP_FILENAME at EXP_LINENUM,
1851    and ranges from EXP_START_COL to EXP_FINISH_COL.
1852    Use LOC as the effective location of the selftest.  */
1853
1854 static void
1855 assert_token_loc_eq (const location &loc,
1856                      const cpp_token *tok,
1857                      const char *exp_filename, int exp_linenum,
1858                      int exp_start_col, int exp_finish_col)
1859 {
1860   location_t tok_loc = tok->src_loc;
1861   ASSERT_STREQ_AT (loc, exp_filename, LOCATION_FILE (tok_loc));
1862   ASSERT_EQ_AT (loc, exp_linenum, LOCATION_LINE (tok_loc));
1863
1864   /* If location_t values are sufficiently high, then column numbers
1865      will be unavailable.  */
1866   if (!should_have_column_data_p (tok_loc))
1867     return;
1868
1869   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_loc));
1870   source_range tok_range = get_range_from_loc (line_table, tok_loc);
1871   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_range.m_start));
1872   ASSERT_EQ_AT (loc, exp_finish_col, LOCATION_COLUMN (tok_range.m_finish));
1873 }
1874
1875 /* Use assert_token_loc_eq to verify the TOK->src_loc, using
1876    SELFTEST_LOCATION as the effective location of the selftest.  */
1877
1878 #define ASSERT_TOKEN_LOC_EQ(TOK, EXP_FILENAME, EXP_LINENUM, \
1879                             EXP_START_COL, EXP_FINISH_COL) \
1880   assert_token_loc_eq (SELFTEST_LOCATION, (TOK), (EXP_FILENAME), \
1881                        (EXP_LINENUM), (EXP_START_COL), (EXP_FINISH_COL))
1882
1883 /* Test of lexing a file using libcpp, verifying tokens and their
1884    location information.  */
1885
1886 static void
1887 test_lexer (const line_table_case &case_)
1888 {
1889   /* Create a tempfile and write some text to it.  */
1890   const char *content =
1891     /*00000000011111111112222222222333333.3333444444444.455555555556
1892       12345678901234567890123456789012345.6789012345678.901234567890.  */
1893     ("test_name /* c-style comment */\n"
1894      "                                  \"test literal\"\n"
1895      " // test c++-style comment\n"
1896      "   42\n");
1897   temp_source_file tmp (SELFTEST_LOCATION, ".txt", content);
1898
1899   line_table_test ltt (case_);
1900
1901   cpp_reader *parser = cpp_create_reader (CLK_GNUC89, NULL, line_table);
1902
1903   const char *fname = cpp_read_main_file (parser, tmp.get_filename ());
1904   ASSERT_NE (fname, NULL);
1905
1906   /* Verify that we get the expected tokens back, with the correct
1907      location information.  */
1908
1909   location_t loc;
1910   const cpp_token *tok;
1911   tok = cpp_get_token_with_location (parser, &loc);
1912   ASSERT_NE (tok, NULL);
1913   ASSERT_EQ (tok->type, CPP_NAME);
1914   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "test_name");
1915   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 1, 1, 9);
1916
1917   tok = cpp_get_token_with_location (parser, &loc);
1918   ASSERT_NE (tok, NULL);
1919   ASSERT_EQ (tok->type, CPP_STRING);
1920   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "\"test literal\"");
1921   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 2, 35, 48);
1922
1923   tok = cpp_get_token_with_location (parser, &loc);
1924   ASSERT_NE (tok, NULL);
1925   ASSERT_EQ (tok->type, CPP_NUMBER);
1926   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "42");
1927   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 4, 4, 5);
1928
1929   tok = cpp_get_token_with_location (parser, &loc);
1930   ASSERT_NE (tok, NULL);
1931   ASSERT_EQ (tok->type, CPP_EOF);
1932
1933   cpp_finish (parser, NULL);
1934   cpp_destroy (parser);
1935 }
1936
1937 /* Forward decls.  */
1938
1939 struct lexer_test;
1940 class lexer_test_options;
1941
1942 /* A class for specifying options of a lexer_test.
1943    The "apply" vfunc is called during the lexer_test constructor.  */
1944
1945 class lexer_test_options
1946 {
1947  public:
1948   virtual void apply (lexer_test &) = 0;
1949 };
1950
1951 /* A struct for writing lexer tests.  */
1952
1953 struct lexer_test
1954 {
1955   lexer_test (const line_table_case &case_, const char *content,
1956               lexer_test_options *options);
1957   ~lexer_test ();
1958
1959   const cpp_token *get_token ();
1960
1961   temp_source_file m_tempfile;
1962   line_table_test m_ltt;
1963   cpp_reader *m_parser;
1964   string_concat_db m_concats;
1965 };
1966
1967 /* Use an EBCDIC encoding for the execution charset, specifically
1968    IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
1969
1970    This exercises iconv integration within libcpp.
1971    Not every build of iconv supports the given charset,
1972    so we need to flag this error and handle it gracefully.  */
1973
1974 class ebcdic_execution_charset : public lexer_test_options
1975 {
1976  public:
1977   ebcdic_execution_charset () : m_num_iconv_errors (0)
1978     {
1979       gcc_assert (s_singleton == NULL);
1980       s_singleton = this;
1981     }
1982   ~ebcdic_execution_charset ()
1983     {
1984       gcc_assert (s_singleton == this);
1985       s_singleton = NULL;
1986     }
1987
1988   void apply (lexer_test &test) FINAL OVERRIDE
1989   {
1990     cpp_options *cpp_opts = cpp_get_options (test.m_parser);
1991     cpp_opts->narrow_charset = "IBM1047";
1992
1993     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
1994     callbacks->error = on_error;
1995   }
1996
1997   static bool on_error (cpp_reader *pfile ATTRIBUTE_UNUSED,
1998                         int level ATTRIBUTE_UNUSED,
1999                         int reason ATTRIBUTE_UNUSED,
2000                         rich_location *richloc ATTRIBUTE_UNUSED,
2001                         const char *msgid, va_list *ap ATTRIBUTE_UNUSED)
2002     ATTRIBUTE_FPTR_PRINTF(5,0)
2003   {
2004     gcc_assert (s_singleton);
2005     /* Detect and record errors emitted by libcpp/charset.c:init_iconv_desc
2006        when the local iconv build doesn't support the conversion.  */
2007     if (strstr (msgid, "not supported by iconv"))
2008       {
2009         s_singleton->m_num_iconv_errors++;
2010         return true;
2011       }
2012
2013     /* Otherwise, we have an unexpected error.  */
2014     abort ();
2015   }
2016
2017   bool iconv_errors_occurred_p () const { return m_num_iconv_errors > 0; }
2018
2019  private:
2020   static ebcdic_execution_charset *s_singleton;
2021   int m_num_iconv_errors;
2022 };
2023
2024 ebcdic_execution_charset *ebcdic_execution_charset::s_singleton;
2025
2026 /* Constructor.  Override line_table with a new instance based on CASE_,
2027    and write CONTENT to a tempfile.  Create a cpp_reader, and use it to
2028    start parsing the tempfile.  */
2029
2030 lexer_test::lexer_test (const line_table_case &case_, const char *content,
2031                         lexer_test_options *options) :
2032   /* Create a tempfile and write the text to it.  */
2033   m_tempfile (SELFTEST_LOCATION, ".c", content),
2034   m_ltt (case_),
2035   m_parser (cpp_create_reader (CLK_GNUC99, NULL, line_table)),
2036   m_concats ()
2037 {
2038   if (options)
2039     options->apply (*this);
2040
2041   cpp_init_iconv (m_parser);
2042
2043   /* Parse the file.  */
2044   const char *fname = cpp_read_main_file (m_parser,
2045                                           m_tempfile.get_filename ());
2046   ASSERT_NE (fname, NULL);
2047 }
2048
2049 /* Destructor.  Verify that the next token in m_parser is EOF.  */
2050
2051 lexer_test::~lexer_test ()
2052 {
2053   location_t loc;
2054   const cpp_token *tok;
2055
2056   tok = cpp_get_token_with_location (m_parser, &loc);
2057   ASSERT_NE (tok, NULL);
2058   ASSERT_EQ (tok->type, CPP_EOF);
2059
2060   cpp_finish (m_parser, NULL);
2061   cpp_destroy (m_parser);
2062 }
2063
2064 /* Get the next token from m_parser.  */
2065
2066 const cpp_token *
2067 lexer_test::get_token ()
2068 {
2069   location_t loc;
2070   const cpp_token *tok;
2071
2072   tok = cpp_get_token_with_location (m_parser, &loc);
2073   ASSERT_NE (tok, NULL);
2074   return tok;
2075 }
2076
2077 /* Verify that locations within string literals are correctly handled.  */
2078
2079 /* Verify get_source_range_for_substring for token(s) at STRLOC,
2080    using the string concatenation database for TEST.
2081
2082    Assert that the character at index IDX is on EXPECTED_LINE,
2083    and that it begins at column EXPECTED_START_COL and ends at
2084    EXPECTED_FINISH_COL (unless the locations are beyond
2085    LINE_MAP_MAX_LOCATION_WITH_COLS, in which case don't check their
2086    columns).  */
2087
2088 static void
2089 assert_char_at_range (const location &loc,
2090                       lexer_test& test,
2091                       location_t strloc, enum cpp_ttype type, int idx,
2092                       int expected_line, int expected_start_col,
2093                       int expected_finish_col)
2094 {
2095   cpp_reader *pfile = test.m_parser;
2096   string_concat_db *concats = &test.m_concats;
2097
2098   source_range actual_range;
2099   const char *err
2100     = get_source_range_for_char (pfile, concats, strloc, type, idx,
2101                                  &actual_range);
2102   if (should_have_column_data_p (strloc))
2103     ASSERT_EQ_AT (loc, NULL, err);
2104   else
2105     {
2106       ASSERT_STREQ_AT (loc,
2107                        "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2108                        err);
2109       return;
2110     }
2111
2112   int actual_start_line = LOCATION_LINE (actual_range.m_start);
2113   ASSERT_EQ_AT (loc, expected_line, actual_start_line);
2114   int actual_finish_line = LOCATION_LINE (actual_range.m_finish);
2115   ASSERT_EQ_AT (loc, expected_line, actual_finish_line);
2116
2117   if (should_have_column_data_p (actual_range.m_start))
2118     {
2119       int actual_start_col = LOCATION_COLUMN (actual_range.m_start);
2120       ASSERT_EQ_AT (loc, expected_start_col, actual_start_col);
2121     }
2122   if (should_have_column_data_p (actual_range.m_finish))
2123     {
2124       int actual_finish_col = LOCATION_COLUMN (actual_range.m_finish);
2125       ASSERT_EQ_AT (loc, expected_finish_col, actual_finish_col);
2126     }
2127 }
2128
2129 /* Macro for calling assert_char_at_range, supplying SELFTEST_LOCATION for
2130    the effective location of any errors.  */
2131
2132 #define ASSERT_CHAR_AT_RANGE(LEXER_TEST, STRLOC, TYPE, IDX, EXPECTED_LINE, \
2133                              EXPECTED_START_COL, EXPECTED_FINISH_COL)   \
2134   assert_char_at_range (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), (TYPE), \
2135                         (IDX), (EXPECTED_LINE), (EXPECTED_START_COL), \
2136                         (EXPECTED_FINISH_COL))
2137
2138 /* Verify get_num_source_ranges_for_substring for token(s) at STRLOC,
2139    using the string concatenation database for TEST.
2140
2141    Assert that the token(s) at STRLOC contain EXPECTED_NUM_RANGES.  */
2142
2143 static void
2144 assert_num_substring_ranges (const location &loc,
2145                              lexer_test& test,
2146                              location_t strloc,
2147                              enum cpp_ttype type,
2148                              int expected_num_ranges)
2149 {
2150   cpp_reader *pfile = test.m_parser;
2151   string_concat_db *concats = &test.m_concats;
2152
2153   int actual_num_ranges = -1;
2154   const char *err
2155     = get_num_source_ranges_for_substring (pfile, concats, strloc, type,
2156                                            &actual_num_ranges);
2157   if (should_have_column_data_p (strloc))
2158     ASSERT_EQ_AT (loc, NULL, err);
2159   else
2160     {
2161       ASSERT_STREQ_AT (loc,
2162                        "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2163                        err);
2164       return;
2165     }
2166   ASSERT_EQ_AT (loc, expected_num_ranges, actual_num_ranges);
2167 }
2168
2169 /* Macro for calling assert_num_substring_ranges, supplying
2170    SELFTEST_LOCATION for the effective location of any errors.  */
2171
2172 #define ASSERT_NUM_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, \
2173                                     EXPECTED_NUM_RANGES)                \
2174   assert_num_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), \
2175                                (TYPE), (EXPECTED_NUM_RANGES))
2176
2177
2178 /* Verify that get_num_source_ranges_for_substring for token(s) at STRLOC
2179    returns an error (using the string concatenation database for TEST).  */
2180
2181 static void
2182 assert_has_no_substring_ranges (const location &loc,
2183                                 lexer_test& test,
2184                                 location_t strloc,
2185                                 enum cpp_ttype type,
2186                                 const char *expected_err)
2187 {
2188   cpp_reader *pfile = test.m_parser;
2189   string_concat_db *concats = &test.m_concats;
2190   cpp_substring_ranges ranges;
2191   const char *actual_err
2192     = get_substring_ranges_for_loc (pfile, concats, strloc,
2193                                     type, ranges);
2194   if (should_have_column_data_p (strloc))
2195     ASSERT_STREQ_AT (loc, expected_err, actual_err);
2196   else
2197     ASSERT_STREQ_AT (loc,
2198                      "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2199                      actual_err);
2200 }
2201
2202 #define ASSERT_HAS_NO_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, ERR)    \
2203     assert_has_no_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), \
2204                                     (STRLOC), (TYPE), (ERR))
2205
2206 /* Lex a simple string literal.  Verify the substring location data, before
2207    and after running cpp_interpret_string on it.  */
2208
2209 static void
2210 test_lexer_string_locations_simple (const line_table_case &case_)
2211 {
2212   /* Digits 0-9 (with 0 at column 10), the simple way.
2213      ....................000000000.11111111112.2222222223333333333
2214      ....................123456789.01234567890.1234567890123456789
2215      We add a trailing comment to ensure that we correctly locate
2216      the end of the string literal token.  */
2217   const char *content = "        \"0123456789\" /* not a string */\n";
2218   lexer_test test (case_, content, NULL);
2219
2220   /* Verify that we get the expected token back, with the correct
2221      location information.  */
2222   const cpp_token *tok = test.get_token ();
2223   ASSERT_EQ (tok->type, CPP_STRING);
2224   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2225   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2226
2227   /* At this point in lexing, the quote characters are treated as part of
2228      the string (they are stripped off by cpp_interpret_string).  */
2229
2230   ASSERT_EQ (tok->val.str.len, 12);
2231
2232   /* Verify that cpp_interpret_string works.  */
2233   cpp_string dst_string;
2234   const enum cpp_ttype type = CPP_STRING;
2235   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2236                                       &dst_string, type);
2237   ASSERT_TRUE (result);
2238   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2239   free (const_cast <unsigned char *> (dst_string.text));
2240
2241   /* Verify ranges of individual characters.  This no longer includes the
2242      opening quote, but does include the closing quote.  */
2243   for (int i = 0; i <= 10; i++)
2244     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1,
2245                           10 + i, 10 + i);
2246
2247   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2248 }
2249
2250 /* As test_lexer_string_locations_simple, but use an EBCDIC execution
2251    encoding.  */
2252
2253 static void
2254 test_lexer_string_locations_ebcdic (const line_table_case &case_)
2255 {
2256   /* EBCDIC support requires iconv.  */
2257   if (!HAVE_ICONV)
2258     return;
2259
2260   /* Digits 0-9 (with 0 at column 10), the simple way.
2261      ....................000000000.11111111112.2222222223333333333
2262      ....................123456789.01234567890.1234567890123456789
2263      We add a trailing comment to ensure that we correctly locate
2264      the end of the string literal token.  */
2265   const char *content = "        \"0123456789\" /* not a string */\n";
2266   ebcdic_execution_charset use_ebcdic;
2267   lexer_test test (case_, content, &use_ebcdic);
2268
2269   /* Verify that we get the expected token back, with the correct
2270      location information.  */
2271   const cpp_token *tok = test.get_token ();
2272   ASSERT_EQ (tok->type, CPP_STRING);
2273   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2274   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2275
2276   /* At this point in lexing, the quote characters are treated as part of
2277      the string (they are stripped off by cpp_interpret_string).  */
2278
2279   ASSERT_EQ (tok->val.str.len, 12);
2280
2281   /* The remainder of the test requires an iconv implementation that
2282      can convert from UTF-8 to the EBCDIC encoding requested above.  */
2283   if (use_ebcdic.iconv_errors_occurred_p ())
2284     return;
2285
2286   /* Verify that cpp_interpret_string works.  */
2287   cpp_string dst_string;
2288   const enum cpp_ttype type = CPP_STRING;
2289   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2290                                       &dst_string, type);
2291   ASSERT_TRUE (result);
2292   /* We should now have EBCDIC-encoded text, specifically
2293      IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2294      The digits 0-9 are encoded as 240-249 i.e. 0xf0-0xf9.  */
2295   ASSERT_STREQ ("\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
2296                 (const char *)dst_string.text);
2297   free (const_cast <unsigned char *> (dst_string.text));
2298
2299   /* Verify that we don't attempt to record substring location information
2300      for such cases.  */
2301   ASSERT_HAS_NO_SUBSTRING_RANGES
2302     (test, tok->src_loc, type,
2303      "execution character set != source character set");
2304 }
2305
2306 /* Lex a string literal containing a hex-escaped character.
2307    Verify the substring location data, before and after running
2308    cpp_interpret_string on it.  */
2309
2310 static void
2311 test_lexer_string_locations_hex (const line_table_case &case_)
2312 {
2313   /* Digits 0-9, expressing digit 5 in ASCII as "\x35"
2314      and with a space in place of digit 6, to terminate the escaped
2315      hex code.
2316      ....................000000000.111111.11112222.
2317      ....................123456789.012345.67890123.  */
2318   const char *content = "        \"01234\\x35 789\"\n";
2319   lexer_test test (case_, content, NULL);
2320
2321   /* Verify that we get the expected token back, with the correct
2322      location information.  */
2323   const cpp_token *tok = test.get_token ();
2324   ASSERT_EQ (tok->type, CPP_STRING);
2325   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\x35 789\"");
2326   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 23);
2327
2328   /* At this point in lexing, the quote characters are treated as part of
2329      the string (they are stripped off by cpp_interpret_string).  */
2330   ASSERT_EQ (tok->val.str.len, 15);
2331
2332   /* Verify that cpp_interpret_string works.  */
2333   cpp_string dst_string;
2334   const enum cpp_ttype type = CPP_STRING;
2335   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2336                                       &dst_string, type);
2337   ASSERT_TRUE (result);
2338   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2339   free (const_cast <unsigned char *> (dst_string.text));
2340
2341   /* Verify ranges of individual characters.  This no longer includes the
2342      opening quote, but does include the closing quote.  */
2343   for (int i = 0; i <= 4; i++)
2344     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2345   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2346   for (int i = 6; i <= 10; i++)
2347     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2348
2349   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2350 }
2351
2352 /* Lex a string literal containing an octal-escaped character.
2353    Verify the substring location data after running cpp_interpret_string
2354    on it.  */
2355
2356 static void
2357 test_lexer_string_locations_oct (const line_table_case &case_)
2358 {
2359   /* Digits 0-9, expressing digit 5 in ASCII as "\065"
2360      and with a space in place of digit 6, to terminate the escaped
2361      octal code.
2362      ....................000000000.111111.11112222.2222223333333333444
2363      ....................123456789.012345.67890123.4567890123456789012  */
2364   const char *content = "        \"01234\\065 789\" /* not a string */\n";
2365   lexer_test test (case_, content, NULL);
2366
2367   /* Verify that we get the expected token back, with the correct
2368      location information.  */
2369   const cpp_token *tok = test.get_token ();
2370   ASSERT_EQ (tok->type, CPP_STRING);
2371   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\065 789\"");
2372
2373   /* Verify that cpp_interpret_string works.  */
2374   cpp_string dst_string;
2375   const enum cpp_ttype type = CPP_STRING;
2376   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2377                                       &dst_string, type);
2378   ASSERT_TRUE (result);
2379   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2380   free (const_cast <unsigned char *> (dst_string.text));
2381
2382   /* Verify ranges of individual characters.  This no longer includes the
2383      opening quote, but does include the closing quote.  */
2384   for (int i = 0; i < 5; i++)
2385     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2386   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2387   for (int i = 6; i <= 10; i++)
2388     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2389
2390   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2391 }
2392
2393 /* Test of string literal containing letter escapes.  */
2394
2395 static void
2396 test_lexer_string_locations_letter_escape_1 (const line_table_case &case_)
2397 {
2398   /* The string "\tfoo\\\nbar" i.e. tab, "foo", backslash, newline, bar.
2399      .....................000000000.1.11111.1.1.11222.22222223333333
2400      .....................123456789.0.12345.6.7.89012.34567890123456.  */
2401   const char *content = ("        \"\\tfoo\\\\\\nbar\" /* non-str */\n");
2402   lexer_test test (case_, content, NULL);
2403
2404   /* Verify that we get the expected tokens back.  */
2405   const cpp_token *tok = test.get_token ();
2406   ASSERT_EQ (tok->type, CPP_STRING);
2407   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"\\tfoo\\\\\\nbar\"");
2408
2409   /* Verify ranges of individual characters. */
2410   /* "\t".  */
2411   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2412                         0, 1, 10, 11);
2413   /* "foo". */
2414   for (int i = 1; i <= 3; i++)
2415     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2416                           i, 1, 11 + i, 11 + i);
2417   /* "\\" and "\n".  */
2418   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2419                         4, 1, 15, 16);
2420   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2421                         5, 1, 17, 18);
2422
2423   /* "bar" and closing quote for nul-terminator.  */
2424   for (int i = 6; i <= 9; i++)
2425     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2426                           i, 1, 13 + i, 13 + i);
2427
2428   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 10);
2429 }
2430
2431 /* Another test of a string literal containing a letter escape.
2432    Based on string seen in
2433      printf ("%-%\n");
2434    in gcc.dg/format/c90-printf-1.c.  */
2435
2436 static void
2437 test_lexer_string_locations_letter_escape_2 (const line_table_case &case_)
2438 {
2439   /* .....................000000000.1111.11.1111.22222222223.
2440      .....................123456789.0123.45.6789.01234567890.  */
2441   const char *content = ("        \"%-%\\n\" /* non-str */\n");
2442   lexer_test test (case_, content, NULL);
2443
2444   /* Verify that we get the expected tokens back.  */
2445   const cpp_token *tok = test.get_token ();
2446   ASSERT_EQ (tok->type, CPP_STRING);
2447   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"%-%\\n\"");
2448
2449   /* Verify ranges of individual characters. */
2450   /* "%-%".  */
2451   for (int i = 0; i < 3; i++)
2452     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2453                           i, 1, 10 + i, 10 + i);
2454   /* "\n".  */
2455   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2456                         3, 1, 13, 14);
2457
2458   /* Closing quote for nul-terminator.  */
2459   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2460                         4, 1, 15, 15);
2461
2462   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 5);
2463 }
2464
2465 /* Lex a string literal containing UCN 4 characters.
2466    Verify the substring location data after running cpp_interpret_string
2467    on it.  */
2468
2469 static void
2470 test_lexer_string_locations_ucn4 (const line_table_case &case_)
2471 {
2472   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals expressed
2473      as UCN 4.
2474      ....................000000000.111111.111122.222222223.33333333344444
2475      ....................123456789.012345.678901.234567890.12345678901234  */
2476   const char *content = "        \"01234\\u2174\\u2175789\" /* non-str */\n";
2477   lexer_test test (case_, content, NULL);
2478
2479   /* Verify that we get the expected token back, with the correct
2480      location information.  */
2481   const cpp_token *tok = test.get_token ();
2482   ASSERT_EQ (tok->type, CPP_STRING);
2483   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\u2174\\u2175789\"");
2484
2485   /* Verify that cpp_interpret_string works.
2486      The string should be encoded in the execution character
2487      set.  Assuming that that is UTF-8, we should have the following:
2488      -----------  ----  -----  -------  ----------------
2489      Byte offset  Byte  Octal  Unicode  Source Column(s)
2490      -----------  ----  -----  -------  ----------------
2491      0            0x30         '0'      10
2492      1            0x31         '1'      11
2493      2            0x32         '2'      12
2494      3            0x33         '3'      13
2495      4            0x34         '4'      14
2496      5            0xE2  \342   U+2174   15-20
2497      6            0x85  \205    (cont)  15-20
2498      7            0xB4  \264    (cont)  15-20
2499      8            0xE2  \342   U+2175   21-26
2500      9            0x85  \205    (cont)  21-26
2501      10           0xB5  \265    (cont)  21-26
2502      11           0x37         '7'      27
2503      12           0x38         '8'      28
2504      13           0x39         '9'      29
2505      14           0x00                  30 (closing quote)
2506      -----------  ----  -----  -------  ---------------.  */
2507
2508   cpp_string dst_string;
2509   const enum cpp_ttype type = CPP_STRING;
2510   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2511                                       &dst_string, type);
2512   ASSERT_TRUE (result);
2513   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2514                 (const char *)dst_string.text);
2515   free (const_cast <unsigned char *> (dst_string.text));
2516
2517   /* Verify ranges of individual characters.  This no longer includes the
2518      opening quote, but does include the closing quote.
2519      '01234'.  */
2520   for (int i = 0; i <= 4; i++)
2521     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2522   /* U+2174.  */
2523   for (int i = 5; i <= 7; i++)
2524     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 20);
2525   /* U+2175.  */
2526   for (int i = 8; i <= 10; i++)
2527     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 21, 26);
2528   /* '789' and nul terminator  */
2529   for (int i = 11; i <= 14; i++)
2530     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 16 + i, 16 + i);
2531
2532   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2533 }
2534
2535 /* Lex a string literal containing UCN 8 characters.
2536    Verify the substring location data after running cpp_interpret_string
2537    on it.  */
2538
2539 static void
2540 test_lexer_string_locations_ucn8 (const line_table_case &case_)
2541 {
2542   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals as UCN 8.
2543      ....................000000000.111111.1111222222.2222333333333.344444
2544      ....................123456789.012345.6789012345.6789012345678.901234  */
2545   const char *content = "        \"01234\\U00002174\\U00002175789\" /* */\n";
2546   lexer_test test (case_, content, NULL);
2547
2548   /* Verify that we get the expected token back, with the correct
2549      location information.  */
2550   const cpp_token *tok = test.get_token ();
2551   ASSERT_EQ (tok->type, CPP_STRING);
2552   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok,
2553                            "\"01234\\U00002174\\U00002175789\"");
2554
2555   /* Verify that cpp_interpret_string works.
2556      The UTF-8 encoding of the string is identical to that from
2557      the ucn4 testcase above; the only difference is the column
2558      locations.  */
2559   cpp_string dst_string;
2560   const enum cpp_ttype type = CPP_STRING;
2561   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2562                                       &dst_string, type);
2563   ASSERT_TRUE (result);
2564   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2565                 (const char *)dst_string.text);
2566   free (const_cast <unsigned char *> (dst_string.text));
2567
2568   /* Verify ranges of individual characters.  This no longer includes the
2569      opening quote, but does include the closing quote.
2570      '01234'.  */
2571   for (int i = 0; i <= 4; i++)
2572     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2573   /* U+2174.  */
2574   for (int i = 5; i <= 7; i++)
2575     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 24);
2576   /* U+2175.  */
2577   for (int i = 8; i <= 10; i++)
2578     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 25, 34);
2579   /* '789' at columns 35-37  */
2580   for (int i = 11; i <= 13; i++)
2581     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 24 + i, 24 + i);
2582   /* Closing quote/nul-terminator at column 38.  */
2583   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 14, 1, 38, 38);
2584
2585   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2586 }
2587
2588 /* Fetch a big-endian 32-bit value and convert to host endianness.  */
2589
2590 static uint32_t
2591 uint32_from_big_endian (const uint32_t *ptr_be_value)
2592 {
2593   const unsigned char *buf = (const unsigned char *)ptr_be_value;
2594   return (((uint32_t) buf[0] << 24)
2595           | ((uint32_t) buf[1] << 16)
2596           | ((uint32_t) buf[2] << 8)
2597           | (uint32_t) buf[3]);
2598 }
2599
2600 /* Lex a wide string literal and verify that attempts to read substring
2601    location data from it fail gracefully.  */
2602
2603 static void
2604 test_lexer_string_locations_wide_string (const line_table_case &case_)
2605 {
2606   /* Digits 0-9.
2607      ....................000000000.11111111112.22222222233333
2608      ....................123456789.01234567890.12345678901234  */
2609   const char *content = "       L\"0123456789\" /* non-str */\n";
2610   lexer_test test (case_, content, NULL);
2611
2612   /* Verify that we get the expected token back, with the correct
2613      location information.  */
2614   const cpp_token *tok = test.get_token ();
2615   ASSERT_EQ (tok->type, CPP_WSTRING);
2616   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L\"0123456789\"");
2617
2618   /* Verify that cpp_interpret_string works, using CPP_WSTRING.  */
2619   cpp_string dst_string;
2620   const enum cpp_ttype type = CPP_WSTRING;
2621   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2622                                       &dst_string, type);
2623   ASSERT_TRUE (result);
2624   /* The cpp_reader defaults to big-endian with
2625      CHAR_BIT * sizeof (int) for the wchar_precision, so dst_string should
2626      now be encoded as UTF-32BE.  */
2627   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
2628   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
2629   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
2630   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
2631   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
2632   free (const_cast <unsigned char *> (dst_string.text));
2633
2634   /* We don't yet support generating substring location information
2635      for L"" strings.  */
2636   ASSERT_HAS_NO_SUBSTRING_RANGES
2637     (test, tok->src_loc, type,
2638      "execution character set != source character set");
2639 }
2640
2641 /* Fetch a big-endian 16-bit value and convert to host endianness.  */
2642
2643 static uint16_t
2644 uint16_from_big_endian (const uint16_t *ptr_be_value)
2645 {
2646   const unsigned char *buf = (const unsigned char *)ptr_be_value;
2647   return ((uint16_t) buf[0] << 8) | (uint16_t) buf[1];
2648 }
2649
2650 /* Lex a u"" string literal and verify that attempts to read substring
2651    location data from it fail gracefully.  */
2652
2653 static void
2654 test_lexer_string_locations_string16 (const line_table_case &case_)
2655 {
2656   /* Digits 0-9.
2657      ....................000000000.11111111112.22222222233333
2658      ....................123456789.01234567890.12345678901234  */
2659   const char *content = "       u\"0123456789\" /* non-str */\n";
2660   lexer_test test (case_, content, NULL);
2661
2662   /* Verify that we get the expected token back, with the correct
2663      location information.  */
2664   const cpp_token *tok = test.get_token ();
2665   ASSERT_EQ (tok->type, CPP_STRING16);
2666   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u\"0123456789\"");
2667
2668   /* Verify that cpp_interpret_string works, using CPP_STRING16.  */
2669   cpp_string dst_string;
2670   const enum cpp_ttype type = CPP_STRING16;
2671   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2672                                       &dst_string, type);
2673   ASSERT_TRUE (result);
2674
2675   /* The cpp_reader defaults to big-endian, so dst_string should
2676      now be encoded as UTF-16BE.  */
2677   const uint16_t *be16_chars = (const uint16_t *)dst_string.text;
2678   ASSERT_EQ ('0', uint16_from_big_endian (&be16_chars[0]));
2679   ASSERT_EQ ('5', uint16_from_big_endian (&be16_chars[5]));
2680   ASSERT_EQ ('9', uint16_from_big_endian (&be16_chars[9]));
2681   ASSERT_EQ (0, uint16_from_big_endian (&be16_chars[10]));
2682   free (const_cast <unsigned char *> (dst_string.text));
2683
2684   /* We don't yet support generating substring location information
2685      for L"" strings.  */
2686   ASSERT_HAS_NO_SUBSTRING_RANGES
2687     (test, tok->src_loc, type,
2688      "execution character set != source character set");
2689 }
2690
2691 /* Lex a U"" string literal and verify that attempts to read substring
2692    location data from it fail gracefully.  */
2693
2694 static void
2695 test_lexer_string_locations_string32 (const line_table_case &case_)
2696 {
2697   /* Digits 0-9.
2698      ....................000000000.11111111112.22222222233333
2699      ....................123456789.01234567890.12345678901234  */
2700   const char *content = "       U\"0123456789\" /* non-str */\n";
2701   lexer_test test (case_, content, NULL);
2702
2703   /* Verify that we get the expected token back, with the correct
2704      location information.  */
2705   const cpp_token *tok = test.get_token ();
2706   ASSERT_EQ (tok->type, CPP_STRING32);
2707   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U\"0123456789\"");
2708
2709   /* Verify that cpp_interpret_string works, using CPP_STRING32.  */
2710   cpp_string dst_string;
2711   const enum cpp_ttype type = CPP_STRING32;
2712   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2713                                       &dst_string, type);
2714   ASSERT_TRUE (result);
2715
2716   /* The cpp_reader defaults to big-endian, so dst_string should
2717      now be encoded as UTF-32BE.  */
2718   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
2719   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
2720   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
2721   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
2722   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
2723   free (const_cast <unsigned char *> (dst_string.text));
2724
2725   /* We don't yet support generating substring location information
2726      for L"" strings.  */
2727   ASSERT_HAS_NO_SUBSTRING_RANGES
2728     (test, tok->src_loc, type,
2729      "execution character set != source character set");
2730 }
2731
2732 /* Lex a u8-string literal.
2733    Verify the substring location data after running cpp_interpret_string
2734    on it.  */
2735
2736 static void
2737 test_lexer_string_locations_u8 (const line_table_case &case_)
2738 {
2739   /* Digits 0-9.
2740      ....................000000000.11111111112.22222222233333
2741      ....................123456789.01234567890.12345678901234  */
2742   const char *content = "      u8\"0123456789\" /* non-str */\n";
2743   lexer_test test (case_, content, NULL);
2744
2745   /* Verify that we get the expected token back, with the correct
2746      location information.  */
2747   const cpp_token *tok = test.get_token ();
2748   ASSERT_EQ (tok->type, CPP_UTF8STRING);
2749   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u8\"0123456789\"");
2750
2751   /* Verify that cpp_interpret_string works.  */
2752   cpp_string dst_string;
2753   const enum cpp_ttype type = CPP_STRING;
2754   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2755                                       &dst_string, type);
2756   ASSERT_TRUE (result);
2757   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2758   free (const_cast <unsigned char *> (dst_string.text));
2759
2760   /* Verify ranges of individual characters.  This no longer includes the
2761      opening quote, but does include the closing quote.  */
2762   for (int i = 0; i <= 10; i++)
2763     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2764 }
2765
2766 /* Lex a string literal containing UTF-8 source characters.
2767    Verify the substring location data after running cpp_interpret_string
2768    on it.  */
2769
2770 static void
2771 test_lexer_string_locations_utf8_source (const line_table_case &case_)
2772 {
2773  /* This string literal is written out to the source file as UTF-8,
2774     and is of the form "before mojibake after", where "mojibake"
2775     is written as the following four unicode code points:
2776        U+6587 CJK UNIFIED IDEOGRAPH-6587
2777        U+5B57 CJK UNIFIED IDEOGRAPH-5B57
2778        U+5316 CJK UNIFIED IDEOGRAPH-5316
2779        U+3051 HIRAGANA LETTER KE.
2780      Each of these is 3 bytes wide when encoded in UTF-8, whereas the
2781      "before" and "after" are 1 byte per unicode character.
2782
2783      The numbering shown are "columns", which are *byte* numbers within
2784      the line, rather than unicode character numbers.
2785
2786      .................... 000000000.1111111.
2787      .................... 123456789.0123456.  */
2788   const char *content = ("        \"before "
2789                          /* U+6587 CJK UNIFIED IDEOGRAPH-6587
2790                               UTF-8: 0xE6 0x96 0x87
2791                               C octal escaped UTF-8: \346\226\207
2792                             "column" numbers: 17-19.  */
2793                          "\346\226\207"
2794
2795                          /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
2796                               UTF-8: 0xE5 0xAD 0x97
2797                               C octal escaped UTF-8: \345\255\227
2798                             "column" numbers: 20-22.  */
2799                          "\345\255\227"
2800
2801                          /* U+5316 CJK UNIFIED IDEOGRAPH-5316
2802                               UTF-8: 0xE5 0x8C 0x96
2803                               C octal escaped UTF-8: \345\214\226
2804                             "column" numbers: 23-25.  */
2805                          "\345\214\226"
2806
2807                          /* U+3051 HIRAGANA LETTER KE
2808                               UTF-8: 0xE3 0x81 0x91
2809                               C octal escaped UTF-8: \343\201\221
2810                             "column" numbers: 26-28.  */
2811                          "\343\201\221"
2812
2813                          /* column numbers 29 onwards
2814                           2333333.33334444444444
2815                           9012345.67890123456789. */
2816                          " after\" /* non-str */\n");
2817   lexer_test test (case_, content, NULL);
2818
2819   /* Verify that we get the expected token back, with the correct
2820      location information.  */
2821   const cpp_token *tok = test.get_token ();
2822   ASSERT_EQ (tok->type, CPP_STRING);
2823   ASSERT_TOKEN_AS_TEXT_EQ
2824     (test.m_parser, tok,
2825      "\"before \346\226\207\345\255\227\345\214\226\343\201\221 after\"");
2826
2827   /* Verify that cpp_interpret_string works.  */
2828   cpp_string dst_string;
2829   const enum cpp_ttype type = CPP_STRING;
2830   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2831                                       &dst_string, type);
2832   ASSERT_TRUE (result);
2833   ASSERT_STREQ
2834     ("before \346\226\207\345\255\227\345\214\226\343\201\221 after",
2835      (const char *)dst_string.text);
2836   free (const_cast <unsigned char *> (dst_string.text));
2837
2838   /* Verify ranges of individual characters.  This no longer includes the
2839      opening quote, but does include the closing quote.
2840      Assuming that both source and execution encodings are UTF-8, we have
2841      a run of 25 octets in each, plus the NUL terminator.  */
2842   for (int i = 0; i < 25; i++)
2843     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2844   /* NUL-terminator should use the closing quote at column 35.  */
2845   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 25, 1, 35, 35);
2846
2847   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 26);
2848 }
2849
2850 /* Test of string literal concatenation.  */
2851
2852 static void
2853 test_lexer_string_locations_concatenation_1 (const line_table_case &case_)
2854 {
2855   /* Digits 0-9.
2856      .....................000000000.111111.11112222222222
2857      .....................123456789.012345.67890123456789.  */
2858   const char *content = ("        \"01234\" /* non-str */\n"
2859                          "        \"56789\" /* non-str */\n");
2860   lexer_test test (case_, content, NULL);
2861
2862   location_t input_locs[2];
2863
2864   /* Verify that we get the expected tokens back.  */
2865   auto_vec <cpp_string> input_strings;
2866   const cpp_token *tok_a = test.get_token ();
2867   ASSERT_EQ (tok_a->type, CPP_STRING);
2868   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_a, "\"01234\"");
2869   input_strings.safe_push (tok_a->val.str);
2870   input_locs[0] = tok_a->src_loc;
2871
2872   const cpp_token *tok_b = test.get_token ();
2873   ASSERT_EQ (tok_b->type, CPP_STRING);
2874   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_b, "\"56789\"");
2875   input_strings.safe_push (tok_b->val.str);
2876   input_locs[1] = tok_b->src_loc;
2877
2878   /* Verify that cpp_interpret_string works.  */
2879   cpp_string dst_string;
2880   const enum cpp_ttype type = CPP_STRING;
2881   bool result = cpp_interpret_string (test.m_parser,
2882                                       input_strings.address (), 2,
2883                                       &dst_string, type);
2884   ASSERT_TRUE (result);
2885   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2886   free (const_cast <unsigned char *> (dst_string.text));
2887
2888   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
2889   test.m_concats.record_string_concatenation (2, input_locs);
2890
2891   location_t initial_loc = input_locs[0];
2892
2893   /* "01234" on line 1.  */
2894   for (int i = 0; i <= 4; i++)
2895     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
2896   /* "56789" in line 2, plus its closing quote for the nul terminator.  */
2897   for (int i = 5; i <= 10; i++)
2898     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 2, 5 + i, 5 + i);
2899
2900   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
2901 }
2902
2903 /* Another test of string literal concatenation.  */
2904
2905 static void
2906 test_lexer_string_locations_concatenation_2 (const line_table_case &case_)
2907 {
2908   /* Digits 0-9.
2909      .....................000000000.111.11111112222222
2910      .....................123456789.012.34567890123456.  */
2911   const char *content = ("        \"01\" /* non-str */\n"
2912                          "        \"23\" /* non-str */\n"
2913                          "        \"45\" /* non-str */\n"
2914                          "        \"67\" /* non-str */\n"
2915                          "        \"89\" /* non-str */\n");
2916   lexer_test test (case_, content, NULL);
2917
2918   auto_vec <cpp_string> input_strings;
2919   location_t input_locs[5];
2920
2921   /* Verify that we get the expected tokens back.  */
2922   for (int i = 0; i < 5; i++)
2923     {
2924       const cpp_token *tok = test.get_token ();
2925       ASSERT_EQ (tok->type, CPP_STRING);
2926       input_strings.safe_push (tok->val.str);
2927       input_locs[i] = tok->src_loc;
2928     }
2929
2930   /* Verify that cpp_interpret_string works.  */
2931   cpp_string dst_string;
2932   const enum cpp_ttype type = CPP_STRING;
2933   bool result = cpp_interpret_string (test.m_parser,
2934                                       input_strings.address (), 5,
2935                                       &dst_string, type);
2936   ASSERT_TRUE (result);
2937   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2938   free (const_cast <unsigned char *> (dst_string.text));
2939
2940   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
2941   test.m_concats.record_string_concatenation (5, input_locs);
2942
2943   location_t initial_loc = input_locs[0];
2944
2945   /* Within ASSERT_CHAR_AT_RANGE (actually assert_char_at_range), we can
2946      detect if the initial loc is after LINE_MAP_MAX_LOCATION_WITH_COLS
2947      and expect get_source_range_for_substring to fail.
2948      However, for a string concatenation test, we can have a case
2949      where the initial string is fully before LINE_MAP_MAX_LOCATION_WITH_COLS,
2950      but subsequent strings can be after it.
2951      Attempting to detect this within assert_char_at_range
2952      would overcomplicate the logic for the common test cases, so
2953      we detect it here.  */
2954   if (should_have_column_data_p (input_locs[0])
2955       && !should_have_column_data_p (input_locs[4]))
2956     {
2957       /* Verify that get_source_range_for_substring gracefully rejects
2958          this case.  */
2959       source_range actual_range;
2960       const char *err
2961         = get_source_range_for_char (test.m_parser, &test.m_concats,
2962                                      initial_loc, type, 0, &actual_range);
2963       ASSERT_STREQ ("range starts after LINE_MAP_MAX_LOCATION_WITH_COLS", err);
2964       return;
2965     }
2966
2967   for (int i = 0; i < 5; i++)
2968     for (int j = 0; j < 2; j++)
2969       ASSERT_CHAR_AT_RANGE (test, initial_loc, type, (i * 2) + j,
2970                             i + 1, 10 + j, 10 + j);
2971
2972   /* NUL-terminator should use the final closing quote at line 5 column 12.  */
2973   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 5, 12, 12);
2974
2975   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
2976 }
2977
2978 /* Another test of string literal concatenation, this time combined with
2979    various kinds of escaped characters.  */
2980
2981 static void
2982 test_lexer_string_locations_concatenation_3 (const line_table_case &case_)
2983 {
2984   /* Digits 0-9, expressing digit 5 in ASCII as hex "\x35"
2985      digit 6 in ASCII as octal "\066", concatenating multiple strings.  */
2986   const char *content
2987     /* .000000000.111111.111.1.2222.222.2.2233.333.3333.34444444444555
2988        .123456789.012345.678.9.0123.456.7.8901.234.5678.90123456789012. */
2989     = ("        \"01234\"  \"\\x35\"  \"\\066\"  \"789\" /* non-str */\n");
2990   lexer_test test (case_, content, NULL);
2991
2992   auto_vec <cpp_string> input_strings;
2993   location_t input_locs[4];
2994
2995   /* Verify that we get the expected tokens back.  */
2996   for (int i = 0; i < 4; i++)
2997     {
2998       const cpp_token *tok = test.get_token ();
2999       ASSERT_EQ (tok->type, CPP_STRING);
3000       input_strings.safe_push (tok->val.str);
3001       input_locs[i] = tok->src_loc;
3002     }
3003
3004   /* Verify that cpp_interpret_string works.  */
3005   cpp_string dst_string;
3006   const enum cpp_ttype type = CPP_STRING;
3007   bool result = cpp_interpret_string (test.m_parser,
3008                                       input_strings.address (), 4,
3009                                       &dst_string, type);
3010   ASSERT_TRUE (result);
3011   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3012   free (const_cast <unsigned char *> (dst_string.text));
3013
3014   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
3015   test.m_concats.record_string_concatenation (4, input_locs);
3016
3017   location_t initial_loc = input_locs[0];
3018
3019   for (int i = 0; i <= 4; i++)
3020     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3021   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 5, 1, 19, 22);
3022   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 6, 1, 27, 30);
3023   for (int i = 7; i <= 9; i++)
3024     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 28 + i, 28 + i);
3025
3026   /* NUL-terminator should use the location of the final closing quote.  */
3027   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 1, 38, 38);
3028
3029   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3030 }
3031
3032 /* Test of string literal in a macro.  */
3033
3034 static void
3035 test_lexer_string_locations_macro (const line_table_case &case_)
3036 {
3037   /* Digits 0-9.
3038      .....................0000000001111111111.22222222223.
3039      .....................1234567890123456789.01234567890.  */
3040   const char *content = ("#define MACRO     \"0123456789\" /* non-str */\n"
3041                          "  MACRO");
3042   lexer_test test (case_, content, NULL);
3043
3044   /* Verify that we get the expected tokens back.  */
3045   const cpp_token *tok = test.get_token ();
3046   ASSERT_EQ (tok->type, CPP_PADDING);
3047
3048   tok = test.get_token ();
3049   ASSERT_EQ (tok->type, CPP_STRING);
3050   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
3051
3052   /* Verify ranges of individual characters.  We ought to
3053      see columns within the macro definition.  */
3054   for (int i = 0; i <= 10; i++)
3055     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3056                           i, 1, 20 + i, 20 + i);
3057
3058   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3059
3060   tok = test.get_token ();
3061   ASSERT_EQ (tok->type, CPP_PADDING);
3062 }
3063
3064 /* Test of stringification of a macro argument.  */
3065
3066 static void
3067 test_lexer_string_locations_stringified_macro_argument
3068   (const line_table_case &case_)
3069 {
3070   /* .....................000000000111111111122222222223.
3071      .....................123456789012345678901234567890.  */
3072   const char *content = ("#define MACRO(X) #X /* non-str */\n"
3073                          "MACRO(foo)\n");
3074   lexer_test test (case_, content, NULL);
3075
3076   /* Verify that we get the expected token back.  */
3077   const cpp_token *tok = test.get_token ();
3078   ASSERT_EQ (tok->type, CPP_PADDING);
3079
3080   tok = test.get_token ();
3081   ASSERT_EQ (tok->type, CPP_STRING);
3082   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"foo\"");
3083
3084   /* We don't support getting the location of a stringified macro
3085      argument.  Verify that it fails gracefully.  */
3086   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3087                                   "cpp_interpret_string_1 failed");
3088
3089   tok = test.get_token ();
3090   ASSERT_EQ (tok->type, CPP_PADDING);
3091
3092   tok = test.get_token ();
3093   ASSERT_EQ (tok->type, CPP_PADDING);
3094 }
3095
3096 /* Ensure that we are fail gracefully if something attempts to pass
3097    in a location that isn't a string literal token.  Seen on this code:
3098
3099      const char a[] = " %d ";
3100      __builtin_printf (a, 0.5);
3101                        ^
3102
3103    when c-format.c erroneously used the indicated one-character
3104    location as the format string location, leading to a read past the
3105    end of a string buffer in cpp_interpret_string_1.  */
3106
3107 static void
3108 test_lexer_string_locations_non_string (const line_table_case &case_)
3109 {
3110   /* .....................000000000111111111122222222223.
3111      .....................123456789012345678901234567890.  */
3112   const char *content = ("         a\n");
3113   lexer_test test (case_, content, NULL);
3114
3115   /* Verify that we get the expected token back.  */
3116   const cpp_token *tok = test.get_token ();
3117   ASSERT_EQ (tok->type, CPP_NAME);
3118   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "a");
3119
3120   /* At this point, libcpp is attempting to interpret the name as a
3121      string literal, despite it not starting with a quote.  We don't detect
3122      that, but we should at least fail gracefully.  */
3123   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3124                                   "cpp_interpret_string_1 failed");
3125 }
3126
3127 /* Ensure that we can read substring information for a token which
3128    starts in one linemap and ends in another .  Adapted from
3129    gcc.dg/cpp/pr69985.c.  */
3130
3131 static void
3132 test_lexer_string_locations_long_line (const line_table_case &case_)
3133 {
3134   /* .....................000000.000111111111
3135      .....................123456.789012346789.  */
3136   const char *content = ("/* A very long line, so that we start a new line map.  */\n"
3137                          "     \"0123456789012345678901234567890123456789"
3138                          "0123456789012345678901234567890123456789"
3139                          "0123456789012345678901234567890123456789"
3140                          "0123456789\"\n");
3141
3142   lexer_test test (case_, content, NULL);
3143
3144   /* Verify that we get the expected token back.  */
3145   const cpp_token *tok = test.get_token ();
3146   ASSERT_EQ (tok->type, CPP_STRING);
3147
3148   if (!should_have_column_data_p (line_table->highest_location))
3149     return;
3150
3151   /* Verify ranges of individual characters.  */
3152   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 131);
3153   for (int i = 0; i < 131; i++)
3154     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3155                           i, 2, 7 + i, 7 + i);
3156 }
3157
3158 /* Test of lexing char constants.  */
3159
3160 static void
3161 test_lexer_char_constants (const line_table_case &case_)
3162 {
3163   /* Various char constants.
3164      .....................0000000001111111111.22222222223.
3165      .....................1234567890123456789.01234567890.  */
3166   const char *content = ("         'a'\n"
3167                          "        u'a'\n"
3168                          "        U'a'\n"
3169                          "        L'a'\n"
3170                          "         'abc'\n");
3171   lexer_test test (case_, content, NULL);
3172
3173   /* Verify that we get the expected tokens back.  */
3174   /* 'a'.  */
3175   const cpp_token *tok = test.get_token ();
3176   ASSERT_EQ (tok->type, CPP_CHAR);
3177   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'a'");
3178
3179   unsigned int chars_seen;
3180   int unsignedp;
3181   cppchar_t cc = cpp_interpret_charconst (test.m_parser, tok,
3182                                           &chars_seen, &unsignedp);
3183   ASSERT_EQ (cc, 'a');
3184   ASSERT_EQ (chars_seen, 1);
3185
3186   /* u'a'.  */
3187   tok = test.get_token ();
3188   ASSERT_EQ (tok->type, CPP_CHAR16);
3189   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u'a'");
3190
3191   /* U'a'.  */
3192   tok = test.get_token ();
3193   ASSERT_EQ (tok->type, CPP_CHAR32);
3194   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U'a'");
3195
3196   /* L'a'.  */
3197   tok = test.get_token ();
3198   ASSERT_EQ (tok->type, CPP_WCHAR);
3199   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L'a'");
3200
3201   /* 'abc' (c-char-sequence).  */
3202   tok = test.get_token ();
3203   ASSERT_EQ (tok->type, CPP_CHAR);
3204   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'abc'");
3205 }
3206 /* A table of interesting location_t values, giving one axis of our test
3207    matrix.  */
3208
3209 static const location_t boundary_locations[] = {
3210   /* Zero means "don't override the default values for a new line_table".  */
3211   0,
3212
3213   /* An arbitrary non-zero value that isn't close to one of
3214      the boundary values below.  */
3215   0x10000,
3216
3217   /* Values near LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES.  */
3218   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 0x100,
3219   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 1,
3220   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES,
3221   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 1,
3222   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 0x100,
3223
3224   /* Values near LINE_MAP_MAX_LOCATION_WITH_COLS.  */
3225   LINE_MAP_MAX_LOCATION_WITH_COLS - 0x100,
3226   LINE_MAP_MAX_LOCATION_WITH_COLS - 1,
3227   LINE_MAP_MAX_LOCATION_WITH_COLS,
3228   LINE_MAP_MAX_LOCATION_WITH_COLS + 1,
3229   LINE_MAP_MAX_LOCATION_WITH_COLS + 0x100,
3230 };
3231
3232 /* Run TESTCASE multiple times, once for each case in our test matrix.  */
3233
3234 void
3235 for_each_line_table_case (void (*testcase) (const line_table_case &))
3236 {
3237   /* As noted above in the description of struct line_table_case,
3238      we want to explore a test matrix of interesting line_table
3239      situations, running various selftests for each case within the
3240      matrix.  */
3241
3242   /* Run all tests with:
3243      (a) line_table->default_range_bits == 0, and
3244      (b) line_table->default_range_bits == 5.  */
3245   int num_cases_tested = 0;
3246   for (int default_range_bits = 0; default_range_bits <= 5;
3247        default_range_bits += 5)
3248     {
3249       /* ...and use each of the "interesting" location values as
3250          the starting location within line_table.  */
3251       const int num_boundary_locations
3252         = sizeof (boundary_locations) / sizeof (boundary_locations[0]);
3253       for (int loc_idx = 0; loc_idx < num_boundary_locations; loc_idx++)
3254         {
3255           line_table_case c (default_range_bits, boundary_locations[loc_idx]);
3256
3257           testcase (c);
3258
3259           num_cases_tested++;
3260         }
3261     }
3262
3263   /* Verify that we fully covered the test matrix.  */
3264   ASSERT_EQ (num_cases_tested, 2 * 12);
3265 }
3266
3267 /* Run all of the selftests within this file.  */
3268
3269 void
3270 input_c_tests ()
3271 {
3272   test_should_have_column_data_p ();
3273   test_unknown_location ();
3274   test_builtins ();
3275   for_each_line_table_case (test_make_location_nonpure_range_endpoints);
3276
3277   for_each_line_table_case (test_accessing_ordinary_linemaps);
3278   for_each_line_table_case (test_lexer);
3279   for_each_line_table_case (test_lexer_string_locations_simple);
3280   for_each_line_table_case (test_lexer_string_locations_ebcdic);
3281   for_each_line_table_case (test_lexer_string_locations_hex);
3282   for_each_line_table_case (test_lexer_string_locations_oct);
3283   for_each_line_table_case (test_lexer_string_locations_letter_escape_1);
3284   for_each_line_table_case (test_lexer_string_locations_letter_escape_2);
3285   for_each_line_table_case (test_lexer_string_locations_ucn4);
3286   for_each_line_table_case (test_lexer_string_locations_ucn8);
3287   for_each_line_table_case (test_lexer_string_locations_wide_string);
3288   for_each_line_table_case (test_lexer_string_locations_string16);
3289   for_each_line_table_case (test_lexer_string_locations_string32);
3290   for_each_line_table_case (test_lexer_string_locations_u8);
3291   for_each_line_table_case (test_lexer_string_locations_utf8_source);
3292   for_each_line_table_case (test_lexer_string_locations_concatenation_1);
3293   for_each_line_table_case (test_lexer_string_locations_concatenation_2);
3294   for_each_line_table_case (test_lexer_string_locations_concatenation_3);
3295   for_each_line_table_case (test_lexer_string_locations_macro);
3296   for_each_line_table_case (test_lexer_string_locations_stringified_macro_argument);
3297   for_each_line_table_case (test_lexer_string_locations_non_string);
3298   for_each_line_table_case (test_lexer_string_locations_long_line);
3299   for_each_line_table_case (test_lexer_char_constants);
3300
3301   test_reading_source_line ();
3302 }
3303
3304 } // namespace selftest
3305
3306 #endif /* CHECKING_P */