gcc/input.c

   1 /* Data and functions related to line maps and input files.
   2    Copyright (C) 2004-2020 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 3, or (at your option) any later
   9 version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "intl.h"
  24 #include "diagnostic.h"
  25 #include "diagnostic-core.h"
  26 #include "selftest.h"
  27 #include "cpplib.h"
  28
  29 #ifndef HAVE_ICONV
  30 #define HAVE_ICONV 0
  31 #endif
  32
  33 /* This is a cache used by get_next_line to store the content of a
  34    file to be searched for file lines.  */
  35 class fcache
  36 {
  37 public:
  38   /* These are information used to store a line boundary.  */
  39   class line_info
  40   {
  41   public:
  42     /* The line number.  It starts from 1.  */
  43     size_t line_num;
  44
  45     /* The position (byte count) of the beginning of the line,
  46        relative to the file data pointer.  This starts at zero.  */
  47     size_t start_pos;
  48
  49     /* The position (byte count) of the last byte of the line.  This
  50        normally points to the '\n' character, or to one byte after the
  51        last byte of the file, if the file doesn't contain a '\n'
  52        character.  */
  53     size_t end_pos;
  54
  55     line_info (size_t l, size_t s, size_t e)
  56       : line_num (l), start_pos (s), end_pos (e)
  57     {}
  58
  59     line_info ()
  60       :line_num (0), start_pos (0), end_pos (0)
  61     {}
  62   };
  63
  64   /* The number of time this file has been accessed.  This is used
  65      to designate which file cache to evict from the cache
  66      array.  */
  67   unsigned use_count;
  68
  69   /* The file_path is the key for identifying a particular file in
  70      the cache.
  71      For libcpp-using code, the underlying buffer for this field is
  72      owned by the corresponding _cpp_file within the cpp_reader.  */
  73   const char *file_path;
  74
  75   FILE *fp;
  76
  77   /* This points to the content of the file that we've read so
  78      far.  */
  79   char *data;
  80
  81   /*  The size of the DATA array above.*/
  82   size_t size;
  83
  84   /* The number of bytes read from the underlying file so far.  This
  85      must be less (or equal) than SIZE above.  */
  86   size_t nb_read;
  87
  88   /* The index of the beginning of the current line.  */
  89   size_t line_start_idx;
  90
  91   /* The number of the previous line read.  This starts at 1.  Zero
  92      means we've read no line so far.  */
  93   size_t line_num;
  94
  95   /* This is the total number of lines of the current file.  At the
  96      moment, we try to get this information from the line map
  97      subsystem.  Note that this is just a hint.  When using the C++
  98      front-end, this hint is correct because the input file is then
  99      completely tokenized before parsing starts; so the line map knows
 100      the number of lines before compilation really starts.  For e.g,
 101      the C front-end, it can happen that we start emitting diagnostics
 102      before the line map has seen the end of the file.  */
 103   size_t total_lines;
 104
 105   /* Could this file be missing a trailing newline on its final line?
 106      Initially true (to cope with empty files), set to true/false
 107      as each line is read.  */
 108   bool missing_trailing_newline;
 109
 110   /* This is a record of the beginning and end of the lines we've seen
 111      while reading the file.  This is useful to avoid walking the data
 112      from the beginning when we are asked to read a line that is
 113      before LINE_START_IDX above.  Note that the maximum size of this
 114      record is fcache_line_record_size, so that the memory consumption
 115      doesn't explode.  We thus scale total_lines down to
 116      fcache_line_record_size.  */
 117   vec<line_info, va_heap> line_record;
 118
 119   fcache ();
 120   ~fcache ();
 121 };
 122
 123 /* Current position in real source file.  */
 124
 125 location_t input_location = UNKNOWN_LOCATION;
 126
 127 class line_maps *line_table;
 128
 129 /* A stashed copy of "line_table" for use by selftest::line_table_test.
 130    This needs to be a global so that it can be a GC root, and thus
 131    prevent the stashed copy from being garbage-collected if the GC runs
 132    during a line_table_test.  */
 133
 134 class line_maps *saved_line_table;
 135
 136 static fcache *fcache_tab;
 137 static const size_t fcache_tab_size = 16;
 138 static const size_t fcache_buffer_size = 4 * 1024;
 139 static const size_t fcache_line_record_size = 100;
 140
 141 /* Expand the source location LOC into a human readable location.  If
 142    LOC resolves to a builtin location, the file name of the readable
 143    location is set to the string "<built-in>". If EXPANSION_POINT_P is
 144    TRUE and LOC is virtual, then it is resolved to the expansion
 145    point of the involved macro.  Otherwise, it is resolved to the
 146    spelling location of the token.
 147
 148    When resolving to the spelling location of the token, if the
 149    resulting location is for a built-in location (that is, it has no
 150    associated line/column) in the context of a macro expansion, the
 151    returned location is the first one (while unwinding the macro
 152    location towards its expansion point) that is in real source
 153    code.
 154
 155    ASPECT controls which part of the location to use.  */
 156
 157 static expanded_location
 158 expand_location_1 (location_t loc,
 159                    bool expansion_point_p,
 160                    enum location_aspect aspect)
 161 {
 162   expanded_location xloc;
 163   const line_map_ordinary *map;
 164   enum location_resolution_kind lrk = LRK_MACRO_EXPANSION_POINT;
 165   tree block = NULL;
 166
 167   if (IS_ADHOC_LOC (loc))
 168     {
 169       block = LOCATION_BLOCK (loc);
 170       loc = LOCATION_LOCUS (loc);
 171     }
 172
 173   memset (&xloc, 0, sizeof (xloc));
 174
 175   if (loc >= RESERVED_LOCATION_COUNT)
 176     {
 177       if (!expansion_point_p)
 178         {
 179           /* We want to resolve LOC to its spelling location.
 180
 181              But if that spelling location is a reserved location that
 182              appears in the context of a macro expansion (like for a
 183              location for a built-in token), let's consider the first
 184              location (toward the expansion point) that is not reserved;
 185              that is, the first location that is in real source code.  */
 186           loc = linemap_unwind_to_first_non_reserved_loc (line_table,
 187                                                           loc, NULL);
 188           lrk = LRK_SPELLING_LOCATION;
 189         }
 190       loc = linemap_resolve_location (line_table, loc, lrk, &map);
 191
 192       /* loc is now either in an ordinary map, or is a reserved location.
 193          If it is a compound location, the caret is in a spelling location,
 194          but the start/finish might still be a virtual location.
 195          Depending of what the caller asked for, we may need to recurse
 196          one level in order to resolve any virtual locations in the
 197          end-points.  */
 198       switch (aspect)
 199         {
 200         default:
 201           gcc_unreachable ();
 202           /* Fall through.  */
 203         case LOCATION_ASPECT_CARET:
 204           break;
 205         case LOCATION_ASPECT_START:
 206           {
 207             location_t start = get_start (loc);
 208             if (start != loc)
 209               return expand_location_1 (start, expansion_point_p, aspect);
 210           }
 211           break;
 212         case LOCATION_ASPECT_FINISH:
 213           {
 214             location_t finish = get_finish (loc);
 215             if (finish != loc)
 216               return expand_location_1 (finish, expansion_point_p, aspect);
 217           }
 218           break;
 219         }
 220       xloc = linemap_expand_location (line_table, map, loc);
 221     }
 222
 223   xloc.data = block;
 224   if (loc <= BUILTINS_LOCATION)
 225     xloc.file = loc == UNKNOWN_LOCATION ? NULL : _("<built-in>");
 226
 227   return xloc;
 228 }
 229
 230 /* Initialize the set of cache used for files accessed by caret
 231    diagnostic.  */
 232
 233 static void
 234 diagnostic_file_cache_init (void)
 235 {
 236   if (fcache_tab == NULL)
 237     fcache_tab = new fcache[fcache_tab_size];
 238 }
 239
 240 /* Free the resources used by the set of cache used for files accessed
 241    by caret diagnostic.  */
 242
 243 void
 244 diagnostic_file_cache_fini (void)
 245 {
 246   if (fcache_tab)
 247     {
 248       delete [] (fcache_tab);
 249       fcache_tab = NULL;
 250     }
 251 }
 252
 253 /* Return the total lines number that have been read so far by the
 254    line map (in the preprocessor) so far.  For languages like C++ that
 255    entirely preprocess the input file before starting to parse, this
 256    equals the actual number of lines of the file.  */
 257
 258 static size_t
 259 total_lines_num (const char *file_path)
 260 {
 261   size_t r = 0;
 262   location_t l = 0;
 263   if (linemap_get_file_highest_location (line_table, file_path, &l))
 264     {
 265       gcc_assert (l >= RESERVED_LOCATION_COUNT);
 266       expanded_location xloc = expand_location (l);
 267       r = xloc.line;
 268     }
 269   return r;
 270 }
 271
 272 /* Lookup the cache used for the content of a given file accessed by
 273    caret diagnostic.  Return the found cached file, or NULL if no
 274    cached file was found.  */
 275
 276 static fcache*
 277 lookup_file_in_cache_tab (const char *file_path)
 278 {
 279   if (file_path == NULL)
 280     return NULL;
 281
 282   diagnostic_file_cache_init ();
 283
 284   /* This will contain the found cached file.  */
 285   fcache *r = NULL;
 286   for (unsigned i = 0; i < fcache_tab_size; ++i)
 287     {
 288       fcache *c = &fcache_tab[i];
 289       if (c->file_path && !strcmp (c->file_path, file_path))
 290         {
 291           ++c->use_count;
 292           r = c;
 293         }
 294     }
 295
 296   if (r)
 297     ++r->use_count;
 298
 299   return r;
 300 }
 301
 302 /* Purge any mention of FILENAME from the cache of files used for
 303    printing source code.  For use in selftests when working
 304    with tempfiles.  */
 305
 306 void
 307 diagnostics_file_cache_forcibly_evict_file (const char *file_path)
 308 {
 309   gcc_assert (file_path);
 310
 311   fcache *r = lookup_file_in_cache_tab (file_path);
 312   if (!r)
 313     /* Not found.  */
 314     return;
 315
 316   r->file_path = NULL;
 317   if (r->fp)
 318     fclose (r->fp);
 319   r->fp = NULL;
 320   r->nb_read = 0;
 321   r->line_start_idx = 0;
 322   r->line_num = 0;
 323   r->line_record.truncate (0);
 324   r->use_count = 0;
 325   r->total_lines = 0;
 326   r->missing_trailing_newline = true;
 327 }
 328
 329 /* Return the file cache that has been less used, recently, or the
 330    first empty one.  If HIGHEST_USE_COUNT is non-null,
 331    *HIGHEST_USE_COUNT is set to the highest use count of the entries
 332    in the cache table.  */
 333
 334 static fcache*
 335 evicted_cache_tab_entry (unsigned *highest_use_count)
 336 {
 337   diagnostic_file_cache_init ();
 338
 339   fcache *to_evict = &fcache_tab[0];
 340   unsigned huc = to_evict->use_count;
 341   for (unsigned i = 1; i < fcache_tab_size; ++i)
 342     {
 343       fcache *c = &fcache_tab[i];
 344       bool c_is_empty = (c->file_path == NULL);
 345
 346       if (c->use_count < to_evict->use_count
 347           || (to_evict->file_path && c_is_empty))
 348         /* We evict C because it's either an entry with a lower use
 349            count or one that is empty.  */
 350         to_evict = c;
 351
 352       if (huc < c->use_count)
 353         huc = c->use_count;
 354
 355       if (c_is_empty)
 356         /* We've reached the end of the cache; subsequent elements are
 357            all empty.  */
 358         break;
 359     }
 360
 361   if (highest_use_count)
 362     *highest_use_count = huc;
 363
 364   return to_evict;
 365 }
 366
 367 /* Create the cache used for the content of a given file to be
 368    accessed by caret diagnostic.  This cache is added to an array of
 369    cache and can be retrieved by lookup_file_in_cache_tab.  This
 370    function returns the created cache.  Note that only the last
 371    fcache_tab_size files are cached.  */
 372
 373 static fcache*
 374 add_file_to_cache_tab (const char *file_path)
 375 {
 376
 377   FILE *fp = fopen (file_path, "r");
 378   if (fp == NULL)
 379     return NULL;
 380
 381   unsigned highest_use_count = 0;
 382   fcache *r = evicted_cache_tab_entry (&highest_use_count);
 383   r->file_path = file_path;
 384   if (r->fp)
 385     fclose (r->fp);
 386   r->fp = fp;
 387   r->nb_read = 0;
 388   r->line_start_idx = 0;
 389   r->line_num = 0;
 390   r->line_record.truncate (0);
 391   /* Ensure that this cache entry doesn't get evicted next time
 392      add_file_to_cache_tab is called.  */
 393   r->use_count = ++highest_use_count;
 394   r->total_lines = total_lines_num (file_path);
 395   r->missing_trailing_newline = true;
 396
 397   return r;
 398 }
 399
 400 /* Lookup the cache used for the content of a given file accessed by
 401    caret diagnostic.  If no cached file was found, create a new cache
 402    for this file, add it to the array of cached file and return
 403    it.  */
 404
 405 static fcache*
 406 lookup_or_add_file_to_cache_tab (const char *file_path)
 407 {
 408   fcache *r = lookup_file_in_cache_tab (file_path);
 409   if (r == NULL)
 410     r = add_file_to_cache_tab (file_path);
 411   return r;
 412 }
 413
 414 /* Default constructor for a cache of file used by caret
 415    diagnostic.  */
 416
 417 fcache::fcache ()
 418 : use_count (0), file_path (NULL), fp (NULL), data (0),
 419   size (0), nb_read (0), line_start_idx (0), line_num (0),
 420   total_lines (0), missing_trailing_newline (true)
 421 {
 422   line_record.create (0);
 423 }
 424
 425 /* Destructor for a cache of file used by caret diagnostic.  */
 426
 427 fcache::~fcache ()
 428 {
 429   if (fp)
 430     {
 431       fclose (fp);
 432       fp = NULL;
 433     }
 434   if (data)
 435     {
 436       XDELETEVEC (data);
 437       data = 0;
 438     }
 439   line_record.release ();
 440 }
 441
 442 /* Returns TRUE iff the cache would need to be filled with data coming
 443    from the file.  That is, either the cache is empty or full or the
 444    current line is empty.  Note that if the cache is full, it would
 445    need to be extended and filled again.  */
 446
 447 static bool
 448 needs_read (fcache *c)
 449 {
 450   return (c->nb_read == 0
 451           || c->nb_read == c->size
 452           || (c->line_start_idx >= c->nb_read - 1));
 453 }
 454
 455 /*  Return TRUE iff the cache is full and thus needs to be
 456     extended.  */
 457
 458 static bool
 459 needs_grow (fcache *c)
 460 {
 461   return c->nb_read == c->size;
 462 }
 463
 464 /* Grow the cache if it needs to be extended.  */
 465
 466 static void
 467 maybe_grow (fcache *c)
 468 {
 469   if (!needs_grow (c))
 470     return;
 471
 472   size_t size = c->size == 0 ? fcache_buffer_size : c->size * 2;
 473   c->data = XRESIZEVEC (char, c->data, size);
 474   c->size = size;
 475 }
 476
 477 /*  Read more data into the cache.  Extends the cache if need be.
 478     Returns TRUE iff new data could be read.  */
 479
 480 static bool
 481 read_data (fcache *c)
 482 {
 483   if (feof (c->fp) || ferror (c->fp))
 484     return false;
 485
 486   maybe_grow (c);
 487
 488   char * from = c->data + c->nb_read;
 489   size_t to_read = c->size - c->nb_read;
 490   size_t nb_read = fread (from, 1, to_read, c->fp);
 491
 492   if (ferror (c->fp))
 493     return false;
 494
 495   c->nb_read += nb_read;
 496   return !!nb_read;
 497 }
 498
 499 /* Read new data iff the cache needs to be filled with more data
 500    coming from the file FP.  Return TRUE iff the cache was filled with
 501    mode data.  */
 502
 503 static bool
 504 maybe_read_data (fcache *c)
 505 {
 506   if (!needs_read (c))
 507     return false;
 508   return read_data (c);
 509 }
 510
 511 /* Read a new line from file FP, using C as a cache for the data
 512    coming from the file.  Upon successful completion, *LINE is set to
 513    the beginning of the line found.  *LINE points directly in the
 514    line cache and is only valid until the next call of get_next_line.
 515    *LINE_LEN is set to the length of the line.  Note that the line
 516    does not contain any terminal delimiter.  This function returns
 517    true if some data was read or process from the cache, false
 518    otherwise.  Note that subsequent calls to get_next_line might
 519    make the content of *LINE invalid.  */
 520
 521 static bool
 522 get_next_line (fcache *c, char **line, ssize_t *line_len)
 523 {
 524   /* Fill the cache with data to process.  */
 525   maybe_read_data (c);
 526
 527   size_t remaining_size = c->nb_read - c->line_start_idx;
 528   if (remaining_size == 0)
 529     /* There is no more data to process.  */
 530     return false;
 531
 532   char *line_start = c->data + c->line_start_idx;
 533
 534   char *next_line_start = NULL;
 535   size_t len = 0;
 536   char *line_end = (char *) memchr (line_start, '\n', remaining_size);
 537   if (line_end == NULL)
 538     {
 539       /* We haven't found the end-of-line delimiter in the cache.
 540          Fill the cache with more data from the file and look for the
 541          '\n'.  */
 542       while (maybe_read_data (c))
 543         {
 544           line_start = c->data + c->line_start_idx;
 545           remaining_size = c->nb_read - c->line_start_idx;
 546           line_end = (char *) memchr (line_start, '\n', remaining_size);
 547           if (line_end != NULL)
 548             {
 549               next_line_start = line_end + 1;
 550               break;
 551             }
 552         }
 553       if (line_end == NULL)
 554         {
 555           /* We've loadded all the file into the cache and still no
 556              '\n'.  Let's say the line ends up at one byte passed the
 557              end of the file.  This is to stay consistent with the case
 558              of when the line ends up with a '\n' and line_end points to
 559              that terminal '\n'.  That consistency is useful below in
 560              the len calculation.  */
 561           line_end = c->data + c->nb_read ;
 562           c->missing_trailing_newline = true;
 563         }
 564       else
 565         c->missing_trailing_newline = false;
 566     }
 567   else
 568     {
 569       next_line_start = line_end + 1;
 570       c->missing_trailing_newline = false;
 571     }
 572
 573   if (ferror (c->fp))
 574     return false;
 575
 576   /* At this point, we've found the end of the of line.  It either
 577      points to the '\n' or to one byte after the last byte of the
 578      file.  */
 579   gcc_assert (line_end != NULL);
 580
 581   len = line_end - line_start;
 582
 583   if (c->line_start_idx < c->nb_read)
 584     *line = line_start;
 585
 586   ++c->line_num;
 587
 588   /* Before we update our line record, make sure the hint about the
 589      total number of lines of the file is correct.  If it's not, then
 590      we give up recording line boundaries from now on.  */
 591   bool update_line_record = true;
 592   if (c->line_num > c->total_lines)
 593     update_line_record = false;
 594
 595     /* Now update our line record so that re-reading lines from the
 596      before c->line_start_idx is faster.  */
 597   if (update_line_record
 598       && c->line_record.length () < fcache_line_record_size)
 599     {
 600       /* If the file lines fits in the line record, we just record all
 601          its lines ...*/
 602       if (c->total_lines <= fcache_line_record_size
 603           && c->line_num > c->line_record.length ())
 604         c->line_record.safe_push (fcache::line_info (c->line_num,
 605                                                  c->line_start_idx,
 606                                                  line_end - c->data));
 607       else if (c->total_lines > fcache_line_record_size)
 608         {
 609           /* ... otherwise, we just scale total_lines down to
 610              (fcache_line_record_size lines.  */
 611           size_t n = (c->line_num * fcache_line_record_size) / c->total_lines;
 612           if (c->line_record.length () == 0
 613               || n >= c->line_record.length ())
 614             c->line_record.safe_push (fcache::line_info (c->line_num,
 615                                                      c->line_start_idx,
 616                                                      line_end - c->data));
 617         }
 618     }
 619
 620   /* Update c->line_start_idx so that it points to the next line to be
 621      read.  */
 622   if (next_line_start)
 623     c->line_start_idx = next_line_start - c->data;
 624   else
 625     /* We didn't find any terminal '\n'.  Let's consider that the end
 626        of line is the end of the data in the cache.  The next
 627        invocation of get_next_line will either read more data from the
 628        underlying file or return false early because we've reached the
 629        end of the file.  */
 630     c->line_start_idx = c->nb_read;
 631
 632   *line_len = len;
 633
 634   return true;
 635 }
 636
 637 /* Consume the next bytes coming from the cache (or from its
 638    underlying file if there are remaining unread bytes in the file)
 639    until we reach the next end-of-line (or end-of-file).  There is no
 640    copying from the cache involved.  Return TRUE upon successful
 641    completion.  */
 642
 643 static bool
 644 goto_next_line (fcache *cache)
 645 {
 646   char *l;
 647   ssize_t len;
 648
 649   return get_next_line (cache, &l, &len);
 650 }
 651
 652 /* Read an arbitrary line number LINE_NUM from the file cached in C.
 653    If the line was read successfully, *LINE points to the beginning
 654    of the line in the file cache and *LINE_LEN is the length of the
 655    line.  *LINE is not nul-terminated, but may contain zero bytes.
 656    *LINE is only valid until the next call of read_line_num.
 657    This function returns bool if a line was read.  */
 658
 659 static bool
 660 read_line_num (fcache *c, size_t line_num,
 661                char **line, ssize_t *line_len)
 662 {
 663   gcc_assert (line_num > 0);
 664
 665   if (line_num <= c->line_num)
 666     {
 667       /* We've been asked to read lines that are before c->line_num.
 668          So lets use our line record (if it's not empty) to try to
 669          avoid re-reading the file from the beginning again.  */
 670
 671       if (c->line_record.is_empty ())
 672         {
 673           c->line_start_idx = 0;
 674           c->line_num = 0;
 675         }
 676       else
 677         {
 678           fcache::line_info *i = NULL;
 679           if (c->total_lines <= fcache_line_record_size)
 680             {
 681               /* In languages where the input file is not totally
 682                  preprocessed up front, the c->total_lines hint
 683                  can be smaller than the number of lines of the
 684                  file.  In that case, only the first
 685                  c->total_lines have been recorded.
 686
 687                  Otherwise, the first c->total_lines we've read have
 688                  their start/end recorded here.  */
 689               i = (line_num <= c->total_lines)
 690                 ? &c->line_record[line_num - 1]
 691                 : &c->line_record[c->total_lines - 1];
 692               gcc_assert (i->line_num <= line_num);
 693             }
 694           else
 695             {
 696               /*  So the file had more lines than our line record
 697                   size.  Thus the number of lines we've recorded has
 698                   been scaled down to fcache_line_reacord_size.  Let's
 699                   pick the start/end of the recorded line that is
 700                   closest to line_num.  */
 701               size_t n = (line_num <= c->total_lines)
 702                 ? line_num * fcache_line_record_size / c->total_lines
 703                 : c ->line_record.length () - 1;
 704               if (n < c->line_record.length ())
 705                 {
 706                   i = &c->line_record[n];
 707                   gcc_assert (i->line_num <= line_num);
 708                 }
 709             }
 710
 711           if (i && i->line_num == line_num)
 712             {
 713               /* We have the start/end of the line.  */
 714               *line = c->data + i->start_pos;
 715               *line_len = i->end_pos - i->start_pos;
 716               return true;
 717             }
 718
 719           if (i)
 720             {
 721               c->line_start_idx = i->start_pos;
 722               c->line_num = i->line_num - 1;
 723             }
 724           else
 725             {
 726               c->line_start_idx = 0;
 727               c->line_num = 0;
 728             }
 729         }
 730     }
 731
 732   /*  Let's walk from line c->line_num up to line_num - 1, without
 733       copying any line.  */
 734   while (c->line_num < line_num - 1)
 735     if (!goto_next_line (c))
 736       return false;
 737
 738   /* The line we want is the next one.  Let's read and copy it back to
 739      the caller.  */
 740   return get_next_line (c, line, line_len);
 741 }
 742
 743 /* Return the physical source line that corresponds to FILE_PATH/LINE.
 744    The line is not nul-terminated.  The returned pointer is only
 745    valid until the next call of location_get_source_line.
 746    Note that the line can contain several null characters,
 747    so the returned value's length has the actual length of the line.
 748    If the function fails, a NULL char_span is returned.  */
 749
 750 char_span
 751 location_get_source_line (const char *file_path, int line)
 752 {
 753   char *buffer = NULL;
 754   ssize_t len;
 755
 756   if (line == 0)
 757     return char_span (NULL, 0);
 758
 759   fcache *c = lookup_or_add_file_to_cache_tab (file_path);
 760   if (c == NULL)
 761     return char_span (NULL, 0);
 762
 763   bool read = read_line_num (c, line, &buffer, &len);
 764   if (!read)
 765     return char_span (NULL, 0);
 766
 767   return char_span (buffer, len);
 768 }
 769
 770 /* Determine if FILE_PATH missing a trailing newline on its final line.
 771    Only valid to call once all of the file has been loaded, by
 772    requesting a line number beyond the end of the file.  */
 773
 774 bool
 775 location_missing_trailing_newline (const char *file_path)
 776 {
 777   fcache *c = lookup_or_add_file_to_cache_tab (file_path);
 778   if (c == NULL)
 779     return false;
 780
 781   return c->missing_trailing_newline;
 782 }
 783
 784 /* Test if the location originates from the spelling location of a
 785    builtin-tokens.  That is, return TRUE if LOC is a (possibly
 786    virtual) location of a built-in token that appears in the expansion
 787    list of a macro.  Please note that this function also works on
 788    tokens that result from built-in tokens.  For instance, the
 789    function would return true if passed a token "4" that is the result
 790    of the expansion of the built-in __LINE__ macro.  */
 791 bool
 792 is_location_from_builtin_token (location_t loc)
 793 {
 794   const line_map_ordinary *map = NULL;
 795   loc = linemap_resolve_location (line_table, loc,
 796                                   LRK_SPELLING_LOCATION, &map);
 797   return loc == BUILTINS_LOCATION;
 798 }
 799
 800 /* Expand the source location LOC into a human readable location.  If
 801    LOC is virtual, it resolves to the expansion point of the involved
 802    macro.  If LOC resolves to a builtin location, the file name of the
 803    readable location is set to the string "<built-in>".  */
 804
 805 expanded_location
 806 expand_location (location_t loc)
 807 {
 808   return expand_location_1 (loc, /*expansion_point_p=*/true,
 809                             LOCATION_ASPECT_CARET);
 810 }
 811
 812 /* Expand the source location LOC into a human readable location.  If
 813    LOC is virtual, it resolves to the expansion location of the
 814    relevant macro.  If LOC resolves to a builtin location, the file
 815    name of the readable location is set to the string
 816    "<built-in>".  */
 817
 818 expanded_location
 819 expand_location_to_spelling_point (location_t loc,
 820                                    enum location_aspect aspect)
 821 {
 822   return expand_location_1 (loc, /*expansion_point_p=*/false, aspect);
 823 }
 824
 825 /* The rich_location class within libcpp requires a way to expand
 826    location_t instances, and relies on the client code
 827    providing a symbol named
 828      linemap_client_expand_location_to_spelling_point
 829    to do this.
 830
 831    This is the implementation for libcommon.a (all host binaries),
 832    which simply calls into expand_location_1.  */
 833
 834 expanded_location
 835 linemap_client_expand_location_to_spelling_point (location_t loc,
 836                                                   enum location_aspect aspect)
 837 {
 838   return expand_location_1 (loc, /*expansion_point_p=*/false, aspect);
 839 }
 840
 841
 842 /* If LOCATION is in a system header and if it is a virtual location for
 843    a token coming from the expansion of a macro, unwind it to the
 844    location of the expansion point of the macro.  Otherwise, just return
 845    LOCATION.
 846
 847    This is used for instance when we want to emit diagnostics about a
 848    token that may be located in a macro that is itself defined in a
 849    system header, for example, for the NULL macro.  In such a case, if
 850    LOCATION were passed directly to diagnostic functions such as
 851    warning_at, the diagnostic would be suppressed (unless
 852    -Wsystem-headers).  */
 853
 854 location_t
 855 expansion_point_location_if_in_system_header (location_t location)
 856 {
 857   if (in_system_header_at (location))
 858     location = linemap_resolve_location (line_table, location,
 859                                          LRK_MACRO_EXPANSION_POINT,
 860                                          NULL);
 861   return location;
 862 }
 863
 864 /* If LOCATION is a virtual location for a token coming from the expansion
 865    of a macro, unwind to the location of the expansion point of the macro.  */
 866
 867 location_t
 868 expansion_point_location (location_t location)
 869 {
 870   return linemap_resolve_location (line_table, location,
 871                                    LRK_MACRO_EXPANSION_POINT, NULL);
 872 }
 873
 874 /* Construct a location with caret at CARET, ranging from START to
 875    finish e.g.
 876
 877                  11111111112
 878         12345678901234567890
 879      522
 880      523   return foo + bar;
 881                   ~~~~^~~~~
 882      524
 883
 884    The location's caret is at the "+", line 523 column 15, but starts
 885    earlier, at the "f" of "foo" at column 11.  The finish is at the "r"
 886    of "bar" at column 19.  */
 887
 888 location_t
 889 make_location (location_t caret, location_t start, location_t finish)
 890 {
 891   location_t pure_loc = get_pure_location (caret);
 892   source_range src_range;
 893   src_range.m_start = get_start (start);
 894   src_range.m_finish = get_finish (finish);
 895   location_t combined_loc = COMBINE_LOCATION_DATA (line_table,
 896                                                    pure_loc,
 897                                                    src_range,
 898                                                    NULL);
 899   return combined_loc;
 900 }
 901
 902 /* Same as above, but taking a source range rather than two locations.  */
 903
 904 location_t
 905 make_location (location_t caret, source_range src_range)
 906 {
 907   location_t pure_loc = get_pure_location (caret);
 908   return COMBINE_LOCATION_DATA (line_table, pure_loc, src_range, NULL);
 909 }
 910
 911 /* An expanded_location stores the column in byte units.  This function
 912    converts that column to display units.  That requires reading the associated
 913    source line in order to calculate the display width.  If that cannot be done
 914    for any reason, then returns the byte column as a fallback.  */
 915 int
 916 location_compute_display_column (expanded_location exploc)
 917 {
 918   if (!(exploc.file && *exploc.file && exploc.line && exploc.column))
 919     return exploc.column;
 920   char_span line = location_get_source_line (exploc.file, exploc.line);
 921   /* If line is NULL, this function returns exploc.column which is the
 922      desired fallback.  */
 923   return cpp_byte_column_to_display_column (line.get_buffer (), line.length (),
 924                                             exploc.column);
 925 }
 926
 927 /* Dump statistics to stderr about the memory usage of the line_table
 928    set of line maps.  This also displays some statistics about macro
 929    expansion.  */
 930
 931 void
 932 dump_line_table_statistics (void)
 933 {
 934   struct linemap_stats s;
 935   long total_used_map_size,
 936     macro_maps_size,
 937     total_allocated_map_size;
 938
 939   memset (&s, 0, sizeof (s));
 940
 941   linemap_get_statistics (line_table, &s);
 942
 943   macro_maps_size = s.macro_maps_used_size
 944     + s.macro_maps_locations_size;
 945
 946   total_allocated_map_size = s.ordinary_maps_allocated_size
 947     + s.macro_maps_allocated_size
 948     + s.macro_maps_locations_size;
 949
 950   total_used_map_size = s.ordinary_maps_used_size
 951     + s.macro_maps_used_size
 952     + s.macro_maps_locations_size;
 953
 954   fprintf (stderr, "Number of expanded macros:                     %5ld\n",
 955            s.num_expanded_macros);
 956   if (s.num_expanded_macros != 0)
 957     fprintf (stderr, "Average number of tokens per macro expansion:  %5ld\n",
 958              s.num_macro_tokens / s.num_expanded_macros);
 959   fprintf (stderr,
 960            "\nLine Table allocations during the "
 961            "compilation process\n");
 962   fprintf (stderr, "Number of ordinary maps used:        " PRsa (5) "\n",
 963            SIZE_AMOUNT (s.num_ordinary_maps_used));
 964   fprintf (stderr, "Ordinary map used size:              " PRsa (5) "\n",
 965            SIZE_AMOUNT (s.ordinary_maps_used_size));
 966   fprintf (stderr, "Number of ordinary maps allocated:   " PRsa (5) "\n",
 967            SIZE_AMOUNT (s.num_ordinary_maps_allocated));
 968   fprintf (stderr, "Ordinary maps allocated size:        " PRsa (5) "\n",
 969            SIZE_AMOUNT (s.ordinary_maps_allocated_size));
 970   fprintf (stderr, "Number of macro maps used:           " PRsa (5) "\n",
 971            SIZE_AMOUNT (s.num_macro_maps_used));
 972   fprintf (stderr, "Macro maps used size:                " PRsa (5) "\n",
 973            SIZE_AMOUNT (s.macro_maps_used_size));
 974   fprintf (stderr, "Macro maps locations size:           " PRsa (5) "\n",
 975            SIZE_AMOUNT (s.macro_maps_locations_size));
 976   fprintf (stderr, "Macro maps size:                     " PRsa (5) "\n",
 977            SIZE_AMOUNT (macro_maps_size));
 978   fprintf (stderr, "Duplicated maps locations size:      " PRsa (5) "\n",
 979            SIZE_AMOUNT (s.duplicated_macro_maps_locations_size));
 980   fprintf (stderr, "Total allocated maps size:           " PRsa (5) "\n",
 981            SIZE_AMOUNT (total_allocated_map_size));
 982   fprintf (stderr, "Total used maps size:                " PRsa (5) "\n",
 983            SIZE_AMOUNT (total_used_map_size));
 984   fprintf (stderr, "Ad-hoc table size:                   " PRsa (5) "\n",
 985            SIZE_AMOUNT (s.adhoc_table_size));
 986   fprintf (stderr, "Ad-hoc table entries used:           " PRsa (5) "\n",
 987            SIZE_AMOUNT (s.adhoc_table_entries_used));
 988   fprintf (stderr, "optimized_ranges:                    " PRsa (5) "\n",
 989            SIZE_AMOUNT (line_table->num_optimized_ranges));
 990   fprintf (stderr, "unoptimized_ranges:                  " PRsa (5) "\n",
 991            SIZE_AMOUNT (line_table->num_unoptimized_ranges));
 992
 993   fprintf (stderr, "\n");
 994 }
 995
 996 /* Get location one beyond the final location in ordinary map IDX.  */
 997
 998 static location_t
 999 get_end_location (class line_maps *set, unsigned int idx)
1000 {
1001   if (idx == LINEMAPS_ORDINARY_USED (set) - 1)
1002     return set->highest_location;
1003
1004   struct line_map *next_map = LINEMAPS_ORDINARY_MAP_AT (set, idx + 1);
1005   return MAP_START_LOCATION (next_map);
1006 }
1007
1008 /* Helper function for write_digit_row.  */
1009
1010 static void
1011 write_digit (FILE *stream, int digit)
1012 {
1013   fputc ('0' + (digit % 10), stream);
1014 }
1015
1016 /* Helper function for dump_location_info.
1017    Write a row of numbers to STREAM, numbering a source line,
1018    giving the units, tens, hundreds etc of the column number.  */
1019
1020 static void
1021 write_digit_row (FILE *stream, int indent,
1022                  const line_map_ordinary *map,
1023                  location_t loc, int max_col, int divisor)
1024 {
1025   fprintf (stream, "%*c", indent, ' ');
1026   fprintf (stream, "|");
1027   for (int column = 1; column < max_col; column++)
1028     {
1029       location_t column_loc = loc + (column << map->m_range_bits);
1030       write_digit (stream, column_loc / divisor);
1031     }
1032   fprintf (stream, "\n");
1033 }
1034
1035 /* Write a half-closed (START) / half-open (END) interval of
1036    location_t to STREAM.  */
1037
1038 static void
1039 dump_location_range (FILE *stream,
1040                      location_t start, location_t end)
1041 {
1042   fprintf (stream,
1043            "  location_t interval: %u <= loc < %u\n",
1044            start, end);
1045 }
1046
1047 /* Write a labelled description of a half-closed (START) / half-open (END)
1048    interval of location_t to STREAM.  */
1049
1050 static void
1051 dump_labelled_location_range (FILE *stream,
1052                               const char *name,
1053                               location_t start, location_t end)
1054 {
1055   fprintf (stream, "%s\n", name);
1056   dump_location_range (stream, start, end);
1057   fprintf (stream, "\n");
1058 }
1059
1060 /* Write a visualization of the locations in the line_table to STREAM.  */
1061
1062 void
1063 dump_location_info (FILE *stream)
1064 {
1065   /* Visualize the reserved locations.  */
1066   dump_labelled_location_range (stream, "RESERVED LOCATIONS",
1067                                 0, RESERVED_LOCATION_COUNT);
1068
1069   /* Visualize the ordinary line_map instances, rendering the sources. */
1070   for (unsigned int idx = 0; idx < LINEMAPS_ORDINARY_USED (line_table); idx++)
1071     {
1072       location_t end_location = get_end_location (line_table, idx);
1073       /* half-closed: doesn't include this one. */
1074
1075       const line_map_ordinary *map
1076         = LINEMAPS_ORDINARY_MAP_AT (line_table, idx);
1077       fprintf (stream, "ORDINARY MAP: %i\n", idx);
1078       dump_location_range (stream,
1079                            MAP_START_LOCATION (map), end_location);
1080       fprintf (stream, "  file: %s\n", ORDINARY_MAP_FILE_NAME (map));
1081       fprintf (stream, "  starting at line: %i\n",
1082                ORDINARY_MAP_STARTING_LINE_NUMBER (map));
1083       fprintf (stream, "  column and range bits: %i\n",
1084                map->m_column_and_range_bits);
1085       fprintf (stream, "  column bits: %i\n",
1086                map->m_column_and_range_bits - map->m_range_bits);
1087       fprintf (stream, "  range bits: %i\n",
1088                map->m_range_bits);
1089       const char * reason;
1090       switch (map->reason) {
1091       case LC_ENTER:
1092         reason = "LC_ENTER";
1093         break;
1094       case LC_LEAVE:
1095         reason = "LC_LEAVE";
1096         break;
1097       case LC_RENAME:
1098         reason = "LC_RENAME";
1099         break;
1100       case LC_RENAME_VERBATIM:
1101         reason = "LC_RENAME_VERBATIM";
1102         break;
1103       case LC_ENTER_MACRO:
1104         reason = "LC_RENAME_MACRO";
1105         break;
1106       default:
1107         reason = "Unknown";
1108       }
1109       fprintf (stream, "  reason: %d (%s)\n", map->reason, reason);
1110
1111       const line_map_ordinary *includer_map
1112         = linemap_included_from_linemap (line_table, map);
1113       fprintf (stream, "  included from location: %d",
1114                linemap_included_from (map));
1115       if (includer_map) {
1116         fprintf (stream, " (in ordinary map %d)",
1117                  int (includer_map - line_table->info_ordinary.maps));
1118       }
1119       fprintf (stream, "\n");
1120
1121       /* Render the span of source lines that this "map" covers.  */
1122       for (location_t loc = MAP_START_LOCATION (map);
1123            loc < end_location;
1124            loc += (1 << map->m_range_bits) )
1125         {
1126           gcc_assert (pure_location_p (line_table, loc) );
1127
1128           expanded_location exploc
1129             = linemap_expand_location (line_table, map, loc);
1130
1131           if (exploc.column == 0)
1132             {
1133               /* Beginning of a new source line: draw the line.  */
1134
1135               char_span line_text = location_get_source_line (exploc.file,
1136                                                               exploc.line);
1137               if (!line_text)
1138                 break;
1139               fprintf (stream,
1140                        "%s:%3i|loc:%5i|%.*s\n",
1141                        exploc.file, exploc.line,
1142                        loc,
1143                        (int)line_text.length (), line_text.get_buffer ());
1144
1145               /* "loc" is at column 0, which means "the whole line".
1146                  Render the locations *within* the line, by underlining
1147                  it, showing the location_t numeric values
1148                  at each column.  */
1149               size_t max_col = (1 << map->m_column_and_range_bits) - 1;
1150               if (max_col > line_text.length ())
1151                 max_col = line_text.length () + 1;
1152
1153               int len_lnum = num_digits (exploc.line);
1154               if (len_lnum < 3)
1155                 len_lnum = 3;
1156               int len_loc = num_digits (loc);
1157               if (len_loc < 5)
1158                 len_loc = 5;
1159
1160               int indent = 6 + strlen (exploc.file) + len_lnum + len_loc;
1161
1162               /* Thousands.  */
1163               if (end_location > 999)
1164                 write_digit_row (stream, indent, map, loc, max_col, 1000);
1165
1166               /* Hundreds.  */
1167               if (end_location > 99)
1168                 write_digit_row (stream, indent, map, loc, max_col, 100);
1169
1170               /* Tens.  */
1171               write_digit_row (stream, indent, map, loc, max_col, 10);
1172
1173               /* Units.  */
1174               write_digit_row (stream, indent, map, loc, max_col, 1);
1175             }
1176         }
1177       fprintf (stream, "\n");
1178     }
1179
1180   /* Visualize unallocated values.  */
1181   dump_labelled_location_range (stream, "UNALLOCATED LOCATIONS",
1182                                 line_table->highest_location,
1183                                 LINEMAPS_MACRO_LOWEST_LOCATION (line_table));
1184
1185   /* Visualize the macro line_map instances, rendering the sources. */
1186   for (unsigned int i = 0; i < LINEMAPS_MACRO_USED (line_table); i++)
1187     {
1188       /* Each macro map that is allocated owns location_t values
1189          that are *lower* that the one before them.
1190          Hence it's meaningful to view them either in order of ascending
1191          source locations, or in order of ascending macro map index.  */
1192       const bool ascending_location_ts = true;
1193       unsigned int idx = (ascending_location_ts
1194                           ? (LINEMAPS_MACRO_USED (line_table) - (i + 1))
1195                           : i);
1196       const line_map_macro *map = LINEMAPS_MACRO_MAP_AT (line_table, idx);
1197       fprintf (stream, "MACRO %i: %s (%u tokens)\n",
1198                idx,
1199                linemap_map_get_macro_name (map),
1200                MACRO_MAP_NUM_MACRO_TOKENS (map));
1201       dump_location_range (stream,
1202                            map->start_location,
1203                            (map->start_location
1204                             + MACRO_MAP_NUM_MACRO_TOKENS (map)));
1205       inform (MACRO_MAP_EXPANSION_POINT_LOCATION (map),
1206               "expansion point is location %i",
1207               MACRO_MAP_EXPANSION_POINT_LOCATION (map));
1208       fprintf (stream, "  map->start_location: %u\n",
1209                map->start_location);
1210
1211       fprintf (stream, "  macro_locations:\n");
1212       for (unsigned int i = 0; i < MACRO_MAP_NUM_MACRO_TOKENS (map); i++)
1213         {
1214           location_t x = MACRO_MAP_LOCATIONS (map)[2 * i];
1215           location_t y = MACRO_MAP_LOCATIONS (map)[(2 * i) + 1];
1216
1217           /* linemap_add_macro_token encodes token numbers in an expansion
1218              by putting them after MAP_START_LOCATION. */
1219
1220           /* I'm typically seeing 4 uninitialized entries at the end of
1221              0xafafafaf.
1222              This appears to be due to macro.c:replace_args
1223              adding 2 extra args for padding tokens; presumably there may
1224              be a leading and/or trailing padding token injected,
1225              each for 2 more location slots.
1226              This would explain there being up to 4 location_ts slots
1227              that may be uninitialized.  */
1228
1229           fprintf (stream, "    %u: %u, %u\n",
1230                    i,
1231                    x,
1232                    y);
1233           if (x == y)
1234             {
1235               if (x < MAP_START_LOCATION (map))
1236                 inform (x, "token %u has %<x-location == y-location == %u%>",
1237                         i, x);
1238               else
1239                 fprintf (stream,
1240                          "x-location == y-location == %u encodes token # %u\n",
1241                          x, x - MAP_START_LOCATION (map));
1242                 }
1243           else
1244             {
1245               inform (x, "token %u has %<x-location == %u%>", i, x);
1246               inform (x, "token %u has %<y-location == %u%>", i, y);
1247             }
1248         }
1249       fprintf (stream, "\n");
1250     }
1251
1252   /* It appears that MAX_LOCATION_T itself is never assigned to a
1253      macro map, presumably due to an off-by-one error somewhere
1254      between the logic in linemap_enter_macro and
1255      LINEMAPS_MACRO_LOWEST_LOCATION.  */
1256   dump_labelled_location_range (stream, "MAX_LOCATION_T",
1257                                 MAX_LOCATION_T,
1258                                 MAX_LOCATION_T + 1);
1259
1260   /* Visualize ad-hoc values.  */
1261   dump_labelled_location_range (stream, "AD-HOC LOCATIONS",
1262                                 MAX_LOCATION_T + 1, UINT_MAX);
1263 }
1264
1265 /* string_concat's constructor.  */
1266
1267 string_concat::string_concat (int num, location_t *locs)
1268   : m_num (num)
1269 {
1270   m_locs = ggc_vec_alloc <location_t> (num);
1271   for (int i = 0; i < num; i++)
1272     m_locs[i] = locs[i];
1273 }
1274
1275 /* string_concat_db's constructor.  */
1276
1277 string_concat_db::string_concat_db ()
1278 {
1279   m_table = hash_map <location_hash, string_concat *>::create_ggc (64);
1280 }
1281
1282 /* Record that a string concatenation occurred, covering NUM
1283    string literal tokens.  LOCS is an array of size NUM, containing the
1284    locations of the tokens.  A copy of LOCS is taken.  */
1285
1286 void
1287 string_concat_db::record_string_concatenation (int num, location_t *locs)
1288 {
1289   gcc_assert (num > 1);
1290   gcc_assert (locs);
1291
1292   location_t key_loc = get_key_loc (locs[0]);
1293
1294   string_concat *concat
1295     = new (ggc_alloc <string_concat> ()) string_concat (num, locs);
1296   m_table->put (key_loc, concat);
1297 }
1298
1299 /* Determine if LOC was the location of the initial token of a
1300    concatenation of string literal tokens.
1301    If so, *OUT_NUM is written to with the number of tokens, and
1302    *OUT_LOCS with the location of an array of locations of the
1303    tokens, and return true.  *OUT_LOCS is a borrowed pointer to
1304    storage owned by the string_concat_db.
1305    Otherwise, return false.  */
1306
1307 bool
1308 string_concat_db::get_string_concatenation (location_t loc,
1309                                             int *out_num,
1310                                             location_t **out_locs)
1311 {
1312   gcc_assert (out_num);
1313   gcc_assert (out_locs);
1314
1315   location_t key_loc = get_key_loc (loc);
1316
1317   string_concat **concat = m_table->get (key_loc);
1318   if (!concat)
1319     return false;
1320
1321   *out_num = (*concat)->m_num;
1322   *out_locs =(*concat)->m_locs;
1323   return true;
1324 }
1325
1326 /* Internal function.  Canonicalize LOC into a form suitable for
1327    use as a key within the database, stripping away macro expansion,
1328    ad-hoc information, and range information, using the location of
1329    the start of LOC within an ordinary linemap.  */
1330
1331 location_t
1332 string_concat_db::get_key_loc (location_t loc)
1333 {
1334   loc = linemap_resolve_location (line_table, loc, LRK_SPELLING_LOCATION,
1335                                   NULL);
1336
1337   loc = get_range_from_loc (line_table, loc).m_start;
1338
1339   return loc;
1340 }
1341
1342 /* Helper class for use within get_substring_ranges_for_loc.
1343    An vec of cpp_string with responsibility for releasing all of the
1344    str->text for each str in the vector.  */
1345
1346 class auto_cpp_string_vec :  public auto_vec <cpp_string>
1347 {
1348  public:
1349   auto_cpp_string_vec (int alloc)
1350     : auto_vec <cpp_string> (alloc) {}
1351
1352   ~auto_cpp_string_vec ()
1353   {
1354     /* Clean up the copies within this vec.  */
1355     int i;
1356     cpp_string *str;
1357     FOR_EACH_VEC_ELT (*this, i, str)
1358       free (const_cast <unsigned char *> (str->text));
1359   }
1360 };
1361
1362 /* Attempt to populate RANGES with source location information on the
1363    individual characters within the string literal found at STRLOC.
1364    If CONCATS is non-NULL, then any string literals that the token at
1365    STRLOC  was concatenated with are also added to RANGES.
1366
1367    Return NULL if successful, or an error message if any errors occurred (in
1368    which case RANGES may be only partially populated and should not
1369    be used).
1370
1371    This is implemented by re-parsing the relevant source line(s).  */
1372
1373 static const char *
1374 get_substring_ranges_for_loc (cpp_reader *pfile,
1375                               string_concat_db *concats,
1376                               location_t strloc,
1377                               enum cpp_ttype type,
1378                               cpp_substring_ranges &ranges)
1379 {
1380   gcc_assert (pfile);
1381
1382   if (strloc == UNKNOWN_LOCATION)
1383     return "unknown location";
1384
1385   /* Reparsing the strings requires accurate location information.
1386      If -ftrack-macro-expansion has been overridden from its default
1387      of 2, then we might have a location of a macro expansion point,
1388      rather than the location of the literal itself.
1389      Avoid this by requiring that we have full macro expansion tracking
1390      for substring locations to be available.  */
1391   if (cpp_get_options (pfile)->track_macro_expansion != 2)
1392     return "track_macro_expansion != 2";
1393
1394   /* If #line or # 44 "file"-style directives are present, then there's
1395      no guarantee that the line numbers we have can be used to locate
1396      the strings.  For example, we might have a .i file with # directives
1397      pointing back to lines within a .c file, but the .c file might
1398      have been edited since the .i file was created.
1399      In such a case, the safest course is to disable on-demand substring
1400      locations.  */
1401   if (line_table->seen_line_directive)
1402     return "seen line directive";
1403
1404   /* If string concatenation has occurred at STRLOC, get the locations
1405      of all of the literal tokens making up the compound string.
1406      Otherwise, just use STRLOC.  */
1407   int num_locs = 1;
1408   location_t *strlocs = &strloc;
1409   if (concats)
1410     concats->get_string_concatenation (strloc, &num_locs, &strlocs);
1411
1412   auto_cpp_string_vec strs (num_locs);
1413   auto_vec <cpp_string_location_reader> loc_readers (num_locs);
1414   for (int i = 0; i < num_locs; i++)
1415     {
1416       /* Get range of strloc.  We will use it to locate the start and finish
1417          of the literal token within the line.  */
1418       source_range src_range = get_range_from_loc (line_table, strlocs[i]);
1419
1420       if (src_range.m_start >= LINEMAPS_MACRO_LOWEST_LOCATION (line_table))
1421         {
1422           /* If the string token was within a macro expansion, then we can
1423              cope with it for the simple case where we have a single token.
1424              Otherwise, bail out.  */
1425           if (src_range.m_start != src_range.m_finish)
1426             return "macro expansion";
1427         }
1428       else
1429         {
1430           if (src_range.m_start >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1431             /* If so, we can't reliably determine where the token started within
1432                its line.  */
1433             return "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS";
1434
1435           if (src_range.m_finish >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1436             /* If so, we can't reliably determine where the token finished
1437                within its line.  */
1438             return "range ends after LINE_MAP_MAX_LOCATION_WITH_COLS";
1439         }
1440
1441       expanded_location start
1442         = expand_location_to_spelling_point (src_range.m_start,
1443                                              LOCATION_ASPECT_START);
1444       expanded_location finish
1445         = expand_location_to_spelling_point (src_range.m_finish,
1446                                              LOCATION_ASPECT_FINISH);
1447       if (start.file != finish.file)
1448         return "range endpoints are in different files";
1449       if (start.line != finish.line)
1450         return "range endpoints are on different lines";
1451       if (start.column > finish.column)
1452         return "range endpoints are reversed";
1453
1454       char_span line = location_get_source_line (start.file, start.line);
1455       if (!line)
1456         return "unable to read source line";
1457
1458       /* Determine the location of the literal (including quotes
1459          and leading prefix chars, such as the 'u' in a u""
1460          token).  */
1461       size_t literal_length = finish.column - start.column + 1;
1462
1463       /* Ensure that we don't crash if we got the wrong location.  */
1464       if (line.length () < (start.column - 1 + literal_length))
1465         return "line is not wide enough";
1466
1467       char_span literal = line.subspan (start.column - 1, literal_length);
1468
1469       cpp_string from;
1470       from.len = literal_length;
1471       /* Make a copy of the literal, to avoid having to rely on
1472          the lifetime of the copy of the line within the cache.
1473          This will be released by the auto_cpp_string_vec dtor.  */
1474       from.text = (unsigned char *)literal.xstrdup ();
1475       strs.safe_push (from);
1476
1477       /* For very long lines, a new linemap could have started
1478          halfway through the token.
1479          Ensure that the loc_reader uses the linemap of the
1480          *end* of the token for its start location.  */
1481       const line_map_ordinary *start_ord_map;
1482       linemap_resolve_location (line_table, src_range.m_start,
1483                                 LRK_SPELLING_LOCATION, &start_ord_map);
1484       const line_map_ordinary *final_ord_map;
1485       linemap_resolve_location (line_table, src_range.m_finish,
1486                                 LRK_SPELLING_LOCATION, &final_ord_map);
1487       if (start_ord_map == NULL || final_ord_map == NULL)
1488         return "failed to get ordinary maps";
1489       /* Bulletproofing.  We ought to only have different ordinary maps
1490          for start vs finish due to line-length jumps.  */
1491       if (start_ord_map != final_ord_map
1492           && start_ord_map->to_file != final_ord_map->to_file)
1493         return "start and finish are spelled in different ordinary maps";
1494       /* The file from linemap_resolve_location ought to match that from
1495          expand_location_to_spelling_point.  */
1496       if (start_ord_map->to_file != start.file)
1497         return "mismatching file after resolving linemap";
1498
1499       location_t start_loc
1500         = linemap_position_for_line_and_column (line_table, final_ord_map,
1501                                                 start.line, start.column);
1502
1503       cpp_string_location_reader loc_reader (start_loc, line_table);
1504       loc_readers.safe_push (loc_reader);
1505     }
1506
1507   /* Rerun cpp_interpret_string, or rather, a modified version of it.  */
1508   const char *err = cpp_interpret_string_ranges (pfile, strs.address (),
1509                                                  loc_readers.address (),
1510                                                  num_locs, &ranges, type);
1511   if (err)
1512     return err;
1513
1514   /* Success: "ranges" should now contain information on the string.  */
1515   return NULL;
1516 }
1517
1518 /* Attempt to populate *OUT_LOC with source location information on the
1519    given characters within the string literal found at STRLOC.
1520    CARET_IDX, START_IDX, and END_IDX refer to offsets within the execution
1521    character set.
1522
1523    For example, given CARET_IDX = 4, START_IDX = 3, END_IDX  = 7
1524    and string literal "012345\n789"
1525    *OUT_LOC is written to with:
1526      "012345\n789"
1527          ~^~~~~
1528
1529    If CONCATS is non-NULL, then any string literals that the token at
1530    STRLOC was concatenated with are also considered.
1531
1532    This is implemented by re-parsing the relevant source line(s).
1533
1534    Return NULL if successful, or an error message if any errors occurred.
1535    Error messages are intended for GCC developers (to help debugging) rather
1536    than for end-users.  */
1537
1538 const char *
1539 get_location_within_string (cpp_reader *pfile,
1540                             string_concat_db *concats,
1541                             location_t strloc,
1542                             enum cpp_ttype type,
1543                             int caret_idx, int start_idx, int end_idx,
1544                             location_t *out_loc)
1545 {
1546   gcc_checking_assert (caret_idx >= 0);
1547   gcc_checking_assert (start_idx >= 0);
1548   gcc_checking_assert (end_idx >= 0);
1549   gcc_assert (out_loc);
1550
1551   cpp_substring_ranges ranges;
1552   const char *err
1553     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1554   if (err)
1555     return err;
1556
1557   if (caret_idx >= ranges.get_num_ranges ())
1558     return "caret_idx out of range";
1559   if (start_idx >= ranges.get_num_ranges ())
1560     return "start_idx out of range";
1561   if (end_idx >= ranges.get_num_ranges ())
1562     return "end_idx out of range";
1563
1564   *out_loc = make_location (ranges.get_range (caret_idx).m_start,
1565                             ranges.get_range (start_idx).m_start,
1566                             ranges.get_range (end_idx).m_finish);
1567   return NULL;
1568 }
1569
1570 #if CHECKING_P
1571
1572 namespace selftest {
1573
1574 /* Selftests of location handling.  */
1575
1576 /* Attempt to populate *OUT_RANGE with source location information on the
1577    given character within the string literal found at STRLOC.
1578    CHAR_IDX refers to an offset within the execution character set.
1579    If CONCATS is non-NULL, then any string literals that the token at
1580    STRLOC was concatenated with are also considered.
1581
1582    This is implemented by re-parsing the relevant source line(s).
1583
1584    Return NULL if successful, or an error message if any errors occurred.
1585    Error messages are intended for GCC developers (to help debugging) rather
1586    than for end-users.  */
1587
1588 static const char *
1589 get_source_range_for_char (cpp_reader *pfile,
1590                            string_concat_db *concats,
1591                            location_t strloc,
1592                            enum cpp_ttype type,
1593                            int char_idx,
1594                            source_range *out_range)
1595 {
1596   gcc_checking_assert (char_idx >= 0);
1597   gcc_assert (out_range);
1598
1599   cpp_substring_ranges ranges;
1600   const char *err
1601     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1602   if (err)
1603     return err;
1604
1605   if (char_idx >= ranges.get_num_ranges ())
1606     return "char_idx out of range";
1607
1608   *out_range = ranges.get_range (char_idx);
1609   return NULL;
1610 }
1611
1612 /* As get_source_range_for_char, but write to *OUT the number
1613    of ranges that are available.  */
1614
1615 static const char *
1616 get_num_source_ranges_for_substring (cpp_reader *pfile,
1617                                      string_concat_db *concats,
1618                                      location_t strloc,
1619                                      enum cpp_ttype type,
1620                                      int *out)
1621 {
1622   gcc_assert (out);
1623
1624   cpp_substring_ranges ranges;
1625   const char *err
1626     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1627
1628   if (err)
1629     return err;
1630
1631   *out = ranges.get_num_ranges ();
1632   return NULL;
1633 }
1634
1635 /* Selftests of location handling.  */
1636
1637 /* Verify that compare() on linenum_type handles comparisons over the full
1638    range of the type.  */
1639
1640 static void
1641 test_linenum_comparisons ()
1642 {
1643   linenum_type min_line (0);
1644   linenum_type max_line (0xffffffff);
1645   ASSERT_EQ (0, compare (min_line, min_line));
1646   ASSERT_EQ (0, compare (max_line, max_line));
1647
1648   ASSERT_GT (compare (max_line, min_line), 0);
1649   ASSERT_LT (compare (min_line, max_line), 0);
1650 }
1651
1652 /* Helper function for verifying location data: when location_t
1653    values are > LINE_MAP_MAX_LOCATION_WITH_COLS, they are treated
1654    as having column 0.  */
1655
1656 static bool
1657 should_have_column_data_p (location_t loc)
1658 {
1659   if (IS_ADHOC_LOC (loc))
1660     loc = get_location_from_adhoc_loc (line_table, loc);
1661   if (loc > LINE_MAP_MAX_LOCATION_WITH_COLS)
1662     return false;
1663   return true;
1664 }
1665
1666 /* Selftest for should_have_column_data_p.  */
1667
1668 static void
1669 test_should_have_column_data_p ()
1670 {
1671   ASSERT_TRUE (should_have_column_data_p (RESERVED_LOCATION_COUNT));
1672   ASSERT_TRUE
1673     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS));
1674   ASSERT_FALSE
1675     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS + 1));
1676 }
1677
1678 /* Verify the result of LOCATION_FILE/LOCATION_LINE/LOCATION_COLUMN
1679    on LOC.  */
1680
1681 static void
1682 assert_loceq (const char *exp_filename, int exp_linenum, int exp_colnum,
1683               location_t loc)
1684 {
1685   ASSERT_STREQ (exp_filename, LOCATION_FILE (loc));
1686   ASSERT_EQ (exp_linenum, LOCATION_LINE (loc));
1687   /* If location_t values are sufficiently high, then column numbers
1688      will be unavailable and LOCATION_COLUMN (loc) will be 0.
1689      When close to the threshold, column numbers *may* be present: if
1690      the final linemap before the threshold contains a line that straddles
1691      the threshold, locations in that line have column information.  */
1692   if (should_have_column_data_p (loc))
1693     ASSERT_EQ (exp_colnum, LOCATION_COLUMN (loc));
1694 }
1695
1696 /* Various selftests involve constructing a line table and one or more
1697    line maps within it.
1698
1699    For maximum test coverage we want to run these tests with a variety
1700    of situations:
1701    - line_table->default_range_bits: some frontends use a non-zero value
1702    and others use zero
1703    - the fallback modes within line-map.c: there are various threshold
1704    values for location_t beyond line-map.c changes
1705    behavior (disabling of the range-packing optimization, disabling
1706    of column-tracking).  We can exercise these by starting the line_table
1707    at interesting values at or near these thresholds.
1708
1709    The following struct describes a particular case within our test
1710    matrix.  */
1711
1712 class line_table_case
1713 {
1714 public:
1715   line_table_case (int default_range_bits, int base_location)
1716   : m_default_range_bits (default_range_bits),
1717     m_base_location (base_location)
1718   {}
1719
1720   int m_default_range_bits;
1721   int m_base_location;
1722 };
1723
1724 /* Constructor.  Store the old value of line_table, and create a new
1725    one, using sane defaults.  */
1726
1727 line_table_test::line_table_test ()
1728 {
1729   gcc_assert (saved_line_table == NULL);
1730   saved_line_table = line_table;
1731   line_table = ggc_alloc<line_maps> ();
1732   linemap_init (line_table, BUILTINS_LOCATION);
1733   gcc_assert (saved_line_table->reallocator);
1734   line_table->reallocator = saved_line_table->reallocator;
1735   gcc_assert (saved_line_table->round_alloc_size);
1736   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1737   line_table->default_range_bits = 0;
1738 }
1739
1740 /* Constructor.  Store the old value of line_table, and create a new
1741    one, using the sitation described in CASE_.  */
1742
1743 line_table_test::line_table_test (const line_table_case &case_)
1744 {
1745   gcc_assert (saved_line_table == NULL);
1746   saved_line_table = line_table;
1747   line_table = ggc_alloc<line_maps> ();
1748   linemap_init (line_table, BUILTINS_LOCATION);
1749   gcc_assert (saved_line_table->reallocator);
1750   line_table->reallocator = saved_line_table->reallocator;
1751   gcc_assert (saved_line_table->round_alloc_size);
1752   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1753   line_table->default_range_bits = case_.m_default_range_bits;
1754   if (case_.m_base_location)
1755     {
1756       line_table->highest_location = case_.m_base_location;
1757       line_table->highest_line = case_.m_base_location;
1758     }
1759 }
1760
1761 /* Destructor.  Restore the old value of line_table.  */
1762
1763 line_table_test::~line_table_test ()
1764 {
1765   gcc_assert (saved_line_table != NULL);
1766   line_table = saved_line_table;
1767   saved_line_table = NULL;
1768 }
1769
1770 /* Verify basic operation of ordinary linemaps.  */
1771
1772 static void
1773 test_accessing_ordinary_linemaps (const line_table_case &case_)
1774 {
1775   line_table_test ltt (case_);
1776
1777   /* Build a simple linemap describing some locations. */
1778   linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
1779
1780   linemap_line_start (line_table, 1, 100);
1781   location_t loc_a = linemap_position_for_column (line_table, 1);
1782   location_t loc_b = linemap_position_for_column (line_table, 23);
1783
1784   linemap_line_start (line_table, 2, 100);
1785   location_t loc_c = linemap_position_for_column (line_table, 1);
1786   location_t loc_d = linemap_position_for_column (line_table, 17);
1787
1788   /* Example of a very long line.  */
1789   linemap_line_start (line_table, 3, 2000);
1790   location_t loc_e = linemap_position_for_column (line_table, 700);
1791
1792   /* Transitioning back to a short line.  */
1793   linemap_line_start (line_table, 4, 0);
1794   location_t loc_back_to_short = linemap_position_for_column (line_table, 100);
1795
1796   if (should_have_column_data_p (loc_back_to_short))
1797     {
1798       /* Verify that we switched to short lines in the linemap.  */
1799       line_map_ordinary *map = LINEMAPS_LAST_ORDINARY_MAP (line_table);
1800       ASSERT_EQ (7, map->m_column_and_range_bits - map->m_range_bits);
1801     }
1802
1803   /* Example of a line that will eventually be seen to be longer
1804      than LINE_MAP_MAX_COLUMN_NUMBER; the initially seen width is
1805      below that.  */
1806   linemap_line_start (line_table, 5, 2000);
1807
1808   location_t loc_start_of_very_long_line
1809     = linemap_position_for_column (line_table, 2000);
1810   location_t loc_too_wide
1811     = linemap_position_for_column (line_table, 4097);
1812   location_t loc_too_wide_2
1813     = linemap_position_for_column (line_table, 4098);
1814
1815   /* ...and back to a sane line length.  */
1816   linemap_line_start (line_table, 6, 100);
1817   location_t loc_sane_again = linemap_position_for_column (line_table, 10);
1818
1819   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
1820
1821   /* Multiple files.  */
1822   linemap_add (line_table, LC_ENTER, false, "bar.c", 0);
1823   linemap_line_start (line_table, 1, 200);
1824   location_t loc_f = linemap_position_for_column (line_table, 150);
1825   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
1826
1827   /* Verify that we can recover the location info.  */
1828   assert_loceq ("foo.c", 1, 1, loc_a);
1829   assert_loceq ("foo.c", 1, 23, loc_b);
1830   assert_loceq ("foo.c", 2, 1, loc_c);
1831   assert_loceq ("foo.c", 2, 17, loc_d);
1832   assert_loceq ("foo.c", 3, 700, loc_e);
1833   assert_loceq ("foo.c", 4, 100, loc_back_to_short);
1834
1835   /* In the very wide line, the initial location should be fully tracked.  */
1836   assert_loceq ("foo.c", 5, 2000, loc_start_of_very_long_line);
1837   /* ...but once we exceed LINE_MAP_MAX_COLUMN_NUMBER column-tracking should
1838      be disabled.  */
1839   assert_loceq ("foo.c", 5, 0, loc_too_wide);
1840   assert_loceq ("foo.c", 5, 0, loc_too_wide_2);
1841   /*...and column-tracking should be re-enabled for subsequent lines.  */
1842   assert_loceq ("foo.c", 6, 10, loc_sane_again);
1843
1844   assert_loceq ("bar.c", 1, 150, loc_f);
1845
1846   ASSERT_FALSE (is_location_from_builtin_token (loc_a));
1847   ASSERT_TRUE (pure_location_p (line_table, loc_a));
1848
1849   /* Verify using make_location to build a range, and extracting data
1850      back from it.  */
1851   location_t range_c_b_d = make_location (loc_c, loc_b, loc_d);
1852   ASSERT_FALSE (pure_location_p (line_table, range_c_b_d));
1853   ASSERT_EQ (loc_c, get_location_from_adhoc_loc (line_table, range_c_b_d));
1854   source_range src_range = get_range_from_loc (line_table, range_c_b_d);
1855   ASSERT_EQ (loc_b, src_range.m_start);
1856   ASSERT_EQ (loc_d, src_range.m_finish);
1857 }
1858
1859 /* Verify various properties of UNKNOWN_LOCATION.  */
1860
1861 static void
1862 test_unknown_location ()
1863 {
1864   ASSERT_EQ (NULL, LOCATION_FILE (UNKNOWN_LOCATION));
1865   ASSERT_EQ (0, LOCATION_LINE (UNKNOWN_LOCATION));
1866   ASSERT_EQ (0, LOCATION_COLUMN (UNKNOWN_LOCATION));
1867 }
1868
1869 /* Verify various properties of BUILTINS_LOCATION.  */
1870
1871 static void
1872 test_builtins ()
1873 {
1874   assert_loceq (_("<built-in>"), 0, 0, BUILTINS_LOCATION);
1875   ASSERT_PRED1 (is_location_from_builtin_token, BUILTINS_LOCATION);
1876 }
1877
1878 /* Regression test for make_location.
1879    Ensure that we use pure locations for the start/finish of the range,
1880    rather than storing a packed or ad-hoc range as the start/finish.  */
1881
1882 static void
1883 test_make_location_nonpure_range_endpoints (const line_table_case &case_)
1884 {
1885   /* Issue seen with testsuite/c-c++-common/Wlogical-not-parentheses-2.c
1886      with C++ frontend.
1887      ....................0000000001111111111222.
1888      ....................1234567890123456789012.  */
1889   const char *content = "     r += !aaa == bbb;\n";
1890   temp_source_file tmp (SELFTEST_LOCATION, ".C", content);
1891   line_table_test ltt (case_);
1892   linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 1);
1893
1894   const location_t c11 = linemap_position_for_column (line_table, 11);
1895   const location_t c12 = linemap_position_for_column (line_table, 12);
1896   const location_t c13 = linemap_position_for_column (line_table, 13);
1897   const location_t c14 = linemap_position_for_column (line_table, 14);
1898   const location_t c21 = linemap_position_for_column (line_table, 21);
1899
1900   if (c21 > LINE_MAP_MAX_LOCATION_WITH_COLS)
1901     return;
1902
1903   /* Use column 13 for the caret location, arbitrarily, to verify that we
1904      handle start != caret.  */
1905   const location_t aaa = make_location (c13, c12, c14);
1906   ASSERT_EQ (c13, get_pure_location (aaa));
1907   ASSERT_EQ (c12, get_start (aaa));
1908   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa)));
1909   ASSERT_EQ (c14, get_finish (aaa));
1910   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa)));
1911
1912   /* Make a location using a location with a range as the start-point.  */
1913   const location_t not_aaa = make_location (c11, aaa, c14);
1914   ASSERT_EQ (c11, get_pure_location (not_aaa));
1915   /* It should use the start location of the range, not store the range
1916      itself.  */
1917   ASSERT_EQ (c12, get_start (not_aaa));
1918   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa)));
1919   ASSERT_EQ (c14, get_finish (not_aaa));
1920   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa)));
1921
1922   /* Similarly, make a location with a range as the end-point.  */
1923   const location_t aaa_eq_bbb = make_location (c12, c12, c21);
1924   ASSERT_EQ (c12, get_pure_location (aaa_eq_bbb));
1925   ASSERT_EQ (c12, get_start (aaa_eq_bbb));
1926   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa_eq_bbb)));
1927   ASSERT_EQ (c21, get_finish (aaa_eq_bbb));
1928   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa_eq_bbb)));
1929   const location_t not_aaa_eq_bbb = make_location (c11, c12, aaa_eq_bbb);
1930   /* It should use the finish location of the range, not store the range
1931      itself.  */
1932   ASSERT_EQ (c11, get_pure_location (not_aaa_eq_bbb));
1933   ASSERT_EQ (c12, get_start (not_aaa_eq_bbb));
1934   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa_eq_bbb)));
1935   ASSERT_EQ (c21, get_finish (not_aaa_eq_bbb));
1936   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa_eq_bbb)));
1937 }
1938
1939 /* Verify reading of input files (e.g. for caret-based diagnostics).  */
1940
1941 static void
1942 test_reading_source_line ()
1943 {
1944   /* Create a tempfile and write some text to it.  */
1945   temp_source_file tmp (SELFTEST_LOCATION, ".txt",
1946                         "01234567890123456789\n"
1947                         "This is the test text\n"
1948                         "This is the 3rd line");
1949
1950   /* Read back a specific line from the tempfile.  */
1951   char_span source_line = location_get_source_line (tmp.get_filename (), 3);
1952   ASSERT_TRUE (source_line);
1953   ASSERT_TRUE (source_line.get_buffer () != NULL);
1954   ASSERT_EQ (20, source_line.length ());
1955   ASSERT_TRUE (!strncmp ("This is the 3rd line",
1956                          source_line.get_buffer (), source_line.length ()));
1957
1958   source_line = location_get_source_line (tmp.get_filename (), 2);
1959   ASSERT_TRUE (source_line);
1960   ASSERT_TRUE (source_line.get_buffer () != NULL);
1961   ASSERT_EQ (21, source_line.length ());
1962   ASSERT_TRUE (!strncmp ("This is the test text",
1963                          source_line.get_buffer (), source_line.length ()));
1964
1965   source_line = location_get_source_line (tmp.get_filename (), 4);
1966   ASSERT_FALSE (source_line);
1967   ASSERT_TRUE (source_line.get_buffer () == NULL);
1968 }
1969
1970 /* Tests of lexing.  */
1971
1972 /* Verify that token TOK from PARSER has cpp_token_as_text
1973    equal to EXPECTED_TEXT.  */
1974
1975 #define ASSERT_TOKEN_AS_TEXT_EQ(PARSER, TOK, EXPECTED_TEXT)             \
1976   SELFTEST_BEGIN_STMT                                                   \
1977     unsigned char *actual_txt = cpp_token_as_text ((PARSER), (TOK));    \
1978     ASSERT_STREQ ((EXPECTED_TEXT), (const char *)actual_txt);           \
1979   SELFTEST_END_STMT
1980
1981 /* Verify that TOK's src_loc is within EXP_FILENAME at EXP_LINENUM,
1982    and ranges from EXP_START_COL to EXP_FINISH_COL.
1983    Use LOC as the effective location of the selftest.  */
1984
1985 static void
1986 assert_token_loc_eq (const location &loc,
1987                      const cpp_token *tok,
1988                      const char *exp_filename, int exp_linenum,
1989                      int exp_start_col, int exp_finish_col)
1990 {
1991   location_t tok_loc = tok->src_loc;
1992   ASSERT_STREQ_AT (loc, exp_filename, LOCATION_FILE (tok_loc));
1993   ASSERT_EQ_AT (loc, exp_linenum, LOCATION_LINE (tok_loc));
1994
1995   /* If location_t values are sufficiently high, then column numbers
1996      will be unavailable.  */
1997   if (!should_have_column_data_p (tok_loc))
1998     return;
1999
2000   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_loc));
2001   source_range tok_range = get_range_from_loc (line_table, tok_loc);
2002   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_range.m_start));
2003   ASSERT_EQ_AT (loc, exp_finish_col, LOCATION_COLUMN (tok_range.m_finish));
2004 }
2005
2006 /* Use assert_token_loc_eq to verify the TOK->src_loc, using
2007    SELFTEST_LOCATION as the effective location of the selftest.  */
2008
2009 #define ASSERT_TOKEN_LOC_EQ(TOK, EXP_FILENAME, EXP_LINENUM, \
2010                             EXP_START_COL, EXP_FINISH_COL) \
2011   assert_token_loc_eq (SELFTEST_LOCATION, (TOK), (EXP_FILENAME), \
2012                        (EXP_LINENUM), (EXP_START_COL), (EXP_FINISH_COL))
2013
2014 /* Test of lexing a file using libcpp, verifying tokens and their
2015    location information.  */
2016
2017 static void
2018 test_lexer (const line_table_case &case_)
2019 {
2020   /* Create a tempfile and write some text to it.  */
2021   const char *content =
2022     /*00000000011111111112222222222333333.3333444444444.455555555556
2023       12345678901234567890123456789012345.6789012345678.901234567890.  */
2024     ("test_name /* c-style comment */\n"
2025      "                                  \"test literal\"\n"
2026      " // test c++-style comment\n"
2027      "   42\n");
2028   temp_source_file tmp (SELFTEST_LOCATION, ".txt", content);
2029
2030   line_table_test ltt (case_);
2031
2032   cpp_reader *parser = cpp_create_reader (CLK_GNUC89, NULL, line_table);
2033
2034   const char *fname = cpp_read_main_file (parser, tmp.get_filename ());
2035   ASSERT_NE (fname, NULL);
2036
2037   /* Verify that we get the expected tokens back, with the correct
2038      location information.  */
2039
2040   location_t loc;
2041   const cpp_token *tok;
2042   tok = cpp_get_token_with_location (parser, &loc);
2043   ASSERT_NE (tok, NULL);
2044   ASSERT_EQ (tok->type, CPP_NAME);
2045   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "test_name");
2046   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 1, 1, 9);
2047
2048   tok = cpp_get_token_with_location (parser, &loc);
2049   ASSERT_NE (tok, NULL);
2050   ASSERT_EQ (tok->type, CPP_STRING);
2051   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "\"test literal\"");
2052   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 2, 35, 48);
2053
2054   tok = cpp_get_token_with_location (parser, &loc);
2055   ASSERT_NE (tok, NULL);
2056   ASSERT_EQ (tok->type, CPP_NUMBER);
2057   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "42");
2058   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 4, 4, 5);
2059
2060   tok = cpp_get_token_with_location (parser, &loc);
2061   ASSERT_NE (tok, NULL);
2062   ASSERT_EQ (tok->type, CPP_EOF);
2063
2064   cpp_finish (parser, NULL);
2065   cpp_destroy (parser);
2066 }
2067
2068 /* Forward decls.  */
2069
2070 class lexer_test;
2071 class lexer_test_options;
2072
2073 /* A class for specifying options of a lexer_test.
2074    The "apply" vfunc is called during the lexer_test constructor.  */
2075
2076 class lexer_test_options
2077 {
2078  public:
2079   virtual void apply (lexer_test &) = 0;
2080 };
2081
2082 /* Wrapper around an cpp_reader *, which calls cpp_finish and cpp_destroy
2083    in its dtor.
2084
2085    This is needed by struct lexer_test to ensure that the cleanup of the
2086    cpp_reader happens *after* the cleanup of the temp_source_file.  */
2087
2088 class cpp_reader_ptr
2089 {
2090  public:
2091   cpp_reader_ptr (cpp_reader *ptr) : m_ptr (ptr) {}
2092
2093   ~cpp_reader_ptr ()
2094   {
2095     cpp_finish (m_ptr, NULL);
2096     cpp_destroy (m_ptr);
2097   }
2098
2099   operator cpp_reader * () const { return m_ptr; }
2100
2101  private:
2102   cpp_reader *m_ptr;
2103 };
2104
2105 /* A struct for writing lexer tests.  */
2106
2107 class lexer_test
2108 {
2109 public:
2110   lexer_test (const line_table_case &case_, const char *content,
2111               lexer_test_options *options);
2112   ~lexer_test ();
2113
2114   const cpp_token *get_token ();
2115
2116   /* The ordering of these fields matters.
2117      The line_table_test must be first, since the cpp_reader_ptr
2118      uses it.
2119      The cpp_reader must be cleaned up *after* the temp_source_file
2120      since the filenames in input.c's input cache are owned by the
2121      cpp_reader; in particular, when ~temp_source_file evicts the
2122      filename the filenames must still be alive.  */
2123   line_table_test m_ltt;
2124   cpp_reader_ptr m_parser;
2125   temp_source_file m_tempfile;
2126   string_concat_db m_concats;
2127   bool m_implicitly_expect_EOF;
2128 };
2129
2130 /* Use an EBCDIC encoding for the execution charset, specifically
2131    IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2132
2133    This exercises iconv integration within libcpp.
2134    Not every build of iconv supports the given charset,
2135    so we need to flag this error and handle it gracefully.  */
2136
2137 class ebcdic_execution_charset : public lexer_test_options
2138 {
2139  public:
2140   ebcdic_execution_charset () : m_num_iconv_errors (0)
2141     {
2142       gcc_assert (s_singleton == NULL);
2143       s_singleton = this;
2144     }
2145   ~ebcdic_execution_charset ()
2146     {
2147       gcc_assert (s_singleton == this);
2148       s_singleton = NULL;
2149     }
2150
2151   void apply (lexer_test &test) FINAL OVERRIDE
2152   {
2153     cpp_options *cpp_opts = cpp_get_options (test.m_parser);
2154     cpp_opts->narrow_charset = "IBM1047";
2155
2156     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2157     callbacks->diagnostic = on_diagnostic;
2158   }
2159
2160   static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2161                              enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2162                              enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2163                              rich_location *richloc ATTRIBUTE_UNUSED,
2164                              const char *msgid, va_list *ap ATTRIBUTE_UNUSED)
2165     ATTRIBUTE_FPTR_PRINTF(5,0)
2166   {
2167     gcc_assert (s_singleton);
2168     /* Avoid exgettext from picking this up, it is translated in libcpp.  */
2169     const char *msg = "conversion from %s to %s not supported by iconv";
2170 #ifdef ENABLE_NLS
2171     msg = dgettext ("cpplib", msg);
2172 #endif
2173     /* Detect and record errors emitted by libcpp/charset.c:init_iconv_desc
2174        when the local iconv build doesn't support the conversion.  */
2175     if (strcmp (msgid, msg) == 0)
2176       {
2177         s_singleton->m_num_iconv_errors++;
2178         return true;
2179       }
2180
2181     /* Otherwise, we have an unexpected error.  */
2182     abort ();
2183   }
2184
2185   bool iconv_errors_occurred_p () const { return m_num_iconv_errors > 0; }
2186
2187  private:
2188   static ebcdic_execution_charset *s_singleton;
2189   int m_num_iconv_errors;
2190 };
2191
2192 ebcdic_execution_charset *ebcdic_execution_charset::s_singleton;
2193
2194 /* A lexer_test_options subclass that records a list of diagnostic
2195    messages emitted by the lexer.  */
2196
2197 class lexer_diagnostic_sink : public lexer_test_options
2198 {
2199  public:
2200   lexer_diagnostic_sink ()
2201   {
2202     gcc_assert (s_singleton == NULL);
2203     s_singleton = this;
2204   }
2205   ~lexer_diagnostic_sink ()
2206   {
2207     gcc_assert (s_singleton == this);
2208     s_singleton = NULL;
2209
2210     int i;
2211     char *str;
2212     FOR_EACH_VEC_ELT (m_diagnostics, i, str)
2213       free (str);
2214   }
2215
2216   void apply (lexer_test &test) FINAL OVERRIDE
2217   {
2218     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2219     callbacks->diagnostic = on_diagnostic;
2220   }
2221
2222   static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2223                              enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2224                              enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2225                              rich_location *richloc ATTRIBUTE_UNUSED,
2226                              const char *msgid, va_list *ap)
2227     ATTRIBUTE_FPTR_PRINTF(5,0)
2228   {
2229     char *msg = xvasprintf (msgid, *ap);
2230     s_singleton->m_diagnostics.safe_push (msg);
2231     return true;
2232   }
2233
2234   auto_vec<char *> m_diagnostics;
2235
2236  private:
2237   static lexer_diagnostic_sink *s_singleton;
2238 };
2239
2240 lexer_diagnostic_sink *lexer_diagnostic_sink::s_singleton;
2241
2242 /* Constructor.  Override line_table with a new instance based on CASE_,
2243    and write CONTENT to a tempfile.  Create a cpp_reader, and use it to
2244    start parsing the tempfile.  */
2245
2246 lexer_test::lexer_test (const line_table_case &case_, const char *content,
2247                         lexer_test_options *options)
2248 : m_ltt (case_),
2249   m_parser (cpp_create_reader (CLK_GNUC99, NULL, line_table)),
2250   /* Create a tempfile and write the text to it.  */
2251   m_tempfile (SELFTEST_LOCATION, ".c", content),
2252   m_concats (),
2253   m_implicitly_expect_EOF (true)
2254 {
2255   if (options)
2256     options->apply (*this);
2257
2258   cpp_init_iconv (m_parser);
2259
2260   /* Parse the file.  */
2261   const char *fname = cpp_read_main_file (m_parser,
2262                                           m_tempfile.get_filename ());
2263   ASSERT_NE (fname, NULL);
2264 }
2265
2266 /* Destructor.  By default, verify that the next token in m_parser is EOF.  */
2267
2268 lexer_test::~lexer_test ()
2269 {
2270   location_t loc;
2271   const cpp_token *tok;
2272
2273   if (m_implicitly_expect_EOF)
2274     {
2275       tok = cpp_get_token_with_location (m_parser, &loc);
2276       ASSERT_NE (tok, NULL);
2277       ASSERT_EQ (tok->type, CPP_EOF);
2278     }
2279 }
2280
2281 /* Get the next token from m_parser.  */
2282
2283 const cpp_token *
2284 lexer_test::get_token ()
2285 {
2286   location_t loc;
2287   const cpp_token *tok;
2288
2289   tok = cpp_get_token_with_location (m_parser, &loc);
2290   ASSERT_NE (tok, NULL);
2291   return tok;
2292 }
2293
2294 /* Verify that locations within string literals are correctly handled.  */
2295
2296 /* Verify get_source_range_for_substring for token(s) at STRLOC,
2297    using the string concatenation database for TEST.
2298
2299    Assert that the character at index IDX is on EXPECTED_LINE,
2300    and that it begins at column EXPECTED_START_COL and ends at
2301    EXPECTED_FINISH_COL (unless the locations are beyond
2302    LINE_MAP_MAX_LOCATION_WITH_COLS, in which case don't check their
2303    columns).  */
2304
2305 static void
2306 assert_char_at_range (const location &loc,
2307                       lexer_test& test,
2308                       location_t strloc, enum cpp_ttype type, int idx,
2309                       int expected_line, int expected_start_col,
2310                       int expected_finish_col)
2311 {
2312   cpp_reader *pfile = test.m_parser;
2313   string_concat_db *concats = &test.m_concats;
2314
2315   source_range actual_range = source_range();
2316   const char *err
2317     = get_source_range_for_char (pfile, concats, strloc, type, idx,
2318                                  &actual_range);
2319   if (should_have_column_data_p (strloc))
2320     ASSERT_EQ_AT (loc, NULL, err);
2321   else
2322     {
2323       ASSERT_STREQ_AT (loc,
2324                        "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2325                        err);
2326       return;
2327     }
2328
2329   int actual_start_line = LOCATION_LINE (actual_range.m_start);
2330   ASSERT_EQ_AT (loc, expected_line, actual_start_line);
2331   int actual_finish_line = LOCATION_LINE (actual_range.m_finish);
2332   ASSERT_EQ_AT (loc, expected_line, actual_finish_line);
2333
2334   if (should_have_column_data_p (actual_range.m_start))
2335     {
2336       int actual_start_col = LOCATION_COLUMN (actual_range.m_start);
2337       ASSERT_EQ_AT (loc, expected_start_col, actual_start_col);
2338     }
2339   if (should_have_column_data_p (actual_range.m_finish))
2340     {
2341       int actual_finish_col = LOCATION_COLUMN (actual_range.m_finish);
2342       ASSERT_EQ_AT (loc, expected_finish_col, actual_finish_col);
2343     }
2344 }
2345
2346 /* Macro for calling assert_char_at_range, supplying SELFTEST_LOCATION for
2347    the effective location of any errors.  */
2348
2349 #define ASSERT_CHAR_AT_RANGE(LEXER_TEST, STRLOC, TYPE, IDX, EXPECTED_LINE, \
2350                              EXPECTED_START_COL, EXPECTED_FINISH_COL)   \
2351   assert_char_at_range (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), (TYPE), \
2352                         (IDX), (EXPECTED_LINE), (EXPECTED_START_COL), \
2353                         (EXPECTED_FINISH_COL))
2354
2355 /* Verify get_num_source_ranges_for_substring for token(s) at STRLOC,
2356    using the string concatenation database for TEST.
2357
2358    Assert that the token(s) at STRLOC contain EXPECTED_NUM_RANGES.  */
2359
2360 static void
2361 assert_num_substring_ranges (const location &loc,
2362                              lexer_test& test,
2363                              location_t strloc,
2364                              enum cpp_ttype type,
2365                              int expected_num_ranges)
2366 {
2367   cpp_reader *pfile = test.m_parser;
2368   string_concat_db *concats = &test.m_concats;
2369
2370   int actual_num_ranges = -1;
2371   const char *err
2372     = get_num_source_ranges_for_substring (pfile, concats, strloc, type,
2373                                            &actual_num_ranges);
2374   if (should_have_column_data_p (strloc))
2375     ASSERT_EQ_AT (loc, NULL, err);
2376   else
2377     {
2378       ASSERT_STREQ_AT (loc,
2379                        "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2380                        err);
2381       return;
2382     }
2383   ASSERT_EQ_AT (loc, expected_num_ranges, actual_num_ranges);
2384 }
2385
2386 /* Macro for calling assert_num_substring_ranges, supplying
2387    SELFTEST_LOCATION for the effective location of any errors.  */
2388
2389 #define ASSERT_NUM_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, \
2390                                     EXPECTED_NUM_RANGES)                \
2391   assert_num_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), \
2392                                (TYPE), (EXPECTED_NUM_RANGES))
2393
2394
2395 /* Verify that get_num_source_ranges_for_substring for token(s) at STRLOC
2396    returns an error (using the string concatenation database for TEST).  */
2397
2398 static void
2399 assert_has_no_substring_ranges (const location &loc,
2400                                 lexer_test& test,
2401                                 location_t strloc,
2402                                 enum cpp_ttype type,
2403                                 const char *expected_err)
2404 {
2405   cpp_reader *pfile = test.m_parser;
2406   string_concat_db *concats = &test.m_concats;
2407   cpp_substring_ranges ranges;
2408   const char *actual_err
2409     = get_substring_ranges_for_loc (pfile, concats, strloc,
2410                                     type, ranges);
2411   if (should_have_column_data_p (strloc))
2412     ASSERT_STREQ_AT (loc, expected_err, actual_err);
2413   else
2414     ASSERT_STREQ_AT (loc,
2415                      "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2416                      actual_err);
2417 }
2418
2419 #define ASSERT_HAS_NO_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, ERR)    \
2420     assert_has_no_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), \
2421                                     (STRLOC), (TYPE), (ERR))
2422
2423 /* Lex a simple string literal.  Verify the substring location data, before
2424    and after running cpp_interpret_string on it.  */
2425
2426 static void
2427 test_lexer_string_locations_simple (const line_table_case &case_)
2428 {
2429   /* Digits 0-9 (with 0 at column 10), the simple way.
2430      ....................000000000.11111111112.2222222223333333333
2431      ....................123456789.01234567890.1234567890123456789
2432      We add a trailing comment to ensure that we correctly locate
2433      the end of the string literal token.  */
2434   const char *content = "        \"0123456789\" /* not a string */\n";
2435   lexer_test test (case_, content, NULL);
2436
2437   /* Verify that we get the expected token back, with the correct
2438      location information.  */
2439   const cpp_token *tok = test.get_token ();
2440   ASSERT_EQ (tok->type, CPP_STRING);
2441   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2442   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2443
2444   /* At this point in lexing, the quote characters are treated as part of
2445      the string (they are stripped off by cpp_interpret_string).  */
2446
2447   ASSERT_EQ (tok->val.str.len, 12);
2448
2449   /* Verify that cpp_interpret_string works.  */
2450   cpp_string dst_string;
2451   const enum cpp_ttype type = CPP_STRING;
2452   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2453                                       &dst_string, type);
2454   ASSERT_TRUE (result);
2455   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2456   free (const_cast <unsigned char *> (dst_string.text));
2457
2458   /* Verify ranges of individual characters.  This no longer includes the
2459      opening quote, but does include the closing quote.  */
2460   for (int i = 0; i <= 10; i++)
2461     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1,
2462                           10 + i, 10 + i);
2463
2464   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2465 }
2466
2467 /* As test_lexer_string_locations_simple, but use an EBCDIC execution
2468    encoding.  */
2469
2470 static void
2471 test_lexer_string_locations_ebcdic (const line_table_case &case_)
2472 {
2473   /* EBCDIC support requires iconv.  */
2474   if (!HAVE_ICONV)
2475     return;
2476
2477   /* Digits 0-9 (with 0 at column 10), the simple way.
2478      ....................000000000.11111111112.2222222223333333333
2479      ....................123456789.01234567890.1234567890123456789
2480      We add a trailing comment to ensure that we correctly locate
2481      the end of the string literal token.  */
2482   const char *content = "        \"0123456789\" /* not a string */\n";
2483   ebcdic_execution_charset use_ebcdic;
2484   lexer_test test (case_, content, &use_ebcdic);
2485
2486   /* Verify that we get the expected token back, with the correct
2487      location information.  */
2488   const cpp_token *tok = test.get_token ();
2489   ASSERT_EQ (tok->type, CPP_STRING);
2490   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2491   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2492
2493   /* At this point in lexing, the quote characters are treated as part of
2494      the string (they are stripped off by cpp_interpret_string).  */
2495
2496   ASSERT_EQ (tok->val.str.len, 12);
2497
2498   /* The remainder of the test requires an iconv implementation that
2499      can convert from UTF-8 to the EBCDIC encoding requested above.  */
2500   if (use_ebcdic.iconv_errors_occurred_p ())
2501     return;
2502
2503   /* Verify that cpp_interpret_string works.  */
2504   cpp_string dst_string;
2505   const enum cpp_ttype type = CPP_STRING;
2506   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2507                                       &dst_string, type);
2508   ASSERT_TRUE (result);
2509   /* We should now have EBCDIC-encoded text, specifically
2510      IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2511      The digits 0-9 are encoded as 240-249 i.e. 0xf0-0xf9.  */
2512   ASSERT_STREQ ("\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
2513                 (const char *)dst_string.text);
2514   free (const_cast <unsigned char *> (dst_string.text));
2515
2516   /* Verify that we don't attempt to record substring location information
2517      for such cases.  */
2518   ASSERT_HAS_NO_SUBSTRING_RANGES
2519     (test, tok->src_loc, type,
2520      "execution character set != source character set");
2521 }
2522
2523 /* Lex a string literal containing a hex-escaped character.
2524    Verify the substring location data, before and after running
2525    cpp_interpret_string on it.  */
2526
2527 static void
2528 test_lexer_string_locations_hex (const line_table_case &case_)
2529 {
2530   /* Digits 0-9, expressing digit 5 in ASCII as "\x35"
2531      and with a space in place of digit 6, to terminate the escaped
2532      hex code.
2533      ....................000000000.111111.11112222.
2534      ....................123456789.012345.67890123.  */
2535   const char *content = "        \"01234\\x35 789\"\n";
2536   lexer_test test (case_, content, NULL);
2537
2538   /* Verify that we get the expected token back, with the correct
2539      location information.  */
2540   const cpp_token *tok = test.get_token ();
2541   ASSERT_EQ (tok->type, CPP_STRING);
2542   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\x35 789\"");
2543   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 23);
2544
2545   /* At this point in lexing, the quote characters are treated as part of
2546      the string (they are stripped off by cpp_interpret_string).  */
2547   ASSERT_EQ (tok->val.str.len, 15);
2548
2549   /* Verify that cpp_interpret_string works.  */
2550   cpp_string dst_string;
2551   const enum cpp_ttype type = CPP_STRING;
2552   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2553                                       &dst_string, type);
2554   ASSERT_TRUE (result);
2555   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2556   free (const_cast <unsigned char *> (dst_string.text));
2557
2558   /* Verify ranges of individual characters.  This no longer includes the
2559      opening quote, but does include the closing quote.  */
2560   for (int i = 0; i <= 4; i++)
2561     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2562   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2563   for (int i = 6; i <= 10; i++)
2564     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2565
2566   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2567 }
2568
2569 /* Lex a string literal containing an octal-escaped character.
2570    Verify the substring location data after running cpp_interpret_string
2571    on it.  */
2572
2573 static void
2574 test_lexer_string_locations_oct (const line_table_case &case_)
2575 {
2576   /* Digits 0-9, expressing digit 5 in ASCII as "\065"
2577      and with a space in place of digit 6, to terminate the escaped
2578      octal code.
2579      ....................000000000.111111.11112222.2222223333333333444
2580      ....................123456789.012345.67890123.4567890123456789012  */
2581   const char *content = "        \"01234\\065 789\" /* not a string */\n";
2582   lexer_test test (case_, content, NULL);
2583
2584   /* Verify that we get the expected token back, with the correct
2585      location information.  */
2586   const cpp_token *tok = test.get_token ();
2587   ASSERT_EQ (tok->type, CPP_STRING);
2588   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\065 789\"");
2589
2590   /* Verify that cpp_interpret_string works.  */
2591   cpp_string dst_string;
2592   const enum cpp_ttype type = CPP_STRING;
2593   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2594                                       &dst_string, type);
2595   ASSERT_TRUE (result);
2596   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2597   free (const_cast <unsigned char *> (dst_string.text));
2598
2599   /* Verify ranges of individual characters.  This no longer includes the
2600      opening quote, but does include the closing quote.  */
2601   for (int i = 0; i < 5; i++)
2602     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2603   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2604   for (int i = 6; i <= 10; i++)
2605     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2606
2607   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2608 }
2609
2610 /* Test of string literal containing letter escapes.  */
2611
2612 static void
2613 test_lexer_string_locations_letter_escape_1 (const line_table_case &case_)
2614 {
2615   /* The string "\tfoo\\\nbar" i.e. tab, "foo", backslash, newline, bar.
2616      .....................000000000.1.11111.1.1.11222.22222223333333
2617      .....................123456789.0.12345.6.7.89012.34567890123456.  */
2618   const char *content = ("        \"\\tfoo\\\\\\nbar\" /* non-str */\n");
2619   lexer_test test (case_, content, NULL);
2620
2621   /* Verify that we get the expected tokens back.  */
2622   const cpp_token *tok = test.get_token ();
2623   ASSERT_EQ (tok->type, CPP_STRING);
2624   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"\\tfoo\\\\\\nbar\"");
2625
2626   /* Verify ranges of individual characters. */
2627   /* "\t".  */
2628   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2629                         0, 1, 10, 11);
2630   /* "foo". */
2631   for (int i = 1; i <= 3; i++)
2632     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2633                           i, 1, 11 + i, 11 + i);
2634   /* "\\" and "\n".  */
2635   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2636                         4, 1, 15, 16);
2637   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2638                         5, 1, 17, 18);
2639
2640   /* "bar" and closing quote for nul-terminator.  */
2641   for (int i = 6; i <= 9; i++)
2642     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2643                           i, 1, 13 + i, 13 + i);
2644
2645   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 10);
2646 }
2647
2648 /* Another test of a string literal containing a letter escape.
2649    Based on string seen in
2650      printf ("%-%\n");
2651    in gcc.dg/format/c90-printf-1.c.  */
2652
2653 static void
2654 test_lexer_string_locations_letter_escape_2 (const line_table_case &case_)
2655 {
2656   /* .....................000000000.1111.11.1111.22222222223.
2657      .....................123456789.0123.45.6789.01234567890.  */
2658   const char *content = ("        \"%-%\\n\" /* non-str */\n");
2659   lexer_test test (case_, content, NULL);
2660
2661   /* Verify that we get the expected tokens back.  */
2662   const cpp_token *tok = test.get_token ();
2663   ASSERT_EQ (tok->type, CPP_STRING);
2664   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"%-%\\n\"");
2665
2666   /* Verify ranges of individual characters. */
2667   /* "%-%".  */
2668   for (int i = 0; i < 3; i++)
2669     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2670                           i, 1, 10 + i, 10 + i);
2671   /* "\n".  */
2672   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2673                         3, 1, 13, 14);
2674
2675   /* Closing quote for nul-terminator.  */
2676   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2677                         4, 1, 15, 15);
2678
2679   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 5);
2680 }
2681
2682 /* Lex a string literal containing UCN 4 characters.
2683    Verify the substring location data after running cpp_interpret_string
2684    on it.  */
2685
2686 static void
2687 test_lexer_string_locations_ucn4 (const line_table_case &case_)
2688 {
2689   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals expressed
2690      as UCN 4.
2691      ....................000000000.111111.111122.222222223.33333333344444
2692      ....................123456789.012345.678901.234567890.12345678901234  */
2693   const char *content = "        \"01234\\u2174\\u2175789\" /* non-str */\n";
2694   lexer_test test (case_, content, NULL);
2695
2696   /* Verify that we get the expected token back, with the correct
2697      location information.  */
2698   const cpp_token *tok = test.get_token ();
2699   ASSERT_EQ (tok->type, CPP_STRING);
2700   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\u2174\\u2175789\"");
2701
2702   /* Verify that cpp_interpret_string works.
2703      The string should be encoded in the execution character
2704      set.  Assuming that is UTF-8, we should have the following:
2705      -----------  ----  -----  -------  ----------------
2706      Byte offset  Byte  Octal  Unicode  Source Column(s)
2707      -----------  ----  -----  -------  ----------------
2708      0            0x30         '0'      10
2709      1            0x31         '1'      11
2710      2            0x32         '2'      12
2711      3            0x33         '3'      13
2712      4            0x34         '4'      14
2713      5            0xE2  \342   U+2174   15-20
2714      6            0x85  \205    (cont)  15-20
2715      7            0xB4  \264    (cont)  15-20
2716      8            0xE2  \342   U+2175   21-26
2717      9            0x85  \205    (cont)  21-26
2718      10           0xB5  \265    (cont)  21-26
2719      11           0x37         '7'      27
2720      12           0x38         '8'      28
2721      13           0x39         '9'      29
2722      14           0x00                  30 (closing quote)
2723      -----------  ----  -----  -------  ---------------.  */
2724
2725   cpp_string dst_string;
2726   const enum cpp_ttype type = CPP_STRING;
2727   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2728                                       &dst_string, type);
2729   ASSERT_TRUE (result);
2730   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2731                 (const char *)dst_string.text);
2732   free (const_cast <unsigned char *> (dst_string.text));
2733
2734   /* Verify ranges of individual characters.  This no longer includes the
2735      opening quote, but does include the closing quote.
2736      '01234'.  */
2737   for (int i = 0; i <= 4; i++)
2738     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2739   /* U+2174.  */
2740   for (int i = 5; i <= 7; i++)
2741     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 20);
2742   /* U+2175.  */
2743   for (int i = 8; i <= 10; i++)
2744     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 21, 26);
2745   /* '789' and nul terminator  */
2746   for (int i = 11; i <= 14; i++)
2747     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 16 + i, 16 + i);
2748
2749   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2750 }
2751
2752 /* Lex a string literal containing UCN 8 characters.
2753    Verify the substring location data after running cpp_interpret_string
2754    on it.  */
2755
2756 static void
2757 test_lexer_string_locations_ucn8 (const line_table_case &case_)
2758 {
2759   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals as UCN 8.
2760      ....................000000000.111111.1111222222.2222333333333.344444
2761      ....................123456789.012345.6789012345.6789012345678.901234  */
2762   const char *content = "        \"01234\\U00002174\\U00002175789\" /* */\n";
2763   lexer_test test (case_, content, NULL);
2764
2765   /* Verify that we get the expected token back, with the correct
2766      location information.  */
2767   const cpp_token *tok = test.get_token ();
2768   ASSERT_EQ (tok->type, CPP_STRING);
2769   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok,
2770                            "\"01234\\U00002174\\U00002175789\"");
2771
2772   /* Verify that cpp_interpret_string works.
2773      The UTF-8 encoding of the string is identical to that from
2774      the ucn4 testcase above; the only difference is the column
2775      locations.  */
2776   cpp_string dst_string;
2777   const enum cpp_ttype type = CPP_STRING;
2778   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2779                                       &dst_string, type);
2780   ASSERT_TRUE (result);
2781   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2782                 (const char *)dst_string.text);
2783   free (const_cast <unsigned char *> (dst_string.text));
2784
2785   /* Verify ranges of individual characters.  This no longer includes the
2786      opening quote, but does include the closing quote.
2787      '01234'.  */
2788   for (int i = 0; i <= 4; i++)
2789     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2790   /* U+2174.  */
2791   for (int i = 5; i <= 7; i++)
2792     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 24);
2793   /* U+2175.  */
2794   for (int i = 8; i <= 10; i++)
2795     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 25, 34);
2796   /* '789' at columns 35-37  */
2797   for (int i = 11; i <= 13; i++)
2798     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 24 + i, 24 + i);
2799   /* Closing quote/nul-terminator at column 38.  */
2800   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 14, 1, 38, 38);
2801
2802   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2803 }
2804
2805 /* Fetch a big-endian 32-bit value and convert to host endianness.  */
2806
2807 static uint32_t
2808 uint32_from_big_endian (const uint32_t *ptr_be_value)
2809 {
2810   const unsigned char *buf = (const unsigned char *)ptr_be_value;
2811   return (((uint32_t) buf[0] << 24)
2812           | ((uint32_t) buf[1] << 16)
2813           | ((uint32_t) buf[2] << 8)
2814           | (uint32_t) buf[3]);
2815 }
2816
2817 /* Lex a wide string literal and verify that attempts to read substring
2818    location data from it fail gracefully.  */
2819
2820 static void
2821 test_lexer_string_locations_wide_string (const line_table_case &case_)
2822 {
2823   /* Digits 0-9.
2824      ....................000000000.11111111112.22222222233333
2825      ....................123456789.01234567890.12345678901234  */
2826   const char *content = "       L\"0123456789\" /* non-str */\n";
2827   lexer_test test (case_, content, NULL);
2828
2829   /* Verify that we get the expected token back, with the correct
2830      location information.  */
2831   const cpp_token *tok = test.get_token ();
2832   ASSERT_EQ (tok->type, CPP_WSTRING);
2833   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L\"0123456789\"");
2834
2835   /* Verify that cpp_interpret_string works, using CPP_WSTRING.  */
2836   cpp_string dst_string;
2837   const enum cpp_ttype type = CPP_WSTRING;
2838   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2839                                       &dst_string, type);
2840   ASSERT_TRUE (result);
2841   /* The cpp_reader defaults to big-endian with
2842      CHAR_BIT * sizeof (int) for the wchar_precision, so dst_string should
2843      now be encoded as UTF-32BE.  */
2844   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
2845   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
2846   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
2847   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
2848   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
2849   free (const_cast <unsigned char *> (dst_string.text));
2850
2851   /* We don't yet support generating substring location information
2852      for L"" strings.  */
2853   ASSERT_HAS_NO_SUBSTRING_RANGES
2854     (test, tok->src_loc, type,
2855      "execution character set != source character set");
2856 }
2857
2858 /* Fetch a big-endian 16-bit value and convert to host endianness.  */
2859
2860 static uint16_t
2861 uint16_from_big_endian (const uint16_t *ptr_be_value)
2862 {
2863   const unsigned char *buf = (const unsigned char *)ptr_be_value;
2864   return ((uint16_t) buf[0] << 8) | (uint16_t) buf[1];
2865 }
2866
2867 /* Lex a u"" string literal and verify that attempts to read substring
2868    location data from it fail gracefully.  */
2869
2870 static void
2871 test_lexer_string_locations_string16 (const line_table_case &case_)
2872 {
2873   /* Digits 0-9.
2874      ....................000000000.11111111112.22222222233333
2875      ....................123456789.01234567890.12345678901234  */
2876   const char *content = "       u\"0123456789\" /* non-str */\n";
2877   lexer_test test (case_, content, NULL);
2878
2879   /* Verify that we get the expected token back, with the correct
2880      location information.  */
2881   const cpp_token *tok = test.get_token ();
2882   ASSERT_EQ (tok->type, CPP_STRING16);
2883   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u\"0123456789\"");
2884
2885   /* Verify that cpp_interpret_string works, using CPP_STRING16.  */
2886   cpp_string dst_string;
2887   const enum cpp_ttype type = CPP_STRING16;
2888   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2889                                       &dst_string, type);
2890   ASSERT_TRUE (result);
2891
2892   /* The cpp_reader defaults to big-endian, so dst_string should
2893      now be encoded as UTF-16BE.  */
2894   const uint16_t *be16_chars = (const uint16_t *)dst_string.text;
2895   ASSERT_EQ ('0', uint16_from_big_endian (&be16_chars[0]));
2896   ASSERT_EQ ('5', uint16_from_big_endian (&be16_chars[5]));
2897   ASSERT_EQ ('9', uint16_from_big_endian (&be16_chars[9]));
2898   ASSERT_EQ (0, uint16_from_big_endian (&be16_chars[10]));
2899   free (const_cast <unsigned char *> (dst_string.text));
2900
2901   /* We don't yet support generating substring location information
2902      for L"" strings.  */
2903   ASSERT_HAS_NO_SUBSTRING_RANGES
2904     (test, tok->src_loc, type,
2905      "execution character set != source character set");
2906 }
2907
2908 /* Lex a U"" string literal and verify that attempts to read substring
2909    location data from it fail gracefully.  */
2910
2911 static void
2912 test_lexer_string_locations_string32 (const line_table_case &case_)
2913 {
2914   /* Digits 0-9.
2915      ....................000000000.11111111112.22222222233333
2916      ....................123456789.01234567890.12345678901234  */
2917   const char *content = "       U\"0123456789\" /* non-str */\n";
2918   lexer_test test (case_, content, NULL);
2919
2920   /* Verify that we get the expected token back, with the correct
2921      location information.  */
2922   const cpp_token *tok = test.get_token ();
2923   ASSERT_EQ (tok->type, CPP_STRING32);
2924   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U\"0123456789\"");
2925
2926   /* Verify that cpp_interpret_string works, using CPP_STRING32.  */
2927   cpp_string dst_string;
2928   const enum cpp_ttype type = CPP_STRING32;
2929   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2930                                       &dst_string, type);
2931   ASSERT_TRUE (result);
2932
2933   /* The cpp_reader defaults to big-endian, so dst_string should
2934      now be encoded as UTF-32BE.  */
2935   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
2936   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
2937   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
2938   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
2939   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
2940   free (const_cast <unsigned char *> (dst_string.text));
2941
2942   /* We don't yet support generating substring location information
2943      for L"" strings.  */
2944   ASSERT_HAS_NO_SUBSTRING_RANGES
2945     (test, tok->src_loc, type,
2946      "execution character set != source character set");
2947 }
2948
2949 /* Lex a u8-string literal.
2950    Verify the substring location data after running cpp_interpret_string
2951    on it.  */
2952
2953 static void
2954 test_lexer_string_locations_u8 (const line_table_case &case_)
2955 {
2956   /* Digits 0-9.
2957      ....................000000000.11111111112.22222222233333
2958      ....................123456789.01234567890.12345678901234  */
2959   const char *content = "      u8\"0123456789\" /* non-str */\n";
2960   lexer_test test (case_, content, NULL);
2961
2962   /* Verify that we get the expected token back, with the correct
2963      location information.  */
2964   const cpp_token *tok = test.get_token ();
2965   ASSERT_EQ (tok->type, CPP_UTF8STRING);
2966   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u8\"0123456789\"");
2967
2968   /* Verify that cpp_interpret_string works.  */
2969   cpp_string dst_string;
2970   const enum cpp_ttype type = CPP_STRING;
2971   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2972                                       &dst_string, type);
2973   ASSERT_TRUE (result);
2974   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2975   free (const_cast <unsigned char *> (dst_string.text));
2976
2977   /* Verify ranges of individual characters.  This no longer includes the
2978      opening quote, but does include the closing quote.  */
2979   for (int i = 0; i <= 10; i++)
2980     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2981 }
2982
2983 /* Lex a string literal containing UTF-8 source characters.
2984    Verify the substring location data after running cpp_interpret_string
2985    on it.  */
2986
2987 static void
2988 test_lexer_string_locations_utf8_source (const line_table_case &case_)
2989 {
2990  /* This string literal is written out to the source file as UTF-8,
2991     and is of the form "before mojibake after", where "mojibake"
2992     is written as the following four unicode code points:
2993        U+6587 CJK UNIFIED IDEOGRAPH-6587
2994        U+5B57 CJK UNIFIED IDEOGRAPH-5B57
2995        U+5316 CJK UNIFIED IDEOGRAPH-5316
2996        U+3051 HIRAGANA LETTER KE.
2997      Each of these is 3 bytes wide when encoded in UTF-8, whereas the
2998      "before" and "after" are 1 byte per unicode character.
2999
3000      The numbering shown are "columns", which are *byte* numbers within
3001      the line, rather than unicode character numbers.
3002
3003      .................... 000000000.1111111.
3004      .................... 123456789.0123456.  */
3005   const char *content = ("        \"before "
3006                          /* U+6587 CJK UNIFIED IDEOGRAPH-6587
3007                               UTF-8: 0xE6 0x96 0x87
3008                               C octal escaped UTF-8: \346\226\207
3009                             "column" numbers: 17-19.  */
3010                          "\346\226\207"
3011
3012                          /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
3013                               UTF-8: 0xE5 0xAD 0x97
3014                               C octal escaped UTF-8: \345\255\227
3015                             "column" numbers: 20-22.  */
3016                          "\345\255\227"
3017
3018                          /* U+5316 CJK UNIFIED IDEOGRAPH-5316
3019                               UTF-8: 0xE5 0x8C 0x96
3020                               C octal escaped UTF-8: \345\214\226
3021                             "column" numbers: 23-25.  */
3022                          "\345\214\226"
3023
3024                          /* U+3051 HIRAGANA LETTER KE
3025                               UTF-8: 0xE3 0x81 0x91
3026                               C octal escaped UTF-8: \343\201\221
3027                             "column" numbers: 26-28.  */
3028                          "\343\201\221"
3029
3030                          /* column numbers 29 onwards
3031                           2333333.33334444444444
3032                           9012345.67890123456789. */
3033                          " after\" /* non-str */\n");
3034   lexer_test test (case_, content, NULL);
3035
3036   /* Verify that we get the expected token back, with the correct
3037      location information.  */
3038   const cpp_token *tok = test.get_token ();
3039   ASSERT_EQ (tok->type, CPP_STRING);
3040   ASSERT_TOKEN_AS_TEXT_EQ
3041     (test.m_parser, tok,
3042      "\"before \346\226\207\345\255\227\345\214\226\343\201\221 after\"");
3043
3044   /* Verify that cpp_interpret_string works.  */
3045   cpp_string dst_string;
3046   const enum cpp_ttype type = CPP_STRING;
3047   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3048                                       &dst_string, type);
3049   ASSERT_TRUE (result);
3050   ASSERT_STREQ
3051     ("before \346\226\207\345\255\227\345\214\226\343\201\221 after",
3052      (const char *)dst_string.text);
3053   free (const_cast <unsigned char *> (dst_string.text));
3054
3055   /* Verify ranges of individual characters.  This no longer includes the
3056      opening quote, but does include the closing quote.
3057      Assuming that both source and execution encodings are UTF-8, we have
3058      a run of 25 octets in each, plus the NUL terminator.  */
3059   for (int i = 0; i < 25; i++)
3060     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3061   /* NUL-terminator should use the closing quote at column 35.  */
3062   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 25, 1, 35, 35);
3063
3064   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 26);
3065 }
3066
3067 /* Test of string literal concatenation.  */
3068
3069 static void
3070 test_lexer_string_locations_concatenation_1 (const line_table_case &case_)
3071 {
3072   /* Digits 0-9.
3073      .....................000000000.111111.11112222222222
3074      .....................123456789.012345.67890123456789.  */
3075   const char *content = ("        \"01234\" /* non-str */\n"
3076                          "        \"56789\" /* non-str */\n");
3077   lexer_test test (case_, content, NULL);
3078
3079   location_t input_locs[2];
3080
3081   /* Verify that we get the expected tokens back.  */
3082   auto_vec <cpp_string> input_strings;
3083   const cpp_token *tok_a = test.get_token ();
3084   ASSERT_EQ (tok_a->type, CPP_STRING);
3085   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_a, "\"01234\"");
3086   input_strings.safe_push (tok_a->val.str);
3087   input_locs[0] = tok_a->src_loc;
3088
3089   const cpp_token *tok_b = test.get_token ();
3090   ASSERT_EQ (tok_b->type, CPP_STRING);
3091   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_b, "\"56789\"");
3092   input_strings.safe_push (tok_b->val.str);
3093   input_locs[1] = tok_b->src_loc;
3094
3095   /* Verify that cpp_interpret_string works.  */
3096   cpp_string dst_string;
3097   const enum cpp_ttype type = CPP_STRING;
3098   bool result = cpp_interpret_string (test.m_parser,
3099                                       input_strings.address (), 2,
3100                                       &dst_string, type);
3101   ASSERT_TRUE (result);
3102   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3103   free (const_cast <unsigned char *> (dst_string.text));
3104
3105   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
3106   test.m_concats.record_string_concatenation (2, input_locs);
3107
3108   location_t initial_loc = input_locs[0];
3109
3110   /* "01234" on line 1.  */
3111   for (int i = 0; i <= 4; i++)
3112     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3113   /* "56789" in line 2, plus its closing quote for the nul terminator.  */
3114   for (int i = 5; i <= 10; i++)
3115     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 2, 5 + i, 5 + i);
3116
3117   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3118 }
3119
3120 /* Another test of string literal concatenation.  */
3121
3122 static void
3123 test_lexer_string_locations_concatenation_2 (const line_table_case &case_)
3124 {
3125   /* Digits 0-9.
3126      .....................000000000.111.11111112222222
3127      .....................123456789.012.34567890123456.  */
3128   const char *content = ("        \"01\" /* non-str */\n"
3129                          "        \"23\" /* non-str */\n"
3130                          "        \"45\" /* non-str */\n"
3131                          "        \"67\" /* non-str */\n"
3132                          "        \"89\" /* non-str */\n");
3133   lexer_test test (case_, content, NULL);
3134
3135   auto_vec <cpp_string> input_strings;
3136   location_t input_locs[5];
3137
3138   /* Verify that we get the expected tokens back.  */
3139   for (int i = 0; i < 5; i++)
3140     {
3141       const cpp_token *tok = test.get_token ();
3142       ASSERT_EQ (tok->type, CPP_STRING);
3143       input_strings.safe_push (tok->val.str);
3144       input_locs[i] = tok->src_loc;
3145     }
3146
3147   /* Verify that cpp_interpret_string works.  */
3148   cpp_string dst_string;
3149   const enum cpp_ttype type = CPP_STRING;
3150   bool result = cpp_interpret_string (test.m_parser,
3151                                       input_strings.address (), 5,
3152                                       &dst_string, type);
3153   ASSERT_TRUE (result);
3154   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3155   free (const_cast <unsigned char *> (dst_string.text));
3156
3157   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
3158   test.m_concats.record_string_concatenation (5, input_locs);
3159
3160   location_t initial_loc = input_locs[0];
3161
3162   /* Within ASSERT_CHAR_AT_RANGE (actually assert_char_at_range), we can
3163      detect if the initial loc is after LINE_MAP_MAX_LOCATION_WITH_COLS
3164      and expect get_source_range_for_substring to fail.
3165      However, for a string concatenation test, we can have a case
3166      where the initial string is fully before LINE_MAP_MAX_LOCATION_WITH_COLS,
3167      but subsequent strings can be after it.
3168      Attempting to detect this within assert_char_at_range
3169      would overcomplicate the logic for the common test cases, so
3170      we detect it here.  */
3171   if (should_have_column_data_p (input_locs[0])
3172       && !should_have_column_data_p (input_locs[4]))
3173     {
3174       /* Verify that get_source_range_for_substring gracefully rejects
3175          this case.  */
3176       source_range actual_range;
3177       const char *err
3178         = get_source_range_for_char (test.m_parser, &test.m_concats,
3179                                      initial_loc, type, 0, &actual_range);
3180       ASSERT_STREQ ("range starts after LINE_MAP_MAX_LOCATION_WITH_COLS", err);
3181       return;
3182     }
3183
3184   for (int i = 0; i < 5; i++)
3185     for (int j = 0; j < 2; j++)
3186       ASSERT_CHAR_AT_RANGE (test, initial_loc, type, (i * 2) + j,
3187                             i + 1, 10 + j, 10 + j);
3188
3189   /* NUL-terminator should use the final closing quote at line 5 column 12.  */
3190   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 5, 12, 12);
3191
3192   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3193 }
3194
3195 /* Another test of string literal concatenation, this time combined with
3196    various kinds of escaped characters.  */
3197
3198 static void
3199 test_lexer_string_locations_concatenation_3 (const line_table_case &case_)
3200 {
3201   /* Digits 0-9, expressing digit 5 in ASCII as hex "\x35"
3202      digit 6 in ASCII as octal "\066", concatenating multiple strings.  */
3203   const char *content
3204     /* .000000000.111111.111.1.2222.222.2.2233.333.3333.34444444444555
3205        .123456789.012345.678.9.0123.456.7.8901.234.5678.90123456789012. */
3206     = ("        \"01234\"  \"\\x35\"  \"\\066\"  \"789\" /* non-str */\n");
3207   lexer_test test (case_, content, NULL);
3208
3209   auto_vec <cpp_string> input_strings;
3210   location_t input_locs[4];
3211
3212   /* Verify that we get the expected tokens back.  */
3213   for (int i = 0; i < 4; i++)
3214     {
3215       const cpp_token *tok = test.get_token ();
3216       ASSERT_EQ (tok->type, CPP_STRING);
3217       input_strings.safe_push (tok->val.str);
3218       input_locs[i] = tok->src_loc;
3219     }
3220
3221   /* Verify that cpp_interpret_string works.  */
3222   cpp_string dst_string;
3223   const enum cpp_ttype type = CPP_STRING;
3224   bool result = cpp_interpret_string (test.m_parser,
3225                                       input_strings.address (), 4,
3226                                       &dst_string, type);
3227   ASSERT_TRUE (result);
3228   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3229   free (const_cast <unsigned char *> (dst_string.text));
3230
3231   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
3232   test.m_concats.record_string_concatenation (4, input_locs);
3233
3234   location_t initial_loc = input_locs[0];
3235
3236   for (int i = 0; i <= 4; i++)
3237     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3238   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 5, 1, 19, 22);
3239   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 6, 1, 27, 30);
3240   for (int i = 7; i <= 9; i++)
3241     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 28 + i, 28 + i);
3242
3243   /* NUL-terminator should use the location of the final closing quote.  */
3244   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 1, 38, 38);
3245
3246   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3247 }
3248
3249 /* Test of string literal in a macro.  */
3250
3251 static void
3252 test_lexer_string_locations_macro (const line_table_case &case_)
3253 {
3254   /* Digits 0-9.
3255      .....................0000000001111111111.22222222223.
3256      .....................1234567890123456789.01234567890.  */
3257   const char *content = ("#define MACRO     \"0123456789\" /* non-str */\n"
3258                          "  MACRO");
3259   lexer_test test (case_, content, NULL);
3260
3261   /* Verify that we get the expected tokens back.  */
3262   const cpp_token *tok = test.get_token ();
3263   ASSERT_EQ (tok->type, CPP_PADDING);
3264
3265   tok = test.get_token ();
3266   ASSERT_EQ (tok->type, CPP_STRING);
3267   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
3268
3269   /* Verify ranges of individual characters.  We ought to
3270      see columns within the macro definition.  */
3271   for (int i = 0; i <= 10; i++)
3272     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3273                           i, 1, 20 + i, 20 + i);
3274
3275   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3276
3277   tok = test.get_token ();
3278   ASSERT_EQ (tok->type, CPP_PADDING);
3279 }
3280
3281 /* Test of stringification of a macro argument.  */
3282
3283 static void
3284 test_lexer_string_locations_stringified_macro_argument
3285   (const line_table_case &case_)
3286 {
3287   /* .....................000000000111111111122222222223.
3288      .....................123456789012345678901234567890.  */
3289   const char *content = ("#define MACRO(X) #X /* non-str */\n"
3290                          "MACRO(foo)\n");
3291   lexer_test test (case_, content, NULL);
3292
3293   /* Verify that we get the expected token back.  */
3294   const cpp_token *tok = test.get_token ();
3295   ASSERT_EQ (tok->type, CPP_PADDING);
3296
3297   tok = test.get_token ();
3298   ASSERT_EQ (tok->type, CPP_STRING);
3299   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"foo\"");
3300
3301   /* We don't support getting the location of a stringified macro
3302      argument.  Verify that it fails gracefully.  */
3303   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3304                                   "cpp_interpret_string_1 failed");
3305
3306   tok = test.get_token ();
3307   ASSERT_EQ (tok->type, CPP_PADDING);
3308
3309   tok = test.get_token ();
3310   ASSERT_EQ (tok->type, CPP_PADDING);
3311 }
3312
3313 /* Ensure that we are fail gracefully if something attempts to pass
3314    in a location that isn't a string literal token.  Seen on this code:
3315
3316      const char a[] = " %d ";
3317      __builtin_printf (a, 0.5);
3318                        ^
3319
3320    when c-format.c erroneously used the indicated one-character
3321    location as the format string location, leading to a read past the
3322    end of a string buffer in cpp_interpret_string_1.  */
3323
3324 static void
3325 test_lexer_string_locations_non_string (const line_table_case &case_)
3326 {
3327   /* .....................000000000111111111122222222223.
3328      .....................123456789012345678901234567890.  */
3329   const char *content = ("         a\n");
3330   lexer_test test (case_, content, NULL);
3331
3332   /* Verify that we get the expected token back.  */
3333   const cpp_token *tok = test.get_token ();
3334   ASSERT_EQ (tok->type, CPP_NAME);
3335   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "a");
3336
3337   /* At this point, libcpp is attempting to interpret the name as a
3338      string literal, despite it not starting with a quote.  We don't detect
3339      that, but we should at least fail gracefully.  */
3340   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3341                                   "cpp_interpret_string_1 failed");
3342 }
3343
3344 /* Ensure that we can read substring information for a token which
3345    starts in one linemap and ends in another .  Adapted from
3346    gcc.dg/cpp/pr69985.c.  */
3347
3348 static void
3349 test_lexer_string_locations_long_line (const line_table_case &case_)
3350 {
3351   /* .....................000000.000111111111
3352      .....................123456.789012346789.  */
3353   const char *content = ("/* A very long line, so that we start a new line map.  */\n"
3354                          "     \"0123456789012345678901234567890123456789"
3355                          "0123456789012345678901234567890123456789"
3356                          "0123456789012345678901234567890123456789"
3357                          "0123456789\"\n");
3358
3359   lexer_test test (case_, content, NULL);
3360
3361   /* Verify that we get the expected token back.  */
3362   const cpp_token *tok = test.get_token ();
3363   ASSERT_EQ (tok->type, CPP_STRING);
3364
3365   if (!should_have_column_data_p (line_table->highest_location))
3366     return;
3367
3368   /* Verify ranges of individual characters.  */
3369   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 131);
3370   for (int i = 0; i < 131; i++)
3371     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3372                           i, 2, 7 + i, 7 + i);
3373 }
3374
3375 /* Test of locations within a raw string that doesn't contain a newline.  */
3376
3377 static void
3378 test_lexer_string_locations_raw_string_one_line (const line_table_case &case_)
3379 {
3380   /* .....................00.0000000111111111122.
3381      .....................12.3456789012345678901.  */
3382   const char *content = ("R\"foo(0123456789)foo\"\n");
3383   lexer_test test (case_, content, NULL);
3384
3385   /* Verify that we get the expected token back.  */
3386   const cpp_token *tok = test.get_token ();
3387   ASSERT_EQ (tok->type, CPP_STRING);
3388
3389   /* Verify that cpp_interpret_string works.  */
3390   cpp_string dst_string;
3391   const enum cpp_ttype type = CPP_STRING;
3392   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3393                                       &dst_string, type);
3394   ASSERT_TRUE (result);
3395   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3396   free (const_cast <unsigned char *> (dst_string.text));
3397
3398   if (!should_have_column_data_p (line_table->highest_location))
3399     return;
3400
3401   /* 0-9, plus the nil terminator.  */
3402   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3403   for (int i = 0; i < 11; i++)
3404     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3405                           i, 1, 7 + i, 7 + i);
3406 }
3407
3408 /* Test of locations within a raw string that contains a newline.  */
3409
3410 static void
3411 test_lexer_string_locations_raw_string_multiline (const line_table_case &case_)
3412 {
3413   /* .....................00.0000.
3414      .....................12.3456.  */
3415   const char *content = ("R\"foo(\n"
3416   /* .....................00000.
3417      .....................12345.  */
3418                          "hello\n"
3419                          "world\n"
3420   /* .....................00000.
3421      .....................12345.  */
3422                          ")foo\"\n");
3423   lexer_test test (case_, content, NULL);
3424
3425   /* Verify that we get the expected token back.  */
3426   const cpp_token *tok = test.get_token ();
3427   ASSERT_EQ (tok->type, CPP_STRING);
3428
3429   /* Verify that cpp_interpret_string works.  */
3430   cpp_string dst_string;
3431   const enum cpp_ttype type = CPP_STRING;
3432   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3433                                       &dst_string, type);
3434   ASSERT_TRUE (result);
3435   ASSERT_STREQ ("\nhello\nworld\n", (const char *)dst_string.text);
3436   free (const_cast <unsigned char *> (dst_string.text));
3437
3438   if (!should_have_column_data_p (line_table->highest_location))
3439     return;
3440
3441   /* Currently we don't support locations within raw strings that
3442      contain newlines.  */
3443   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, tok->type,
3444                                   "range endpoints are on different lines");
3445 }
3446
3447 /* Test of parsing an unterminated raw string.  */
3448
3449 static void
3450 test_lexer_string_locations_raw_string_unterminated (const line_table_case &case_)
3451 {
3452   const char *content = "R\"ouch()ouCh\" /* etc */";
3453
3454   lexer_diagnostic_sink diagnostics;
3455   lexer_test test (case_, content, &diagnostics);
3456   test.m_implicitly_expect_EOF = false;
3457
3458   /* Attempt to parse the raw string.  */
3459   const cpp_token *tok = test.get_token ();
3460   ASSERT_EQ (tok->type, CPP_EOF);
3461
3462   ASSERT_EQ (1, diagnostics.m_diagnostics.length ());
3463   /* We expect the message "unterminated raw string"
3464      in the "cpplib" translation domain.
3465      It's not clear that dgettext is available on all supported hosts,
3466      so this assertion is commented-out for now.
3467        ASSERT_STREQ (dgettext ("cpplib", "unterminated raw string"),
3468                      diagnostics.m_diagnostics[0]);
3469   */
3470 }
3471
3472 /* Test of lexing char constants.  */
3473
3474 static void
3475 test_lexer_char_constants (const line_table_case &case_)
3476 {
3477   /* Various char constants.
3478      .....................0000000001111111111.22222222223.
3479      .....................1234567890123456789.01234567890.  */
3480   const char *content = ("         'a'\n"
3481                          "        u'a'\n"
3482                          "        U'a'\n"
3483                          "        L'a'\n"
3484                          "         'abc'\n");
3485   lexer_test test (case_, content, NULL);
3486
3487   /* Verify that we get the expected tokens back.  */
3488   /* 'a'.  */
3489   const cpp_token *tok = test.get_token ();
3490   ASSERT_EQ (tok->type, CPP_CHAR);
3491   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'a'");
3492
3493   unsigned int chars_seen;
3494   int unsignedp;
3495   cppchar_t cc = cpp_interpret_charconst (test.m_parser, tok,
3496                                           &chars_seen, &unsignedp);
3497   ASSERT_EQ (cc, 'a');
3498   ASSERT_EQ (chars_seen, 1);
3499
3500   /* u'a'.  */
3501   tok = test.get_token ();
3502   ASSERT_EQ (tok->type, CPP_CHAR16);
3503   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u'a'");
3504
3505   /* U'a'.  */
3506   tok = test.get_token ();
3507   ASSERT_EQ (tok->type, CPP_CHAR32);
3508   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U'a'");
3509
3510   /* L'a'.  */
3511   tok = test.get_token ();
3512   ASSERT_EQ (tok->type, CPP_WCHAR);
3513   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L'a'");
3514
3515   /* 'abc' (c-char-sequence).  */
3516   tok = test.get_token ();
3517   ASSERT_EQ (tok->type, CPP_CHAR);
3518   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'abc'");
3519 }
3520 /* A table of interesting location_t values, giving one axis of our test
3521    matrix.  */
3522
3523 static const location_t boundary_locations[] = {
3524   /* Zero means "don't override the default values for a new line_table".  */
3525   0,
3526
3527   /* An arbitrary non-zero value that isn't close to one of
3528      the boundary values below.  */
3529   0x10000,
3530
3531   /* Values near LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES.  */
3532   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 0x100,
3533   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 1,
3534   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES,
3535   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 1,
3536   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 0x100,
3537
3538   /* Values near LINE_MAP_MAX_LOCATION_WITH_COLS.  */
3539   LINE_MAP_MAX_LOCATION_WITH_COLS - 0x100,
3540   LINE_MAP_MAX_LOCATION_WITH_COLS - 1,
3541   LINE_MAP_MAX_LOCATION_WITH_COLS,
3542   LINE_MAP_MAX_LOCATION_WITH_COLS + 1,
3543   LINE_MAP_MAX_LOCATION_WITH_COLS + 0x100,
3544 };
3545
3546 /* Run TESTCASE multiple times, once for each case in our test matrix.  */
3547
3548 void
3549 for_each_line_table_case (void (*testcase) (const line_table_case &))
3550 {
3551   /* As noted above in the description of struct line_table_case,
3552      we want to explore a test matrix of interesting line_table
3553      situations, running various selftests for each case within the
3554      matrix.  */
3555
3556   /* Run all tests with:
3557      (a) line_table->default_range_bits == 0, and
3558      (b) line_table->default_range_bits == 5.  */
3559   int num_cases_tested = 0;
3560   for (int default_range_bits = 0; default_range_bits <= 5;
3561        default_range_bits += 5)
3562     {
3563       /* ...and use each of the "interesting" location values as
3564          the starting location within line_table.  */
3565       const int num_boundary_locations
3566         = sizeof (boundary_locations) / sizeof (boundary_locations[0]);
3567       for (int loc_idx = 0; loc_idx < num_boundary_locations; loc_idx++)
3568         {
3569           line_table_case c (default_range_bits, boundary_locations[loc_idx]);
3570
3571           testcase (c);
3572
3573           num_cases_tested++;
3574         }
3575     }
3576
3577   /* Verify that we fully covered the test matrix.  */
3578   ASSERT_EQ (num_cases_tested, 2 * 12);
3579 }
3580
3581 /* Verify that when presented with a consecutive pair of locations with
3582    a very large line offset, we don't attempt to consolidate them into
3583    a single ordinary linemap where the line offsets within the line map
3584    would lead to overflow (PR lto/88147).  */
3585
3586 static void
3587 test_line_offset_overflow ()
3588 {
3589   line_table_test ltt (line_table_case (5, 0));
3590
3591   linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
3592   linemap_line_start (line_table, 1, 100);
3593   location_t loc_a = linemap_line_start (line_table, 2578, 255);
3594   assert_loceq ("foo.c", 2578, 0, loc_a);
3595
3596   const line_map_ordinary *ordmap_a = LINEMAPS_LAST_ORDINARY_MAP (line_table);
3597   ASSERT_EQ (ordmap_a->m_column_and_range_bits, 13);
3598   ASSERT_EQ (ordmap_a->m_range_bits, 5);
3599
3600   location_t loc_b = linemap_line_start (line_table, 404198, 512);
3601   assert_loceq ("foo.c", 404198, 0, loc_b);
3602
3603   /* We should have started a new linemap, rather than attempting to store
3604      a very large line offset.  */
3605   const line_map_ordinary *ordmap_b = LINEMAPS_LAST_ORDINARY_MAP (line_table);
3606   ASSERT_NE (ordmap_a, ordmap_b);
3607 }
3608
3609 void test_cpp_utf8 ()
3610 {
3611   /* Verify that wcwidth of invalid UTF-8 or control bytes is 1.  */
3612   {
3613     int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8);
3614     ASSERT_EQ (8, w_bad);
3615     int w_ctrl = cpp_display_width ("\r\t\n\v\0\1", 6);
3616     ASSERT_EQ (6, w_ctrl);
3617   }
3618
3619   /* Verify that wcwidth of valid UTF-8 is as expected.  */
3620   {
3621     const int w_pi = cpp_display_width ("\xcf\x80", 2);
3622     ASSERT_EQ (1, w_pi);
3623     const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4);
3624     ASSERT_EQ (2, w_emoji);
3625     const int w_umlaut_precomposed = cpp_display_width ("\xc3\xbf", 2);
3626     ASSERT_EQ (1, w_umlaut_precomposed);
3627     const int w_umlaut_combining = cpp_display_width ("y\xcc\x88", 3);
3628     ASSERT_EQ (1, w_umlaut_combining);
3629     const int w_han = cpp_display_width ("\xe4\xb8\xba", 3);
3630     ASSERT_EQ (2, w_han);
3631     const int w_ascii = cpp_display_width ("GCC", 3);
3632     ASSERT_EQ (3, w_ascii);
3633     const int w_mixed = cpp_display_width ("\xcf\x80 = 3.14 \xf0\x9f\x98\x82"
3634                                            "\x9f! \xe4\xb8\xba y\xcc\x88", 24);
3635     ASSERT_EQ (18, w_mixed);
3636   }
3637
3638   /* Verify that cpp_byte_column_to_display_column can go past the end,
3639      and similar edge cases.  */
3640   {
3641     const char *str
3642       /* Display columns.
3643          111111112345  */
3644       = "\xcf\x80 abc";
3645       /* 111122223456
3646          Byte columns.  */
3647
3648     ASSERT_EQ (5, cpp_display_width (str, 6));
3649     ASSERT_EQ (105, cpp_byte_column_to_display_column (str, 6, 106));
3650     ASSERT_EQ (10000, cpp_byte_column_to_display_column (NULL, 0, 10000));
3651     ASSERT_EQ (0, cpp_byte_column_to_display_column (NULL, 10000, 0));
3652   }
3653
3654   /* Verify that cpp_display_column_to_byte_column can go past the end,
3655      and similar edge cases, and check invertibility.  */
3656   {
3657     const char *str
3658       /* Display columns.
3659          000000000000000000000000000000000000011
3660          111111112222222234444444455555555678901  */
3661       = "\xf0\x9f\x98\x82 \xf0\x9f\x98\x82 hello";
3662       /* 000000000000000000000000000000000111111
3663          111122223333444456666777788889999012345
3664          Byte columns.  */
3665     ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2));
3666     ASSERT_EQ (15, cpp_display_column_to_byte_column (str, 15, 11));
3667     ASSERT_EQ (115, cpp_display_column_to_byte_column (str, 15, 111));
3668     ASSERT_EQ (10000, cpp_display_column_to_byte_column (NULL, 0, 10000));
3669     ASSERT_EQ (0, cpp_display_column_to_byte_column (NULL, 10000, 0));
3670
3671     /* Verify that we do not interrupt a UTF-8 sequence.  */
3672     ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1));
3673
3674     for (int byte_col = 1; byte_col <= 15; ++byte_col)
3675       {
3676         const int disp_col = cpp_byte_column_to_display_column (str, 15,
3677                                                                 byte_col);
3678         const int byte_col2 = cpp_display_column_to_byte_column (str, 15,
3679                                                                  disp_col);
3680
3681         /* If we ask for the display column in the middle of a UTF-8
3682            sequence, it will return the length of the partial sequence,
3683            matching the behavior of GCC before display column support.
3684            Otherwise check the round trip was successful.  */
3685         if (byte_col < 4)
3686           ASSERT_EQ (byte_col, disp_col);
3687         else if (byte_col >= 6 && byte_col < 9)
3688           ASSERT_EQ (3 + (byte_col - 5), disp_col);
3689         else
3690           ASSERT_EQ (byte_col2, byte_col);
3691       }
3692   }
3693
3694 }
3695
3696 /* Run all of the selftests within this file.  */
3697
3698 void
3699 input_c_tests ()
3700 {
3701   test_linenum_comparisons ();
3702   test_should_have_column_data_p ();
3703   test_unknown_location ();
3704   test_builtins ();
3705   for_each_line_table_case (test_make_location_nonpure_range_endpoints);
3706
3707   for_each_line_table_case (test_accessing_ordinary_linemaps);
3708   for_each_line_table_case (test_lexer);
3709   for_each_line_table_case (test_lexer_string_locations_simple);
3710   for_each_line_table_case (test_lexer_string_locations_ebcdic);
3711   for_each_line_table_case (test_lexer_string_locations_hex);
3712   for_each_line_table_case (test_lexer_string_locations_oct);
3713   for_each_line_table_case (test_lexer_string_locations_letter_escape_1);
3714   for_each_line_table_case (test_lexer_string_locations_letter_escape_2);
3715   for_each_line_table_case (test_lexer_string_locations_ucn4);
3716   for_each_line_table_case (test_lexer_string_locations_ucn8);
3717   for_each_line_table_case (test_lexer_string_locations_wide_string);
3718   for_each_line_table_case (test_lexer_string_locations_string16);
3719   for_each_line_table_case (test_lexer_string_locations_string32);
3720   for_each_line_table_case (test_lexer_string_locations_u8);
3721   for_each_line_table_case (test_lexer_string_locations_utf8_source);
3722   for_each_line_table_case (test_lexer_string_locations_concatenation_1);
3723   for_each_line_table_case (test_lexer_string_locations_concatenation_2);
3724   for_each_line_table_case (test_lexer_string_locations_concatenation_3);
3725   for_each_line_table_case (test_lexer_string_locations_macro);
3726   for_each_line_table_case (test_lexer_string_locations_stringified_macro_argument);
3727   for_each_line_table_case (test_lexer_string_locations_non_string);
3728   for_each_line_table_case (test_lexer_string_locations_long_line);
3729   for_each_line_table_case (test_lexer_string_locations_raw_string_one_line);
3730   for_each_line_table_case (test_lexer_string_locations_raw_string_multiline);
3731   for_each_line_table_case (test_lexer_string_locations_raw_string_unterminated);
3732   for_each_line_table_case (test_lexer_char_constants);
3733
3734   test_reading_source_line ();
3735
3736   test_line_offset_overflow ();
3737
3738   test_cpp_utf8 ();
3739 }
3740
3741 } // namespace selftest
3742
3743 #endif /* CHECKING_P */