gcc/input.c

   1 /* Data and functions related to line maps and input files.
   2    Copyright (C) 2004-2021 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 3, or (at your option) any later
   9 version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "intl.h"
  24 #include "diagnostic.h"
  25 #include "selftest.h"
  26 #include "cpplib.h"
  27
  28 #ifndef HAVE_ICONV
  29 #define HAVE_ICONV 0
  30 #endif
  31
  32 /* Input charset configuration.  */
  33 static const char *default_charset_callback (const char *)
  34 {
  35   return nullptr;
  36 }
  37
  38 void
  39 file_cache::initialize_input_context (diagnostic_input_charset_callback ccb,
  40                                       bool should_skip_bom)
  41 {
  42   in_context.ccb = (ccb ? ccb : default_charset_callback);
  43   in_context.should_skip_bom = should_skip_bom;
  44 }
  45
  46 /* This is a cache used by get_next_line to store the content of a
  47    file to be searched for file lines.  */
  48 class file_cache_slot
  49 {
  50 public:
  51   file_cache_slot ();
  52   ~file_cache_slot ();
  53
  54   bool read_line_num (size_t line_num,
  55                       char ** line, ssize_t *line_len);
  56
  57   /* Accessors.  */
  58   const char *get_file_path () const { return m_file_path; }
  59   unsigned get_use_count () const { return m_use_count; }
  60   bool missing_trailing_newline_p () const
  61   {
  62     return m_missing_trailing_newline;
  63   }
  64
  65   void inc_use_count () { m_use_count++; }
  66
  67   bool create (const file_cache::input_context &in_context,
  68                const char *file_path, FILE *fp, unsigned highest_use_count);
  69   void evict ();
  70
  71  private:
  72   /* These are information used to store a line boundary.  */
  73   class line_info
  74   {
  75   public:
  76     /* The line number.  It starts from 1.  */
  77     size_t line_num;
  78
  79     /* The position (byte count) of the beginning of the line,
  80        relative to the file data pointer.  This starts at zero.  */
  81     size_t start_pos;
  82
  83     /* The position (byte count) of the last byte of the line.  This
  84        normally points to the '\n' character, or to one byte after the
  85        last byte of the file, if the file doesn't contain a '\n'
  86        character.  */
  87     size_t end_pos;
  88
  89     line_info (size_t l, size_t s, size_t e)
  90       : line_num (l), start_pos (s), end_pos (e)
  91     {}
  92
  93     line_info ()
  94       :line_num (0), start_pos (0), end_pos (0)
  95     {}
  96   };
  97
  98   bool needs_read_p () const;
  99   bool needs_grow_p () const;
 100   void maybe_grow ();
 101   bool read_data ();
 102   bool maybe_read_data ();
 103   bool get_next_line (char **line, ssize_t *line_len);
 104   bool read_next_line (char ** line, ssize_t *line_len);
 105   bool goto_next_line ();
 106
 107   static const size_t buffer_size = 4 * 1024;
 108   static const size_t line_record_size = 100;
 109
 110   /* The number of time this file has been accessed.  This is used
 111      to designate which file cache to evict from the cache
 112      array.  */
 113   unsigned m_use_count;
 114
 115   /* The file_path is the key for identifying a particular file in
 116      the cache.
 117      For libcpp-using code, the underlying buffer for this field is
 118      owned by the corresponding _cpp_file within the cpp_reader.  */
 119   const char *m_file_path;
 120
 121   FILE *m_fp;
 122
 123   /* This points to the content of the file that we've read so
 124      far.  */
 125   char *m_data;
 126
 127   /* The allocated buffer to be freed may start a little earlier than DATA,
 128      e.g. if a UTF8 BOM was skipped at the beginning.  */
 129   int m_alloc_offset;
 130
 131   /*  The size of the DATA array above.*/
 132   size_t m_size;
 133
 134   /* The number of bytes read from the underlying file so far.  This
 135      must be less (or equal) than SIZE above.  */
 136   size_t m_nb_read;
 137
 138   /* The index of the beginning of the current line.  */
 139   size_t m_line_start_idx;
 140
 141   /* The number of the previous line read.  This starts at 1.  Zero
 142      means we've read no line so far.  */
 143   size_t m_line_num;
 144
 145   /* This is the total number of lines of the current file.  At the
 146      moment, we try to get this information from the line map
 147      subsystem.  Note that this is just a hint.  When using the C++
 148      front-end, this hint is correct because the input file is then
 149      completely tokenized before parsing starts; so the line map knows
 150      the number of lines before compilation really starts.  For e.g,
 151      the C front-end, it can happen that we start emitting diagnostics
 152      before the line map has seen the end of the file.  */
 153   size_t m_total_lines;
 154
 155   /* Could this file be missing a trailing newline on its final line?
 156      Initially true (to cope with empty files), set to true/false
 157      as each line is read.  */
 158   bool m_missing_trailing_newline;
 159
 160   /* This is a record of the beginning and end of the lines we've seen
 161      while reading the file.  This is useful to avoid walking the data
 162      from the beginning when we are asked to read a line that is
 163      before LINE_START_IDX above.  Note that the maximum size of this
 164      record is line_record_size, so that the memory consumption
 165      doesn't explode.  We thus scale total_lines down to
 166      line_record_size.  */
 167   vec<line_info, va_heap> m_line_record;
 168
 169   void offset_buffer (int offset)
 170   {
 171     gcc_assert (offset < 0 ? m_alloc_offset + offset >= 0
 172                 : (size_t) offset <= m_size);
 173     gcc_assert (m_data);
 174     m_alloc_offset += offset;
 175     m_data += offset;
 176     m_size -= offset;
 177   }
 178
 179 };
 180
 181 /* Current position in real source file.  */
 182
 183 location_t input_location = UNKNOWN_LOCATION;
 184
 185 class line_maps *line_table;
 186
 187 /* A stashed copy of "line_table" for use by selftest::line_table_test.
 188    This needs to be a global so that it can be a GC root, and thus
 189    prevent the stashed copy from being garbage-collected if the GC runs
 190    during a line_table_test.  */
 191
 192 class line_maps *saved_line_table;
 193
 194 /* Expand the source location LOC into a human readable location.  If
 195    LOC resolves to a builtin location, the file name of the readable
 196    location is set to the string "<built-in>". If EXPANSION_POINT_P is
 197    TRUE and LOC is virtual, then it is resolved to the expansion
 198    point of the involved macro.  Otherwise, it is resolved to the
 199    spelling location of the token.
 200
 201    When resolving to the spelling location of the token, if the
 202    resulting location is for a built-in location (that is, it has no
 203    associated line/column) in the context of a macro expansion, the
 204    returned location is the first one (while unwinding the macro
 205    location towards its expansion point) that is in real source
 206    code.
 207
 208    ASPECT controls which part of the location to use.  */
 209
 210 static expanded_location
 211 expand_location_1 (location_t loc,
 212                    bool expansion_point_p,
 213                    enum location_aspect aspect)
 214 {
 215   expanded_location xloc;
 216   const line_map_ordinary *map;
 217   enum location_resolution_kind lrk = LRK_MACRO_EXPANSION_POINT;
 218   tree block = NULL;
 219
 220   if (IS_ADHOC_LOC (loc))
 221     {
 222       block = LOCATION_BLOCK (loc);
 223       loc = LOCATION_LOCUS (loc);
 224     }
 225
 226   memset (&xloc, 0, sizeof (xloc));
 227
 228   if (loc >= RESERVED_LOCATION_COUNT)
 229     {
 230       if (!expansion_point_p)
 231         {
 232           /* We want to resolve LOC to its spelling location.
 233
 234              But if that spelling location is a reserved location that
 235              appears in the context of a macro expansion (like for a
 236              location for a built-in token), let's consider the first
 237              location (toward the expansion point) that is not reserved;
 238              that is, the first location that is in real source code.  */
 239           loc = linemap_unwind_to_first_non_reserved_loc (line_table,
 240                                                           loc, NULL);
 241           lrk = LRK_SPELLING_LOCATION;
 242         }
 243       loc = linemap_resolve_location (line_table, loc, lrk, &map);
 244
 245       /* loc is now either in an ordinary map, or is a reserved location.
 246          If it is a compound location, the caret is in a spelling location,
 247          but the start/finish might still be a virtual location.
 248          Depending of what the caller asked for, we may need to recurse
 249          one level in order to resolve any virtual locations in the
 250          end-points.  */
 251       switch (aspect)
 252         {
 253         default:
 254           gcc_unreachable ();
 255           /* Fall through.  */
 256         case LOCATION_ASPECT_CARET:
 257           break;
 258         case LOCATION_ASPECT_START:
 259           {
 260             location_t start = get_start (loc);
 261             if (start != loc)
 262               return expand_location_1 (start, expansion_point_p, aspect);
 263           }
 264           break;
 265         case LOCATION_ASPECT_FINISH:
 266           {
 267             location_t finish = get_finish (loc);
 268             if (finish != loc)
 269               return expand_location_1 (finish, expansion_point_p, aspect);
 270           }
 271           break;
 272         }
 273       xloc = linemap_expand_location (line_table, map, loc);
 274     }
 275
 276   xloc.data = block;
 277   if (loc <= BUILTINS_LOCATION)
 278     xloc.file = loc == UNKNOWN_LOCATION ? NULL : _("<built-in>");
 279
 280   return xloc;
 281 }
 282
 283 /* Initialize the set of cache used for files accessed by caret
 284    diagnostic.  */
 285
 286 static void
 287 diagnostic_file_cache_init (void)
 288 {
 289   gcc_assert (global_dc);
 290   if (global_dc->m_file_cache == NULL)
 291     global_dc->m_file_cache = new file_cache ();
 292 }
 293
 294 /* Free the resources used by the set of cache used for files accessed
 295    by caret diagnostic.  */
 296
 297 void
 298 diagnostic_file_cache_fini (void)
 299 {
 300   if (global_dc->m_file_cache)
 301     {
 302       delete global_dc->m_file_cache;
 303       global_dc->m_file_cache = NULL;
 304     }
 305 }
 306
 307 /* Return the total lines number that have been read so far by the
 308    line map (in the preprocessor) so far.  For languages like C++ that
 309    entirely preprocess the input file before starting to parse, this
 310    equals the actual number of lines of the file.  */
 311
 312 static size_t
 313 total_lines_num (const char *file_path)
 314 {
 315   size_t r = 0;
 316   location_t l = 0;
 317   if (linemap_get_file_highest_location (line_table, file_path, &l))
 318     {
 319       gcc_assert (l >= RESERVED_LOCATION_COUNT);
 320       expanded_location xloc = expand_location (l);
 321       r = xloc.line;
 322     }
 323   return r;
 324 }
 325
 326 /* Lookup the cache used for the content of a given file accessed by
 327    caret diagnostic.  Return the found cached file, or NULL if no
 328    cached file was found.  */
 329
 330 file_cache_slot *
 331 file_cache::lookup_file (const char *file_path)
 332 {
 333   gcc_assert (file_path);
 334
 335   /* This will contain the found cached file.  */
 336   file_cache_slot *r = NULL;
 337   for (unsigned i = 0; i < num_file_slots; ++i)
 338     {
 339       file_cache_slot *c = &m_file_slots[i];
 340       if (c->get_file_path () && !strcmp (c->get_file_path (), file_path))
 341         {
 342           c->inc_use_count ();
 343           r = c;
 344         }
 345     }
 346
 347   if (r)
 348     r->inc_use_count ();
 349
 350   return r;
 351 }
 352
 353 /* Purge any mention of FILENAME from the cache of files used for
 354    printing source code.  For use in selftests when working
 355    with tempfiles.  */
 356
 357 void
 358 diagnostics_file_cache_forcibly_evict_file (const char *file_path)
 359 {
 360   gcc_assert (file_path);
 361
 362   if (!global_dc->m_file_cache)
 363     return;
 364
 365   global_dc->m_file_cache->forcibly_evict_file (file_path);
 366 }
 367
 368 void
 369 file_cache::forcibly_evict_file (const char *file_path)
 370 {
 371   gcc_assert (file_path);
 372
 373   file_cache_slot *r = lookup_file (file_path);
 374   if (!r)
 375     /* Not found.  */
 376     return;
 377
 378   r->evict ();
 379 }
 380
 381 void
 382 file_cache_slot::evict ()
 383 {
 384   m_file_path = NULL;
 385   if (m_fp)
 386     fclose (m_fp);
 387   m_fp = NULL;
 388   m_nb_read = 0;
 389   m_line_start_idx = 0;
 390   m_line_num = 0;
 391   m_line_record.truncate (0);
 392   m_use_count = 0;
 393   m_total_lines = 0;
 394   m_missing_trailing_newline = true;
 395 }
 396
 397 /* Return the file cache that has been less used, recently, or the
 398    first empty one.  If HIGHEST_USE_COUNT is non-null,
 399    *HIGHEST_USE_COUNT is set to the highest use count of the entries
 400    in the cache table.  */
 401
 402 file_cache_slot*
 403 file_cache::evicted_cache_tab_entry (unsigned *highest_use_count)
 404 {
 405   diagnostic_file_cache_init ();
 406
 407   file_cache_slot *to_evict = &m_file_slots[0];
 408   unsigned huc = to_evict->get_use_count ();
 409   for (unsigned i = 1; i < num_file_slots; ++i)
 410     {
 411       file_cache_slot *c = &m_file_slots[i];
 412       bool c_is_empty = (c->get_file_path () == NULL);
 413
 414       if (c->get_use_count () < to_evict->get_use_count ()
 415           || (to_evict->get_file_path () && c_is_empty))
 416         /* We evict C because it's either an entry with a lower use
 417            count or one that is empty.  */
 418         to_evict = c;
 419
 420       if (huc < c->get_use_count ())
 421         huc = c->get_use_count ();
 422
 423       if (c_is_empty)
 424         /* We've reached the end of the cache; subsequent elements are
 425            all empty.  */
 426         break;
 427     }
 428
 429   if (highest_use_count)
 430     *highest_use_count = huc;
 431
 432   return to_evict;
 433 }
 434
 435 /* Create the cache used for the content of a given file to be
 436    accessed by caret diagnostic.  This cache is added to an array of
 437    cache and can be retrieved by lookup_file_in_cache_tab.  This
 438    function returns the created cache.  Note that only the last
 439    num_file_slots files are cached.  */
 440
 441 file_cache_slot*
 442 file_cache::add_file (const char *file_path)
 443 {
 444
 445   FILE *fp = fopen (file_path, "r");
 446   if (fp == NULL)
 447     return NULL;
 448
 449   unsigned highest_use_count = 0;
 450   file_cache_slot *r = evicted_cache_tab_entry (&highest_use_count);
 451   if (!r->create (in_context, file_path, fp, highest_use_count))
 452     return NULL;
 453   return r;
 454 }
 455
 456 /* Populate this slot for use on FILE_PATH and FP, dropping any
 457    existing cached content within it.  */
 458
 459 bool
 460 file_cache_slot::create (const file_cache::input_context &in_context,
 461                          const char *file_path, FILE *fp,
 462                          unsigned highest_use_count)
 463 {
 464   m_file_path = file_path;
 465   if (m_fp)
 466     fclose (m_fp);
 467   m_fp = fp;
 468   if (m_alloc_offset)
 469     offset_buffer (-m_alloc_offset);
 470   m_nb_read = 0;
 471   m_line_start_idx = 0;
 472   m_line_num = 0;
 473   m_line_record.truncate (0);
 474   /* Ensure that this cache entry doesn't get evicted next time
 475      add_file_to_cache_tab is called.  */
 476   m_use_count = ++highest_use_count;
 477   m_total_lines = total_lines_num (file_path);
 478   m_missing_trailing_newline = true;
 479
 480
 481   /* Check the input configuration to determine if we need to do any
 482      transformations, such as charset conversion or BOM skipping.  */
 483   if (const char *input_charset = in_context.ccb (file_path))
 484     {
 485       /* Need a full-blown conversion of the input charset.  */
 486       fclose (m_fp);
 487       m_fp = NULL;
 488       const cpp_converted_source cs
 489         = cpp_get_converted_source (file_path, input_charset);
 490       if (!cs.data)
 491         return false;
 492       if (m_data)
 493         XDELETEVEC (m_data);
 494       m_data = cs.data;
 495       m_nb_read = m_size = cs.len;
 496       m_alloc_offset = cs.data - cs.to_free;
 497     }
 498   else if (in_context.should_skip_bom)
 499     {
 500       if (read_data ())
 501         {
 502           const int offset = cpp_check_utf8_bom (m_data, m_nb_read);
 503           offset_buffer (offset);
 504           m_nb_read -= offset;
 505         }
 506     }
 507
 508   return true;
 509 }
 510
 511 /* file_cache's ctor.  */
 512
 513 file_cache::file_cache ()
 514 : m_file_slots (new file_cache_slot[num_file_slots])
 515 {
 516   initialize_input_context (nullptr, false);
 517 }
 518
 519 /* file_cache's dtor.  */
 520
 521 file_cache::~file_cache ()
 522 {
 523   delete[] m_file_slots;
 524 }
 525
 526 /* Lookup the cache used for the content of a given file accessed by
 527    caret diagnostic.  If no cached file was found, create a new cache
 528    for this file, add it to the array of cached file and return
 529    it.  */
 530
 531 file_cache_slot*
 532 file_cache::lookup_or_add_file (const char *file_path)
 533 {
 534   file_cache_slot *r = lookup_file (file_path);
 535   if (r == NULL)
 536     r = add_file (file_path);
 537   return r;
 538 }
 539
 540 /* Default constructor for a cache of file used by caret
 541    diagnostic.  */
 542
 543 file_cache_slot::file_cache_slot ()
 544 : m_use_count (0), m_file_path (NULL), m_fp (NULL), m_data (0),
 545   m_alloc_offset (0), m_size (0), m_nb_read (0), m_line_start_idx (0),
 546   m_line_num (0), m_total_lines (0), m_missing_trailing_newline (true)
 547 {
 548   m_line_record.create (0);
 549 }
 550
 551 /* Destructor for a cache of file used by caret diagnostic.  */
 552
 553 file_cache_slot::~file_cache_slot ()
 554 {
 555   if (m_fp)
 556     {
 557       fclose (m_fp);
 558       m_fp = NULL;
 559     }
 560   if (m_data)
 561     {
 562       offset_buffer (-m_alloc_offset);
 563       XDELETEVEC (m_data);
 564       m_data = 0;
 565     }
 566   m_line_record.release ();
 567 }
 568
 569 /* Returns TRUE iff the cache would need to be filled with data coming
 570    from the file.  That is, either the cache is empty or full or the
 571    current line is empty.  Note that if the cache is full, it would
 572    need to be extended and filled again.  */
 573
 574 bool
 575 file_cache_slot::needs_read_p () const
 576 {
 577   return m_fp && (m_nb_read == 0
 578           || m_nb_read == m_size
 579           || (m_line_start_idx >= m_nb_read - 1));
 580 }
 581
 582 /*  Return TRUE iff the cache is full and thus needs to be
 583     extended.  */
 584
 585 bool
 586 file_cache_slot::needs_grow_p () const
 587 {
 588   return m_nb_read == m_size;
 589 }
 590
 591 /* Grow the cache if it needs to be extended.  */
 592
 593 void
 594 file_cache_slot::maybe_grow ()
 595 {
 596   if (!needs_grow_p ())
 597     return;
 598
 599   if (!m_data)
 600     {
 601       gcc_assert (m_size == 0 && m_alloc_offset == 0);
 602       m_size = buffer_size;
 603       m_data = XNEWVEC (char, m_size);
 604     }
 605   else
 606     {
 607       const int offset = m_alloc_offset;
 608       offset_buffer (-offset);
 609       m_size *= 2;
 610       m_data = XRESIZEVEC (char, m_data, m_size);
 611       offset_buffer (offset);
 612     }
 613 }
 614
 615 /*  Read more data into the cache.  Extends the cache if need be.
 616     Returns TRUE iff new data could be read.  */
 617
 618 bool
 619 file_cache_slot::read_data ()
 620 {
 621   if (feof (m_fp) || ferror (m_fp))
 622     return false;
 623
 624   maybe_grow ();
 625
 626   char * from = m_data + m_nb_read;
 627   size_t to_read = m_size - m_nb_read;
 628   size_t nb_read = fread (from, 1, to_read, m_fp);
 629
 630   if (ferror (m_fp))
 631     return false;
 632
 633   m_nb_read += nb_read;
 634   return !!nb_read;
 635 }
 636
 637 /* Read new data iff the cache needs to be filled with more data
 638    coming from the file FP.  Return TRUE iff the cache was filled with
 639    mode data.  */
 640
 641 bool
 642 file_cache_slot::maybe_read_data ()
 643 {
 644   if (!needs_read_p ())
 645     return false;
 646   return read_data ();
 647 }
 648
 649 /* Read a new line from file FP, using C as a cache for the data
 650    coming from the file.  Upon successful completion, *LINE is set to
 651    the beginning of the line found.  *LINE points directly in the
 652    line cache and is only valid until the next call of get_next_line.
 653    *LINE_LEN is set to the length of the line.  Note that the line
 654    does not contain any terminal delimiter.  This function returns
 655    true if some data was read or process from the cache, false
 656    otherwise.  Note that subsequent calls to get_next_line might
 657    make the content of *LINE invalid.  */
 658
 659 bool
 660 file_cache_slot::get_next_line (char **line, ssize_t *line_len)
 661 {
 662   /* Fill the cache with data to process.  */
 663   maybe_read_data ();
 664
 665   size_t remaining_size = m_nb_read - m_line_start_idx;
 666   if (remaining_size == 0)
 667     /* There is no more data to process.  */
 668     return false;
 669
 670   char *line_start = m_data + m_line_start_idx;
 671
 672   char *next_line_start = NULL;
 673   size_t len = 0;
 674   char *line_end = (char *) memchr (line_start, '\n', remaining_size);
 675   if (line_end == NULL)
 676     {
 677       /* We haven't found the end-of-line delimiter in the cache.
 678          Fill the cache with more data from the file and look for the
 679          '\n'.  */
 680       while (maybe_read_data ())
 681         {
 682           line_start = m_data + m_line_start_idx;
 683           remaining_size = m_nb_read - m_line_start_idx;
 684           line_end = (char *) memchr (line_start, '\n', remaining_size);
 685           if (line_end != NULL)
 686             {
 687               next_line_start = line_end + 1;
 688               break;
 689             }
 690         }
 691       if (line_end == NULL)
 692         {
 693           /* We've loadded all the file into the cache and still no
 694              '\n'.  Let's say the line ends up at one byte passed the
 695              end of the file.  This is to stay consistent with the case
 696              of when the line ends up with a '\n' and line_end points to
 697              that terminal '\n'.  That consistency is useful below in
 698              the len calculation.  */
 699           line_end = m_data + m_nb_read ;
 700           m_missing_trailing_newline = true;
 701         }
 702       else
 703         m_missing_trailing_newline = false;
 704     }
 705   else
 706     {
 707       next_line_start = line_end + 1;
 708       m_missing_trailing_newline = false;
 709     }
 710
 711   if (m_fp && ferror (m_fp))
 712     return false;
 713
 714   /* At this point, we've found the end of the of line.  It either
 715      points to the '\n' or to one byte after the last byte of the
 716      file.  */
 717   gcc_assert (line_end != NULL);
 718
 719   len = line_end - line_start;
 720
 721   if (m_line_start_idx < m_nb_read)
 722     *line = line_start;
 723
 724   ++m_line_num;
 725
 726   /* Before we update our line record, make sure the hint about the
 727      total number of lines of the file is correct.  If it's not, then
 728      we give up recording line boundaries from now on.  */
 729   bool update_line_record = true;
 730   if (m_line_num > m_total_lines)
 731     update_line_record = false;
 732
 733     /* Now update our line record so that re-reading lines from the
 734      before m_line_start_idx is faster.  */
 735   if (update_line_record
 736       && m_line_record.length () < line_record_size)
 737     {
 738       /* If the file lines fits in the line record, we just record all
 739          its lines ...*/
 740       if (m_total_lines <= line_record_size
 741           && m_line_num > m_line_record.length ())
 742         m_line_record.safe_push
 743           (file_cache_slot::line_info (m_line_num,
 744                                        m_line_start_idx,
 745                                        line_end - m_data));
 746       else if (m_total_lines > line_record_size)
 747         {
 748           /* ... otherwise, we just scale total_lines down to
 749              (line_record_size lines.  */
 750           size_t n = (m_line_num * line_record_size) / m_total_lines;
 751           if (m_line_record.length () == 0
 752               || n >= m_line_record.length ())
 753             m_line_record.safe_push
 754               (file_cache_slot::line_info (m_line_num,
 755                                            m_line_start_idx,
 756                                            line_end - m_data));
 757         }
 758     }
 759
 760   /* Update m_line_start_idx so that it points to the next line to be
 761      read.  */
 762   if (next_line_start)
 763     m_line_start_idx = next_line_start - m_data;
 764   else
 765     /* We didn't find any terminal '\n'.  Let's consider that the end
 766        of line is the end of the data in the cache.  The next
 767        invocation of get_next_line will either read more data from the
 768        underlying file or return false early because we've reached the
 769        end of the file.  */
 770     m_line_start_idx = m_nb_read;
 771
 772   *line_len = len;
 773
 774   return true;
 775 }
 776
 777 /* Consume the next bytes coming from the cache (or from its
 778    underlying file if there are remaining unread bytes in the file)
 779    until we reach the next end-of-line (or end-of-file).  There is no
 780    copying from the cache involved.  Return TRUE upon successful
 781    completion.  */
 782
 783 bool
 784 file_cache_slot::goto_next_line ()
 785 {
 786   char *l;
 787   ssize_t len;
 788
 789   return get_next_line (&l, &len);
 790 }
 791
 792 /* Read an arbitrary line number LINE_NUM from the file cached in C.
 793    If the line was read successfully, *LINE points to the beginning
 794    of the line in the file cache and *LINE_LEN is the length of the
 795    line.  *LINE is not nul-terminated, but may contain zero bytes.
 796    *LINE is only valid until the next call of read_line_num.
 797    This function returns bool if a line was read.  */
 798
 799 bool
 800 file_cache_slot::read_line_num (size_t line_num,
 801                        char ** line, ssize_t *line_len)
 802 {
 803   gcc_assert (line_num > 0);
 804
 805   if (line_num <= m_line_num)
 806     {
 807       /* We've been asked to read lines that are before m_line_num.
 808          So lets use our line record (if it's not empty) to try to
 809          avoid re-reading the file from the beginning again.  */
 810
 811       if (m_line_record.is_empty ())
 812         {
 813           m_line_start_idx = 0;
 814           m_line_num = 0;
 815         }
 816       else
 817         {
 818           file_cache_slot::line_info *i = NULL;
 819           if (m_total_lines <= line_record_size)
 820             {
 821               /* In languages where the input file is not totally
 822                  preprocessed up front, the m_total_lines hint
 823                  can be smaller than the number of lines of the
 824                  file.  In that case, only the first
 825                  m_total_lines have been recorded.
 826
 827                  Otherwise, the first m_total_lines we've read have
 828                  their start/end recorded here.  */
 829               i = (line_num <= m_total_lines)
 830                 ? &m_line_record[line_num - 1]
 831                 : &m_line_record[m_total_lines - 1];
 832               gcc_assert (i->line_num <= line_num);
 833             }
 834           else
 835             {
 836               /*  So the file had more lines than our line record
 837                   size.  Thus the number of lines we've recorded has
 838                   been scaled down to line_record_size.  Let's
 839                   pick the start/end of the recorded line that is
 840                   closest to line_num.  */
 841               size_t n = (line_num <= m_total_lines)
 842                 ? line_num * line_record_size / m_total_lines
 843                 : m_line_record.length () - 1;
 844               if (n < m_line_record.length ())
 845                 {
 846                   i = &m_line_record[n];
 847                   gcc_assert (i->line_num <= line_num);
 848                 }
 849             }
 850
 851           if (i && i->line_num == line_num)
 852             {
 853               /* We have the start/end of the line.  */
 854               *line = m_data + i->start_pos;
 855               *line_len = i->end_pos - i->start_pos;
 856               return true;
 857             }
 858
 859           if (i)
 860             {
 861               m_line_start_idx = i->start_pos;
 862               m_line_num = i->line_num - 1;
 863             }
 864           else
 865             {
 866               m_line_start_idx = 0;
 867               m_line_num = 0;
 868             }
 869         }
 870     }
 871
 872   /*  Let's walk from line m_line_num up to line_num - 1, without
 873       copying any line.  */
 874   while (m_line_num < line_num - 1)
 875     if (!goto_next_line ())
 876       return false;
 877
 878   /* The line we want is the next one.  Let's read and copy it back to
 879      the caller.  */
 880   return get_next_line (line, line_len);
 881 }
 882
 883 /* Return the physical source line that corresponds to FILE_PATH/LINE.
 884    The line is not nul-terminated.  The returned pointer is only
 885    valid until the next call of location_get_source_line.
 886    Note that the line can contain several null characters,
 887    so the returned value's length has the actual length of the line.
 888    If the function fails, a NULL char_span is returned.  */
 889
 890 char_span
 891 location_get_source_line (const char *file_path, int line)
 892 {
 893   char *buffer = NULL;
 894   ssize_t len;
 895
 896   if (line == 0)
 897     return char_span (NULL, 0);
 898
 899   if (file_path == NULL)
 900     return char_span (NULL, 0);
 901
 902   diagnostic_file_cache_init ();
 903
 904   file_cache_slot *c = global_dc->m_file_cache->lookup_or_add_file (file_path);
 905   if (c == NULL)
 906     return char_span (NULL, 0);
 907
 908   bool read = c->read_line_num (line, &buffer, &len);
 909   if (!read)
 910     return char_span (NULL, 0);
 911
 912   return char_span (buffer, len);
 913 }
 914
 915 /* Determine if FILE_PATH missing a trailing newline on its final line.
 916    Only valid to call once all of the file has been loaded, by
 917    requesting a line number beyond the end of the file.  */
 918
 919 bool
 920 location_missing_trailing_newline (const char *file_path)
 921 {
 922   diagnostic_file_cache_init ();
 923
 924   file_cache_slot *c = global_dc->m_file_cache->lookup_or_add_file (file_path);
 925   if (c == NULL)
 926     return false;
 927
 928   return c->missing_trailing_newline_p ();
 929 }
 930
 931 /* Test if the location originates from the spelling location of a
 932    builtin-tokens.  That is, return TRUE if LOC is a (possibly
 933    virtual) location of a built-in token that appears in the expansion
 934    list of a macro.  Please note that this function also works on
 935    tokens that result from built-in tokens.  For instance, the
 936    function would return true if passed a token "4" that is the result
 937    of the expansion of the built-in __LINE__ macro.  */
 938 bool
 939 is_location_from_builtin_token (location_t loc)
 940 {
 941   const line_map_ordinary *map = NULL;
 942   loc = linemap_resolve_location (line_table, loc,
 943                                   LRK_SPELLING_LOCATION, &map);
 944   return loc == BUILTINS_LOCATION;
 945 }
 946
 947 /* Expand the source location LOC into a human readable location.  If
 948    LOC is virtual, it resolves to the expansion point of the involved
 949    macro.  If LOC resolves to a builtin location, the file name of the
 950    readable location is set to the string "<built-in>".  */
 951
 952 expanded_location
 953 expand_location (location_t loc)
 954 {
 955   return expand_location_1 (loc, /*expansion_point_p=*/true,
 956                             LOCATION_ASPECT_CARET);
 957 }
 958
 959 /* Expand the source location LOC into a human readable location.  If
 960    LOC is virtual, it resolves to the expansion location of the
 961    relevant macro.  If LOC resolves to a builtin location, the file
 962    name of the readable location is set to the string
 963    "<built-in>".  */
 964
 965 expanded_location
 966 expand_location_to_spelling_point (location_t loc,
 967                                    enum location_aspect aspect)
 968 {
 969   return expand_location_1 (loc, /*expansion_point_p=*/false, aspect);
 970 }
 971
 972 /* The rich_location class within libcpp requires a way to expand
 973    location_t instances, and relies on the client code
 974    providing a symbol named
 975      linemap_client_expand_location_to_spelling_point
 976    to do this.
 977
 978    This is the implementation for libcommon.a (all host binaries),
 979    which simply calls into expand_location_1.  */
 980
 981 expanded_location
 982 linemap_client_expand_location_to_spelling_point (location_t loc,
 983                                                   enum location_aspect aspect)
 984 {
 985   return expand_location_1 (loc, /*expansion_point_p=*/false, aspect);
 986 }
 987
 988
 989 /* If LOCATION is in a system header and if it is a virtual location for
 990    a token coming from the expansion of a macro, unwind it to the
 991    location of the expansion point of the macro.  Otherwise, just return
 992    LOCATION.
 993
 994    This is used for instance when we want to emit diagnostics about a
 995    token that may be located in a macro that is itself defined in a
 996    system header, for example, for the NULL macro.  In such a case, if
 997    LOCATION were passed directly to diagnostic functions such as
 998    warning_at, the diagnostic would be suppressed (unless
 999    -Wsystem-headers).  */
1000
1001 location_t
1002 expansion_point_location_if_in_system_header (location_t location)
1003 {
1004   if (in_system_header_at (location))
1005     location = linemap_resolve_location (line_table, location,
1006                                          LRK_MACRO_EXPANSION_POINT,
1007                                          NULL);
1008   return location;
1009 }
1010
1011 /* If LOCATION is a virtual location for a token coming from the expansion
1012    of a macro, unwind to the location of the expansion point of the macro.  */
1013
1014 location_t
1015 expansion_point_location (location_t location)
1016 {
1017   return linemap_resolve_location (line_table, location,
1018                                    LRK_MACRO_EXPANSION_POINT, NULL);
1019 }
1020
1021 /* Construct a location with caret at CARET, ranging from START to
1022    finish e.g.
1023
1024                  11111111112
1025         12345678901234567890
1026      522
1027      523   return foo + bar;
1028                   ~~~~^~~~~
1029      524
1030
1031    The location's caret is at the "+", line 523 column 15, but starts
1032    earlier, at the "f" of "foo" at column 11.  The finish is at the "r"
1033    of "bar" at column 19.  */
1034
1035 location_t
1036 make_location (location_t caret, location_t start, location_t finish)
1037 {
1038   location_t pure_loc = get_pure_location (caret);
1039   source_range src_range;
1040   src_range.m_start = get_start (start);
1041   src_range.m_finish = get_finish (finish);
1042   location_t combined_loc = COMBINE_LOCATION_DATA (line_table,
1043                                                    pure_loc,
1044                                                    src_range,
1045                                                    NULL);
1046   return combined_loc;
1047 }
1048
1049 /* Same as above, but taking a source range rather than two locations.  */
1050
1051 location_t
1052 make_location (location_t caret, source_range src_range)
1053 {
1054   location_t pure_loc = get_pure_location (caret);
1055   return COMBINE_LOCATION_DATA (line_table, pure_loc, src_range, NULL);
1056 }
1057
1058 /* An expanded_location stores the column in byte units.  This function
1059    converts that column to display units.  That requires reading the associated
1060    source line in order to calculate the display width.  If that cannot be done
1061    for any reason, then returns the byte column as a fallback.  */
1062 int
1063 location_compute_display_column (expanded_location exploc, int tabstop)
1064 {
1065   if (!(exploc.file && *exploc.file && exploc.line && exploc.column))
1066     return exploc.column;
1067   char_span line = location_get_source_line (exploc.file, exploc.line);
1068   /* If line is NULL, this function returns exploc.column which is the
1069      desired fallback.  */
1070   return cpp_byte_column_to_display_column (line.get_buffer (), line.length (),
1071                                             exploc.column, tabstop);
1072 }
1073
1074 /* Dump statistics to stderr about the memory usage of the line_table
1075    set of line maps.  This also displays some statistics about macro
1076    expansion.  */
1077
1078 void
1079 dump_line_table_statistics (void)
1080 {
1081   struct linemap_stats s;
1082   long total_used_map_size,
1083     macro_maps_size,
1084     total_allocated_map_size;
1085
1086   memset (&s, 0, sizeof (s));
1087
1088   linemap_get_statistics (line_table, &s);
1089
1090   macro_maps_size = s.macro_maps_used_size
1091     + s.macro_maps_locations_size;
1092
1093   total_allocated_map_size = s.ordinary_maps_allocated_size
1094     + s.macro_maps_allocated_size
1095     + s.macro_maps_locations_size;
1096
1097   total_used_map_size = s.ordinary_maps_used_size
1098     + s.macro_maps_used_size
1099     + s.macro_maps_locations_size;
1100
1101   fprintf (stderr, "Number of expanded macros:                     %5ld\n",
1102            s.num_expanded_macros);
1103   if (s.num_expanded_macros != 0)
1104     fprintf (stderr, "Average number of tokens per macro expansion:  %5ld\n",
1105              s.num_macro_tokens / s.num_expanded_macros);
1106   fprintf (stderr,
1107            "\nLine Table allocations during the "
1108            "compilation process\n");
1109   fprintf (stderr, "Number of ordinary maps used:        " PRsa (5) "\n",
1110            SIZE_AMOUNT (s.num_ordinary_maps_used));
1111   fprintf (stderr, "Ordinary map used size:              " PRsa (5) "\n",
1112            SIZE_AMOUNT (s.ordinary_maps_used_size));
1113   fprintf (stderr, "Number of ordinary maps allocated:   " PRsa (5) "\n",
1114            SIZE_AMOUNT (s.num_ordinary_maps_allocated));
1115   fprintf (stderr, "Ordinary maps allocated size:        " PRsa (5) "\n",
1116            SIZE_AMOUNT (s.ordinary_maps_allocated_size));
1117   fprintf (stderr, "Number of macro maps used:           " PRsa (5) "\n",
1118            SIZE_AMOUNT (s.num_macro_maps_used));
1119   fprintf (stderr, "Macro maps used size:                " PRsa (5) "\n",
1120            SIZE_AMOUNT (s.macro_maps_used_size));
1121   fprintf (stderr, "Macro maps locations size:           " PRsa (5) "\n",
1122            SIZE_AMOUNT (s.macro_maps_locations_size));
1123   fprintf (stderr, "Macro maps size:                     " PRsa (5) "\n",
1124            SIZE_AMOUNT (macro_maps_size));
1125   fprintf (stderr, "Duplicated maps locations size:      " PRsa (5) "\n",
1126            SIZE_AMOUNT (s.duplicated_macro_maps_locations_size));
1127   fprintf (stderr, "Total allocated maps size:           " PRsa (5) "\n",
1128            SIZE_AMOUNT (total_allocated_map_size));
1129   fprintf (stderr, "Total used maps size:                " PRsa (5) "\n",
1130            SIZE_AMOUNT (total_used_map_size));
1131   fprintf (stderr, "Ad-hoc table size:                   " PRsa (5) "\n",
1132            SIZE_AMOUNT (s.adhoc_table_size));
1133   fprintf (stderr, "Ad-hoc table entries used:           " PRsa (5) "\n",
1134            SIZE_AMOUNT (s.adhoc_table_entries_used));
1135   fprintf (stderr, "optimized_ranges:                    " PRsa (5) "\n",
1136            SIZE_AMOUNT (line_table->num_optimized_ranges));
1137   fprintf (stderr, "unoptimized_ranges:                  " PRsa (5) "\n",
1138            SIZE_AMOUNT (line_table->num_unoptimized_ranges));
1139
1140   fprintf (stderr, "\n");
1141 }
1142
1143 /* Get location one beyond the final location in ordinary map IDX.  */
1144
1145 static location_t
1146 get_end_location (class line_maps *set, unsigned int idx)
1147 {
1148   if (idx == LINEMAPS_ORDINARY_USED (set) - 1)
1149     return set->highest_location;
1150
1151   struct line_map *next_map = LINEMAPS_ORDINARY_MAP_AT (set, idx + 1);
1152   return MAP_START_LOCATION (next_map);
1153 }
1154
1155 /* Helper function for write_digit_row.  */
1156
1157 static void
1158 write_digit (FILE *stream, int digit)
1159 {
1160   fputc ('0' + (digit % 10), stream);
1161 }
1162
1163 /* Helper function for dump_location_info.
1164    Write a row of numbers to STREAM, numbering a source line,
1165    giving the units, tens, hundreds etc of the column number.  */
1166
1167 static void
1168 write_digit_row (FILE *stream, int indent,
1169                  const line_map_ordinary *map,
1170                  location_t loc, int max_col, int divisor)
1171 {
1172   fprintf (stream, "%*c", indent, ' ');
1173   fprintf (stream, "|");
1174   for (int column = 1; column < max_col; column++)
1175     {
1176       location_t column_loc = loc + (column << map->m_range_bits);
1177       write_digit (stream, column_loc / divisor);
1178     }
1179   fprintf (stream, "\n");
1180 }
1181
1182 /* Write a half-closed (START) / half-open (END) interval of
1183    location_t to STREAM.  */
1184
1185 static void
1186 dump_location_range (FILE *stream,
1187                      location_t start, location_t end)
1188 {
1189   fprintf (stream,
1190            "  location_t interval: %u <= loc < %u\n",
1191            start, end);
1192 }
1193
1194 /* Write a labelled description of a half-closed (START) / half-open (END)
1195    interval of location_t to STREAM.  */
1196
1197 static void
1198 dump_labelled_location_range (FILE *stream,
1199                               const char *name,
1200                               location_t start, location_t end)
1201 {
1202   fprintf (stream, "%s\n", name);
1203   dump_location_range (stream, start, end);
1204   fprintf (stream, "\n");
1205 }
1206
1207 /* Write a visualization of the locations in the line_table to STREAM.  */
1208
1209 void
1210 dump_location_info (FILE *stream)
1211 {
1212   /* Visualize the reserved locations.  */
1213   dump_labelled_location_range (stream, "RESERVED LOCATIONS",
1214                                 0, RESERVED_LOCATION_COUNT);
1215
1216   /* Visualize the ordinary line_map instances, rendering the sources. */
1217   for (unsigned int idx = 0; idx < LINEMAPS_ORDINARY_USED (line_table); idx++)
1218     {
1219       location_t end_location = get_end_location (line_table, idx);
1220       /* half-closed: doesn't include this one. */
1221
1222       const line_map_ordinary *map
1223         = LINEMAPS_ORDINARY_MAP_AT (line_table, idx);
1224       fprintf (stream, "ORDINARY MAP: %i\n", idx);
1225       dump_location_range (stream,
1226                            MAP_START_LOCATION (map), end_location);
1227       fprintf (stream, "  file: %s\n", ORDINARY_MAP_FILE_NAME (map));
1228       fprintf (stream, "  starting at line: %i\n",
1229                ORDINARY_MAP_STARTING_LINE_NUMBER (map));
1230       fprintf (stream, "  column and range bits: %i\n",
1231                map->m_column_and_range_bits);
1232       fprintf (stream, "  column bits: %i\n",
1233                map->m_column_and_range_bits - map->m_range_bits);
1234       fprintf (stream, "  range bits: %i\n",
1235                map->m_range_bits);
1236       const char * reason;
1237       switch (map->reason) {
1238       case LC_ENTER:
1239         reason = "LC_ENTER";
1240         break;
1241       case LC_LEAVE:
1242         reason = "LC_LEAVE";
1243         break;
1244       case LC_RENAME:
1245         reason = "LC_RENAME";
1246         break;
1247       case LC_RENAME_VERBATIM:
1248         reason = "LC_RENAME_VERBATIM";
1249         break;
1250       case LC_ENTER_MACRO:
1251         reason = "LC_RENAME_MACRO";
1252         break;
1253       default:
1254         reason = "Unknown";
1255       }
1256       fprintf (stream, "  reason: %d (%s)\n", map->reason, reason);
1257
1258       const line_map_ordinary *includer_map
1259         = linemap_included_from_linemap (line_table, map);
1260       fprintf (stream, "  included from location: %d",
1261                linemap_included_from (map));
1262       if (includer_map) {
1263         fprintf (stream, " (in ordinary map %d)",
1264                  int (includer_map - line_table->info_ordinary.maps));
1265       }
1266       fprintf (stream, "\n");
1267
1268       /* Render the span of source lines that this "map" covers.  */
1269       for (location_t loc = MAP_START_LOCATION (map);
1270            loc < end_location;
1271            loc += (1 << map->m_range_bits) )
1272         {
1273           gcc_assert (pure_location_p (line_table, loc) );
1274
1275           expanded_location exploc
1276             = linemap_expand_location (line_table, map, loc);
1277
1278           if (exploc.column == 0)
1279             {
1280               /* Beginning of a new source line: draw the line.  */
1281
1282               char_span line_text = location_get_source_line (exploc.file,
1283                                                               exploc.line);
1284               if (!line_text)
1285                 break;
1286               fprintf (stream,
1287                        "%s:%3i|loc:%5i|%.*s\n",
1288                        exploc.file, exploc.line,
1289                        loc,
1290                        (int)line_text.length (), line_text.get_buffer ());
1291
1292               /* "loc" is at column 0, which means "the whole line".
1293                  Render the locations *within* the line, by underlining
1294                  it, showing the location_t numeric values
1295                  at each column.  */
1296               size_t max_col = (1 << map->m_column_and_range_bits) - 1;
1297               if (max_col > line_text.length ())
1298                 max_col = line_text.length () + 1;
1299
1300               int len_lnum = num_digits (exploc.line);
1301               if (len_lnum < 3)
1302                 len_lnum = 3;
1303               int len_loc = num_digits (loc);
1304               if (len_loc < 5)
1305                 len_loc = 5;
1306
1307               int indent = 6 + strlen (exploc.file) + len_lnum + len_loc;
1308
1309               /* Thousands.  */
1310               if (end_location > 999)
1311                 write_digit_row (stream, indent, map, loc, max_col, 1000);
1312
1313               /* Hundreds.  */
1314               if (end_location > 99)
1315                 write_digit_row (stream, indent, map, loc, max_col, 100);
1316
1317               /* Tens.  */
1318               write_digit_row (stream, indent, map, loc, max_col, 10);
1319
1320               /* Units.  */
1321               write_digit_row (stream, indent, map, loc, max_col, 1);
1322             }
1323         }
1324       fprintf (stream, "\n");
1325     }
1326
1327   /* Visualize unallocated values.  */
1328   dump_labelled_location_range (stream, "UNALLOCATED LOCATIONS",
1329                                 line_table->highest_location,
1330                                 LINEMAPS_MACRO_LOWEST_LOCATION (line_table));
1331
1332   /* Visualize the macro line_map instances, rendering the sources. */
1333   for (unsigned int i = 0; i < LINEMAPS_MACRO_USED (line_table); i++)
1334     {
1335       /* Each macro map that is allocated owns location_t values
1336          that are *lower* that the one before them.
1337          Hence it's meaningful to view them either in order of ascending
1338          source locations, or in order of ascending macro map index.  */
1339       const bool ascending_location_ts = true;
1340       unsigned int idx = (ascending_location_ts
1341                           ? (LINEMAPS_MACRO_USED (line_table) - (i + 1))
1342                           : i);
1343       const line_map_macro *map = LINEMAPS_MACRO_MAP_AT (line_table, idx);
1344       fprintf (stream, "MACRO %i: %s (%u tokens)\n",
1345                idx,
1346                linemap_map_get_macro_name (map),
1347                MACRO_MAP_NUM_MACRO_TOKENS (map));
1348       dump_location_range (stream,
1349                            map->start_location,
1350                            (map->start_location
1351                             + MACRO_MAP_NUM_MACRO_TOKENS (map)));
1352       inform (MACRO_MAP_EXPANSION_POINT_LOCATION (map),
1353               "expansion point is location %i",
1354               MACRO_MAP_EXPANSION_POINT_LOCATION (map));
1355       fprintf (stream, "  map->start_location: %u\n",
1356                map->start_location);
1357
1358       fprintf (stream, "  macro_locations:\n");
1359       for (unsigned int i = 0; i < MACRO_MAP_NUM_MACRO_TOKENS (map); i++)
1360         {
1361           location_t x = MACRO_MAP_LOCATIONS (map)[2 * i];
1362           location_t y = MACRO_MAP_LOCATIONS (map)[(2 * i) + 1];
1363
1364           /* linemap_add_macro_token encodes token numbers in an expansion
1365              by putting them after MAP_START_LOCATION. */
1366
1367           /* I'm typically seeing 4 uninitialized entries at the end of
1368              0xafafafaf.
1369              This appears to be due to macro.c:replace_args
1370              adding 2 extra args for padding tokens; presumably there may
1371              be a leading and/or trailing padding token injected,
1372              each for 2 more location slots.
1373              This would explain there being up to 4 location_ts slots
1374              that may be uninitialized.  */
1375
1376           fprintf (stream, "    %u: %u, %u\n",
1377                    i,
1378                    x,
1379                    y);
1380           if (x == y)
1381             {
1382               if (x < MAP_START_LOCATION (map))
1383                 inform (x, "token %u has %<x-location == y-location == %u%>",
1384                         i, x);
1385               else
1386                 fprintf (stream,
1387                          "x-location == y-location == %u encodes token # %u\n",
1388                          x, x - MAP_START_LOCATION (map));
1389                 }
1390           else
1391             {
1392               inform (x, "token %u has %<x-location == %u%>", i, x);
1393               inform (x, "token %u has %<y-location == %u%>", i, y);
1394             }
1395         }
1396       fprintf (stream, "\n");
1397     }
1398
1399   /* It appears that MAX_LOCATION_T itself is never assigned to a
1400      macro map, presumably due to an off-by-one error somewhere
1401      between the logic in linemap_enter_macro and
1402      LINEMAPS_MACRO_LOWEST_LOCATION.  */
1403   dump_labelled_location_range (stream, "MAX_LOCATION_T",
1404                                 MAX_LOCATION_T,
1405                                 MAX_LOCATION_T + 1);
1406
1407   /* Visualize ad-hoc values.  */
1408   dump_labelled_location_range (stream, "AD-HOC LOCATIONS",
1409                                 MAX_LOCATION_T + 1, UINT_MAX);
1410 }
1411
1412 /* string_concat's constructor.  */
1413
1414 string_concat::string_concat (int num, location_t *locs)
1415   : m_num (num)
1416 {
1417   m_locs = ggc_vec_alloc <location_t> (num);
1418   for (int i = 0; i < num; i++)
1419     m_locs[i] = locs[i];
1420 }
1421
1422 /* string_concat_db's constructor.  */
1423
1424 string_concat_db::string_concat_db ()
1425 {
1426   m_table = hash_map <location_hash, string_concat *>::create_ggc (64);
1427 }
1428
1429 /* Record that a string concatenation occurred, covering NUM
1430    string literal tokens.  LOCS is an array of size NUM, containing the
1431    locations of the tokens.  A copy of LOCS is taken.  */
1432
1433 void
1434 string_concat_db::record_string_concatenation (int num, location_t *locs)
1435 {
1436   gcc_assert (num > 1);
1437   gcc_assert (locs);
1438
1439   location_t key_loc = get_key_loc (locs[0]);
1440
1441   string_concat *concat
1442     = new (ggc_alloc <string_concat> ()) string_concat (num, locs);
1443   m_table->put (key_loc, concat);
1444 }
1445
1446 /* Determine if LOC was the location of the initial token of a
1447    concatenation of string literal tokens.
1448    If so, *OUT_NUM is written to with the number of tokens, and
1449    *OUT_LOCS with the location of an array of locations of the
1450    tokens, and return true.  *OUT_LOCS is a borrowed pointer to
1451    storage owned by the string_concat_db.
1452    Otherwise, return false.  */
1453
1454 bool
1455 string_concat_db::get_string_concatenation (location_t loc,
1456                                             int *out_num,
1457                                             location_t **out_locs)
1458 {
1459   gcc_assert (out_num);
1460   gcc_assert (out_locs);
1461
1462   location_t key_loc = get_key_loc (loc);
1463
1464   string_concat **concat = m_table->get (key_loc);
1465   if (!concat)
1466     return false;
1467
1468   *out_num = (*concat)->m_num;
1469   *out_locs =(*concat)->m_locs;
1470   return true;
1471 }
1472
1473 /* Internal function.  Canonicalize LOC into a form suitable for
1474    use as a key within the database, stripping away macro expansion,
1475    ad-hoc information, and range information, using the location of
1476    the start of LOC within an ordinary linemap.  */
1477
1478 location_t
1479 string_concat_db::get_key_loc (location_t loc)
1480 {
1481   loc = linemap_resolve_location (line_table, loc, LRK_SPELLING_LOCATION,
1482                                   NULL);
1483
1484   loc = get_range_from_loc (line_table, loc).m_start;
1485
1486   return loc;
1487 }
1488
1489 /* Helper class for use within get_substring_ranges_for_loc.
1490    An vec of cpp_string with responsibility for releasing all of the
1491    str->text for each str in the vector.  */
1492
1493 class auto_cpp_string_vec :  public auto_vec <cpp_string>
1494 {
1495  public:
1496   auto_cpp_string_vec (int alloc)
1497     : auto_vec <cpp_string> (alloc) {}
1498
1499   ~auto_cpp_string_vec ()
1500   {
1501     /* Clean up the copies within this vec.  */
1502     int i;
1503     cpp_string *str;
1504     FOR_EACH_VEC_ELT (*this, i, str)
1505       free (const_cast <unsigned char *> (str->text));
1506   }
1507 };
1508
1509 /* Attempt to populate RANGES with source location information on the
1510    individual characters within the string literal found at STRLOC.
1511    If CONCATS is non-NULL, then any string literals that the token at
1512    STRLOC  was concatenated with are also added to RANGES.
1513
1514    Return NULL if successful, or an error message if any errors occurred (in
1515    which case RANGES may be only partially populated and should not
1516    be used).
1517
1518    This is implemented by re-parsing the relevant source line(s).  */
1519
1520 static const char *
1521 get_substring_ranges_for_loc (cpp_reader *pfile,
1522                               string_concat_db *concats,
1523                               location_t strloc,
1524                               enum cpp_ttype type,
1525                               cpp_substring_ranges &ranges)
1526 {
1527   gcc_assert (pfile);
1528
1529   if (strloc == UNKNOWN_LOCATION)
1530     return "unknown location";
1531
1532   /* Reparsing the strings requires accurate location information.
1533      If -ftrack-macro-expansion has been overridden from its default
1534      of 2, then we might have a location of a macro expansion point,
1535      rather than the location of the literal itself.
1536      Avoid this by requiring that we have full macro expansion tracking
1537      for substring locations to be available.  */
1538   if (cpp_get_options (pfile)->track_macro_expansion != 2)
1539     return "track_macro_expansion != 2";
1540
1541   /* If #line or # 44 "file"-style directives are present, then there's
1542      no guarantee that the line numbers we have can be used to locate
1543      the strings.  For example, we might have a .i file with # directives
1544      pointing back to lines within a .c file, but the .c file might
1545      have been edited since the .i file was created.
1546      In such a case, the safest course is to disable on-demand substring
1547      locations.  */
1548   if (line_table->seen_line_directive)
1549     return "seen line directive";
1550
1551   /* If string concatenation has occurred at STRLOC, get the locations
1552      of all of the literal tokens making up the compound string.
1553      Otherwise, just use STRLOC.  */
1554   int num_locs = 1;
1555   location_t *strlocs = &strloc;
1556   if (concats)
1557     concats->get_string_concatenation (strloc, &num_locs, &strlocs);
1558
1559   auto_cpp_string_vec strs (num_locs);
1560   auto_vec <cpp_string_location_reader> loc_readers (num_locs);
1561   for (int i = 0; i < num_locs; i++)
1562     {
1563       /* Get range of strloc.  We will use it to locate the start and finish
1564          of the literal token within the line.  */
1565       source_range src_range = get_range_from_loc (line_table, strlocs[i]);
1566
1567       if (src_range.m_start >= LINEMAPS_MACRO_LOWEST_LOCATION (line_table))
1568         {
1569           /* If the string token was within a macro expansion, then we can
1570              cope with it for the simple case where we have a single token.
1571              Otherwise, bail out.  */
1572           if (src_range.m_start != src_range.m_finish)
1573             return "macro expansion";
1574         }
1575       else
1576         {
1577           if (src_range.m_start >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1578             /* If so, we can't reliably determine where the token started within
1579                its line.  */
1580             return "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS";
1581
1582           if (src_range.m_finish >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1583             /* If so, we can't reliably determine where the token finished
1584                within its line.  */
1585             return "range ends after LINE_MAP_MAX_LOCATION_WITH_COLS";
1586         }
1587
1588       expanded_location start
1589         = expand_location_to_spelling_point (src_range.m_start,
1590                                              LOCATION_ASPECT_START);
1591       expanded_location finish
1592         = expand_location_to_spelling_point (src_range.m_finish,
1593                                              LOCATION_ASPECT_FINISH);
1594       if (start.file != finish.file)
1595         return "range endpoints are in different files";
1596       if (start.line != finish.line)
1597         return "range endpoints are on different lines";
1598       if (start.column > finish.column)
1599         return "range endpoints are reversed";
1600
1601       char_span line = location_get_source_line (start.file, start.line);
1602       if (!line)
1603         return "unable to read source line";
1604
1605       /* Determine the location of the literal (including quotes
1606          and leading prefix chars, such as the 'u' in a u""
1607          token).  */
1608       size_t literal_length = finish.column - start.column + 1;
1609
1610       /* Ensure that we don't crash if we got the wrong location.  */
1611       if (start.column < 1)
1612         return "zero start column";
1613       if (line.length () < (start.column - 1 + literal_length))
1614         return "line is not wide enough";
1615
1616       char_span literal = line.subspan (start.column - 1, literal_length);
1617
1618       cpp_string from;
1619       from.len = literal_length;
1620       /* Make a copy of the literal, to avoid having to rely on
1621          the lifetime of the copy of the line within the cache.
1622          This will be released by the auto_cpp_string_vec dtor.  */
1623       from.text = (unsigned char *)literal.xstrdup ();
1624       strs.safe_push (from);
1625
1626       /* For very long lines, a new linemap could have started
1627          halfway through the token.
1628          Ensure that the loc_reader uses the linemap of the
1629          *end* of the token for its start location.  */
1630       const line_map_ordinary *start_ord_map;
1631       linemap_resolve_location (line_table, src_range.m_start,
1632                                 LRK_SPELLING_LOCATION, &start_ord_map);
1633       const line_map_ordinary *final_ord_map;
1634       linemap_resolve_location (line_table, src_range.m_finish,
1635                                 LRK_SPELLING_LOCATION, &final_ord_map);
1636       if (start_ord_map == NULL || final_ord_map == NULL)
1637         return "failed to get ordinary maps";
1638       /* Bulletproofing.  We ought to only have different ordinary maps
1639          for start vs finish due to line-length jumps.  */
1640       if (start_ord_map != final_ord_map
1641           && start_ord_map->to_file != final_ord_map->to_file)
1642         return "start and finish are spelled in different ordinary maps";
1643       /* The file from linemap_resolve_location ought to match that from
1644          expand_location_to_spelling_point.  */
1645       if (start_ord_map->to_file != start.file)
1646         return "mismatching file after resolving linemap";
1647
1648       location_t start_loc
1649         = linemap_position_for_line_and_column (line_table, final_ord_map,
1650                                                 start.line, start.column);
1651
1652       cpp_string_location_reader loc_reader (start_loc, line_table);
1653       loc_readers.safe_push (loc_reader);
1654     }
1655
1656   /* Rerun cpp_interpret_string, or rather, a modified version of it.  */
1657   const char *err = cpp_interpret_string_ranges (pfile, strs.address (),
1658                                                  loc_readers.address (),
1659                                                  num_locs, &ranges, type);
1660   if (err)
1661     return err;
1662
1663   /* Success: "ranges" should now contain information on the string.  */
1664   return NULL;
1665 }
1666
1667 /* Attempt to populate *OUT_LOC with source location information on the
1668    given characters within the string literal found at STRLOC.
1669    CARET_IDX, START_IDX, and END_IDX refer to offsets within the execution
1670    character set.
1671
1672    For example, given CARET_IDX = 4, START_IDX = 3, END_IDX  = 7
1673    and string literal "012345\n789"
1674    *OUT_LOC is written to with:
1675      "012345\n789"
1676          ~^~~~~
1677
1678    If CONCATS is non-NULL, then any string literals that the token at
1679    STRLOC was concatenated with are also considered.
1680
1681    This is implemented by re-parsing the relevant source line(s).
1682
1683    Return NULL if successful, or an error message if any errors occurred.
1684    Error messages are intended for GCC developers (to help debugging) rather
1685    than for end-users.  */
1686
1687 const char *
1688 get_location_within_string (cpp_reader *pfile,
1689                             string_concat_db *concats,
1690                             location_t strloc,
1691                             enum cpp_ttype type,
1692                             int caret_idx, int start_idx, int end_idx,
1693                             location_t *out_loc)
1694 {
1695   gcc_checking_assert (caret_idx >= 0);
1696   gcc_checking_assert (start_idx >= 0);
1697   gcc_checking_assert (end_idx >= 0);
1698   gcc_assert (out_loc);
1699
1700   cpp_substring_ranges ranges;
1701   const char *err
1702     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1703   if (err)
1704     return err;
1705
1706   if (caret_idx >= ranges.get_num_ranges ())
1707     return "caret_idx out of range";
1708   if (start_idx >= ranges.get_num_ranges ())
1709     return "start_idx out of range";
1710   if (end_idx >= ranges.get_num_ranges ())
1711     return "end_idx out of range";
1712
1713   *out_loc = make_location (ranges.get_range (caret_idx).m_start,
1714                             ranges.get_range (start_idx).m_start,
1715                             ranges.get_range (end_idx).m_finish);
1716   return NULL;
1717 }
1718
1719 #if CHECKING_P
1720
1721 namespace selftest {
1722
1723 /* Selftests of location handling.  */
1724
1725 /* Attempt to populate *OUT_RANGE with source location information on the
1726    given character within the string literal found at STRLOC.
1727    CHAR_IDX refers to an offset within the execution character set.
1728    If CONCATS is non-NULL, then any string literals that the token at
1729    STRLOC was concatenated with are also considered.
1730
1731    This is implemented by re-parsing the relevant source line(s).
1732
1733    Return NULL if successful, or an error message if any errors occurred.
1734    Error messages are intended for GCC developers (to help debugging) rather
1735    than for end-users.  */
1736
1737 static const char *
1738 get_source_range_for_char (cpp_reader *pfile,
1739                            string_concat_db *concats,
1740                            location_t strloc,
1741                            enum cpp_ttype type,
1742                            int char_idx,
1743                            source_range *out_range)
1744 {
1745   gcc_checking_assert (char_idx >= 0);
1746   gcc_assert (out_range);
1747
1748   cpp_substring_ranges ranges;
1749   const char *err
1750     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1751   if (err)
1752     return err;
1753
1754   if (char_idx >= ranges.get_num_ranges ())
1755     return "char_idx out of range";
1756
1757   *out_range = ranges.get_range (char_idx);
1758   return NULL;
1759 }
1760
1761 /* As get_source_range_for_char, but write to *OUT the number
1762    of ranges that are available.  */
1763
1764 static const char *
1765 get_num_source_ranges_for_substring (cpp_reader *pfile,
1766                                      string_concat_db *concats,
1767                                      location_t strloc,
1768                                      enum cpp_ttype type,
1769                                      int *out)
1770 {
1771   gcc_assert (out);
1772
1773   cpp_substring_ranges ranges;
1774   const char *err
1775     = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1776
1777   if (err)
1778     return err;
1779
1780   *out = ranges.get_num_ranges ();
1781   return NULL;
1782 }
1783
1784 /* Selftests of location handling.  */
1785
1786 /* Verify that compare() on linenum_type handles comparisons over the full
1787    range of the type.  */
1788
1789 static void
1790 test_linenum_comparisons ()
1791 {
1792   linenum_type min_line (0);
1793   linenum_type max_line (0xffffffff);
1794   ASSERT_EQ (0, compare (min_line, min_line));
1795   ASSERT_EQ (0, compare (max_line, max_line));
1796
1797   ASSERT_GT (compare (max_line, min_line), 0);
1798   ASSERT_LT (compare (min_line, max_line), 0);
1799 }
1800
1801 /* Helper function for verifying location data: when location_t
1802    values are > LINE_MAP_MAX_LOCATION_WITH_COLS, they are treated
1803    as having column 0.  */
1804
1805 static bool
1806 should_have_column_data_p (location_t loc)
1807 {
1808   if (IS_ADHOC_LOC (loc))
1809     loc = get_location_from_adhoc_loc (line_table, loc);
1810   if (loc > LINE_MAP_MAX_LOCATION_WITH_COLS)
1811     return false;
1812   return true;
1813 }
1814
1815 /* Selftest for should_have_column_data_p.  */
1816
1817 static void
1818 test_should_have_column_data_p ()
1819 {
1820   ASSERT_TRUE (should_have_column_data_p (RESERVED_LOCATION_COUNT));
1821   ASSERT_TRUE
1822     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS));
1823   ASSERT_FALSE
1824     (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS + 1));
1825 }
1826
1827 /* Verify the result of LOCATION_FILE/LOCATION_LINE/LOCATION_COLUMN
1828    on LOC.  */
1829
1830 static void
1831 assert_loceq (const char *exp_filename, int exp_linenum, int exp_colnum,
1832               location_t loc)
1833 {
1834   ASSERT_STREQ (exp_filename, LOCATION_FILE (loc));
1835   ASSERT_EQ (exp_linenum, LOCATION_LINE (loc));
1836   /* If location_t values are sufficiently high, then column numbers
1837      will be unavailable and LOCATION_COLUMN (loc) will be 0.
1838      When close to the threshold, column numbers *may* be present: if
1839      the final linemap before the threshold contains a line that straddles
1840      the threshold, locations in that line have column information.  */
1841   if (should_have_column_data_p (loc))
1842     ASSERT_EQ (exp_colnum, LOCATION_COLUMN (loc));
1843 }
1844
1845 /* Various selftests involve constructing a line table and one or more
1846    line maps within it.
1847
1848    For maximum test coverage we want to run these tests with a variety
1849    of situations:
1850    - line_table->default_range_bits: some frontends use a non-zero value
1851    and others use zero
1852    - the fallback modes within line-map.c: there are various threshold
1853    values for location_t beyond line-map.c changes
1854    behavior (disabling of the range-packing optimization, disabling
1855    of column-tracking).  We can exercise these by starting the line_table
1856    at interesting values at or near these thresholds.
1857
1858    The following struct describes a particular case within our test
1859    matrix.  */
1860
1861 class line_table_case
1862 {
1863 public:
1864   line_table_case (int default_range_bits, int base_location)
1865   : m_default_range_bits (default_range_bits),
1866     m_base_location (base_location)
1867   {}
1868
1869   int m_default_range_bits;
1870   int m_base_location;
1871 };
1872
1873 /* Constructor.  Store the old value of line_table, and create a new
1874    one, using sane defaults.  */
1875
1876 line_table_test::line_table_test ()
1877 {
1878   gcc_assert (saved_line_table == NULL);
1879   saved_line_table = line_table;
1880   line_table = ggc_alloc<line_maps> ();
1881   linemap_init (line_table, BUILTINS_LOCATION);
1882   gcc_assert (saved_line_table->reallocator);
1883   line_table->reallocator = saved_line_table->reallocator;
1884   gcc_assert (saved_line_table->round_alloc_size);
1885   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1886   line_table->default_range_bits = 0;
1887 }
1888
1889 /* Constructor.  Store the old value of line_table, and create a new
1890    one, using the sitation described in CASE_.  */
1891
1892 line_table_test::line_table_test (const line_table_case &case_)
1893 {
1894   gcc_assert (saved_line_table == NULL);
1895   saved_line_table = line_table;
1896   line_table = ggc_alloc<line_maps> ();
1897   linemap_init (line_table, BUILTINS_LOCATION);
1898   gcc_assert (saved_line_table->reallocator);
1899   line_table->reallocator = saved_line_table->reallocator;
1900   gcc_assert (saved_line_table->round_alloc_size);
1901   line_table->round_alloc_size = saved_line_table->round_alloc_size;
1902   line_table->default_range_bits = case_.m_default_range_bits;
1903   if (case_.m_base_location)
1904     {
1905       line_table->highest_location = case_.m_base_location;
1906       line_table->highest_line = case_.m_base_location;
1907     }
1908 }
1909
1910 /* Destructor.  Restore the old value of line_table.  */
1911
1912 line_table_test::~line_table_test ()
1913 {
1914   gcc_assert (saved_line_table != NULL);
1915   line_table = saved_line_table;
1916   saved_line_table = NULL;
1917 }
1918
1919 /* Verify basic operation of ordinary linemaps.  */
1920
1921 static void
1922 test_accessing_ordinary_linemaps (const line_table_case &case_)
1923 {
1924   line_table_test ltt (case_);
1925
1926   /* Build a simple linemap describing some locations. */
1927   linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
1928
1929   linemap_line_start (line_table, 1, 100);
1930   location_t loc_a = linemap_position_for_column (line_table, 1);
1931   location_t loc_b = linemap_position_for_column (line_table, 23);
1932
1933   linemap_line_start (line_table, 2, 100);
1934   location_t loc_c = linemap_position_for_column (line_table, 1);
1935   location_t loc_d = linemap_position_for_column (line_table, 17);
1936
1937   /* Example of a very long line.  */
1938   linemap_line_start (line_table, 3, 2000);
1939   location_t loc_e = linemap_position_for_column (line_table, 700);
1940
1941   /* Transitioning back to a short line.  */
1942   linemap_line_start (line_table, 4, 0);
1943   location_t loc_back_to_short = linemap_position_for_column (line_table, 100);
1944
1945   if (should_have_column_data_p (loc_back_to_short))
1946     {
1947       /* Verify that we switched to short lines in the linemap.  */
1948       line_map_ordinary *map = LINEMAPS_LAST_ORDINARY_MAP (line_table);
1949       ASSERT_EQ (7, map->m_column_and_range_bits - map->m_range_bits);
1950     }
1951
1952   /* Example of a line that will eventually be seen to be longer
1953      than LINE_MAP_MAX_COLUMN_NUMBER; the initially seen width is
1954      below that.  */
1955   linemap_line_start (line_table, 5, 2000);
1956
1957   location_t loc_start_of_very_long_line
1958     = linemap_position_for_column (line_table, 2000);
1959   location_t loc_too_wide
1960     = linemap_position_for_column (line_table, 4097);
1961   location_t loc_too_wide_2
1962     = linemap_position_for_column (line_table, 4098);
1963
1964   /* ...and back to a sane line length.  */
1965   linemap_line_start (line_table, 6, 100);
1966   location_t loc_sane_again = linemap_position_for_column (line_table, 10);
1967
1968   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
1969
1970   /* Multiple files.  */
1971   linemap_add (line_table, LC_ENTER, false, "bar.c", 0);
1972   linemap_line_start (line_table, 1, 200);
1973   location_t loc_f = linemap_position_for_column (line_table, 150);
1974   linemap_add (line_table, LC_LEAVE, false, NULL, 0);
1975
1976   /* Verify that we can recover the location info.  */
1977   assert_loceq ("foo.c", 1, 1, loc_a);
1978   assert_loceq ("foo.c", 1, 23, loc_b);
1979   assert_loceq ("foo.c", 2, 1, loc_c);
1980   assert_loceq ("foo.c", 2, 17, loc_d);
1981   assert_loceq ("foo.c", 3, 700, loc_e);
1982   assert_loceq ("foo.c", 4, 100, loc_back_to_short);
1983
1984   /* In the very wide line, the initial location should be fully tracked.  */
1985   assert_loceq ("foo.c", 5, 2000, loc_start_of_very_long_line);
1986   /* ...but once we exceed LINE_MAP_MAX_COLUMN_NUMBER column-tracking should
1987      be disabled.  */
1988   assert_loceq ("foo.c", 5, 0, loc_too_wide);
1989   assert_loceq ("foo.c", 5, 0, loc_too_wide_2);
1990   /*...and column-tracking should be re-enabled for subsequent lines.  */
1991   assert_loceq ("foo.c", 6, 10, loc_sane_again);
1992
1993   assert_loceq ("bar.c", 1, 150, loc_f);
1994
1995   ASSERT_FALSE (is_location_from_builtin_token (loc_a));
1996   ASSERT_TRUE (pure_location_p (line_table, loc_a));
1997
1998   /* Verify using make_location to build a range, and extracting data
1999      back from it.  */
2000   location_t range_c_b_d = make_location (loc_c, loc_b, loc_d);
2001   ASSERT_FALSE (pure_location_p (line_table, range_c_b_d));
2002   ASSERT_EQ (loc_c, get_location_from_adhoc_loc (line_table, range_c_b_d));
2003   source_range src_range = get_range_from_loc (line_table, range_c_b_d);
2004   ASSERT_EQ (loc_b, src_range.m_start);
2005   ASSERT_EQ (loc_d, src_range.m_finish);
2006 }
2007
2008 /* Verify various properties of UNKNOWN_LOCATION.  */
2009
2010 static void
2011 test_unknown_location ()
2012 {
2013   ASSERT_EQ (NULL, LOCATION_FILE (UNKNOWN_LOCATION));
2014   ASSERT_EQ (0, LOCATION_LINE (UNKNOWN_LOCATION));
2015   ASSERT_EQ (0, LOCATION_COLUMN (UNKNOWN_LOCATION));
2016 }
2017
2018 /* Verify various properties of BUILTINS_LOCATION.  */
2019
2020 static void
2021 test_builtins ()
2022 {
2023   assert_loceq (_("<built-in>"), 0, 0, BUILTINS_LOCATION);
2024   ASSERT_PRED1 (is_location_from_builtin_token, BUILTINS_LOCATION);
2025 }
2026
2027 /* Regression test for make_location.
2028    Ensure that we use pure locations for the start/finish of the range,
2029    rather than storing a packed or ad-hoc range as the start/finish.  */
2030
2031 static void
2032 test_make_location_nonpure_range_endpoints (const line_table_case &case_)
2033 {
2034   /* Issue seen with testsuite/c-c++-common/Wlogical-not-parentheses-2.c
2035      with C++ frontend.
2036      ....................0000000001111111111222.
2037      ....................1234567890123456789012.  */
2038   const char *content = "     r += !aaa == bbb;\n";
2039   temp_source_file tmp (SELFTEST_LOCATION, ".C", content);
2040   line_table_test ltt (case_);
2041   linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 1);
2042
2043   const location_t c11 = linemap_position_for_column (line_table, 11);
2044   const location_t c12 = linemap_position_for_column (line_table, 12);
2045   const location_t c13 = linemap_position_for_column (line_table, 13);
2046   const location_t c14 = linemap_position_for_column (line_table, 14);
2047   const location_t c21 = linemap_position_for_column (line_table, 21);
2048
2049   if (c21 > LINE_MAP_MAX_LOCATION_WITH_COLS)
2050     return;
2051
2052   /* Use column 13 for the caret location, arbitrarily, to verify that we
2053      handle start != caret.  */
2054   const location_t aaa = make_location (c13, c12, c14);
2055   ASSERT_EQ (c13, get_pure_location (aaa));
2056   ASSERT_EQ (c12, get_start (aaa));
2057   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa)));
2058   ASSERT_EQ (c14, get_finish (aaa));
2059   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa)));
2060
2061   /* Make a location using a location with a range as the start-point.  */
2062   const location_t not_aaa = make_location (c11, aaa, c14);
2063   ASSERT_EQ (c11, get_pure_location (not_aaa));
2064   /* It should use the start location of the range, not store the range
2065      itself.  */
2066   ASSERT_EQ (c12, get_start (not_aaa));
2067   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa)));
2068   ASSERT_EQ (c14, get_finish (not_aaa));
2069   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa)));
2070
2071   /* Similarly, make a location with a range as the end-point.  */
2072   const location_t aaa_eq_bbb = make_location (c12, c12, c21);
2073   ASSERT_EQ (c12, get_pure_location (aaa_eq_bbb));
2074   ASSERT_EQ (c12, get_start (aaa_eq_bbb));
2075   ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa_eq_bbb)));
2076   ASSERT_EQ (c21, get_finish (aaa_eq_bbb));
2077   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa_eq_bbb)));
2078   const location_t not_aaa_eq_bbb = make_location (c11, c12, aaa_eq_bbb);
2079   /* It should use the finish location of the range, not store the range
2080      itself.  */
2081   ASSERT_EQ (c11, get_pure_location (not_aaa_eq_bbb));
2082   ASSERT_EQ (c12, get_start (not_aaa_eq_bbb));
2083   ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa_eq_bbb)));
2084   ASSERT_EQ (c21, get_finish (not_aaa_eq_bbb));
2085   ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa_eq_bbb)));
2086 }
2087
2088 /* Verify reading of input files (e.g. for caret-based diagnostics).  */
2089
2090 static void
2091 test_reading_source_line ()
2092 {
2093   /* Create a tempfile and write some text to it.  */
2094   temp_source_file tmp (SELFTEST_LOCATION, ".txt",
2095                         "01234567890123456789\n"
2096                         "This is the test text\n"
2097                         "This is the 3rd line");
2098
2099   /* Read back a specific line from the tempfile.  */
2100   char_span source_line = location_get_source_line (tmp.get_filename (), 3);
2101   ASSERT_TRUE (source_line);
2102   ASSERT_TRUE (source_line.get_buffer () != NULL);
2103   ASSERT_EQ (20, source_line.length ());
2104   ASSERT_TRUE (!strncmp ("This is the 3rd line",
2105                          source_line.get_buffer (), source_line.length ()));
2106
2107   source_line = location_get_source_line (tmp.get_filename (), 2);
2108   ASSERT_TRUE (source_line);
2109   ASSERT_TRUE (source_line.get_buffer () != NULL);
2110   ASSERT_EQ (21, source_line.length ());
2111   ASSERT_TRUE (!strncmp ("This is the test text",
2112                          source_line.get_buffer (), source_line.length ()));
2113
2114   source_line = location_get_source_line (tmp.get_filename (), 4);
2115   ASSERT_FALSE (source_line);
2116   ASSERT_TRUE (source_line.get_buffer () == NULL);
2117 }
2118
2119 /* Tests of lexing.  */
2120
2121 /* Verify that token TOK from PARSER has cpp_token_as_text
2122    equal to EXPECTED_TEXT.  */
2123
2124 #define ASSERT_TOKEN_AS_TEXT_EQ(PARSER, TOK, EXPECTED_TEXT)             \
2125   SELFTEST_BEGIN_STMT                                                   \
2126     unsigned char *actual_txt = cpp_token_as_text ((PARSER), (TOK));    \
2127     ASSERT_STREQ ((EXPECTED_TEXT), (const char *)actual_txt);           \
2128   SELFTEST_END_STMT
2129
2130 /* Verify that TOK's src_loc is within EXP_FILENAME at EXP_LINENUM,
2131    and ranges from EXP_START_COL to EXP_FINISH_COL.
2132    Use LOC as the effective location of the selftest.  */
2133
2134 static void
2135 assert_token_loc_eq (const location &loc,
2136                      const cpp_token *tok,
2137                      const char *exp_filename, int exp_linenum,
2138                      int exp_start_col, int exp_finish_col)
2139 {
2140   location_t tok_loc = tok->src_loc;
2141   ASSERT_STREQ_AT (loc, exp_filename, LOCATION_FILE (tok_loc));
2142   ASSERT_EQ_AT (loc, exp_linenum, LOCATION_LINE (tok_loc));
2143
2144   /* If location_t values are sufficiently high, then column numbers
2145      will be unavailable.  */
2146   if (!should_have_column_data_p (tok_loc))
2147     return;
2148
2149   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_loc));
2150   source_range tok_range = get_range_from_loc (line_table, tok_loc);
2151   ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_range.m_start));
2152   ASSERT_EQ_AT (loc, exp_finish_col, LOCATION_COLUMN (tok_range.m_finish));
2153 }
2154
2155 /* Use assert_token_loc_eq to verify the TOK->src_loc, using
2156    SELFTEST_LOCATION as the effective location of the selftest.  */
2157
2158 #define ASSERT_TOKEN_LOC_EQ(TOK, EXP_FILENAME, EXP_LINENUM, \
2159                             EXP_START_COL, EXP_FINISH_COL) \
2160   assert_token_loc_eq (SELFTEST_LOCATION, (TOK), (EXP_FILENAME), \
2161                        (EXP_LINENUM), (EXP_START_COL), (EXP_FINISH_COL))
2162
2163 /* Test of lexing a file using libcpp, verifying tokens and their
2164    location information.  */
2165
2166 static void
2167 test_lexer (const line_table_case &case_)
2168 {
2169   /* Create a tempfile and write some text to it.  */
2170   const char *content =
2171     /*00000000011111111112222222222333333.3333444444444.455555555556
2172       12345678901234567890123456789012345.6789012345678.901234567890.  */
2173     ("test_name /* c-style comment */\n"
2174      "                                  \"test literal\"\n"
2175      " // test c++-style comment\n"
2176      "   42\n");
2177   temp_source_file tmp (SELFTEST_LOCATION, ".txt", content);
2178
2179   line_table_test ltt (case_);
2180
2181   cpp_reader *parser = cpp_create_reader (CLK_GNUC89, NULL, line_table);
2182
2183   const char *fname = cpp_read_main_file (parser, tmp.get_filename ());
2184   ASSERT_NE (fname, NULL);
2185
2186   /* Verify that we get the expected tokens back, with the correct
2187      location information.  */
2188
2189   location_t loc;
2190   const cpp_token *tok;
2191   tok = cpp_get_token_with_location (parser, &loc);
2192   ASSERT_NE (tok, NULL);
2193   ASSERT_EQ (tok->type, CPP_NAME);
2194   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "test_name");
2195   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 1, 1, 9);
2196
2197   tok = cpp_get_token_with_location (parser, &loc);
2198   ASSERT_NE (tok, NULL);
2199   ASSERT_EQ (tok->type, CPP_STRING);
2200   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "\"test literal\"");
2201   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 2, 35, 48);
2202
2203   tok = cpp_get_token_with_location (parser, &loc);
2204   ASSERT_NE (tok, NULL);
2205   ASSERT_EQ (tok->type, CPP_NUMBER);
2206   ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "42");
2207   ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 4, 4, 5);
2208
2209   tok = cpp_get_token_with_location (parser, &loc);
2210   ASSERT_NE (tok, NULL);
2211   ASSERT_EQ (tok->type, CPP_EOF);
2212
2213   cpp_finish (parser, NULL);
2214   cpp_destroy (parser);
2215 }
2216
2217 /* Forward decls.  */
2218
2219 class lexer_test;
2220 class lexer_test_options;
2221
2222 /* A class for specifying options of a lexer_test.
2223    The "apply" vfunc is called during the lexer_test constructor.  */
2224
2225 class lexer_test_options
2226 {
2227  public:
2228   virtual void apply (lexer_test &) = 0;
2229 };
2230
2231 /* Wrapper around an cpp_reader *, which calls cpp_finish and cpp_destroy
2232    in its dtor.
2233
2234    This is needed by struct lexer_test to ensure that the cleanup of the
2235    cpp_reader happens *after* the cleanup of the temp_source_file.  */
2236
2237 class cpp_reader_ptr
2238 {
2239  public:
2240   cpp_reader_ptr (cpp_reader *ptr) : m_ptr (ptr) {}
2241
2242   ~cpp_reader_ptr ()
2243   {
2244     cpp_finish (m_ptr, NULL);
2245     cpp_destroy (m_ptr);
2246   }
2247
2248   operator cpp_reader * () const { return m_ptr; }
2249
2250  private:
2251   cpp_reader *m_ptr;
2252 };
2253
2254 /* A struct for writing lexer tests.  */
2255
2256 class lexer_test
2257 {
2258 public:
2259   lexer_test (const line_table_case &case_, const char *content,
2260               lexer_test_options *options);
2261   ~lexer_test ();
2262
2263   const cpp_token *get_token ();
2264
2265   /* The ordering of these fields matters.
2266      The line_table_test must be first, since the cpp_reader_ptr
2267      uses it.
2268      The cpp_reader must be cleaned up *after* the temp_source_file
2269      since the filenames in input.c's input cache are owned by the
2270      cpp_reader; in particular, when ~temp_source_file evicts the
2271      filename the filenames must still be alive.  */
2272   line_table_test m_ltt;
2273   cpp_reader_ptr m_parser;
2274   temp_source_file m_tempfile;
2275   string_concat_db m_concats;
2276   bool m_implicitly_expect_EOF;
2277 };
2278
2279 /* Use an EBCDIC encoding for the execution charset, specifically
2280    IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2281
2282    This exercises iconv integration within libcpp.
2283    Not every build of iconv supports the given charset,
2284    so we need to flag this error and handle it gracefully.  */
2285
2286 class ebcdic_execution_charset : public lexer_test_options
2287 {
2288  public:
2289   ebcdic_execution_charset () : m_num_iconv_errors (0)
2290     {
2291       gcc_assert (s_singleton == NULL);
2292       s_singleton = this;
2293     }
2294   ~ebcdic_execution_charset ()
2295     {
2296       gcc_assert (s_singleton == this);
2297       s_singleton = NULL;
2298     }
2299
2300   void apply (lexer_test &test) FINAL OVERRIDE
2301   {
2302     cpp_options *cpp_opts = cpp_get_options (test.m_parser);
2303     cpp_opts->narrow_charset = "IBM1047";
2304
2305     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2306     callbacks->diagnostic = on_diagnostic;
2307   }
2308
2309   static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2310                              enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2311                              enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2312                              rich_location *richloc ATTRIBUTE_UNUSED,
2313                              const char *msgid, va_list *ap ATTRIBUTE_UNUSED)
2314     ATTRIBUTE_FPTR_PRINTF(5,0)
2315   {
2316     gcc_assert (s_singleton);
2317     /* Avoid exgettext from picking this up, it is translated in libcpp.  */
2318     const char *msg = "conversion from %s to %s not supported by iconv";
2319 #ifdef ENABLE_NLS
2320     msg = dgettext ("cpplib", msg);
2321 #endif
2322     /* Detect and record errors emitted by libcpp/charset.c:init_iconv_desc
2323        when the local iconv build doesn't support the conversion.  */
2324     if (strcmp (msgid, msg) == 0)
2325       {
2326         s_singleton->m_num_iconv_errors++;
2327         return true;
2328       }
2329
2330     /* Otherwise, we have an unexpected error.  */
2331     abort ();
2332   }
2333
2334   bool iconv_errors_occurred_p () const { return m_num_iconv_errors > 0; }
2335
2336  private:
2337   static ebcdic_execution_charset *s_singleton;
2338   int m_num_iconv_errors;
2339 };
2340
2341 ebcdic_execution_charset *ebcdic_execution_charset::s_singleton;
2342
2343 /* A lexer_test_options subclass that records a list of diagnostic
2344    messages emitted by the lexer.  */
2345
2346 class lexer_diagnostic_sink : public lexer_test_options
2347 {
2348  public:
2349   lexer_diagnostic_sink ()
2350   {
2351     gcc_assert (s_singleton == NULL);
2352     s_singleton = this;
2353   }
2354   ~lexer_diagnostic_sink ()
2355   {
2356     gcc_assert (s_singleton == this);
2357     s_singleton = NULL;
2358
2359     int i;
2360     char *str;
2361     FOR_EACH_VEC_ELT (m_diagnostics, i, str)
2362       free (str);
2363   }
2364
2365   void apply (lexer_test &test) FINAL OVERRIDE
2366   {
2367     cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2368     callbacks->diagnostic = on_diagnostic;
2369   }
2370
2371   static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2372                              enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2373                              enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2374                              rich_location *richloc ATTRIBUTE_UNUSED,
2375                              const char *msgid, va_list *ap)
2376     ATTRIBUTE_FPTR_PRINTF(5,0)
2377   {
2378     char *msg = xvasprintf (msgid, *ap);
2379     s_singleton->m_diagnostics.safe_push (msg);
2380     return true;
2381   }
2382
2383   auto_vec<char *> m_diagnostics;
2384
2385  private:
2386   static lexer_diagnostic_sink *s_singleton;
2387 };
2388
2389 lexer_diagnostic_sink *lexer_diagnostic_sink::s_singleton;
2390
2391 /* Constructor.  Override line_table with a new instance based on CASE_,
2392    and write CONTENT to a tempfile.  Create a cpp_reader, and use it to
2393    start parsing the tempfile.  */
2394
2395 lexer_test::lexer_test (const line_table_case &case_, const char *content,
2396                         lexer_test_options *options)
2397 : m_ltt (case_),
2398   m_parser (cpp_create_reader (CLK_GNUC99, NULL, line_table)),
2399   /* Create a tempfile and write the text to it.  */
2400   m_tempfile (SELFTEST_LOCATION, ".c", content),
2401   m_concats (),
2402   m_implicitly_expect_EOF (true)
2403 {
2404   if (options)
2405     options->apply (*this);
2406
2407   cpp_init_iconv (m_parser);
2408
2409   /* Parse the file.  */
2410   const char *fname = cpp_read_main_file (m_parser,
2411                                           m_tempfile.get_filename ());
2412   ASSERT_NE (fname, NULL);
2413 }
2414
2415 /* Destructor.  By default, verify that the next token in m_parser is EOF.  */
2416
2417 lexer_test::~lexer_test ()
2418 {
2419   location_t loc;
2420   const cpp_token *tok;
2421
2422   if (m_implicitly_expect_EOF)
2423     {
2424       tok = cpp_get_token_with_location (m_parser, &loc);
2425       ASSERT_NE (tok, NULL);
2426       ASSERT_EQ (tok->type, CPP_EOF);
2427     }
2428 }
2429
2430 /* Get the next token from m_parser.  */
2431
2432 const cpp_token *
2433 lexer_test::get_token ()
2434 {
2435   location_t loc;
2436   const cpp_token *tok;
2437
2438   tok = cpp_get_token_with_location (m_parser, &loc);
2439   ASSERT_NE (tok, NULL);
2440   return tok;
2441 }
2442
2443 /* Verify that locations within string literals are correctly handled.  */
2444
2445 /* Verify get_source_range_for_substring for token(s) at STRLOC,
2446    using the string concatenation database for TEST.
2447
2448    Assert that the character at index IDX is on EXPECTED_LINE,
2449    and that it begins at column EXPECTED_START_COL and ends at
2450    EXPECTED_FINISH_COL (unless the locations are beyond
2451    LINE_MAP_MAX_LOCATION_WITH_COLS, in which case don't check their
2452    columns).  */
2453
2454 static void
2455 assert_char_at_range (const location &loc,
2456                       lexer_test& test,
2457                       location_t strloc, enum cpp_ttype type, int idx,
2458                       int expected_line, int expected_start_col,
2459                       int expected_finish_col)
2460 {
2461   cpp_reader *pfile = test.m_parser;
2462   string_concat_db *concats = &test.m_concats;
2463
2464   source_range actual_range = source_range();
2465   const char *err
2466     = get_source_range_for_char (pfile, concats, strloc, type, idx,
2467                                  &actual_range);
2468   if (should_have_column_data_p (strloc))
2469     ASSERT_EQ_AT (loc, NULL, err);
2470   else
2471     {
2472       ASSERT_STREQ_AT (loc,
2473                        "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2474                        err);
2475       return;
2476     }
2477
2478   int actual_start_line = LOCATION_LINE (actual_range.m_start);
2479   ASSERT_EQ_AT (loc, expected_line, actual_start_line);
2480   int actual_finish_line = LOCATION_LINE (actual_range.m_finish);
2481   ASSERT_EQ_AT (loc, expected_line, actual_finish_line);
2482
2483   if (should_have_column_data_p (actual_range.m_start))
2484     {
2485       int actual_start_col = LOCATION_COLUMN (actual_range.m_start);
2486       ASSERT_EQ_AT (loc, expected_start_col, actual_start_col);
2487     }
2488   if (should_have_column_data_p (actual_range.m_finish))
2489     {
2490       int actual_finish_col = LOCATION_COLUMN (actual_range.m_finish);
2491       ASSERT_EQ_AT (loc, expected_finish_col, actual_finish_col);
2492     }
2493 }
2494
2495 /* Macro for calling assert_char_at_range, supplying SELFTEST_LOCATION for
2496    the effective location of any errors.  */
2497
2498 #define ASSERT_CHAR_AT_RANGE(LEXER_TEST, STRLOC, TYPE, IDX, EXPECTED_LINE, \
2499                              EXPECTED_START_COL, EXPECTED_FINISH_COL)   \
2500   assert_char_at_range (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), (TYPE), \
2501                         (IDX), (EXPECTED_LINE), (EXPECTED_START_COL), \
2502                         (EXPECTED_FINISH_COL))
2503
2504 /* Verify get_num_source_ranges_for_substring for token(s) at STRLOC,
2505    using the string concatenation database for TEST.
2506
2507    Assert that the token(s) at STRLOC contain EXPECTED_NUM_RANGES.  */
2508
2509 static void
2510 assert_num_substring_ranges (const location &loc,
2511                              lexer_test& test,
2512                              location_t strloc,
2513                              enum cpp_ttype type,
2514                              int expected_num_ranges)
2515 {
2516   cpp_reader *pfile = test.m_parser;
2517   string_concat_db *concats = &test.m_concats;
2518
2519   int actual_num_ranges = -1;
2520   const char *err
2521     = get_num_source_ranges_for_substring (pfile, concats, strloc, type,
2522                                            &actual_num_ranges);
2523   if (should_have_column_data_p (strloc))
2524     ASSERT_EQ_AT (loc, NULL, err);
2525   else
2526     {
2527       ASSERT_STREQ_AT (loc,
2528                        "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2529                        err);
2530       return;
2531     }
2532   ASSERT_EQ_AT (loc, expected_num_ranges, actual_num_ranges);
2533 }
2534
2535 /* Macro for calling assert_num_substring_ranges, supplying
2536    SELFTEST_LOCATION for the effective location of any errors.  */
2537
2538 #define ASSERT_NUM_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, \
2539                                     EXPECTED_NUM_RANGES)                \
2540   assert_num_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), \
2541                                (TYPE), (EXPECTED_NUM_RANGES))
2542
2543
2544 /* Verify that get_num_source_ranges_for_substring for token(s) at STRLOC
2545    returns an error (using the string concatenation database for TEST).  */
2546
2547 static void
2548 assert_has_no_substring_ranges (const location &loc,
2549                                 lexer_test& test,
2550                                 location_t strloc,
2551                                 enum cpp_ttype type,
2552                                 const char *expected_err)
2553 {
2554   cpp_reader *pfile = test.m_parser;
2555   string_concat_db *concats = &test.m_concats;
2556   cpp_substring_ranges ranges;
2557   const char *actual_err
2558     = get_substring_ranges_for_loc (pfile, concats, strloc,
2559                                     type, ranges);
2560   if (should_have_column_data_p (strloc))
2561     ASSERT_STREQ_AT (loc, expected_err, actual_err);
2562   else
2563     ASSERT_STREQ_AT (loc,
2564                      "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2565                      actual_err);
2566 }
2567
2568 #define ASSERT_HAS_NO_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, ERR)    \
2569     assert_has_no_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), \
2570                                     (STRLOC), (TYPE), (ERR))
2571
2572 /* Lex a simple string literal.  Verify the substring location data, before
2573    and after running cpp_interpret_string on it.  */
2574
2575 static void
2576 test_lexer_string_locations_simple (const line_table_case &case_)
2577 {
2578   /* Digits 0-9 (with 0 at column 10), the simple way.
2579      ....................000000000.11111111112.2222222223333333333
2580      ....................123456789.01234567890.1234567890123456789
2581      We add a trailing comment to ensure that we correctly locate
2582      the end of the string literal token.  */
2583   const char *content = "        \"0123456789\" /* not a string */\n";
2584   lexer_test test (case_, content, NULL);
2585
2586   /* Verify that we get the expected token back, with the correct
2587      location information.  */
2588   const cpp_token *tok = test.get_token ();
2589   ASSERT_EQ (tok->type, CPP_STRING);
2590   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2591   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2592
2593   /* At this point in lexing, the quote characters are treated as part of
2594      the string (they are stripped off by cpp_interpret_string).  */
2595
2596   ASSERT_EQ (tok->val.str.len, 12);
2597
2598   /* Verify that cpp_interpret_string works.  */
2599   cpp_string dst_string;
2600   const enum cpp_ttype type = CPP_STRING;
2601   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2602                                       &dst_string, type);
2603   ASSERT_TRUE (result);
2604   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2605   free (const_cast <unsigned char *> (dst_string.text));
2606
2607   /* Verify ranges of individual characters.  This no longer includes the
2608      opening quote, but does include the closing quote.  */
2609   for (int i = 0; i <= 10; i++)
2610     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1,
2611                           10 + i, 10 + i);
2612
2613   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2614 }
2615
2616 /* As test_lexer_string_locations_simple, but use an EBCDIC execution
2617    encoding.  */
2618
2619 static void
2620 test_lexer_string_locations_ebcdic (const line_table_case &case_)
2621 {
2622   /* EBCDIC support requires iconv.  */
2623   if (!HAVE_ICONV)
2624     return;
2625
2626   /* Digits 0-9 (with 0 at column 10), the simple way.
2627      ....................000000000.11111111112.2222222223333333333
2628      ....................123456789.01234567890.1234567890123456789
2629      We add a trailing comment to ensure that we correctly locate
2630      the end of the string literal token.  */
2631   const char *content = "        \"0123456789\" /* not a string */\n";
2632   ebcdic_execution_charset use_ebcdic;
2633   lexer_test test (case_, content, &use_ebcdic);
2634
2635   /* Verify that we get the expected token back, with the correct
2636      location information.  */
2637   const cpp_token *tok = test.get_token ();
2638   ASSERT_EQ (tok->type, CPP_STRING);
2639   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2640   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2641
2642   /* At this point in lexing, the quote characters are treated as part of
2643      the string (they are stripped off by cpp_interpret_string).  */
2644
2645   ASSERT_EQ (tok->val.str.len, 12);
2646
2647   /* The remainder of the test requires an iconv implementation that
2648      can convert from UTF-8 to the EBCDIC encoding requested above.  */
2649   if (use_ebcdic.iconv_errors_occurred_p ())
2650     return;
2651
2652   /* Verify that cpp_interpret_string works.  */
2653   cpp_string dst_string;
2654   const enum cpp_ttype type = CPP_STRING;
2655   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2656                                       &dst_string, type);
2657   ASSERT_TRUE (result);
2658   /* We should now have EBCDIC-encoded text, specifically
2659      IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2660      The digits 0-9 are encoded as 240-249 i.e. 0xf0-0xf9.  */
2661   ASSERT_STREQ ("\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
2662                 (const char *)dst_string.text);
2663   free (const_cast <unsigned char *> (dst_string.text));
2664
2665   /* Verify that we don't attempt to record substring location information
2666      for such cases.  */
2667   ASSERT_HAS_NO_SUBSTRING_RANGES
2668     (test, tok->src_loc, type,
2669      "execution character set != source character set");
2670 }
2671
2672 /* Lex a string literal containing a hex-escaped character.
2673    Verify the substring location data, before and after running
2674    cpp_interpret_string on it.  */
2675
2676 static void
2677 test_lexer_string_locations_hex (const line_table_case &case_)
2678 {
2679   /* Digits 0-9, expressing digit 5 in ASCII as "\x35"
2680      and with a space in place of digit 6, to terminate the escaped
2681      hex code.
2682      ....................000000000.111111.11112222.
2683      ....................123456789.012345.67890123.  */
2684   const char *content = "        \"01234\\x35 789\"\n";
2685   lexer_test test (case_, content, NULL);
2686
2687   /* Verify that we get the expected token back, with the correct
2688      location information.  */
2689   const cpp_token *tok = test.get_token ();
2690   ASSERT_EQ (tok->type, CPP_STRING);
2691   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\x35 789\"");
2692   ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 23);
2693
2694   /* At this point in lexing, the quote characters are treated as part of
2695      the string (they are stripped off by cpp_interpret_string).  */
2696   ASSERT_EQ (tok->val.str.len, 15);
2697
2698   /* Verify that cpp_interpret_string works.  */
2699   cpp_string dst_string;
2700   const enum cpp_ttype type = CPP_STRING;
2701   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2702                                       &dst_string, type);
2703   ASSERT_TRUE (result);
2704   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2705   free (const_cast <unsigned char *> (dst_string.text));
2706
2707   /* Verify ranges of individual characters.  This no longer includes the
2708      opening quote, but does include the closing quote.  */
2709   for (int i = 0; i <= 4; i++)
2710     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2711   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2712   for (int i = 6; i <= 10; i++)
2713     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2714
2715   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2716 }
2717
2718 /* Lex a string literal containing an octal-escaped character.
2719    Verify the substring location data after running cpp_interpret_string
2720    on it.  */
2721
2722 static void
2723 test_lexer_string_locations_oct (const line_table_case &case_)
2724 {
2725   /* Digits 0-9, expressing digit 5 in ASCII as "\065"
2726      and with a space in place of digit 6, to terminate the escaped
2727      octal code.
2728      ....................000000000.111111.11112222.2222223333333333444
2729      ....................123456789.012345.67890123.4567890123456789012  */
2730   const char *content = "        \"01234\\065 789\" /* not a string */\n";
2731   lexer_test test (case_, content, NULL);
2732
2733   /* Verify that we get the expected token back, with the correct
2734      location information.  */
2735   const cpp_token *tok = test.get_token ();
2736   ASSERT_EQ (tok->type, CPP_STRING);
2737   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\065 789\"");
2738
2739   /* Verify that cpp_interpret_string works.  */
2740   cpp_string dst_string;
2741   const enum cpp_ttype type = CPP_STRING;
2742   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2743                                       &dst_string, type);
2744   ASSERT_TRUE (result);
2745   ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2746   free (const_cast <unsigned char *> (dst_string.text));
2747
2748   /* Verify ranges of individual characters.  This no longer includes the
2749      opening quote, but does include the closing quote.  */
2750   for (int i = 0; i < 5; i++)
2751     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2752   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2753   for (int i = 6; i <= 10; i++)
2754     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2755
2756   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2757 }
2758
2759 /* Test of string literal containing letter escapes.  */
2760
2761 static void
2762 test_lexer_string_locations_letter_escape_1 (const line_table_case &case_)
2763 {
2764   /* The string "\tfoo\\\nbar" i.e. tab, "foo", backslash, newline, bar.
2765      .....................000000000.1.11111.1.1.11222.22222223333333
2766      .....................123456789.0.12345.6.7.89012.34567890123456.  */
2767   const char *content = ("        \"\\tfoo\\\\\\nbar\" /* non-str */\n");
2768   lexer_test test (case_, content, NULL);
2769
2770   /* Verify that we get the expected tokens back.  */
2771   const cpp_token *tok = test.get_token ();
2772   ASSERT_EQ (tok->type, CPP_STRING);
2773   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"\\tfoo\\\\\\nbar\"");
2774
2775   /* Verify ranges of individual characters. */
2776   /* "\t".  */
2777   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2778                         0, 1, 10, 11);
2779   /* "foo". */
2780   for (int i = 1; i <= 3; i++)
2781     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2782                           i, 1, 11 + i, 11 + i);
2783   /* "\\" and "\n".  */
2784   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2785                         4, 1, 15, 16);
2786   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2787                         5, 1, 17, 18);
2788
2789   /* "bar" and closing quote for nul-terminator.  */
2790   for (int i = 6; i <= 9; i++)
2791     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2792                           i, 1, 13 + i, 13 + i);
2793
2794   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 10);
2795 }
2796
2797 /* Another test of a string literal containing a letter escape.
2798    Based on string seen in
2799      printf ("%-%\n");
2800    in gcc.dg/format/c90-printf-1.c.  */
2801
2802 static void
2803 test_lexer_string_locations_letter_escape_2 (const line_table_case &case_)
2804 {
2805   /* .....................000000000.1111.11.1111.22222222223.
2806      .....................123456789.0123.45.6789.01234567890.  */
2807   const char *content = ("        \"%-%\\n\" /* non-str */\n");
2808   lexer_test test (case_, content, NULL);
2809
2810   /* Verify that we get the expected tokens back.  */
2811   const cpp_token *tok = test.get_token ();
2812   ASSERT_EQ (tok->type, CPP_STRING);
2813   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"%-%\\n\"");
2814
2815   /* Verify ranges of individual characters. */
2816   /* "%-%".  */
2817   for (int i = 0; i < 3; i++)
2818     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2819                           i, 1, 10 + i, 10 + i);
2820   /* "\n".  */
2821   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2822                         3, 1, 13, 14);
2823
2824   /* Closing quote for nul-terminator.  */
2825   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2826                         4, 1, 15, 15);
2827
2828   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 5);
2829 }
2830
2831 /* Lex a string literal containing UCN 4 characters.
2832    Verify the substring location data after running cpp_interpret_string
2833    on it.  */
2834
2835 static void
2836 test_lexer_string_locations_ucn4 (const line_table_case &case_)
2837 {
2838   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals expressed
2839      as UCN 4.
2840      ....................000000000.111111.111122.222222223.33333333344444
2841      ....................123456789.012345.678901.234567890.12345678901234  */
2842   const char *content = "        \"01234\\u2174\\u2175789\" /* non-str */\n";
2843   lexer_test test (case_, content, NULL);
2844
2845   /* Verify that we get the expected token back, with the correct
2846      location information.  */
2847   const cpp_token *tok = test.get_token ();
2848   ASSERT_EQ (tok->type, CPP_STRING);
2849   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\u2174\\u2175789\"");
2850
2851   /* Verify that cpp_interpret_string works.
2852      The string should be encoded in the execution character
2853      set.  Assuming that is UTF-8, we should have the following:
2854      -----------  ----  -----  -------  ----------------
2855      Byte offset  Byte  Octal  Unicode  Source Column(s)
2856      -----------  ----  -----  -------  ----------------
2857      0            0x30         '0'      10
2858      1            0x31         '1'      11
2859      2            0x32         '2'      12
2860      3            0x33         '3'      13
2861      4            0x34         '4'      14
2862      5            0xE2  \342   U+2174   15-20
2863      6            0x85  \205    (cont)  15-20
2864      7            0xB4  \264    (cont)  15-20
2865      8            0xE2  \342   U+2175   21-26
2866      9            0x85  \205    (cont)  21-26
2867      10           0xB5  \265    (cont)  21-26
2868      11           0x37         '7'      27
2869      12           0x38         '8'      28
2870      13           0x39         '9'      29
2871      14           0x00                  30 (closing quote)
2872      -----------  ----  -----  -------  ---------------.  */
2873
2874   cpp_string dst_string;
2875   const enum cpp_ttype type = CPP_STRING;
2876   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2877                                       &dst_string, type);
2878   ASSERT_TRUE (result);
2879   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2880                 (const char *)dst_string.text);
2881   free (const_cast <unsigned char *> (dst_string.text));
2882
2883   /* Verify ranges of individual characters.  This no longer includes the
2884      opening quote, but does include the closing quote.
2885      '01234'.  */
2886   for (int i = 0; i <= 4; i++)
2887     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2888   /* U+2174.  */
2889   for (int i = 5; i <= 7; i++)
2890     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 20);
2891   /* U+2175.  */
2892   for (int i = 8; i <= 10; i++)
2893     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 21, 26);
2894   /* '789' and nul terminator  */
2895   for (int i = 11; i <= 14; i++)
2896     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 16 + i, 16 + i);
2897
2898   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2899 }
2900
2901 /* Lex a string literal containing UCN 8 characters.
2902    Verify the substring location data after running cpp_interpret_string
2903    on it.  */
2904
2905 static void
2906 test_lexer_string_locations_ucn8 (const line_table_case &case_)
2907 {
2908   /* Digits 0-9, expressing digits 5 and 6 as Roman numerals as UCN 8.
2909      ....................000000000.111111.1111222222.2222333333333.344444
2910      ....................123456789.012345.6789012345.6789012345678.901234  */
2911   const char *content = "        \"01234\\U00002174\\U00002175789\" /* */\n";
2912   lexer_test test (case_, content, NULL);
2913
2914   /* Verify that we get the expected token back, with the correct
2915      location information.  */
2916   const cpp_token *tok = test.get_token ();
2917   ASSERT_EQ (tok->type, CPP_STRING);
2918   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok,
2919                            "\"01234\\U00002174\\U00002175789\"");
2920
2921   /* Verify that cpp_interpret_string works.
2922      The UTF-8 encoding of the string is identical to that from
2923      the ucn4 testcase above; the only difference is the column
2924      locations.  */
2925   cpp_string dst_string;
2926   const enum cpp_ttype type = CPP_STRING;
2927   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2928                                       &dst_string, type);
2929   ASSERT_TRUE (result);
2930   ASSERT_STREQ ("01234\342\205\264\342\205\265789",
2931                 (const char *)dst_string.text);
2932   free (const_cast <unsigned char *> (dst_string.text));
2933
2934   /* Verify ranges of individual characters.  This no longer includes the
2935      opening quote, but does include the closing quote.
2936      '01234'.  */
2937   for (int i = 0; i <= 4; i++)
2938     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2939   /* U+2174.  */
2940   for (int i = 5; i <= 7; i++)
2941     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 24);
2942   /* U+2175.  */
2943   for (int i = 8; i <= 10; i++)
2944     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 25, 34);
2945   /* '789' at columns 35-37  */
2946   for (int i = 11; i <= 13; i++)
2947     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 24 + i, 24 + i);
2948   /* Closing quote/nul-terminator at column 38.  */
2949   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 14, 1, 38, 38);
2950
2951   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
2952 }
2953
2954 /* Fetch a big-endian 32-bit value and convert to host endianness.  */
2955
2956 static uint32_t
2957 uint32_from_big_endian (const uint32_t *ptr_be_value)
2958 {
2959   const unsigned char *buf = (const unsigned char *)ptr_be_value;
2960   return (((uint32_t) buf[0] << 24)
2961           | ((uint32_t) buf[1] << 16)
2962           | ((uint32_t) buf[2] << 8)
2963           | (uint32_t) buf[3]);
2964 }
2965
2966 /* Lex a wide string literal and verify that attempts to read substring
2967    location data from it fail gracefully.  */
2968
2969 static void
2970 test_lexer_string_locations_wide_string (const line_table_case &case_)
2971 {
2972   /* Digits 0-9.
2973      ....................000000000.11111111112.22222222233333
2974      ....................123456789.01234567890.12345678901234  */
2975   const char *content = "       L\"0123456789\" /* non-str */\n";
2976   lexer_test test (case_, content, NULL);
2977
2978   /* Verify that we get the expected token back, with the correct
2979      location information.  */
2980   const cpp_token *tok = test.get_token ();
2981   ASSERT_EQ (tok->type, CPP_WSTRING);
2982   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L\"0123456789\"");
2983
2984   /* Verify that cpp_interpret_string works, using CPP_WSTRING.  */
2985   cpp_string dst_string;
2986   const enum cpp_ttype type = CPP_WSTRING;
2987   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2988                                       &dst_string, type);
2989   ASSERT_TRUE (result);
2990   /* The cpp_reader defaults to big-endian with
2991      CHAR_BIT * sizeof (int) for the wchar_precision, so dst_string should
2992      now be encoded as UTF-32BE.  */
2993   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
2994   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
2995   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
2996   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
2997   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
2998   free (const_cast <unsigned char *> (dst_string.text));
2999
3000   /* We don't yet support generating substring location information
3001      for L"" strings.  */
3002   ASSERT_HAS_NO_SUBSTRING_RANGES
3003     (test, tok->src_loc, type,
3004      "execution character set != source character set");
3005 }
3006
3007 /* Fetch a big-endian 16-bit value and convert to host endianness.  */
3008
3009 static uint16_t
3010 uint16_from_big_endian (const uint16_t *ptr_be_value)
3011 {
3012   const unsigned char *buf = (const unsigned char *)ptr_be_value;
3013   return ((uint16_t) buf[0] << 8) | (uint16_t) buf[1];
3014 }
3015
3016 /* Lex a u"" string literal and verify that attempts to read substring
3017    location data from it fail gracefully.  */
3018
3019 static void
3020 test_lexer_string_locations_string16 (const line_table_case &case_)
3021 {
3022   /* Digits 0-9.
3023      ....................000000000.11111111112.22222222233333
3024      ....................123456789.01234567890.12345678901234  */
3025   const char *content = "       u\"0123456789\" /* non-str */\n";
3026   lexer_test test (case_, content, NULL);
3027
3028   /* Verify that we get the expected token back, with the correct
3029      location information.  */
3030   const cpp_token *tok = test.get_token ();
3031   ASSERT_EQ (tok->type, CPP_STRING16);
3032   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u\"0123456789\"");
3033
3034   /* Verify that cpp_interpret_string works, using CPP_STRING16.  */
3035   cpp_string dst_string;
3036   const enum cpp_ttype type = CPP_STRING16;
3037   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3038                                       &dst_string, type);
3039   ASSERT_TRUE (result);
3040
3041   /* The cpp_reader defaults to big-endian, so dst_string should
3042      now be encoded as UTF-16BE.  */
3043   const uint16_t *be16_chars = (const uint16_t *)dst_string.text;
3044   ASSERT_EQ ('0', uint16_from_big_endian (&be16_chars[0]));
3045   ASSERT_EQ ('5', uint16_from_big_endian (&be16_chars[5]));
3046   ASSERT_EQ ('9', uint16_from_big_endian (&be16_chars[9]));
3047   ASSERT_EQ (0, uint16_from_big_endian (&be16_chars[10]));
3048   free (const_cast <unsigned char *> (dst_string.text));
3049
3050   /* We don't yet support generating substring location information
3051      for L"" strings.  */
3052   ASSERT_HAS_NO_SUBSTRING_RANGES
3053     (test, tok->src_loc, type,
3054      "execution character set != source character set");
3055 }
3056
3057 /* Lex a U"" string literal and verify that attempts to read substring
3058    location data from it fail gracefully.  */
3059
3060 static void
3061 test_lexer_string_locations_string32 (const line_table_case &case_)
3062 {
3063   /* Digits 0-9.
3064      ....................000000000.11111111112.22222222233333
3065      ....................123456789.01234567890.12345678901234  */
3066   const char *content = "       U\"0123456789\" /* non-str */\n";
3067   lexer_test test (case_, content, NULL);
3068
3069   /* Verify that we get the expected token back, with the correct
3070      location information.  */
3071   const cpp_token *tok = test.get_token ();
3072   ASSERT_EQ (tok->type, CPP_STRING32);
3073   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U\"0123456789\"");
3074
3075   /* Verify that cpp_interpret_string works, using CPP_STRING32.  */
3076   cpp_string dst_string;
3077   const enum cpp_ttype type = CPP_STRING32;
3078   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3079                                       &dst_string, type);
3080   ASSERT_TRUE (result);
3081
3082   /* The cpp_reader defaults to big-endian, so dst_string should
3083      now be encoded as UTF-32BE.  */
3084   const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
3085   ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
3086   ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
3087   ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
3088   ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
3089   free (const_cast <unsigned char *> (dst_string.text));
3090
3091   /* We don't yet support generating substring location information
3092      for L"" strings.  */
3093   ASSERT_HAS_NO_SUBSTRING_RANGES
3094     (test, tok->src_loc, type,
3095      "execution character set != source character set");
3096 }
3097
3098 /* Lex a u8-string literal.
3099    Verify the substring location data after running cpp_interpret_string
3100    on it.  */
3101
3102 static void
3103 test_lexer_string_locations_u8 (const line_table_case &case_)
3104 {
3105   /* Digits 0-9.
3106      ....................000000000.11111111112.22222222233333
3107      ....................123456789.01234567890.12345678901234  */
3108   const char *content = "      u8\"0123456789\" /* non-str */\n";
3109   lexer_test test (case_, content, NULL);
3110
3111   /* Verify that we get the expected token back, with the correct
3112      location information.  */
3113   const cpp_token *tok = test.get_token ();
3114   ASSERT_EQ (tok->type, CPP_UTF8STRING);
3115   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u8\"0123456789\"");
3116
3117   /* Verify that cpp_interpret_string works.  */
3118   cpp_string dst_string;
3119   const enum cpp_ttype type = CPP_STRING;
3120   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3121                                       &dst_string, type);
3122   ASSERT_TRUE (result);
3123   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3124   free (const_cast <unsigned char *> (dst_string.text));
3125
3126   /* Verify ranges of individual characters.  This no longer includes the
3127      opening quote, but does include the closing quote.  */
3128   for (int i = 0; i <= 10; i++)
3129     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3130 }
3131
3132 /* Lex a string literal containing UTF-8 source characters.
3133    Verify the substring location data after running cpp_interpret_string
3134    on it.  */
3135
3136 static void
3137 test_lexer_string_locations_utf8_source (const line_table_case &case_)
3138 {
3139  /* This string literal is written out to the source file as UTF-8,
3140     and is of the form "before mojibake after", where "mojibake"
3141     is written as the following four unicode code points:
3142        U+6587 CJK UNIFIED IDEOGRAPH-6587
3143        U+5B57 CJK UNIFIED IDEOGRAPH-5B57
3144        U+5316 CJK UNIFIED IDEOGRAPH-5316
3145        U+3051 HIRAGANA LETTER KE.
3146      Each of these is 3 bytes wide when encoded in UTF-8, whereas the
3147      "before" and "after" are 1 byte per unicode character.
3148
3149      The numbering shown are "columns", which are *byte* numbers within
3150      the line, rather than unicode character numbers.
3151
3152      .................... 000000000.1111111.
3153      .................... 123456789.0123456.  */
3154   const char *content = ("        \"before "
3155                          /* U+6587 CJK UNIFIED IDEOGRAPH-6587
3156                               UTF-8: 0xE6 0x96 0x87
3157                               C octal escaped UTF-8: \346\226\207
3158                             "column" numbers: 17-19.  */
3159                          "\346\226\207"
3160
3161                          /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
3162                               UTF-8: 0xE5 0xAD 0x97
3163                               C octal escaped UTF-8: \345\255\227
3164                             "column" numbers: 20-22.  */
3165                          "\345\255\227"
3166
3167                          /* U+5316 CJK UNIFIED IDEOGRAPH-5316
3168                               UTF-8: 0xE5 0x8C 0x96
3169                               C octal escaped UTF-8: \345\214\226
3170                             "column" numbers: 23-25.  */
3171                          "\345\214\226"
3172
3173                          /* U+3051 HIRAGANA LETTER KE
3174                               UTF-8: 0xE3 0x81 0x91
3175                               C octal escaped UTF-8: \343\201\221
3176                             "column" numbers: 26-28.  */
3177                          "\343\201\221"
3178
3179                          /* column numbers 29 onwards
3180                           2333333.33334444444444
3181                           9012345.67890123456789. */
3182                          " after\" /* non-str */\n");
3183   lexer_test test (case_, content, NULL);
3184
3185   /* Verify that we get the expected token back, with the correct
3186      location information.  */
3187   const cpp_token *tok = test.get_token ();
3188   ASSERT_EQ (tok->type, CPP_STRING);
3189   ASSERT_TOKEN_AS_TEXT_EQ
3190     (test.m_parser, tok,
3191      "\"before \346\226\207\345\255\227\345\214\226\343\201\221 after\"");
3192
3193   /* Verify that cpp_interpret_string works.  */
3194   cpp_string dst_string;
3195   const enum cpp_ttype type = CPP_STRING;
3196   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3197                                       &dst_string, type);
3198   ASSERT_TRUE (result);
3199   ASSERT_STREQ
3200     ("before \346\226\207\345\255\227\345\214\226\343\201\221 after",
3201      (const char *)dst_string.text);
3202   free (const_cast <unsigned char *> (dst_string.text));
3203
3204   /* Verify ranges of individual characters.  This no longer includes the
3205      opening quote, but does include the closing quote.
3206      Assuming that both source and execution encodings are UTF-8, we have
3207      a run of 25 octets in each, plus the NUL terminator.  */
3208   for (int i = 0; i < 25; i++)
3209     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3210   /* NUL-terminator should use the closing quote at column 35.  */
3211   ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 25, 1, 35, 35);
3212
3213   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 26);
3214 }
3215
3216 /* Test of string literal concatenation.  */
3217
3218 static void
3219 test_lexer_string_locations_concatenation_1 (const line_table_case &case_)
3220 {
3221   /* Digits 0-9.
3222      .....................000000000.111111.11112222222222
3223      .....................123456789.012345.67890123456789.  */
3224   const char *content = ("        \"01234\" /* non-str */\n"
3225                          "        \"56789\" /* non-str */\n");
3226   lexer_test test (case_, content, NULL);
3227
3228   location_t input_locs[2];
3229
3230   /* Verify that we get the expected tokens back.  */
3231   auto_vec <cpp_string> input_strings;
3232   const cpp_token *tok_a = test.get_token ();
3233   ASSERT_EQ (tok_a->type, CPP_STRING);
3234   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_a, "\"01234\"");
3235   input_strings.safe_push (tok_a->val.str);
3236   input_locs[0] = tok_a->src_loc;
3237
3238   const cpp_token *tok_b = test.get_token ();
3239   ASSERT_EQ (tok_b->type, CPP_STRING);
3240   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_b, "\"56789\"");
3241   input_strings.safe_push (tok_b->val.str);
3242   input_locs[1] = tok_b->src_loc;
3243
3244   /* Verify that cpp_interpret_string works.  */
3245   cpp_string dst_string;
3246   const enum cpp_ttype type = CPP_STRING;
3247   bool result = cpp_interpret_string (test.m_parser,
3248                                       input_strings.address (), 2,
3249                                       &dst_string, type);
3250   ASSERT_TRUE (result);
3251   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3252   free (const_cast <unsigned char *> (dst_string.text));
3253
3254   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
3255   test.m_concats.record_string_concatenation (2, input_locs);
3256
3257   location_t initial_loc = input_locs[0];
3258
3259   /* "01234" on line 1.  */
3260   for (int i = 0; i <= 4; i++)
3261     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3262   /* "56789" in line 2, plus its closing quote for the nul terminator.  */
3263   for (int i = 5; i <= 10; i++)
3264     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 2, 5 + i, 5 + i);
3265
3266   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3267 }
3268
3269 /* Another test of string literal concatenation.  */
3270
3271 static void
3272 test_lexer_string_locations_concatenation_2 (const line_table_case &case_)
3273 {
3274   /* Digits 0-9.
3275      .....................000000000.111.11111112222222
3276      .....................123456789.012.34567890123456.  */
3277   const char *content = ("        \"01\" /* non-str */\n"
3278                          "        \"23\" /* non-str */\n"
3279                          "        \"45\" /* non-str */\n"
3280                          "        \"67\" /* non-str */\n"
3281                          "        \"89\" /* non-str */\n");
3282   lexer_test test (case_, content, NULL);
3283
3284   auto_vec <cpp_string> input_strings;
3285   location_t input_locs[5];
3286
3287   /* Verify that we get the expected tokens back.  */
3288   for (int i = 0; i < 5; i++)
3289     {
3290       const cpp_token *tok = test.get_token ();
3291       ASSERT_EQ (tok->type, CPP_STRING);
3292       input_strings.safe_push (tok->val.str);
3293       input_locs[i] = tok->src_loc;
3294     }
3295
3296   /* Verify that cpp_interpret_string works.  */
3297   cpp_string dst_string;
3298   const enum cpp_ttype type = CPP_STRING;
3299   bool result = cpp_interpret_string (test.m_parser,
3300                                       input_strings.address (), 5,
3301                                       &dst_string, type);
3302   ASSERT_TRUE (result);
3303   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3304   free (const_cast <unsigned char *> (dst_string.text));
3305
3306   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
3307   test.m_concats.record_string_concatenation (5, input_locs);
3308
3309   location_t initial_loc = input_locs[0];
3310
3311   /* Within ASSERT_CHAR_AT_RANGE (actually assert_char_at_range), we can
3312      detect if the initial loc is after LINE_MAP_MAX_LOCATION_WITH_COLS
3313      and expect get_source_range_for_substring to fail.
3314      However, for a string concatenation test, we can have a case
3315      where the initial string is fully before LINE_MAP_MAX_LOCATION_WITH_COLS,
3316      but subsequent strings can be after it.
3317      Attempting to detect this within assert_char_at_range
3318      would overcomplicate the logic for the common test cases, so
3319      we detect it here.  */
3320   if (should_have_column_data_p (input_locs[0])
3321       && !should_have_column_data_p (input_locs[4]))
3322     {
3323       /* Verify that get_source_range_for_substring gracefully rejects
3324          this case.  */
3325       source_range actual_range;
3326       const char *err
3327         = get_source_range_for_char (test.m_parser, &test.m_concats,
3328                                      initial_loc, type, 0, &actual_range);
3329       ASSERT_STREQ ("range starts after LINE_MAP_MAX_LOCATION_WITH_COLS", err);
3330       return;
3331     }
3332
3333   for (int i = 0; i < 5; i++)
3334     for (int j = 0; j < 2; j++)
3335       ASSERT_CHAR_AT_RANGE (test, initial_loc, type, (i * 2) + j,
3336                             i + 1, 10 + j, 10 + j);
3337
3338   /* NUL-terminator should use the final closing quote at line 5 column 12.  */
3339   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 5, 12, 12);
3340
3341   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3342 }
3343
3344 /* Another test of string literal concatenation, this time combined with
3345    various kinds of escaped characters.  */
3346
3347 static void
3348 test_lexer_string_locations_concatenation_3 (const line_table_case &case_)
3349 {
3350   /* Digits 0-9, expressing digit 5 in ASCII as hex "\x35"
3351      digit 6 in ASCII as octal "\066", concatenating multiple strings.  */
3352   const char *content
3353     /* .000000000.111111.111.1.2222.222.2.2233.333.3333.34444444444555
3354        .123456789.012345.678.9.0123.456.7.8901.234.5678.90123456789012. */
3355     = ("        \"01234\"  \"\\x35\"  \"\\066\"  \"789\" /* non-str */\n");
3356   lexer_test test (case_, content, NULL);
3357
3358   auto_vec <cpp_string> input_strings;
3359   location_t input_locs[4];
3360
3361   /* Verify that we get the expected tokens back.  */
3362   for (int i = 0; i < 4; i++)
3363     {
3364       const cpp_token *tok = test.get_token ();
3365       ASSERT_EQ (tok->type, CPP_STRING);
3366       input_strings.safe_push (tok->val.str);
3367       input_locs[i] = tok->src_loc;
3368     }
3369
3370   /* Verify that cpp_interpret_string works.  */
3371   cpp_string dst_string;
3372   const enum cpp_ttype type = CPP_STRING;
3373   bool result = cpp_interpret_string (test.m_parser,
3374                                       input_strings.address (), 4,
3375                                       &dst_string, type);
3376   ASSERT_TRUE (result);
3377   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3378   free (const_cast <unsigned char *> (dst_string.text));
3379
3380   /* Simulate c-lex.c's lex_string in order to record concatenation.  */
3381   test.m_concats.record_string_concatenation (4, input_locs);
3382
3383   location_t initial_loc = input_locs[0];
3384
3385   for (int i = 0; i <= 4; i++)
3386     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3387   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 5, 1, 19, 22);
3388   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 6, 1, 27, 30);
3389   for (int i = 7; i <= 9; i++)
3390     ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 28 + i, 28 + i);
3391
3392   /* NUL-terminator should use the location of the final closing quote.  */
3393   ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 1, 38, 38);
3394
3395   ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3396 }
3397
3398 /* Test of string literal in a macro.  */
3399
3400 static void
3401 test_lexer_string_locations_macro (const line_table_case &case_)
3402 {
3403   /* Digits 0-9.
3404      .....................0000000001111111111.22222222223.
3405      .....................1234567890123456789.01234567890.  */
3406   const char *content = ("#define MACRO     \"0123456789\" /* non-str */\n"
3407                          "  MACRO");
3408   lexer_test test (case_, content, NULL);
3409
3410   /* Verify that we get the expected tokens back.  */
3411   const cpp_token *tok = test.get_token ();
3412   ASSERT_EQ (tok->type, CPP_PADDING);
3413
3414   tok = test.get_token ();
3415   ASSERT_EQ (tok->type, CPP_STRING);
3416   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
3417
3418   /* Verify ranges of individual characters.  We ought to
3419      see columns within the macro definition.  */
3420   for (int i = 0; i <= 10; i++)
3421     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3422                           i, 1, 20 + i, 20 + i);
3423
3424   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3425
3426   tok = test.get_token ();
3427   ASSERT_EQ (tok->type, CPP_PADDING);
3428 }
3429
3430 /* Test of stringification of a macro argument.  */
3431
3432 static void
3433 test_lexer_string_locations_stringified_macro_argument
3434   (const line_table_case &case_)
3435 {
3436   /* .....................000000000111111111122222222223.
3437      .....................123456789012345678901234567890.  */
3438   const char *content = ("#define MACRO(X) #X /* non-str */\n"
3439                          "MACRO(foo)\n");
3440   lexer_test test (case_, content, NULL);
3441
3442   /* Verify that we get the expected token back.  */
3443   const cpp_token *tok = test.get_token ();
3444   ASSERT_EQ (tok->type, CPP_PADDING);
3445
3446   tok = test.get_token ();
3447   ASSERT_EQ (tok->type, CPP_STRING);
3448   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"foo\"");
3449
3450   /* We don't support getting the location of a stringified macro
3451      argument.  Verify that it fails gracefully.  */
3452   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3453                                   "cpp_interpret_string_1 failed");
3454
3455   tok = test.get_token ();
3456   ASSERT_EQ (tok->type, CPP_PADDING);
3457
3458   tok = test.get_token ();
3459   ASSERT_EQ (tok->type, CPP_PADDING);
3460 }
3461
3462 /* Ensure that we are fail gracefully if something attempts to pass
3463    in a location that isn't a string literal token.  Seen on this code:
3464
3465      const char a[] = " %d ";
3466      __builtin_printf (a, 0.5);
3467                        ^
3468
3469    when c-format.c erroneously used the indicated one-character
3470    location as the format string location, leading to a read past the
3471    end of a string buffer in cpp_interpret_string_1.  */
3472
3473 static void
3474 test_lexer_string_locations_non_string (const line_table_case &case_)
3475 {
3476   /* .....................000000000111111111122222222223.
3477      .....................123456789012345678901234567890.  */
3478   const char *content = ("         a\n");
3479   lexer_test test (case_, content, NULL);
3480
3481   /* Verify that we get the expected token back.  */
3482   const cpp_token *tok = test.get_token ();
3483   ASSERT_EQ (tok->type, CPP_NAME);
3484   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "a");
3485
3486   /* At this point, libcpp is attempting to interpret the name as a
3487      string literal, despite it not starting with a quote.  We don't detect
3488      that, but we should at least fail gracefully.  */
3489   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3490                                   "cpp_interpret_string_1 failed");
3491 }
3492
3493 /* Ensure that we can read substring information for a token which
3494    starts in one linemap and ends in another .  Adapted from
3495    gcc.dg/cpp/pr69985.c.  */
3496
3497 static void
3498 test_lexer_string_locations_long_line (const line_table_case &case_)
3499 {
3500   /* .....................000000.000111111111
3501      .....................123456.789012346789.  */
3502   const char *content = ("/* A very long line, so that we start a new line map.  */\n"
3503                          "     \"0123456789012345678901234567890123456789"
3504                          "0123456789012345678901234567890123456789"
3505                          "0123456789012345678901234567890123456789"
3506                          "0123456789\"\n");
3507
3508   lexer_test test (case_, content, NULL);
3509
3510   /* Verify that we get the expected token back.  */
3511   const cpp_token *tok = test.get_token ();
3512   ASSERT_EQ (tok->type, CPP_STRING);
3513
3514   if (!should_have_column_data_p (line_table->highest_location))
3515     return;
3516
3517   /* Verify ranges of individual characters.  */
3518   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 131);
3519   for (int i = 0; i < 131; i++)
3520     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3521                           i, 2, 7 + i, 7 + i);
3522 }
3523
3524 /* Test of locations within a raw string that doesn't contain a newline.  */
3525
3526 static void
3527 test_lexer_string_locations_raw_string_one_line (const line_table_case &case_)
3528 {
3529   /* .....................00.0000000111111111122.
3530      .....................12.3456789012345678901.  */
3531   const char *content = ("R\"foo(0123456789)foo\"\n");
3532   lexer_test test (case_, content, NULL);
3533
3534   /* Verify that we get the expected token back.  */
3535   const cpp_token *tok = test.get_token ();
3536   ASSERT_EQ (tok->type, CPP_STRING);
3537
3538   /* Verify that cpp_interpret_string works.  */
3539   cpp_string dst_string;
3540   const enum cpp_ttype type = CPP_STRING;
3541   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3542                                       &dst_string, type);
3543   ASSERT_TRUE (result);
3544   ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3545   free (const_cast <unsigned char *> (dst_string.text));
3546
3547   if (!should_have_column_data_p (line_table->highest_location))
3548     return;
3549
3550   /* 0-9, plus the nil terminator.  */
3551   ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3552   for (int i = 0; i < 11; i++)
3553     ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3554                           i, 1, 7 + i, 7 + i);
3555 }
3556
3557 /* Test of locations within a raw string that contains a newline.  */
3558
3559 static void
3560 test_lexer_string_locations_raw_string_multiline (const line_table_case &case_)
3561 {
3562   /* .....................00.0000.
3563      .....................12.3456.  */
3564   const char *content = ("R\"foo(\n"
3565   /* .....................00000.
3566      .....................12345.  */
3567                          "hello\n"
3568                          "world\n"
3569   /* .....................00000.
3570      .....................12345.  */
3571                          ")foo\"\n");
3572   lexer_test test (case_, content, NULL);
3573
3574   /* Verify that we get the expected token back.  */
3575   const cpp_token *tok = test.get_token ();
3576   ASSERT_EQ (tok->type, CPP_STRING);
3577
3578   /* Verify that cpp_interpret_string works.  */
3579   cpp_string dst_string;
3580   const enum cpp_ttype type = CPP_STRING;
3581   bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3582                                       &dst_string, type);
3583   ASSERT_TRUE (result);
3584   ASSERT_STREQ ("\nhello\nworld\n", (const char *)dst_string.text);
3585   free (const_cast <unsigned char *> (dst_string.text));
3586
3587   if (!should_have_column_data_p (line_table->highest_location))
3588     return;
3589
3590   /* Currently we don't support locations within raw strings that
3591      contain newlines.  */
3592   ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, tok->type,
3593                                   "range endpoints are on different lines");
3594 }
3595
3596 /* Test of parsing an unterminated raw string.  */
3597
3598 static void
3599 test_lexer_string_locations_raw_string_unterminated (const line_table_case &case_)
3600 {
3601   const char *content = "R\"ouch()ouCh\" /* etc */";
3602
3603   lexer_diagnostic_sink diagnostics;
3604   lexer_test test (case_, content, &diagnostics);
3605   test.m_implicitly_expect_EOF = false;
3606
3607   /* Attempt to parse the raw string.  */
3608   const cpp_token *tok = test.get_token ();
3609   ASSERT_EQ (tok->type, CPP_EOF);
3610
3611   ASSERT_EQ (1, diagnostics.m_diagnostics.length ());
3612   /* We expect the message "unterminated raw string"
3613      in the "cpplib" translation domain.
3614      It's not clear that dgettext is available on all supported hosts,
3615      so this assertion is commented-out for now.
3616        ASSERT_STREQ (dgettext ("cpplib", "unterminated raw string"),
3617                      diagnostics.m_diagnostics[0]);
3618   */
3619 }
3620
3621 /* Test of lexing char constants.  */
3622
3623 static void
3624 test_lexer_char_constants (const line_table_case &case_)
3625 {
3626   /* Various char constants.
3627      .....................0000000001111111111.22222222223.
3628      .....................1234567890123456789.01234567890.  */
3629   const char *content = ("         'a'\n"
3630                          "        u'a'\n"
3631                          "        U'a'\n"
3632                          "        L'a'\n"
3633                          "         'abc'\n");
3634   lexer_test test (case_, content, NULL);
3635
3636   /* Verify that we get the expected tokens back.  */
3637   /* 'a'.  */
3638   const cpp_token *tok = test.get_token ();
3639   ASSERT_EQ (tok->type, CPP_CHAR);
3640   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'a'");
3641
3642   unsigned int chars_seen;
3643   int unsignedp;
3644   cppchar_t cc = cpp_interpret_charconst (test.m_parser, tok,
3645                                           &chars_seen, &unsignedp);
3646   ASSERT_EQ (cc, 'a');
3647   ASSERT_EQ (chars_seen, 1);
3648
3649   /* u'a'.  */
3650   tok = test.get_token ();
3651   ASSERT_EQ (tok->type, CPP_CHAR16);
3652   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u'a'");
3653
3654   /* U'a'.  */
3655   tok = test.get_token ();
3656   ASSERT_EQ (tok->type, CPP_CHAR32);
3657   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U'a'");
3658
3659   /* L'a'.  */
3660   tok = test.get_token ();
3661   ASSERT_EQ (tok->type, CPP_WCHAR);
3662   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L'a'");
3663
3664   /* 'abc' (c-char-sequence).  */
3665   tok = test.get_token ();
3666   ASSERT_EQ (tok->type, CPP_CHAR);
3667   ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'abc'");
3668 }
3669 /* A table of interesting location_t values, giving one axis of our test
3670    matrix.  */
3671
3672 static const location_t boundary_locations[] = {
3673   /* Zero means "don't override the default values for a new line_table".  */
3674   0,
3675
3676   /* An arbitrary non-zero value that isn't close to one of
3677      the boundary values below.  */
3678   0x10000,
3679
3680   /* Values near LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES.  */
3681   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 0x100,
3682   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 1,
3683   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES,
3684   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 1,
3685   LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 0x100,
3686
3687   /* Values near LINE_MAP_MAX_LOCATION_WITH_COLS.  */
3688   LINE_MAP_MAX_LOCATION_WITH_COLS - 0x100,
3689   LINE_MAP_MAX_LOCATION_WITH_COLS - 1,
3690   LINE_MAP_MAX_LOCATION_WITH_COLS,
3691   LINE_MAP_MAX_LOCATION_WITH_COLS + 1,
3692   LINE_MAP_MAX_LOCATION_WITH_COLS + 0x100,
3693 };
3694
3695 /* Run TESTCASE multiple times, once for each case in our test matrix.  */
3696
3697 void
3698 for_each_line_table_case (void (*testcase) (const line_table_case &))
3699 {
3700   /* As noted above in the description of struct line_table_case,
3701      we want to explore a test matrix of interesting line_table
3702      situations, running various selftests for each case within the
3703      matrix.  */
3704
3705   /* Run all tests with:
3706      (a) line_table->default_range_bits == 0, and
3707      (b) line_table->default_range_bits == 5.  */
3708   int num_cases_tested = 0;
3709   for (int default_range_bits = 0; default_range_bits <= 5;
3710        default_range_bits += 5)
3711     {
3712       /* ...and use each of the "interesting" location values as
3713          the starting location within line_table.  */
3714       const int num_boundary_locations
3715         = sizeof (boundary_locations) / sizeof (boundary_locations[0]);
3716       for (int loc_idx = 0; loc_idx < num_boundary_locations; loc_idx++)
3717         {
3718           line_table_case c (default_range_bits, boundary_locations[loc_idx]);
3719
3720           testcase (c);
3721
3722           num_cases_tested++;
3723         }
3724     }
3725
3726   /* Verify that we fully covered the test matrix.  */
3727   ASSERT_EQ (num_cases_tested, 2 * 12);
3728 }
3729
3730 /* Verify that when presented with a consecutive pair of locations with
3731    a very large line offset, we don't attempt to consolidate them into
3732    a single ordinary linemap where the line offsets within the line map
3733    would lead to overflow (PR lto/88147).  */
3734
3735 static void
3736 test_line_offset_overflow ()
3737 {
3738   line_table_test ltt (line_table_case (5, 0));
3739
3740   linemap_add (line_table, LC_ENTER, false, "foo.c", 0);
3741   linemap_line_start (line_table, 1, 100);
3742   location_t loc_a = linemap_line_start (line_table, 2578, 255);
3743   assert_loceq ("foo.c", 2578, 0, loc_a);
3744
3745   const line_map_ordinary *ordmap_a = LINEMAPS_LAST_ORDINARY_MAP (line_table);
3746   ASSERT_EQ (ordmap_a->m_column_and_range_bits, 13);
3747   ASSERT_EQ (ordmap_a->m_range_bits, 5);
3748
3749   location_t loc_b = linemap_line_start (line_table, 404198, 512);
3750   assert_loceq ("foo.c", 404198, 0, loc_b);
3751
3752   /* We should have started a new linemap, rather than attempting to store
3753      a very large line offset.  */
3754   const line_map_ordinary *ordmap_b = LINEMAPS_LAST_ORDINARY_MAP (line_table);
3755   ASSERT_NE (ordmap_a, ordmap_b);
3756 }
3757
3758 void test_cpp_utf8 ()
3759 {
3760   const int def_tabstop = 8;
3761   /* Verify that wcwidth of invalid UTF-8 or control bytes is 1.  */
3762   {
3763     int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8, def_tabstop);
3764     ASSERT_EQ (8, w_bad);
3765     int w_ctrl = cpp_display_width ("\r\n\v\0\1", 5, def_tabstop);
3766     ASSERT_EQ (5, w_ctrl);
3767   }
3768
3769   /* Verify that wcwidth of valid UTF-8 is as expected.  */
3770   {
3771     const int w_pi = cpp_display_width ("\xcf\x80", 2, def_tabstop);
3772     ASSERT_EQ (1, w_pi);
3773     const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4, def_tabstop);
3774     ASSERT_EQ (2, w_emoji);
3775     const int w_umlaut_precomposed = cpp_display_width ("\xc3\xbf", 2,
3776                                                         def_tabstop);
3777     ASSERT_EQ (1, w_umlaut_precomposed);
3778     const int w_umlaut_combining = cpp_display_width ("y\xcc\x88", 3,
3779                                                       def_tabstop);
3780     ASSERT_EQ (1, w_umlaut_combining);
3781     const int w_han = cpp_display_width ("\xe4\xb8\xba", 3, def_tabstop);
3782     ASSERT_EQ (2, w_han);
3783     const int w_ascii = cpp_display_width ("GCC", 3, def_tabstop);
3784     ASSERT_EQ (3, w_ascii);
3785     const int w_mixed = cpp_display_width ("\xcf\x80 = 3.14 \xf0\x9f\x98\x82"
3786                                            "\x9f! \xe4\xb8\xba y\xcc\x88",
3787                                            24, def_tabstop);
3788     ASSERT_EQ (18, w_mixed);
3789   }
3790
3791   /* Verify that display width properly expands tabs.  */
3792   {
3793     const char *tstr = "\tabc\td";
3794     ASSERT_EQ (6, cpp_display_width (tstr, 6, 1));
3795     ASSERT_EQ (10, cpp_display_width (tstr, 6, 3));
3796     ASSERT_EQ (17, cpp_display_width (tstr, 6, 8));
3797     ASSERT_EQ (1, cpp_display_column_to_byte_column (tstr, 6, 7, 8));
3798   }
3799
3800   /* Verify that cpp_byte_column_to_display_column can go past the end,
3801      and similar edge cases.  */
3802   {
3803     const char *str
3804       /* Display columns.
3805          111111112345  */
3806       = "\xcf\x80 abc";
3807       /* 111122223456
3808          Byte columns.  */
3809
3810     ASSERT_EQ (5, cpp_display_width (str, 6, def_tabstop));
3811     ASSERT_EQ (105,
3812                cpp_byte_column_to_display_column (str, 6, 106, def_tabstop));
3813     ASSERT_EQ (10000,
3814                cpp_byte_column_to_display_column (NULL, 0, 10000, def_tabstop));
3815     ASSERT_EQ (0,
3816                cpp_byte_column_to_display_column (NULL, 10000, 0, def_tabstop));
3817   }
3818
3819   /* Verify that cpp_display_column_to_byte_column can go past the end,
3820      and similar edge cases, and check invertibility.  */
3821   {
3822     const char *str
3823       /* Display columns.
3824          000000000000000000000000000000000000011
3825          111111112222222234444444455555555678901  */
3826       = "\xf0\x9f\x98\x82 \xf0\x9f\x98\x82 hello";
3827       /* 000000000000000000000000000000000111111
3828          111122223333444456666777788889999012345
3829          Byte columns.  */
3830     ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2, def_tabstop));
3831     ASSERT_EQ (15,
3832                cpp_display_column_to_byte_column (str, 15, 11, def_tabstop));
3833     ASSERT_EQ (115,
3834                cpp_display_column_to_byte_column (str, 15, 111, def_tabstop));
3835     ASSERT_EQ (10000,
3836                cpp_display_column_to_byte_column (NULL, 0, 10000, def_tabstop));
3837     ASSERT_EQ (0,
3838                cpp_display_column_to_byte_column (NULL, 10000, 0, def_tabstop));
3839
3840     /* Verify that we do not interrupt a UTF-8 sequence.  */
3841     ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1, def_tabstop));
3842
3843     for (int byte_col = 1; byte_col <= 15; ++byte_col)
3844       {
3845         const int disp_col
3846           = cpp_byte_column_to_display_column (str, 15, byte_col, def_tabstop);
3847         const int byte_col2
3848           = cpp_display_column_to_byte_column (str, 15, disp_col, def_tabstop);
3849
3850         /* If we ask for the display column in the middle of a UTF-8
3851            sequence, it will return the length of the partial sequence,
3852            matching the behavior of GCC before display column support.
3853            Otherwise check the round trip was successful.  */
3854         if (byte_col < 4)
3855           ASSERT_EQ (byte_col, disp_col);
3856         else if (byte_col >= 6 && byte_col < 9)
3857           ASSERT_EQ (3 + (byte_col - 5), disp_col);
3858         else
3859           ASSERT_EQ (byte_col2, byte_col);
3860       }
3861   }
3862
3863 }
3864
3865 /* Run all of the selftests within this file.  */
3866
3867 void
3868 input_c_tests ()
3869 {
3870   test_linenum_comparisons ();
3871   test_should_have_column_data_p ();
3872   test_unknown_location ();
3873   test_builtins ();
3874   for_each_line_table_case (test_make_location_nonpure_range_endpoints);
3875
3876   for_each_line_table_case (test_accessing_ordinary_linemaps);
3877   for_each_line_table_case (test_lexer);
3878   for_each_line_table_case (test_lexer_string_locations_simple);
3879   for_each_line_table_case (test_lexer_string_locations_ebcdic);
3880   for_each_line_table_case (test_lexer_string_locations_hex);
3881   for_each_line_table_case (test_lexer_string_locations_oct);
3882   for_each_line_table_case (test_lexer_string_locations_letter_escape_1);
3883   for_each_line_table_case (test_lexer_string_locations_letter_escape_2);
3884   for_each_line_table_case (test_lexer_string_locations_ucn4);
3885   for_each_line_table_case (test_lexer_string_locations_ucn8);
3886   for_each_line_table_case (test_lexer_string_locations_wide_string);
3887   for_each_line_table_case (test_lexer_string_locations_string16);
3888   for_each_line_table_case (test_lexer_string_locations_string32);
3889   for_each_line_table_case (test_lexer_string_locations_u8);
3890   for_each_line_table_case (test_lexer_string_locations_utf8_source);
3891   for_each_line_table_case (test_lexer_string_locations_concatenation_1);
3892   for_each_line_table_case (test_lexer_string_locations_concatenation_2);
3893   for_each_line_table_case (test_lexer_string_locations_concatenation_3);
3894   for_each_line_table_case (test_lexer_string_locations_macro);
3895   for_each_line_table_case (test_lexer_string_locations_stringified_macro_argument);
3896   for_each_line_table_case (test_lexer_string_locations_non_string);
3897   for_each_line_table_case (test_lexer_string_locations_long_line);
3898   for_each_line_table_case (test_lexer_string_locations_raw_string_one_line);
3899   for_each_line_table_case (test_lexer_string_locations_raw_string_multiline);
3900   for_each_line_table_case (test_lexer_string_locations_raw_string_unterminated);
3901   for_each_line_table_case (test_lexer_char_constants);
3902
3903   test_reading_source_line ();
3904
3905   test_line_offset_overflow ();
3906
3907   test_cpp_utf8 ();
3908 }
3909
3910 } // namespace selftest
3911
3912 #endif /* CHECKING_P */