diagnostics: Support for -finput-charset [PR93067]

author Lewis Hyatt <lhyatt@gmail.com>

Tue, 24 Aug 2021 23:30:44 +0000 (19:30 -0400)

committer Lewis Hyatt <lhyatt@gmail.com>

Wed, 25 Aug 2021 15:15:28 +0000 (11:15 -0400)
author Lewis Hyatt <lhyatt@gmail.com>
Tue, 24 Aug 2021 23:30:44 +0000 (19:30 -0400)
committer Lewis Hyatt <lhyatt@gmail.com>
Wed, 25 Aug 2021 15:15:28 +0000 (11:15 -0400)
diff --git a/gcc/c-family/c-opts.c b/gcc/c-family/c-opts.c

index 373af0cf06f1302c897d7ab0b82ec5fe580472bb..fdde082158b3f97038b3548c916a8f4a6c2b7d7c 100644 (file)
--- a/gcc/c-family/c-opts.c
+++ b/gcc/c-family/c-opts.c
@@ -188,6 +188,14 @@ c_common_diagnostics_set_defaults (diagnostic_context *context)
    context->opt_permissive = OPT_fpermissive;
  }
  
+/* Input charset configuration for diagnostics.  */
+static const char *
+c_common_input_charset_cb (const char * /*filename*/)
+{
+  const char *cs = cpp_opts->input_charset;
+  return cpp_input_conversion_is_trivial (cs) ? nullptr : cs;
+}
+
  /* Whether options from all C-family languages should be accepted
     quietly.  */
  static bool accept_all_c_family_options = false;
@@ -1136,6 +1144,11 @@ c_common_post_options (const char **pfilename)
    cpp_post_options (parse_in);
    init_global_opts_from_cpp (&global_options, cpp_get_options (parse_in));
  
+  /* Let diagnostics infrastructure know how to convert input files the same
+     way libcpp will do it, namely using the configured input charset and
+     skipping a UTF-8 BOM if present.  */
+  diagnostic_initialize_input_context (global_dc,
+                                      c_common_input_charset_cb, true);
    input_location = UNKNOWN_LOCATION;
  
    *pfilename = this_input_filename
diff --git a/gcc/coretypes.h b/gcc/coretypes.h

index 406572e947d74777bdcff62214c738afbe05a081..726fcaddda2a6199781d2cb2f1fb6acf7807f38e 100644 (file)
--- a/gcc/coretypes.h
+++ b/gcc/coretypes.h
@@ -154,6 +154,7 @@ struct cl_option_handlers;
  struct diagnostic_context;
  class pretty_printer;
  class diagnostic_event_id_t;
+typedef const char * (*diagnostic_input_charset_callback)(const char *);
  
  template<typename T> struct array_traits;
  
diff --git a/gcc/d/d-lang.cc b/gcc/d/d-lang.cc

index 4386a489ff2d2512c4d3673b8ae4cfb380d12427..fa29a46ab1ebf3b514e8a854a2e8475d7c712f8d 100644 (file)
--- a/gcc/d/d-lang.cc
+++ b/gcc/d/d-lang.cc
@@ -50,6 +50,7 @@ along with GCC; see the file COPYING3.  If not see
  #include "output.h"
  #include "print-tree.h"
  #include "debug.h"
+#include "input.h"
  
  #include "d-tree.h"
  #include "id.h"
@@ -362,6 +363,19 @@ d_option_lang_mask (void)
    return CL_D;
  }
  
+/* Implements input charset and BOM skipping configuration for
+   diagnostics.  */
+static const char *d_input_charset_callback (const char * /*filename*/)
+{
+  /* TODO: The input charset is automatically determined by code in
+     dmd/dmodule.c based on the contents of the file.  If this detection
+     logic were factored out and could be reused here, then we would be able
+     to return UTF-16 or UTF-32 as needed here.  For now, we return always
+     NULL, which means no conversion is necessary, i.e. the input is assumed
+     to be UTF-8 when diagnostics read this file.  */
+  return nullptr;
+}
+
  /* Implements the lang_hooks.init routine for language D.  */
  
  static bool
@@ -373,6 +387,11 @@ d_init (void)
    Expression::_init ();
    Objc::_init ();
  
+  /* Diagnostics input init, to enable BOM skipping and
+     input charset conversion.  */
+  diagnostic_initialize_input_context (global_dc,
+                                      d_input_charset_callback, true);
+
    /* Back-end init.  */
    global_binding_level = ggc_cleared_alloc <binding_level> ();
    current_binding_level = global_binding_level;
diff --git a/gcc/diagnostic.c b/gcc/diagnostic.c

index 8361f68aaceb5bc0f8487899197eb97dcf296c65..b3afbeae648844a872f5773915f6227e5a1f5725 100644 (file)
--- a/gcc/diagnostic.c
+++ b/gcc/diagnostic.c
@@ -293,6 +293,17 @@ diagnostic_urls_init (diagnostic_context *context, int value /*= -1 */)
      = determine_url_format ((diagnostic_url_rule_t) value);
  }
  
+/* Create the file_cache, if not already created, and tell it how to
+   translate files on input.  */
+void diagnostic_initialize_input_context (diagnostic_context *context,
+                                         diagnostic_input_charset_callback ccb,
+                                         bool should_skip_bom)
+{
+  if (!context->m_file_cache)
+    context->m_file_cache = new file_cache;
+  context->m_file_cache->initialize_input_context (ccb, should_skip_bom);
+}
+
  /* Do any cleaning up required after the last diagnostic is emitted.  */
  
  void
diff --git a/gcc/diagnostic.h b/gcc/diagnostic.h

index 7227dae1b6bef7d42ea32a3b6e858d80bda3747e..f90d20a4f5ec751e7dd769ed554cee62bd1321dd 100644 (file)
--- a/gcc/diagnostic.h
+++ b/gcc/diagnostic.h
@@ -446,6 +446,25 @@ extern void diagnostic_show_locus (diagnostic_context *,
                                    diagnostic_t diagnostic_kind);
  extern void diagnostic_show_any_path (diagnostic_context *, diagnostic_info *);
  
+/* Because we read source files a second time after the frontend did it the
+   first time, we need to know how the frontend handled things like character
+   set conversion and UTF-8 BOM stripping, in order to make everything
+   consistent.  This function needs to be called by each frontend that requires
+   non-default behavior, to inform the diagnostics infrastructure how input is
+   to be processed.  The default behavior is to do no conversion and not to
+   strip a UTF-8 BOM.
+
+   The callback should return the input charset to be used to convert the given
+   file's contents to UTF-8, or it should return NULL if no conversion is needed
+   for this file.  SHOULD_SKIP_BOM only applies in case no conversion was
+   performed, and if true, it will cause a UTF-8 BOM to be skipped at the
+   beginning of the file.  (In case a conversion was performed, the BOM is
+   rather skipped as part of the conversion process.)  */
+
+void diagnostic_initialize_input_context (diagnostic_context *context,
+                                         diagnostic_input_charset_callback ccb,
+                                         bool should_skip_bom);
+
  /* Force diagnostics controlled by OPTIDX to be kind KIND.  */
  extern diagnostic_t diagnostic_classify_diagnostic (diagnostic_context *,
                                                     int /* optidx */,
diff --git a/gcc/fortran/cpp.c b/gcc/fortran/cpp.c

index 419cd6accbeefeb7af890d2e9194f7d7b51d9e49..83c4517acdbb3ec1413838070be0bc580ec0bc4f 100644 (file)
--- a/gcc/fortran/cpp.c
+++ b/gcc/fortran/cpp.c
@@ -493,6 +493,12 @@ gfc_cpp_post_options (void)
  
    cpp_post_options (cpp_in);
  
+
+  /* Let diagnostics infrastructure know how to convert input files the same
+     way libcpp will do it, namely, with no charset conversion but with
+     skipping of a UTF-8 BOM if present.  */
+  diagnostic_initialize_input_context (global_dc, nullptr, true);
+
    gfc_cpp_register_include_paths ();
  }
  
diff --git a/gcc/input.c b/gcc/input.c

index de20d983d2c4f7d2f4f86522ba8db44ca019789c..4b809862e02dc78f63572bf78a5fc413a2b99f6e 100644 (file)
--- a/gcc/input.c
+++ b/gcc/input.c
@@ -22,7 +22,6 @@ along with GCC; see the file COPYING3.  If not see
  #include "coretypes.h"
  #include "intl.h"
  #include "diagnostic.h"
-#include "diagnostic-core.h"
  #include "selftest.h"
  #include "cpplib.h"
  
@@ -30,6 +29,20 @@ along with GCC; see the file COPYING3.  If not see
  #define HAVE_ICONV 0
  #endif
  
+/* Input charset configuration.  */
+static const char *default_charset_callback (const char *)
+{
+  return nullptr;
+}
+
+void
+file_cache::initialize_input_context (diagnostic_input_charset_callback ccb,
+                                     bool should_skip_bom)
+{
+  in_context.ccb = (ccb ? ccb : default_charset_callback);
+  in_context.should_skip_bom = should_skip_bom;
+}
+
  /* This is a cache used by get_next_line to store the content of a
     file to be searched for file lines.  */
  class file_cache_slot
@@ -51,7 +64,8 @@ public:
  
    void inc_use_count () { m_use_count++; }
  
-  void create (const char *file_path, FILE *fp, unsigned highest_use_count);
+  bool create (const file_cache::input_context &in_context,
+              const char *file_path, FILE *fp, unsigned highest_use_count);
    void evict ();
  
   private:
@@ -110,6 +124,10 @@ public:
       far.  */
    char *m_data;
  
+  /* The allocated buffer to be freed may start a little earlier than DATA,
+     e.g. if a UTF8 BOM was skipped at the beginning.  */
+  int m_alloc_offset;
+
    /*  The size of the DATA array above.*/
    size_t m_size;
  
@@ -147,6 +165,17 @@ public:
       doesn't explode.  We thus scale total_lines down to
       line_record_size.  */
    vec<line_info, va_heap> m_line_record;
+
+  void offset_buffer (int offset)
+  {
+    gcc_assert (offset < 0 ? m_alloc_offset + offset >= 0
+               : (size_t) offset <= m_size);
+    gcc_assert (m_data);
+    m_alloc_offset += offset;
+    m_data += offset;
+    m_size -= offset;
+  }
+
  };
  
  /* Current position in real source file.  */
@@ -419,21 +448,25 @@ file_cache::add_file (const char *file_path)
  
    unsigned highest_use_count = 0;
    file_cache_slot *r = evicted_cache_tab_entry (&highest_use_count);
-  r->create (file_path, fp, highest_use_count);
+  if (!r->create (in_context, file_path, fp, highest_use_count))
+    return NULL;
    return r;
  }
  
  /* Populate this slot for use on FILE_PATH and FP, dropping any
     existing cached content within it.  */
  
-void
-file_cache_slot::create (const char *file_path, FILE *fp,
+bool
+file_cache_slot::create (const file_cache::input_context &in_context,
+                        const char *file_path, FILE *fp,
                          unsigned highest_use_count)
  {
    m_file_path = file_path;
    if (m_fp)
      fclose (m_fp);
    m_fp = fp;
+  if (m_alloc_offset)
+    offset_buffer (-m_alloc_offset);
    m_nb_read = 0;
    m_line_start_idx = 0;
    m_line_num = 0;
@@ -443,6 +476,36 @@ file_cache_slot::create (const char *file_path, FILE *fp,
    m_use_count = ++highest_use_count;
    m_total_lines = total_lines_num (file_path);
    m_missing_trailing_newline = true;
+
+
+  /* Check the input configuration to determine if we need to do any
+     transformations, such as charset conversion or BOM skipping.  */
+  if (const char *input_charset = in_context.ccb (file_path))
+    {
+      /* Need a full-blown conversion of the input charset.  */
+      fclose (m_fp);
+      m_fp = NULL;
+      const cpp_converted_source cs
+       = cpp_get_converted_source (file_path, input_charset);
+      if (!cs.data)
+       return false;
+      if (m_data)
+       XDELETEVEC (m_data);
+      m_data = cs.data;
+      m_nb_read = m_size = cs.len;
+      m_alloc_offset = cs.data - cs.to_free;
+    }
+  else if (in_context.should_skip_bom)
+    {
+      if (read_data ())
+       {
+         const int offset = cpp_check_utf8_bom (m_data, m_nb_read);
+         offset_buffer (offset);
+         m_nb_read -= offset;
+       }
+    }
+
+  return true;
  }
  
  /* file_cache's ctor.  */
@@ -450,6 +513,7 @@ file_cache_slot::create (const char *file_path, FILE *fp,
  file_cache::file_cache ()
  : m_file_slots (new file_cache_slot[num_file_slots])
  {
+  initialize_input_context (nullptr, false);
  }
  
  /* file_cache's dtor.  */
@@ -478,8 +542,8 @@ file_cache::lookup_or_add_file (const char *file_path)
  
  file_cache_slot::file_cache_slot ()
  : m_use_count (0), m_file_path (NULL), m_fp (NULL), m_data (0),
-  m_size (0), m_nb_read (0), m_line_start_idx (0), m_line_num (0),
-  m_total_lines (0), m_missing_trailing_newline (true)
+  m_alloc_offset (0), m_size (0), m_nb_read (0), m_line_start_idx (0),
+  m_line_num (0), m_total_lines (0), m_missing_trailing_newline (true)
  {
    m_line_record.create (0);
  }
@@ -495,6 +559,7 @@ file_cache_slot::~file_cache_slot ()
      }
    if (m_data)
      {
+      offset_buffer (-m_alloc_offset);
        XDELETEVEC (m_data);
        m_data = 0;
      }
@@ -509,7 +574,7 @@ file_cache_slot::~file_cache_slot ()
  bool
  file_cache_slot::needs_read_p () const
  {
-  return (m_nb_read == 0
+  return m_fp && (m_nb_read == 0
           || m_nb_read == m_size
           || (m_line_start_idx >= m_nb_read - 1));
  }
@@ -531,9 +596,20 @@ file_cache_slot::maybe_grow ()
    if (!needs_grow_p ())
      return;
  
-  size_t size = m_size == 0 ? buffer_size : m_size * 2;
-  m_data = XRESIZEVEC (char, m_data, size);
-  m_size = size;
+  if (!m_data)
+    {
+      gcc_assert (m_size == 0 && m_alloc_offset == 0);
+      m_size = buffer_size;
+      m_data = XNEWVEC (char, m_size);
+    }
+  else
+    {
+      const int offset = m_alloc_offset;
+      offset_buffer (-offset);
+      m_size *= 2;
+      m_data = XRESIZEVEC (char, m_data, m_size);
+      offset_buffer (offset);
+    }
  }
  
  /*  Read more data into the cache.  Extends the cache if need be.
@@ -632,7 +708,7 @@ file_cache_slot::get_next_line (char **line, ssize_t *line_len)
        m_missing_trailing_newline = false;
      }
  
-  if (ferror (m_fp))
+  if (m_fp && ferror (m_fp))
      return false;
  
    /* At this point, we've found the end of the of line.  It either
diff --git a/gcc/input.h b/gcc/input.h

index bbcec84c521d50f73eace0a03d14af337072aa8b..e6881072c5f5191d7ba193c0dfe0fae008f1b74a 100644 (file)
--- a/gcc/input.h
+++ b/gcc/input.h
@@ -111,6 +111,15 @@ class file_cache
    file_cache_slot *lookup_or_add_file (const char *file_path);
    void forcibly_evict_file (const char *file_path);
  
+  /* See comments in diagnostic.h about the input conversion context.  */
+  struct input_context
+  {
+    diagnostic_input_charset_callback ccb;
+    bool should_skip_bom;
+  };
+  void initialize_input_context (diagnostic_input_charset_callback ccb,
+                                bool should_skip_bom);
+
   private:
    file_cache_slot *evicted_cache_tab_entry (unsigned *highest_use_count);
    file_cache_slot *add_file (const char *file_path);
@@ -119,6 +128,7 @@ class file_cache
   private:
    static const size_t num_file_slots = 16;
    file_cache_slot *m_file_slots;
+  input_context in_context;
  };
  
  extern expanded_location
diff --git a/gcc/testsuite/gcc.dg/diagnostic-input-charset-1.c b/gcc/testsuite/gcc.dg/diagnostic-input-charset-1.c

new file mode 100644 (file)

index 0000000..4e56833
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/diagnostic-input-charset-1.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-require-iconv "CP850" } */
+/* { dg-options "-finput-charset=CP850 -fdiagnostics-show-caret" } */
+
+/* Test that diagnostics are converted to UTF-8; this file is encoded in
+   CP850.  Why CP850?  -finput-charset only supports encodings that are a
+   superset of ASCII.  But encodings that look like latin-1 are automatically
+   converted by expect to UTF-8, and hence by the time dg sees them, it can't
+   verify they were actually output in UTF-8.  So codepage 850 was chosen as one
+   that is hopefully available and meets the requirements of matching ASCII and
+   not matching latin-1.  */
+const char *section = "õ"
+/* { dg-error "expected .* at end of input" "" { target *-*-*} .-1 } */
+/* { dg-begin-multiline-output "" }
+ const char *section = "§"
+ ^~~~~
+   { dg-end-multiline-output "" } */
diff --git a/gcc/testsuite/gcc.dg/diagnostic-input-utf8-bom.c b/gcc/testsuite/gcc.dg/diagnostic-input-utf8-bom.c

new file mode 100644 (file)

index 0000000..1a3f352
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/diagnostic-input-utf8-bom.c
@@ -0,0 +1,14 @@
+int 1;
+/* { dg-do compile } */
+/* { dg-options "-fdiagnostics-show-caret" } */
+
+/* This file begins with a UTF-8 byte order mark.  Verify that diagnostics
+   still point to the right place, since the stripping of the BOM happens twice,
+   once when libcpp reads the file, and once when diagnostics infrastucture
+   reads it.  */
+
+/* { dg-error "expected .* before numeric constant" "" { target *-*-*} 1 } */
+/* { dg-begin-multiline-output "" }
+ int 1;
+     ^
+   { dg-end-multiline-output "" } */
diff --git a/libcpp/charset.c b/libcpp/charset.c

index 99a9b73e5ab997f11df198c83ae3b858d0fde45b..61881f978a86e88396b939517ea23522b86084e1 100644 (file)
--- a/libcpp/charset.c
+++ b/libcpp/charset.c
@@ -630,7 +630,11 @@ static const struct cpp_conversion conversion_tab[] = {
     cset_converter structure for conversion from FROM to TO.  If
     iconv_open() fails, issue an error and return an identity
     converter.  Silently return an identity converter if FROM and TO
-   are identical.  */
+   are identical.
+
+   PFILE is only used for generating diagnostics; setting it to NULL
+   suppresses diagnostics.  */
+
  static struct cset_converter
  init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
  {
@@ -672,25 +676,31 @@ init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
  
        if (ret.cd == (iconv_t) -1)
         {
-         if (errno == EINVAL)
-           cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
-                      "conversion from %s to %s not supported by iconv",
-                      from, to);
-         else
-           cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
-
+         if (pfile)
+           {
+             if (errno == EINVAL)
+               cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
+                          "conversion from %s to %s not supported by iconv",
+                          from, to);
+             else
+               cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
+           }
           ret.func = convert_no_conversion;
         }
      }
    else
      {
-      cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
-                "no iconv implementation, cannot convert from %s to %s",
-                from, to);
+      if (pfile)
+       {
+         cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
+                    "no iconv implementation, cannot convert from %s to %s",
+                    from, to);
+       }
        ret.func = convert_no_conversion;
        ret.cd = (iconv_t) -1;
        ret.width = -1;
      }
+
    return ret;
  }
  
@@ -2122,6 +2132,25 @@ _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
                                   buf, bufp - buf, HT_ALLOC));
  }
  \f
+
+/* Utility to strip a UTF-8 byte order marking from the beginning
+   of a buffer.  Returns the number of bytes to skip, which currently
+   will be either 0 or 3.  */
+int
+cpp_check_utf8_bom (const char *data, size_t data_length)
+{
+
+#if HOST_CHARSET == HOST_CHARSET_ASCII
+  const unsigned char *udata = (const unsigned char *) data;
+  if (data_length >= 3 && udata[0] == 0xef && udata[1] == 0xbb
+      && udata[2] == 0xbf)
+    return 3;
+#endif
+
+  return 0;
+}
+
+
  /* Convert an input buffer (containing the complete contents of one
     source file) from INPUT_CHARSET to the source character set.  INPUT
     points to the input buffer, SIZE is its allocated size, and LEN is
@@ -2135,7 +2164,11 @@ _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
     INPUT is expected to have been allocated with xmalloc.  This
     function will either set *BUFFER_START to INPUT, or free it and set
     *BUFFER_START to a pointer to another xmalloc-allocated block of
-   memory.  */
+   memory.
+
+   PFILE is only used to generate diagnostics; setting it to NULL suppresses
+   diagnostics, and causes a return of NULL if there was any error instead.  */
+
  uchar * 
  _cpp_convert_input (cpp_reader *pfile, const char *input_charset,
                     uchar *input, size_t size, size_t len,
@@ -2158,17 +2191,27 @@ _cpp_convert_input (cpp_reader *pfile, const char *input_charset,
        to.text = XNEWVEC (uchar, to.asize);
        to.len = 0;
  
-      if (!APPLY_CONVERSION (input_cset, input, len, &to))
-       cpp_error (pfile, CPP_DL_ERROR,
-                  "failure to convert %s to %s",
-                  CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);
-
+      const bool ok = APPLY_CONVERSION (input_cset, input, len, &to);
        free (input);
-    }
  
-  /* Clean up the mess.  */
-  if (input_cset.func == convert_using_iconv)
-    iconv_close (input_cset.cd);
+      /* Clean up the mess.  */
+      if (input_cset.func == convert_using_iconv)
+       iconv_close (input_cset.cd);
+
+      /* Handle conversion failure.  */
+      if (!ok)
+       {
+         if (!pfile)
+           {
+             XDELETEVEC (to.text);
+             *buffer_start = NULL;
+             *st_size = 0;
+             return NULL;
+           }
+         cpp_error (pfile, CPP_DL_ERROR, "failure to convert %s to %s",
+                    input_charset, SOURCE_CHARSET);
+       }
+    }
  
    /* Resize buffer if we allocated substantially too much, or if we
       haven't enough space for the \n-terminator or following
@@ -2192,19 +2235,14 @@ _cpp_convert_input (cpp_reader *pfile, const char *input_charset,
  
    buffer = to.text;
    *st_size = to.len;
-#if HOST_CHARSET == HOST_CHARSET_ASCII
-  /* The HOST_CHARSET test just above ensures that the source charset
-     is UTF-8.  So, ignore a UTF-8 BOM if we see one.  Note that
-     glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a
+
+  /* Ignore a UTF-8 BOM if we see one and the source charset is UTF-8.  Note
+     that glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a
       BOM -- however, even if it did, we would still need this code due
       to the 'convert_no_conversion' case.  */
-  if (to.len >= 3 && to.text[0] == 0xef && to.text[1] == 0xbb
-      && to.text[2] == 0xbf)
-    {
-      *st_size -= 3;
-      buffer += 3;
-    }
-#endif
+  const int bom_len = cpp_check_utf8_bom ((const char *) to.text, to.len);
+  *st_size -= bom_len;
+  buffer += bom_len;
  
    *buffer_start = to.text;
    return buffer;
@@ -2244,6 +2282,13 @@ _cpp_default_encoding (void)
    return current_encoding;
  }
  
+/* Check if the configured input charset requires no conversion, other than
+   possibly stripping a UTF-8 BOM.  */
+bool cpp_input_conversion_is_trivial (const char *input_charset)
+{
+  return !strcasecmp (input_charset, SOURCE_CHARSET);
+}
+
  /* Implementation of class cpp_string_location_reader.  */
  
  /* Constructor for cpp_string_location_reader.  */
diff --git a/libcpp/files.c b/libcpp/files.c

index 6e20fc5887f001d23179eda88c5de323f24571a1..c93a03c69ef5b5a5d58feb7615e955e199c7f128 100644 (file)
--- a/libcpp/files.c
+++ b/libcpp/files.c
@@ -173,7 +173,7 @@ static bool pch_open_file (cpp_reader *pfile, _cpp_file *file,
  static bool find_file_in_dir (cpp_reader *pfile, _cpp_file *file,
                               bool *invalid_pch, location_t loc);
  static bool read_file_guts (cpp_reader *pfile, _cpp_file *file,
-                           location_t loc);
+                           location_t loc, const char *input_charset);
  static bool read_file (cpp_reader *pfile, _cpp_file *file,
                        location_t loc);
  static struct cpp_dir *search_path_head (cpp_reader *, const char *fname,
@@ -671,9 +671,12 @@ _cpp_find_file (cpp_reader *pfile, const char *fname, cpp_dir *start_dir,
  
     Use LOC for any diagnostics.
  
+   PFILE may be NULL.  In this case, no diagnostics are issued.
+
     FIXME: Flush file cache and try again if we run out of memory.  */
  static bool
-read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc)
+read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc,
+               const char *input_charset)
  {
    ssize_t size, total, count;
    uchar *buf;
@@ -681,8 +684,9 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc)
  
    if (S_ISBLK (file->st.st_mode))
      {
-      cpp_error_at (pfile, CPP_DL_ERROR, loc,
-                   "%s is a block device", file->path);
+      if (pfile)
+       cpp_error_at (pfile, CPP_DL_ERROR, loc,
+                     "%s is a block device", file->path);
        return false;
      }
  
@@ -699,8 +703,9 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc)
          does not bite us.  */
        if (file->st.st_size > INTTYPE_MAXIMUM (ssize_t))
         {
-         cpp_error_at (pfile, CPP_DL_ERROR, loc,
-                       "%s is too large", file->path);
+         if (pfile)
+           cpp_error_at (pfile, CPP_DL_ERROR, loc,
+                         "%s is too large", file->path);
           return false;
         }
  
@@ -733,29 +738,29 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc)
  
    if (count < 0)
      {
-      cpp_errno_filename (pfile, CPP_DL_ERROR, file->path, loc);
+      if (pfile)
+       cpp_errno_filename (pfile, CPP_DL_ERROR, file->path, loc);
        free (buf);
        return false;
      }
  
-  if (regular && total != size && STAT_SIZE_RELIABLE (file->st))
+  if (pfile && regular && total != size && STAT_SIZE_RELIABLE (file->st))
      cpp_error_at (pfile, CPP_DL_WARNING, loc,
                "%s is shorter than expected", file->path);
  
    file->buffer = _cpp_convert_input (pfile,
-                                    CPP_OPTION (pfile, input_charset),
+                                    input_charset,
                                      buf, size + 16, total,
                                      &file->buffer_start,
                                      &file->st.st_size);
-  file->buffer_valid = true;
-
-  return true;
+  file->buffer_valid = file->buffer;
+  return file->buffer_valid;
  }
  
  /* Convenience wrapper around read_file_guts that opens the file if
     necessary and closes the file descriptor after reading.  FILE must
     have been passed through find_file() at some stage.  Use LOC for
-   any diagnostics.  */
+   any diagnostics.  Unlike read_file_guts(), PFILE may not be NULL.  */
  static bool
  read_file (cpp_reader *pfile, _cpp_file *file, location_t loc)
  {
@@ -773,7 +778,8 @@ read_file (cpp_reader *pfile, _cpp_file *file, location_t loc)
        return false;
      }
  
-  file->dont_read = !read_file_guts (pfile, file, loc);
+  file->dont_read = !read_file_guts (pfile, file, loc,
+                                    CPP_OPTION (pfile, input_charset));
    close (file->fd);
    file->fd = -1;
  
@@ -2145,3 +2151,25 @@ _cpp_has_header (cpp_reader *pfile, const char *fname, int angle_brackets,
    return file->err_no != ENOENT;
  }
  
+/* Read a file and convert to input charset, the same as if it were being read
+   by a cpp_reader.  */
+
+cpp_converted_source
+cpp_get_converted_source (const char *fname, const char *input_charset)
+{
+  cpp_converted_source res = {};
+  _cpp_file file = {};
+  file.fd = -1;
+  file.name = lbasename (fname);
+  file.path = fname;
+  if (!open_file (&file))
+    return res;
+  const bool ok = read_file_guts (NULL, &file, 0, input_charset);
+  close (file.fd);
+  if (!ok)
+    return res;
+  res.to_free = (char *) file.buffer_start;
+  res.data = (char *) file.buffer;
+  res.len = file.st.st_size;
+  return res;
+}
diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h

index 7e840635a380a7b29dade58fc2f0fee8670bba28..af1429171ea858c7bd6033ff936a89229aaaf47e 100644 (file)
--- a/libcpp/include/cpplib.h
+++ b/libcpp/include/cpplib.h
@@ -1379,6 +1379,20 @@ extern struct _cpp_file *cpp_get_file (cpp_buffer *);
  extern cpp_buffer *cpp_get_prev (cpp_buffer *);
  extern void cpp_clear_file_cache (cpp_reader *);
  
+/* cpp_get_converted_source returns the contents of the given file, as it exists
+   after cpplib has read it and converted it from the input charset to the
+   source charset.  Return struct will be zero-filled if the data could not be
+   read for any reason.  The data starts at the DATA pointer, but the TO_FREE
+   pointer is what should be passed to free(), as there may be an offset.  */
+struct cpp_converted_source
+{
+  char *to_free;
+  char *data;
+  size_t len;
+};
+cpp_converted_source cpp_get_converted_source (const char *fname,
+                                              const char *input_charset);
+
  /* In pch.c */
  struct save_macro_data;
  extern int cpp_save_state (cpp_reader *, FILE *);
@@ -1449,6 +1463,7 @@ class cpp_display_width_computation {
  /* Convenience functions that are simple use cases for class
     cpp_display_width_computation.  Tab characters will be expanded to spaces
     as determined by TABSTOP.  */
+
  int cpp_byte_column_to_display_column (const char *data, int data_length,
                                        int column, int tabstop);
  inline int cpp_display_width (const char *data, int data_length,
@@ -1461,4 +1476,7 @@ int cpp_display_column_to_byte_column (const char *data, int data_length,
                                        int display_col, int tabstop);
  int cpp_wcwidth (cppchar_t c);
  
+bool cpp_input_conversion_is_trivial (const char *input_charset);
+int cpp_check_utf8_bom (const char *data, size_t data_length);
+
  #endif /* ! LIBCPP_CPPLIB_H */
author	Lewis Hyatt <lhyatt@gmail.com>
	Tue, 24 Aug 2021 23:30:44 +0000 (19:30 -0400)
committer	Lewis Hyatt <lhyatt@gmail.com>
	Wed, 25 Aug 2021 15:15:28 +0000 (11:15 -0400)
gcc/c-family/c-opts.c		patch \| blob \| blame \| history
gcc/coretypes.h		patch \| blob \| blame \| history
gcc/d/d-lang.cc		patch \| blob \| blame \| history
gcc/diagnostic.c		patch \| blob \| blame \| history
gcc/diagnostic.h		patch \| blob \| blame \| history
gcc/fortran/cpp.c		patch \| blob \| blame \| history
gcc/input.c		patch \| blob \| blame \| history
gcc/input.h		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/diagnostic-input-charset-1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.dg/diagnostic-input-utf8-bom.c	[new file with mode: 0644]	patch \| blob
libcpp/charset.c		patch \| blob \| blame \| history
libcpp/files.c		patch \| blob \| blame \| history
libcpp/include/cpplib.h		patch \| blob \| blame \| history