From 3ac6b5cff1eca4e1748c671960ef7b4ca5e47fd2 Mon Sep 17 00:00:00 2001 From: Lewis Hyatt Date: Tue, 24 Aug 2021 19:30:44 -0400 Subject: [PATCH] diagnostics: Support for -finput-charset [PR93067] Adds the logic to handle -finput-charset in layout_get_source_line(), so that source lines are converted from their input encodings prior to being output by diagnostics machinery. Also adds the ability to strip a UTF-8 BOM similarly. gcc/c-family/ChangeLog: PR other/93067 * c-opts.c (c_common_input_charset_cb): New function. (c_common_post_options): Call new function diagnostic_initialize_input_context(). gcc/d/ChangeLog: PR other/93067 * d-lang.cc (d_input_charset_callback): New function. (d_init): Call new function diagnostic_initialize_input_context(). gcc/fortran/ChangeLog: PR other/93067 * cpp.c (gfc_cpp_post_options): Call new function diagnostic_initialize_input_context(). gcc/ChangeLog: PR other/93067 * coretypes.h (typedef diagnostic_input_charset_callback): Declare. * diagnostic.c (diagnostic_initialize_input_context): New function. * diagnostic.h (diagnostic_initialize_input_context): Declare. * input.c (default_charset_callback): New function. (file_cache::initialize_input_context): New function. (file_cache_slot::create): Added ability to convert the input according to the input context. (file_cache::file_cache): Initialize the new input context. (class file_cache_slot): Added new m_alloc_offset member. (file_cache_slot::file_cache_slot): Initialize the new member. (file_cache_slot::~file_cache_slot): Handle potentially offset buffer. (file_cache_slot::maybe_grow): Likewise. (file_cache_slot::needs_read_p): Handle NULL fp, which is now possible. (file_cache_slot::get_next_line): Likewise. * input.h (class file_cache): Added input context member. libcpp/ChangeLog: PR other/93067 * charset.c (init_iconv_desc): Adapt to permit PFILE argument to be NULL. (_cpp_convert_input): Likewise. Also move UTF-8 BOM logic to... (cpp_check_utf8_bom): ...here. New function. (cpp_input_conversion_is_trivial): New function. * files.c (read_file_guts): Allow PFILE argument to be NULL. Add INPUT_CHARSET argument as an alternate source of this information. (read_file): Pass the new argument to read_file_guts. (cpp_get_converted_source): New function. * include/cpplib.h (struct cpp_converted_source): Declare. (cpp_get_converted_source): Declare. (cpp_input_conversion_is_trivial): Declare. (cpp_check_utf8_bom): Declare. gcc/testsuite/ChangeLog: PR other/93067 * gcc.dg/diagnostic-input-charset-1.c: New test. * gcc.dg/diagnostic-input-utf8-bom.c: New test. --- gcc/c-family/c-opts.c | 13 +++ gcc/coretypes.h | 1 + gcc/d/d-lang.cc | 19 +++ gcc/diagnostic.c | 11 ++ gcc/diagnostic.h | 19 +++ gcc/fortran/cpp.c | 6 + gcc/input.c | 100 ++++++++++++++-- gcc/input.h | 10 ++ .../gcc.dg/diagnostic-input-charset-1.c | 17 +++ .../gcc.dg/diagnostic-input-utf8-bom.c | 14 +++ libcpp/charset.c | 109 +++++++++++++----- libcpp/files.c | 56 ++++++--- libcpp/include/cpplib.h | 18 +++ 13 files changed, 335 insertions(+), 58 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/diagnostic-input-charset-1.c create mode 100644 gcc/testsuite/gcc.dg/diagnostic-input-utf8-bom.c diff --git a/gcc/c-family/c-opts.c b/gcc/c-family/c-opts.c index 373af0cf06f1..fdde082158b3 100644 --- a/gcc/c-family/c-opts.c +++ b/gcc/c-family/c-opts.c @@ -188,6 +188,14 @@ c_common_diagnostics_set_defaults (diagnostic_context *context) context->opt_permissive = OPT_fpermissive; } +/* Input charset configuration for diagnostics. */ +static const char * +c_common_input_charset_cb (const char * /*filename*/) +{ + const char *cs = cpp_opts->input_charset; + return cpp_input_conversion_is_trivial (cs) ? nullptr : cs; +} + /* Whether options from all C-family languages should be accepted quietly. */ static bool accept_all_c_family_options = false; @@ -1136,6 +1144,11 @@ c_common_post_options (const char **pfilename) cpp_post_options (parse_in); init_global_opts_from_cpp (&global_options, cpp_get_options (parse_in)); + /* Let diagnostics infrastructure know how to convert input files the same + way libcpp will do it, namely using the configured input charset and + skipping a UTF-8 BOM if present. */ + diagnostic_initialize_input_context (global_dc, + c_common_input_charset_cb, true); input_location = UNKNOWN_LOCATION; *pfilename = this_input_filename diff --git a/gcc/coretypes.h b/gcc/coretypes.h index 406572e947d7..726fcaddda2a 100644 --- a/gcc/coretypes.h +++ b/gcc/coretypes.h @@ -154,6 +154,7 @@ struct cl_option_handlers; struct diagnostic_context; class pretty_printer; class diagnostic_event_id_t; +typedef const char * (*diagnostic_input_charset_callback)(const char *); template struct array_traits; diff --git a/gcc/d/d-lang.cc b/gcc/d/d-lang.cc index 4386a489ff2d..fa29a46ab1eb 100644 --- a/gcc/d/d-lang.cc +++ b/gcc/d/d-lang.cc @@ -50,6 +50,7 @@ along with GCC; see the file COPYING3. If not see #include "output.h" #include "print-tree.h" #include "debug.h" +#include "input.h" #include "d-tree.h" #include "id.h" @@ -362,6 +363,19 @@ d_option_lang_mask (void) return CL_D; } +/* Implements input charset and BOM skipping configuration for + diagnostics. */ +static const char *d_input_charset_callback (const char * /*filename*/) +{ + /* TODO: The input charset is automatically determined by code in + dmd/dmodule.c based on the contents of the file. If this detection + logic were factored out and could be reused here, then we would be able + to return UTF-16 or UTF-32 as needed here. For now, we return always + NULL, which means no conversion is necessary, i.e. the input is assumed + to be UTF-8 when diagnostics read this file. */ + return nullptr; +} + /* Implements the lang_hooks.init routine for language D. */ static bool @@ -373,6 +387,11 @@ d_init (void) Expression::_init (); Objc::_init (); + /* Diagnostics input init, to enable BOM skipping and + input charset conversion. */ + diagnostic_initialize_input_context (global_dc, + d_input_charset_callback, true); + /* Back-end init. */ global_binding_level = ggc_cleared_alloc (); current_binding_level = global_binding_level; diff --git a/gcc/diagnostic.c b/gcc/diagnostic.c index 8361f68aaceb..b3afbeae6488 100644 --- a/gcc/diagnostic.c +++ b/gcc/diagnostic.c @@ -293,6 +293,17 @@ diagnostic_urls_init (diagnostic_context *context, int value /*= -1 */) = determine_url_format ((diagnostic_url_rule_t) value); } +/* Create the file_cache, if not already created, and tell it how to + translate files on input. */ +void diagnostic_initialize_input_context (diagnostic_context *context, + diagnostic_input_charset_callback ccb, + bool should_skip_bom) +{ + if (!context->m_file_cache) + context->m_file_cache = new file_cache; + context->m_file_cache->initialize_input_context (ccb, should_skip_bom); +} + /* Do any cleaning up required after the last diagnostic is emitted. */ void diff --git a/gcc/diagnostic.h b/gcc/diagnostic.h index 7227dae1b6be..f90d20a4f5ec 100644 --- a/gcc/diagnostic.h +++ b/gcc/diagnostic.h @@ -446,6 +446,25 @@ extern void diagnostic_show_locus (diagnostic_context *, diagnostic_t diagnostic_kind); extern void diagnostic_show_any_path (diagnostic_context *, diagnostic_info *); +/* Because we read source files a second time after the frontend did it the + first time, we need to know how the frontend handled things like character + set conversion and UTF-8 BOM stripping, in order to make everything + consistent. This function needs to be called by each frontend that requires + non-default behavior, to inform the diagnostics infrastructure how input is + to be processed. The default behavior is to do no conversion and not to + strip a UTF-8 BOM. + + The callback should return the input charset to be used to convert the given + file's contents to UTF-8, or it should return NULL if no conversion is needed + for this file. SHOULD_SKIP_BOM only applies in case no conversion was + performed, and if true, it will cause a UTF-8 BOM to be skipped at the + beginning of the file. (In case a conversion was performed, the BOM is + rather skipped as part of the conversion process.) */ + +void diagnostic_initialize_input_context (diagnostic_context *context, + diagnostic_input_charset_callback ccb, + bool should_skip_bom); + /* Force diagnostics controlled by OPTIDX to be kind KIND. */ extern diagnostic_t diagnostic_classify_diagnostic (diagnostic_context *, int /* optidx */, diff --git a/gcc/fortran/cpp.c b/gcc/fortran/cpp.c index 419cd6accbee..83c4517acdbb 100644 --- a/gcc/fortran/cpp.c +++ b/gcc/fortran/cpp.c @@ -493,6 +493,12 @@ gfc_cpp_post_options (void) cpp_post_options (cpp_in); + + /* Let diagnostics infrastructure know how to convert input files the same + way libcpp will do it, namely, with no charset conversion but with + skipping of a UTF-8 BOM if present. */ + diagnostic_initialize_input_context (global_dc, nullptr, true); + gfc_cpp_register_include_paths (); } diff --git a/gcc/input.c b/gcc/input.c index de20d983d2c4..4b809862e02d 100644 --- a/gcc/input.c +++ b/gcc/input.c @@ -22,7 +22,6 @@ along with GCC; see the file COPYING3. If not see #include "coretypes.h" #include "intl.h" #include "diagnostic.h" -#include "diagnostic-core.h" #include "selftest.h" #include "cpplib.h" @@ -30,6 +29,20 @@ along with GCC; see the file COPYING3. If not see #define HAVE_ICONV 0 #endif +/* Input charset configuration. */ +static const char *default_charset_callback (const char *) +{ + return nullptr; +} + +void +file_cache::initialize_input_context (diagnostic_input_charset_callback ccb, + bool should_skip_bom) +{ + in_context.ccb = (ccb ? ccb : default_charset_callback); + in_context.should_skip_bom = should_skip_bom; +} + /* This is a cache used by get_next_line to store the content of a file to be searched for file lines. */ class file_cache_slot @@ -51,7 +64,8 @@ public: void inc_use_count () { m_use_count++; } - void create (const char *file_path, FILE *fp, unsigned highest_use_count); + bool create (const file_cache::input_context &in_context, + const char *file_path, FILE *fp, unsigned highest_use_count); void evict (); private: @@ -110,6 +124,10 @@ public: far. */ char *m_data; + /* The allocated buffer to be freed may start a little earlier than DATA, + e.g. if a UTF8 BOM was skipped at the beginning. */ + int m_alloc_offset; + /* The size of the DATA array above.*/ size_t m_size; @@ -147,6 +165,17 @@ public: doesn't explode. We thus scale total_lines down to line_record_size. */ vec m_line_record; + + void offset_buffer (int offset) + { + gcc_assert (offset < 0 ? m_alloc_offset + offset >= 0 + : (size_t) offset <= m_size); + gcc_assert (m_data); + m_alloc_offset += offset; + m_data += offset; + m_size -= offset; + } + }; /* Current position in real source file. */ @@ -419,21 +448,25 @@ file_cache::add_file (const char *file_path) unsigned highest_use_count = 0; file_cache_slot *r = evicted_cache_tab_entry (&highest_use_count); - r->create (file_path, fp, highest_use_count); + if (!r->create (in_context, file_path, fp, highest_use_count)) + return NULL; return r; } /* Populate this slot for use on FILE_PATH and FP, dropping any existing cached content within it. */ -void -file_cache_slot::create (const char *file_path, FILE *fp, +bool +file_cache_slot::create (const file_cache::input_context &in_context, + const char *file_path, FILE *fp, unsigned highest_use_count) { m_file_path = file_path; if (m_fp) fclose (m_fp); m_fp = fp; + if (m_alloc_offset) + offset_buffer (-m_alloc_offset); m_nb_read = 0; m_line_start_idx = 0; m_line_num = 0; @@ -443,6 +476,36 @@ file_cache_slot::create (const char *file_path, FILE *fp, m_use_count = ++highest_use_count; m_total_lines = total_lines_num (file_path); m_missing_trailing_newline = true; + + + /* Check the input configuration to determine if we need to do any + transformations, such as charset conversion or BOM skipping. */ + if (const char *input_charset = in_context.ccb (file_path)) + { + /* Need a full-blown conversion of the input charset. */ + fclose (m_fp); + m_fp = NULL; + const cpp_converted_source cs + = cpp_get_converted_source (file_path, input_charset); + if (!cs.data) + return false; + if (m_data) + XDELETEVEC (m_data); + m_data = cs.data; + m_nb_read = m_size = cs.len; + m_alloc_offset = cs.data - cs.to_free; + } + else if (in_context.should_skip_bom) + { + if (read_data ()) + { + const int offset = cpp_check_utf8_bom (m_data, m_nb_read); + offset_buffer (offset); + m_nb_read -= offset; + } + } + + return true; } /* file_cache's ctor. */ @@ -450,6 +513,7 @@ file_cache_slot::create (const char *file_path, FILE *fp, file_cache::file_cache () : m_file_slots (new file_cache_slot[num_file_slots]) { + initialize_input_context (nullptr, false); } /* file_cache's dtor. */ @@ -478,8 +542,8 @@ file_cache::lookup_or_add_file (const char *file_path) file_cache_slot::file_cache_slot () : m_use_count (0), m_file_path (NULL), m_fp (NULL), m_data (0), - m_size (0), m_nb_read (0), m_line_start_idx (0), m_line_num (0), - m_total_lines (0), m_missing_trailing_newline (true) + m_alloc_offset (0), m_size (0), m_nb_read (0), m_line_start_idx (0), + m_line_num (0), m_total_lines (0), m_missing_trailing_newline (true) { m_line_record.create (0); } @@ -495,6 +559,7 @@ file_cache_slot::~file_cache_slot () } if (m_data) { + offset_buffer (-m_alloc_offset); XDELETEVEC (m_data); m_data = 0; } @@ -509,7 +574,7 @@ file_cache_slot::~file_cache_slot () bool file_cache_slot::needs_read_p () const { - return (m_nb_read == 0 + return m_fp && (m_nb_read == 0 || m_nb_read == m_size || (m_line_start_idx >= m_nb_read - 1)); } @@ -531,9 +596,20 @@ file_cache_slot::maybe_grow () if (!needs_grow_p ()) return; - size_t size = m_size == 0 ? buffer_size : m_size * 2; - m_data = XRESIZEVEC (char, m_data, size); - m_size = size; + if (!m_data) + { + gcc_assert (m_size == 0 && m_alloc_offset == 0); + m_size = buffer_size; + m_data = XNEWVEC (char, m_size); + } + else + { + const int offset = m_alloc_offset; + offset_buffer (-offset); + m_size *= 2; + m_data = XRESIZEVEC (char, m_data, m_size); + offset_buffer (offset); + } } /* Read more data into the cache. Extends the cache if need be. @@ -632,7 +708,7 @@ file_cache_slot::get_next_line (char **line, ssize_t *line_len) m_missing_trailing_newline = false; } - if (ferror (m_fp)) + if (m_fp && ferror (m_fp)) return false; /* At this point, we've found the end of the of line. It either diff --git a/gcc/input.h b/gcc/input.h index bbcec84c521d..e6881072c5f5 100644 --- a/gcc/input.h +++ b/gcc/input.h @@ -111,6 +111,15 @@ class file_cache file_cache_slot *lookup_or_add_file (const char *file_path); void forcibly_evict_file (const char *file_path); + /* See comments in diagnostic.h about the input conversion context. */ + struct input_context + { + diagnostic_input_charset_callback ccb; + bool should_skip_bom; + }; + void initialize_input_context (diagnostic_input_charset_callback ccb, + bool should_skip_bom); + private: file_cache_slot *evicted_cache_tab_entry (unsigned *highest_use_count); file_cache_slot *add_file (const char *file_path); @@ -119,6 +128,7 @@ class file_cache private: static const size_t num_file_slots = 16; file_cache_slot *m_file_slots; + input_context in_context; }; extern expanded_location diff --git a/gcc/testsuite/gcc.dg/diagnostic-input-charset-1.c b/gcc/testsuite/gcc.dg/diagnostic-input-charset-1.c new file mode 100644 index 000000000000..4e56833162d8 --- /dev/null +++ b/gcc/testsuite/gcc.dg/diagnostic-input-charset-1.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-require-iconv "CP850" } */ +/* { dg-options "-finput-charset=CP850 -fdiagnostics-show-caret" } */ + +/* Test that diagnostics are converted to UTF-8; this file is encoded in + CP850. Why CP850? -finput-charset only supports encodings that are a + superset of ASCII. But encodings that look like latin-1 are automatically + converted by expect to UTF-8, and hence by the time dg sees them, it can't + verify they were actually output in UTF-8. So codepage 850 was chosen as one + that is hopefully available and meets the requirements of matching ASCII and + not matching latin-1. */ +const char *section = "õ" +/* { dg-error "expected .* at end of input" "" { target *-*-*} .-1 } */ +/* { dg-begin-multiline-output "" } + const char *section = "§" + ^~~~~ + { dg-end-multiline-output "" } */ diff --git a/gcc/testsuite/gcc.dg/diagnostic-input-utf8-bom.c b/gcc/testsuite/gcc.dg/diagnostic-input-utf8-bom.c new file mode 100644 index 000000000000..1a3f35287da7 --- /dev/null +++ b/gcc/testsuite/gcc.dg/diagnostic-input-utf8-bom.c @@ -0,0 +1,14 @@ +int 1; +/* { dg-do compile } */ +/* { dg-options "-fdiagnostics-show-caret" } */ + +/* This file begins with a UTF-8 byte order mark. Verify that diagnostics + still point to the right place, since the stripping of the BOM happens twice, + once when libcpp reads the file, and once when diagnostics infrastucture + reads it. */ + +/* { dg-error "expected .* before numeric constant" "" { target *-*-*} 1 } */ +/* { dg-begin-multiline-output "" } + int 1; + ^ + { dg-end-multiline-output "" } */ diff --git a/libcpp/charset.c b/libcpp/charset.c index 99a9b73e5ab9..61881f978a86 100644 --- a/libcpp/charset.c +++ b/libcpp/charset.c @@ -630,7 +630,11 @@ static const struct cpp_conversion conversion_tab[] = { cset_converter structure for conversion from FROM to TO. If iconv_open() fails, issue an error and return an identity converter. Silently return an identity converter if FROM and TO - are identical. */ + are identical. + + PFILE is only used for generating diagnostics; setting it to NULL + suppresses diagnostics. */ + static struct cset_converter init_iconv_desc (cpp_reader *pfile, const char *to, const char *from) { @@ -672,25 +676,31 @@ init_iconv_desc (cpp_reader *pfile, const char *to, const char *from) if (ret.cd == (iconv_t) -1) { - if (errno == EINVAL) - cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */ - "conversion from %s to %s not supported by iconv", - from, to); - else - cpp_errno (pfile, CPP_DL_ERROR, "iconv_open"); - + if (pfile) + { + if (errno == EINVAL) + cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */ + "conversion from %s to %s not supported by iconv", + from, to); + else + cpp_errno (pfile, CPP_DL_ERROR, "iconv_open"); + } ret.func = convert_no_conversion; } } else { - cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */ - "no iconv implementation, cannot convert from %s to %s", - from, to); + if (pfile) + { + cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */ + "no iconv implementation, cannot convert from %s to %s", + from, to); + } ret.func = convert_no_conversion; ret.cd = (iconv_t) -1; ret.width = -1; } + return ret; } @@ -2122,6 +2132,25 @@ _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len) buf, bufp - buf, HT_ALLOC)); } + +/* Utility to strip a UTF-8 byte order marking from the beginning + of a buffer. Returns the number of bytes to skip, which currently + will be either 0 or 3. */ +int +cpp_check_utf8_bom (const char *data, size_t data_length) +{ + +#if HOST_CHARSET == HOST_CHARSET_ASCII + const unsigned char *udata = (const unsigned char *) data; + if (data_length >= 3 && udata[0] == 0xef && udata[1] == 0xbb + && udata[2] == 0xbf) + return 3; +#endif + + return 0; +} + + /* Convert an input buffer (containing the complete contents of one source file) from INPUT_CHARSET to the source character set. INPUT points to the input buffer, SIZE is its allocated size, and LEN is @@ -2135,7 +2164,11 @@ _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len) INPUT is expected to have been allocated with xmalloc. This function will either set *BUFFER_START to INPUT, or free it and set *BUFFER_START to a pointer to another xmalloc-allocated block of - memory. */ + memory. + + PFILE is only used to generate diagnostics; setting it to NULL suppresses + diagnostics, and causes a return of NULL if there was any error instead. */ + uchar * _cpp_convert_input (cpp_reader *pfile, const char *input_charset, uchar *input, size_t size, size_t len, @@ -2158,17 +2191,27 @@ _cpp_convert_input (cpp_reader *pfile, const char *input_charset, to.text = XNEWVEC (uchar, to.asize); to.len = 0; - if (!APPLY_CONVERSION (input_cset, input, len, &to)) - cpp_error (pfile, CPP_DL_ERROR, - "failure to convert %s to %s", - CPP_OPTION (pfile, input_charset), SOURCE_CHARSET); - + const bool ok = APPLY_CONVERSION (input_cset, input, len, &to); free (input); - } - /* Clean up the mess. */ - if (input_cset.func == convert_using_iconv) - iconv_close (input_cset.cd); + /* Clean up the mess. */ + if (input_cset.func == convert_using_iconv) + iconv_close (input_cset.cd); + + /* Handle conversion failure. */ + if (!ok) + { + if (!pfile) + { + XDELETEVEC (to.text); + *buffer_start = NULL; + *st_size = 0; + return NULL; + } + cpp_error (pfile, CPP_DL_ERROR, "failure to convert %s to %s", + input_charset, SOURCE_CHARSET); + } + } /* Resize buffer if we allocated substantially too much, or if we haven't enough space for the \n-terminator or following @@ -2192,19 +2235,14 @@ _cpp_convert_input (cpp_reader *pfile, const char *input_charset, buffer = to.text; *st_size = to.len; -#if HOST_CHARSET == HOST_CHARSET_ASCII - /* The HOST_CHARSET test just above ensures that the source charset - is UTF-8. So, ignore a UTF-8 BOM if we see one. Note that - glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a + + /* Ignore a UTF-8 BOM if we see one and the source charset is UTF-8. Note + that glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a BOM -- however, even if it did, we would still need this code due to the 'convert_no_conversion' case. */ - if (to.len >= 3 && to.text[0] == 0xef && to.text[1] == 0xbb - && to.text[2] == 0xbf) - { - *st_size -= 3; - buffer += 3; - } -#endif + const int bom_len = cpp_check_utf8_bom ((const char *) to.text, to.len); + *st_size -= bom_len; + buffer += bom_len; *buffer_start = to.text; return buffer; @@ -2244,6 +2282,13 @@ _cpp_default_encoding (void) return current_encoding; } +/* Check if the configured input charset requires no conversion, other than + possibly stripping a UTF-8 BOM. */ +bool cpp_input_conversion_is_trivial (const char *input_charset) +{ + return !strcasecmp (input_charset, SOURCE_CHARSET); +} + /* Implementation of class cpp_string_location_reader. */ /* Constructor for cpp_string_location_reader. */ diff --git a/libcpp/files.c b/libcpp/files.c index 6e20fc5887f0..c93a03c69ef5 100644 --- a/libcpp/files.c +++ b/libcpp/files.c @@ -173,7 +173,7 @@ static bool pch_open_file (cpp_reader *pfile, _cpp_file *file, static bool find_file_in_dir (cpp_reader *pfile, _cpp_file *file, bool *invalid_pch, location_t loc); static bool read_file_guts (cpp_reader *pfile, _cpp_file *file, - location_t loc); + location_t loc, const char *input_charset); static bool read_file (cpp_reader *pfile, _cpp_file *file, location_t loc); static struct cpp_dir *search_path_head (cpp_reader *, const char *fname, @@ -671,9 +671,12 @@ _cpp_find_file (cpp_reader *pfile, const char *fname, cpp_dir *start_dir, Use LOC for any diagnostics. + PFILE may be NULL. In this case, no diagnostics are issued. + FIXME: Flush file cache and try again if we run out of memory. */ static bool -read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc) +read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc, + const char *input_charset) { ssize_t size, total, count; uchar *buf; @@ -681,8 +684,9 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc) if (S_ISBLK (file->st.st_mode)) { - cpp_error_at (pfile, CPP_DL_ERROR, loc, - "%s is a block device", file->path); + if (pfile) + cpp_error_at (pfile, CPP_DL_ERROR, loc, + "%s is a block device", file->path); return false; } @@ -699,8 +703,9 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc) does not bite us. */ if (file->st.st_size > INTTYPE_MAXIMUM (ssize_t)) { - cpp_error_at (pfile, CPP_DL_ERROR, loc, - "%s is too large", file->path); + if (pfile) + cpp_error_at (pfile, CPP_DL_ERROR, loc, + "%s is too large", file->path); return false; } @@ -733,29 +738,29 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc) if (count < 0) { - cpp_errno_filename (pfile, CPP_DL_ERROR, file->path, loc); + if (pfile) + cpp_errno_filename (pfile, CPP_DL_ERROR, file->path, loc); free (buf); return false; } - if (regular && total != size && STAT_SIZE_RELIABLE (file->st)) + if (pfile && regular && total != size && STAT_SIZE_RELIABLE (file->st)) cpp_error_at (pfile, CPP_DL_WARNING, loc, "%s is shorter than expected", file->path); file->buffer = _cpp_convert_input (pfile, - CPP_OPTION (pfile, input_charset), + input_charset, buf, size + 16, total, &file->buffer_start, &file->st.st_size); - file->buffer_valid = true; - - return true; + file->buffer_valid = file->buffer; + return file->buffer_valid; } /* Convenience wrapper around read_file_guts that opens the file if necessary and closes the file descriptor after reading. FILE must have been passed through find_file() at some stage. Use LOC for - any diagnostics. */ + any diagnostics. Unlike read_file_guts(), PFILE may not be NULL. */ static bool read_file (cpp_reader *pfile, _cpp_file *file, location_t loc) { @@ -773,7 +778,8 @@ read_file (cpp_reader *pfile, _cpp_file *file, location_t loc) return false; } - file->dont_read = !read_file_guts (pfile, file, loc); + file->dont_read = !read_file_guts (pfile, file, loc, + CPP_OPTION (pfile, input_charset)); close (file->fd); file->fd = -1; @@ -2145,3 +2151,25 @@ _cpp_has_header (cpp_reader *pfile, const char *fname, int angle_brackets, return file->err_no != ENOENT; } +/* Read a file and convert to input charset, the same as if it were being read + by a cpp_reader. */ + +cpp_converted_source +cpp_get_converted_source (const char *fname, const char *input_charset) +{ + cpp_converted_source res = {}; + _cpp_file file = {}; + file.fd = -1; + file.name = lbasename (fname); + file.path = fname; + if (!open_file (&file)) + return res; + const bool ok = read_file_guts (NULL, &file, 0, input_charset); + close (file.fd); + if (!ok) + return res; + res.to_free = (char *) file.buffer_start; + res.data = (char *) file.buffer; + res.len = file.st.st_size; + return res; +} diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h index 7e840635a380..af1429171ea8 100644 --- a/libcpp/include/cpplib.h +++ b/libcpp/include/cpplib.h @@ -1379,6 +1379,20 @@ extern struct _cpp_file *cpp_get_file (cpp_buffer *); extern cpp_buffer *cpp_get_prev (cpp_buffer *); extern void cpp_clear_file_cache (cpp_reader *); +/* cpp_get_converted_source returns the contents of the given file, as it exists + after cpplib has read it and converted it from the input charset to the + source charset. Return struct will be zero-filled if the data could not be + read for any reason. The data starts at the DATA pointer, but the TO_FREE + pointer is what should be passed to free(), as there may be an offset. */ +struct cpp_converted_source +{ + char *to_free; + char *data; + size_t len; +}; +cpp_converted_source cpp_get_converted_source (const char *fname, + const char *input_charset); + /* In pch.c */ struct save_macro_data; extern int cpp_save_state (cpp_reader *, FILE *); @@ -1449,6 +1463,7 @@ class cpp_display_width_computation { /* Convenience functions that are simple use cases for class cpp_display_width_computation. Tab characters will be expanded to spaces as determined by TABSTOP. */ + int cpp_byte_column_to_display_column (const char *data, int data_length, int column, int tabstop); inline int cpp_display_width (const char *data, int data_length, @@ -1461,4 +1476,7 @@ int cpp_display_column_to_byte_column (const char *data, int data_length, int display_col, int tabstop); int cpp_wcwidth (cppchar_t c); +bool cpp_input_conversion_is_trivial (const char *input_charset); +int cpp_check_utf8_bom (const char *data, size_t data_length); + #endif /* ! LIBCPP_CPPLIB_H */ -- 2.39.5