context->opt_permissive = OPT_fpermissive;
}
+/* Input charset configuration for diagnostics. */
+static const char *
+c_common_input_charset_cb (const char * /*filename*/)
+{
+ const char *cs = cpp_opts->input_charset;
+ return cpp_input_conversion_is_trivial (cs) ? nullptr : cs;
+}
+
/* Whether options from all C-family languages should be accepted
quietly. */
static bool accept_all_c_family_options = false;
cpp_post_options (parse_in);
init_global_opts_from_cpp (&global_options, cpp_get_options (parse_in));
+ /* Let diagnostics infrastructure know how to convert input files the same
+ way libcpp will do it, namely using the configured input charset and
+ skipping a UTF-8 BOM if present. */
+ diagnostic_initialize_input_context (global_dc,
+ c_common_input_charset_cb, true);
input_location = UNKNOWN_LOCATION;
*pfilename = this_input_filename
struct diagnostic_context;
class pretty_printer;
class diagnostic_event_id_t;
+typedef const char * (*diagnostic_input_charset_callback)(const char *);
template<typename T> struct array_traits;
#include "output.h"
#include "print-tree.h"
#include "debug.h"
+#include "input.h"
#include "d-tree.h"
#include "id.h"
return CL_D;
}
+/* Implements input charset and BOM skipping configuration for
+ diagnostics. */
+static const char *d_input_charset_callback (const char * /*filename*/)
+{
+ /* TODO: The input charset is automatically determined by code in
+ dmd/dmodule.c based on the contents of the file. If this detection
+ logic were factored out and could be reused here, then we would be able
+ to return UTF-16 or UTF-32 as needed here. For now, we return always
+ NULL, which means no conversion is necessary, i.e. the input is assumed
+ to be UTF-8 when diagnostics read this file. */
+ return nullptr;
+}
+
/* Implements the lang_hooks.init routine for language D. */
static bool
Expression::_init ();
Objc::_init ();
+ /* Diagnostics input init, to enable BOM skipping and
+ input charset conversion. */
+ diagnostic_initialize_input_context (global_dc,
+ d_input_charset_callback, true);
+
/* Back-end init. */
global_binding_level = ggc_cleared_alloc <binding_level> ();
current_binding_level = global_binding_level;
= determine_url_format ((diagnostic_url_rule_t) value);
}
+/* Create the file_cache, if not already created, and tell it how to
+ translate files on input. */
+void diagnostic_initialize_input_context (diagnostic_context *context,
+ diagnostic_input_charset_callback ccb,
+ bool should_skip_bom)
+{
+ if (!context->m_file_cache)
+ context->m_file_cache = new file_cache;
+ context->m_file_cache->initialize_input_context (ccb, should_skip_bom);
+}
+
/* Do any cleaning up required after the last diagnostic is emitted. */
void
diagnostic_t diagnostic_kind);
extern void diagnostic_show_any_path (diagnostic_context *, diagnostic_info *);
+/* Because we read source files a second time after the frontend did it the
+ first time, we need to know how the frontend handled things like character
+ set conversion and UTF-8 BOM stripping, in order to make everything
+ consistent. This function needs to be called by each frontend that requires
+ non-default behavior, to inform the diagnostics infrastructure how input is
+ to be processed. The default behavior is to do no conversion and not to
+ strip a UTF-8 BOM.
+
+ The callback should return the input charset to be used to convert the given
+ file's contents to UTF-8, or it should return NULL if no conversion is needed
+ for this file. SHOULD_SKIP_BOM only applies in case no conversion was
+ performed, and if true, it will cause a UTF-8 BOM to be skipped at the
+ beginning of the file. (In case a conversion was performed, the BOM is
+ rather skipped as part of the conversion process.) */
+
+void diagnostic_initialize_input_context (diagnostic_context *context,
+ diagnostic_input_charset_callback ccb,
+ bool should_skip_bom);
+
/* Force diagnostics controlled by OPTIDX to be kind KIND. */
extern diagnostic_t diagnostic_classify_diagnostic (diagnostic_context *,
int /* optidx */,
cpp_post_options (cpp_in);
+
+ /* Let diagnostics infrastructure know how to convert input files the same
+ way libcpp will do it, namely, with no charset conversion but with
+ skipping of a UTF-8 BOM if present. */
+ diagnostic_initialize_input_context (global_dc, nullptr, true);
+
gfc_cpp_register_include_paths ();
}
#include "coretypes.h"
#include "intl.h"
#include "diagnostic.h"
-#include "diagnostic-core.h"
#include "selftest.h"
#include "cpplib.h"
#define HAVE_ICONV 0
#endif
+/* Input charset configuration. */
+static const char *default_charset_callback (const char *)
+{
+ return nullptr;
+}
+
+void
+file_cache::initialize_input_context (diagnostic_input_charset_callback ccb,
+ bool should_skip_bom)
+{
+ in_context.ccb = (ccb ? ccb : default_charset_callback);
+ in_context.should_skip_bom = should_skip_bom;
+}
+
/* This is a cache used by get_next_line to store the content of a
file to be searched for file lines. */
class file_cache_slot
void inc_use_count () { m_use_count++; }
- void create (const char *file_path, FILE *fp, unsigned highest_use_count);
+ bool create (const file_cache::input_context &in_context,
+ const char *file_path, FILE *fp, unsigned highest_use_count);
void evict ();
private:
far. */
char *m_data;
+ /* The allocated buffer to be freed may start a little earlier than DATA,
+ e.g. if a UTF8 BOM was skipped at the beginning. */
+ int m_alloc_offset;
+
/* The size of the DATA array above.*/
size_t m_size;
doesn't explode. We thus scale total_lines down to
line_record_size. */
vec<line_info, va_heap> m_line_record;
+
+ void offset_buffer (int offset)
+ {
+ gcc_assert (offset < 0 ? m_alloc_offset + offset >= 0
+ : (size_t) offset <= m_size);
+ gcc_assert (m_data);
+ m_alloc_offset += offset;
+ m_data += offset;
+ m_size -= offset;
+ }
+
};
/* Current position in real source file. */
unsigned highest_use_count = 0;
file_cache_slot *r = evicted_cache_tab_entry (&highest_use_count);
- r->create (file_path, fp, highest_use_count);
+ if (!r->create (in_context, file_path, fp, highest_use_count))
+ return NULL;
return r;
}
/* Populate this slot for use on FILE_PATH and FP, dropping any
existing cached content within it. */
-void
-file_cache_slot::create (const char *file_path, FILE *fp,
+bool
+file_cache_slot::create (const file_cache::input_context &in_context,
+ const char *file_path, FILE *fp,
unsigned highest_use_count)
{
m_file_path = file_path;
if (m_fp)
fclose (m_fp);
m_fp = fp;
+ if (m_alloc_offset)
+ offset_buffer (-m_alloc_offset);
m_nb_read = 0;
m_line_start_idx = 0;
m_line_num = 0;
m_use_count = ++highest_use_count;
m_total_lines = total_lines_num (file_path);
m_missing_trailing_newline = true;
+
+
+ /* Check the input configuration to determine if we need to do any
+ transformations, such as charset conversion or BOM skipping. */
+ if (const char *input_charset = in_context.ccb (file_path))
+ {
+ /* Need a full-blown conversion of the input charset. */
+ fclose (m_fp);
+ m_fp = NULL;
+ const cpp_converted_source cs
+ = cpp_get_converted_source (file_path, input_charset);
+ if (!cs.data)
+ return false;
+ if (m_data)
+ XDELETEVEC (m_data);
+ m_data = cs.data;
+ m_nb_read = m_size = cs.len;
+ m_alloc_offset = cs.data - cs.to_free;
+ }
+ else if (in_context.should_skip_bom)
+ {
+ if (read_data ())
+ {
+ const int offset = cpp_check_utf8_bom (m_data, m_nb_read);
+ offset_buffer (offset);
+ m_nb_read -= offset;
+ }
+ }
+
+ return true;
}
/* file_cache's ctor. */
file_cache::file_cache ()
: m_file_slots (new file_cache_slot[num_file_slots])
{
+ initialize_input_context (nullptr, false);
}
/* file_cache's dtor. */
file_cache_slot::file_cache_slot ()
: m_use_count (0), m_file_path (NULL), m_fp (NULL), m_data (0),
- m_size (0), m_nb_read (0), m_line_start_idx (0), m_line_num (0),
- m_total_lines (0), m_missing_trailing_newline (true)
+ m_alloc_offset (0), m_size (0), m_nb_read (0), m_line_start_idx (0),
+ m_line_num (0), m_total_lines (0), m_missing_trailing_newline (true)
{
m_line_record.create (0);
}
}
if (m_data)
{
+ offset_buffer (-m_alloc_offset);
XDELETEVEC (m_data);
m_data = 0;
}
bool
file_cache_slot::needs_read_p () const
{
- return (m_nb_read == 0
+ return m_fp && (m_nb_read == 0
|| m_nb_read == m_size
|| (m_line_start_idx >= m_nb_read - 1));
}
if (!needs_grow_p ())
return;
- size_t size = m_size == 0 ? buffer_size : m_size * 2;
- m_data = XRESIZEVEC (char, m_data, size);
- m_size = size;
+ if (!m_data)
+ {
+ gcc_assert (m_size == 0 && m_alloc_offset == 0);
+ m_size = buffer_size;
+ m_data = XNEWVEC (char, m_size);
+ }
+ else
+ {
+ const int offset = m_alloc_offset;
+ offset_buffer (-offset);
+ m_size *= 2;
+ m_data = XRESIZEVEC (char, m_data, m_size);
+ offset_buffer (offset);
+ }
}
/* Read more data into the cache. Extends the cache if need be.
m_missing_trailing_newline = false;
}
- if (ferror (m_fp))
+ if (m_fp && ferror (m_fp))
return false;
/* At this point, we've found the end of the of line. It either
file_cache_slot *lookup_or_add_file (const char *file_path);
void forcibly_evict_file (const char *file_path);
+ /* See comments in diagnostic.h about the input conversion context. */
+ struct input_context
+ {
+ diagnostic_input_charset_callback ccb;
+ bool should_skip_bom;
+ };
+ void initialize_input_context (diagnostic_input_charset_callback ccb,
+ bool should_skip_bom);
+
private:
file_cache_slot *evicted_cache_tab_entry (unsigned *highest_use_count);
file_cache_slot *add_file (const char *file_path);
private:
static const size_t num_file_slots = 16;
file_cache_slot *m_file_slots;
+ input_context in_context;
};
extern expanded_location
--- /dev/null
+/* { dg-do compile } */
+/* { dg-require-iconv "CP850" } */
+/* { dg-options "-finput-charset=CP850 -fdiagnostics-show-caret" } */
+
+/* Test that diagnostics are converted to UTF-8; this file is encoded in
+ CP850. Why CP850? -finput-charset only supports encodings that are a
+ superset of ASCII. But encodings that look like latin-1 are automatically
+ converted by expect to UTF-8, and hence by the time dg sees them, it can't
+ verify they were actually output in UTF-8. So codepage 850 was chosen as one
+ that is hopefully available and meets the requirements of matching ASCII and
+ not matching latin-1. */
+const char *section = "õ"
+/* { dg-error "expected .* at end of input" "" { target *-*-*} .-1 } */
+/* { dg-begin-multiline-output "" }
+ const char *section = "§"
+ ^~~~~
+ { dg-end-multiline-output "" } */
--- /dev/null
+int 1;
+/* { dg-do compile } */
+/* { dg-options "-fdiagnostics-show-caret" } */
+
+/* This file begins with a UTF-8 byte order mark. Verify that diagnostics
+ still point to the right place, since the stripping of the BOM happens twice,
+ once when libcpp reads the file, and once when diagnostics infrastucture
+ reads it. */
+
+/* { dg-error "expected .* before numeric constant" "" { target *-*-*} 1 } */
+/* { dg-begin-multiline-output "" }
+ int 1;
+ ^
+ { dg-end-multiline-output "" } */
cset_converter structure for conversion from FROM to TO. If
iconv_open() fails, issue an error and return an identity
converter. Silently return an identity converter if FROM and TO
- are identical. */
+ are identical.
+
+ PFILE is only used for generating diagnostics; setting it to NULL
+ suppresses diagnostics. */
+
static struct cset_converter
init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
{
if (ret.cd == (iconv_t) -1)
{
- if (errno == EINVAL)
- cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
- "conversion from %s to %s not supported by iconv",
- from, to);
- else
- cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
-
+ if (pfile)
+ {
+ if (errno == EINVAL)
+ cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
+ "conversion from %s to %s not supported by iconv",
+ from, to);
+ else
+ cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
+ }
ret.func = convert_no_conversion;
}
}
else
{
- cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
- "no iconv implementation, cannot convert from %s to %s",
- from, to);
+ if (pfile)
+ {
+ cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
+ "no iconv implementation, cannot convert from %s to %s",
+ from, to);
+ }
ret.func = convert_no_conversion;
ret.cd = (iconv_t) -1;
ret.width = -1;
}
+
return ret;
}
buf, bufp - buf, HT_ALLOC));
}
\f
+
+/* Utility to strip a UTF-8 byte order marking from the beginning
+ of a buffer. Returns the number of bytes to skip, which currently
+ will be either 0 or 3. */
+int
+cpp_check_utf8_bom (const char *data, size_t data_length)
+{
+
+#if HOST_CHARSET == HOST_CHARSET_ASCII
+ const unsigned char *udata = (const unsigned char *) data;
+ if (data_length >= 3 && udata[0] == 0xef && udata[1] == 0xbb
+ && udata[2] == 0xbf)
+ return 3;
+#endif
+
+ return 0;
+}
+
+
/* Convert an input buffer (containing the complete contents of one
source file) from INPUT_CHARSET to the source character set. INPUT
points to the input buffer, SIZE is its allocated size, and LEN is
INPUT is expected to have been allocated with xmalloc. This
function will either set *BUFFER_START to INPUT, or free it and set
*BUFFER_START to a pointer to another xmalloc-allocated block of
- memory. */
+ memory.
+
+ PFILE is only used to generate diagnostics; setting it to NULL suppresses
+ diagnostics, and causes a return of NULL if there was any error instead. */
+
uchar *
_cpp_convert_input (cpp_reader *pfile, const char *input_charset,
uchar *input, size_t size, size_t len,
to.text = XNEWVEC (uchar, to.asize);
to.len = 0;
- if (!APPLY_CONVERSION (input_cset, input, len, &to))
- cpp_error (pfile, CPP_DL_ERROR,
- "failure to convert %s to %s",
- CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);
-
+ const bool ok = APPLY_CONVERSION (input_cset, input, len, &to);
free (input);
- }
- /* Clean up the mess. */
- if (input_cset.func == convert_using_iconv)
- iconv_close (input_cset.cd);
+ /* Clean up the mess. */
+ if (input_cset.func == convert_using_iconv)
+ iconv_close (input_cset.cd);
+
+ /* Handle conversion failure. */
+ if (!ok)
+ {
+ if (!pfile)
+ {
+ XDELETEVEC (to.text);
+ *buffer_start = NULL;
+ *st_size = 0;
+ return NULL;
+ }
+ cpp_error (pfile, CPP_DL_ERROR, "failure to convert %s to %s",
+ input_charset, SOURCE_CHARSET);
+ }
+ }
/* Resize buffer if we allocated substantially too much, or if we
haven't enough space for the \n-terminator or following
buffer = to.text;
*st_size = to.len;
-#if HOST_CHARSET == HOST_CHARSET_ASCII
- /* The HOST_CHARSET test just above ensures that the source charset
- is UTF-8. So, ignore a UTF-8 BOM if we see one. Note that
- glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a
+
+ /* Ignore a UTF-8 BOM if we see one and the source charset is UTF-8. Note
+ that glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a
BOM -- however, even if it did, we would still need this code due
to the 'convert_no_conversion' case. */
- if (to.len >= 3 && to.text[0] == 0xef && to.text[1] == 0xbb
- && to.text[2] == 0xbf)
- {
- *st_size -= 3;
- buffer += 3;
- }
-#endif
+ const int bom_len = cpp_check_utf8_bom ((const char *) to.text, to.len);
+ *st_size -= bom_len;
+ buffer += bom_len;
*buffer_start = to.text;
return buffer;
return current_encoding;
}
+/* Check if the configured input charset requires no conversion, other than
+ possibly stripping a UTF-8 BOM. */
+bool cpp_input_conversion_is_trivial (const char *input_charset)
+{
+ return !strcasecmp (input_charset, SOURCE_CHARSET);
+}
+
/* Implementation of class cpp_string_location_reader. */
/* Constructor for cpp_string_location_reader. */
static bool find_file_in_dir (cpp_reader *pfile, _cpp_file *file,
bool *invalid_pch, location_t loc);
static bool read_file_guts (cpp_reader *pfile, _cpp_file *file,
- location_t loc);
+ location_t loc, const char *input_charset);
static bool read_file (cpp_reader *pfile, _cpp_file *file,
location_t loc);
static struct cpp_dir *search_path_head (cpp_reader *, const char *fname,
Use LOC for any diagnostics.
+ PFILE may be NULL. In this case, no diagnostics are issued.
+
FIXME: Flush file cache and try again if we run out of memory. */
static bool
-read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc)
+read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc,
+ const char *input_charset)
{
ssize_t size, total, count;
uchar *buf;
if (S_ISBLK (file->st.st_mode))
{
- cpp_error_at (pfile, CPP_DL_ERROR, loc,
- "%s is a block device", file->path);
+ if (pfile)
+ cpp_error_at (pfile, CPP_DL_ERROR, loc,
+ "%s is a block device", file->path);
return false;
}
does not bite us. */
if (file->st.st_size > INTTYPE_MAXIMUM (ssize_t))
{
- cpp_error_at (pfile, CPP_DL_ERROR, loc,
- "%s is too large", file->path);
+ if (pfile)
+ cpp_error_at (pfile, CPP_DL_ERROR, loc,
+ "%s is too large", file->path);
return false;
}
if (count < 0)
{
- cpp_errno_filename (pfile, CPP_DL_ERROR, file->path, loc);
+ if (pfile)
+ cpp_errno_filename (pfile, CPP_DL_ERROR, file->path, loc);
free (buf);
return false;
}
- if (regular && total != size && STAT_SIZE_RELIABLE (file->st))
+ if (pfile && regular && total != size && STAT_SIZE_RELIABLE (file->st))
cpp_error_at (pfile, CPP_DL_WARNING, loc,
"%s is shorter than expected", file->path);
file->buffer = _cpp_convert_input (pfile,
- CPP_OPTION (pfile, input_charset),
+ input_charset,
buf, size + 16, total,
&file->buffer_start,
&file->st.st_size);
- file->buffer_valid = true;
-
- return true;
+ file->buffer_valid = file->buffer;
+ return file->buffer_valid;
}
/* Convenience wrapper around read_file_guts that opens the file if
necessary and closes the file descriptor after reading. FILE must
have been passed through find_file() at some stage. Use LOC for
- any diagnostics. */
+ any diagnostics. Unlike read_file_guts(), PFILE may not be NULL. */
static bool
read_file (cpp_reader *pfile, _cpp_file *file, location_t loc)
{
return false;
}
- file->dont_read = !read_file_guts (pfile, file, loc);
+ file->dont_read = !read_file_guts (pfile, file, loc,
+ CPP_OPTION (pfile, input_charset));
close (file->fd);
file->fd = -1;
return file->err_no != ENOENT;
}
+/* Read a file and convert to input charset, the same as if it were being read
+ by a cpp_reader. */
+
+cpp_converted_source
+cpp_get_converted_source (const char *fname, const char *input_charset)
+{
+ cpp_converted_source res = {};
+ _cpp_file file = {};
+ file.fd = -1;
+ file.name = lbasename (fname);
+ file.path = fname;
+ if (!open_file (&file))
+ return res;
+ const bool ok = read_file_guts (NULL, &file, 0, input_charset);
+ close (file.fd);
+ if (!ok)
+ return res;
+ res.to_free = (char *) file.buffer_start;
+ res.data = (char *) file.buffer;
+ res.len = file.st.st_size;
+ return res;
+}
extern cpp_buffer *cpp_get_prev (cpp_buffer *);
extern void cpp_clear_file_cache (cpp_reader *);
+/* cpp_get_converted_source returns the contents of the given file, as it exists
+ after cpplib has read it and converted it from the input charset to the
+ source charset. Return struct will be zero-filled if the data could not be
+ read for any reason. The data starts at the DATA pointer, but the TO_FREE
+ pointer is what should be passed to free(), as there may be an offset. */
+struct cpp_converted_source
+{
+ char *to_free;
+ char *data;
+ size_t len;
+};
+cpp_converted_source cpp_get_converted_source (const char *fname,
+ const char *input_charset);
+
/* In pch.c */
struct save_macro_data;
extern int cpp_save_state (cpp_reader *, FILE *);
/* Convenience functions that are simple use cases for class
cpp_display_width_computation. Tab characters will be expanded to spaces
as determined by TABSTOP. */
+
int cpp_byte_column_to_display_column (const char *data, int data_length,
int column, int tabstop);
inline int cpp_display_width (const char *data, int data_length,
int display_col, int tabstop);
int cpp_wcwidth (cppchar_t c);
+bool cpp_input_conversion_is_trivial (const char *input_charset);
+int cpp_check_utf8_bom (const char *data, size_t data_length);
+
#endif /* ! LIBCPP_CPPLIB_H */