return artifact_obj;
}
-/* Read all data from F_IN until EOF.
- Return a NULL-terminated buffer containing the data, which must be
- freed by the caller.
- Return NULL on errors. */
-
-static char *
-read_until_eof (FILE *f_in)
-{
- /* Read content, allocating a buffer for it. */
- char *result = NULL;
- size_t total_sz = 0;
- size_t alloc_sz = 0;
- char buf[4096];
- size_t iter_sz_in;
-
- while ( (iter_sz_in = fread (buf, 1, sizeof (buf), f_in)) )
- {
- gcc_assert (alloc_sz >= total_sz);
- size_t old_total_sz = total_sz;
- total_sz += iter_sz_in;
- /* Allow 1 extra byte for 0-termination. */
- if (alloc_sz < (total_sz + 1))
- {
- size_t new_alloc_sz = alloc_sz ? alloc_sz * 2: total_sz + 1;
- result = (char *)xrealloc (result, new_alloc_sz);
- alloc_sz = new_alloc_sz;
- }
- memcpy (result + old_total_sz, buf, iter_sz_in);
- }
-
- if (!feof (f_in))
- return NULL;
-
- /* 0-terminate the buffer. */
- gcc_assert (total_sz < alloc_sz);
- result[total_sz] = '\0';
-
- return result;
-}
-
-/* Read all data from FILENAME until EOF.
- Return a NULL-terminated buffer containing the data, which must be
- freed by the caller.
- Return NULL on errors. */
-
-static char *
-maybe_read_file (const char *filename)
-{
- FILE *f_in = fopen (filename, "r");
- if (!f_in)
- return NULL;
- char *result = read_until_eof (f_in);
- fclose (f_in);
- return result;
-}
-
/* Make an artifactContent object (SARIF v2.1.0 section 3.3) for the
full contents of FILENAME. */
json::object *
sarif_builder::maybe_make_artifact_content_object (const char *filename) const
{
- char *text_utf8 = maybe_read_file (filename);
- if (!text_utf8)
+ /* Let input.cc handle any charset conversion. */
+ char_span utf8_content = get_source_file_content (filename);
+ if (!utf8_content)
return NULL;
- json::object *artifact_content_obj = new json::object ();
- artifact_content_obj->set ("text", new json::string (text_utf8));
- free (text_utf8);
+ /* Don't add it if it's not valid UTF-8. */
+ if (!cpp_valid_utf8_p(utf8_content.get_buffer (), utf8_content.length ()))
+ return NULL;
+ json::object *artifact_content_obj = new json::object ();
+ artifact_content_obj->set ("text",
+ new json::string (utf8_content.get_buffer (),
+ utf8_content.length ()));
return artifact_content_obj;
}
if (!text_utf8)
return NULL;
+ /* Don't add it if it's not valid UTF-8. */
+ if (!cpp_valid_utf8_p(text_utf8, strlen(text_utf8)))
+ {
+ free (text_utf8);
+ return NULL;
+ }
+
json::object *artifact_content_obj = new json::object ();
artifact_content_obj->set ("text", new json::string (text_utf8));
free (text_utf8);
{
return m_missing_trailing_newline;
}
+ char_span get_full_file_content ();
void inc_use_count () { m_use_count++; }
return r;
}
+/* Get a borrowed char_span to the full content of this file
+ as decoded according to the input charset, encoded as UTF-8. */
+
+char_span
+file_cache_slot::get_full_file_content ()
+{
+ char *line;
+ ssize_t line_len;
+ while (get_next_line (&line, &line_len))
+ {
+ }
+ return char_span (m_data, m_nb_read);
+}
+
/* Populate this slot for use on FILE_PATH and FP, dropping any
existing cached content within it. */
return xstrdup (buf);
}
+/* Get a borrowed char_span to the full content of FILE_PATH
+ as decoded according to the input charset, encoded as UTF-8. */
+
+char_span
+get_source_file_content (const char *file_path)
+{
+ diagnostic_file_cache_init ();
+
+ file_cache_slot *c = global_dc->m_file_cache->lookup_or_add_file (file_path);
+ return c->get_full_file_content ();
+}
+
/* Determine if FILE_PATH missing a trailing newline on its final line.
Only valid to call once all of the file has been loaded, by
requesting a line number beyond the end of the file. */
ASSERT_EQ (byte_col2, byte_col);
}
}
+}
+
+static bool
+check_cpp_valid_utf8_p (const char *str)
+{
+ return cpp_valid_utf8_p (str, strlen (str));
+}
+
+/* Check that cpp_valid_utf8_p works as expected. */
+
+static void
+test_cpp_valid_utf8_p ()
+{
+ ASSERT_TRUE (check_cpp_valid_utf8_p ("hello world"));
+
+ /* 2-byte char (pi). */
+ ASSERT_TRUE (check_cpp_valid_utf8_p("\xcf\x80"));
+
+ /* 3-byte chars (the Japanese word "mojibake"). */
+ ASSERT_TRUE (check_cpp_valid_utf8_p
+ (
+ /* U+6587 CJK UNIFIED IDEOGRAPH-6587
+ UTF-8: 0xE6 0x96 0x87
+ C octal escaped UTF-8: \346\226\207. */
+ "\346\226\207"
+ /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
+ UTF-8: 0xE5 0xAD 0x97
+ C octal escaped UTF-8: \345\255\227. */
+ "\345\255\227"
+ /* U+5316 CJK UNIFIED IDEOGRAPH-5316
+ UTF-8: 0xE5 0x8C 0x96
+ C octal escaped UTF-8: \345\214\226. */
+ "\345\214\226"
+ /* U+3051 HIRAGANA LETTER KE
+ UTF-8: 0xE3 0x81 0x91
+ C octal escaped UTF-8: \343\201\221. */
+ "\343\201\221"));
+
+ /* 4-byte char: an emoji. */
+ ASSERT_TRUE (check_cpp_valid_utf8_p ("\xf0\x9f\x98\x82"));
+
+ /* Control codes, including the NUL byte. */
+ ASSERT_TRUE (cpp_valid_utf8_p ("\r\n\v\0\1", 5));
+
+ ASSERT_FALSE (check_cpp_valid_utf8_p ("\xf0!\x9f!\x98!\x82!"));
+
+ /* Unexpected continuation bytes. */
+ for (unsigned char continuation_byte = 0x80;
+ continuation_byte <= 0xbf;
+ continuation_byte++)
+ ASSERT_FALSE (cpp_valid_utf8_p ((const char *)&continuation_byte, 1));
+
+ /* "Lonely start characters" for 2-byte sequences. */
+ {
+ unsigned char buf[2];
+ buf[1] = ' ';
+ for (buf[0] = 0xc0;
+ buf[0] <= 0xdf;
+ buf[0]++)
+ ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
+ }
+
+ /* "Lonely start characters" for 3-byte sequences. */
+ {
+ unsigned char buf[2];
+ buf[1] = ' ';
+ for (buf[0] = 0xe0;
+ buf[0] <= 0xef;
+ buf[0]++)
+ ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
+ }
+
+ /* "Lonely start characters" for 4-byte sequences. */
+ {
+ unsigned char buf[2];
+ buf[1] = ' ';
+ for (buf[0] = 0xf0;
+ buf[0] <= 0xf4;
+ buf[0]++)
+ ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
+ }
+
+ /* Invalid start characters (formerly valid for 5-byte and 6-byte
+ sequences). */
+ {
+ unsigned char buf[2];
+ buf[1] = ' ';
+ for (buf[0] = 0xf5;
+ buf[0] <= 0xfd;
+ buf[0]++)
+ ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
+ }
+ /* Impossible bytes. */
+ ASSERT_FALSE (check_cpp_valid_utf8_p ("\xc0"));
+ ASSERT_FALSE (check_cpp_valid_utf8_p ("\xc1"));
+ ASSERT_FALSE (check_cpp_valid_utf8_p ("\xfe"));
+ ASSERT_FALSE (check_cpp_valid_utf8_p ("\xff"));
}
/* Run all of the selftests within this file. */
test_line_offset_overflow ();
test_cpp_utf8 ();
+ test_cpp_valid_utf8_p ();
}
} // namespace selftest
extern char_span location_get_source_line (const char *file_path, int line);
extern char *get_source_text_between (location_t, location_t);
+extern char_span get_source_file_content (const char *file_path);
extern bool location_missing_trailing_newline (const char *file_path);
#warning message
/* Verify that some JSON was written to a file with the expected name. */
+/* { dg-final { verify-sarif-file } } */
/* We expect various properties.
The indentation here reflects the expected hierarchy, though these tests
}
/*
+ { dg-final { verify-sarif-file } }
+
{ dg-final { scan-sarif-file "\"level\": \"warning\"" } }
{ dg-final { scan-sarif-file "\"ruleId\": \"-Wmisleading-indentation\"" } }
{ dg-final { scan-sarif-file "\"text\": \" if " } }
}
/*
+ { dg-final { verify-sarif-file } }
+
{ dg-final { scan-sarif-file "\"level\": \"error\"" } }
We expect a logical location for the error (within fn "test"):
}
/*
+ { dg-final { verify-sarif-file } }
+
{ dg-final { scan-sarif-file "\"level\": \"error\"" } }
We expect the region expressed in display columns:
--- /dev/null
+/* Adapted from Wbidi-chars-1.c */
+
+/* PR preprocessor/103026 */
+/* { dg-do compile } */
+/* { dg-options "-fdiagnostics-format=sarif-file" } */
+
+int main() {
+ int isAdmin = 0;
+ /* } if (isAdmin) begin admins only */
+ __builtin_printf("You are an admin.\n");
+ /* end admins only { */
+ return 0;
+}
+
+/* Verify that we generate a valid UTF-8 .sarif file.
+
+ { dg-final { verify-sarif-file } }
+
+ Verify that we captured the expected warnings.
+
+ { dg-final { scan-sarif-file {"text": "unpaired UTF-8 bidirectional control characters detected"} } }
+ { dg-final { scan-sarif-file {"text": "unpaired UTF-8 bidirectional control characters detected"} } }
+*/
--- /dev/null
+/* Try to process this explicitly as UTF-8.
+
+ { dg-do preprocess }
+ { dg-options "-finput-charset=UTF-8 -Winvalid-utf8 -fdiagnostics-format=sarif-file" } */
+
+/* This comment intentionally contains non-UTF-8 bytes:
+ * \80\98<unknown>\80\99 may be used uninitialized
+ */
+
+/*
+ { dg-final { verify-sarif-file } }
+
+ Verify that we captured the expected warnings.
+
+ { dg-final { scan-sarif-file "\"results\": \\\[" } }
+ { dg-final { scan-sarif-file "\"level\": \"warning\"" } }
+ { dg-final { scan-sarif-file "\"ruleId\": \"-Winvalid-utf8\"" } }
+ { dg-final { scan-sarif-file "\"message\": " } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <80>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <98>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <80>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <99>"} } }
+*/
--- /dev/null
+/* Try to process this explicitly as UTF-8. */
+
+/* { dg-do compile } */
+/* { dg-options "-finput-charset=utf-8 -fdiagnostics-format=sarif-file" } */
+/* { dg-excess-errors "The error is sent to the SARIF file, rather than stderr" } */
+
+const char *section = "þ"
+
+/* The above in quotes is byte 0xFE which is not valid in UTF-8.
+ Verify that we can generate a valid UTF-8 .sarif file complaining
+ about the missing semicolon above. */
+
+/* { dg-final { verify-sarif-file } }
+
+ { dg-final { scan-sarif-file {"text": "expected ',' or ';' at end of input"} } }
+*/
--- /dev/null
+/* Adapted from cpp/Winvalid-utf8-1.c
+
+ P2295R6 - Support for UTF-8 as a portable source file encoding
+ This test intentionally contains various byte sequences which are not valid UTF-8
+ { dg-do preprocess }
+ { dg-options "-finput-charset=UTF-8 -Winvalid-utf8 -fdiagnostics-format=sarif-file" } */
+
+// a\80߿ࠀ𐀀a
+// a\80a
+// a¿a
+// aÀa
+// aÁa
+// aõa
+// aÿa
+// aÂa
+// aàa
+// aà\80¿a
+// aà\9f\80a
+// aà¿a
+// aì\80a
+// aa
+// að\80\80\80a
+// að\8f¿¿a
+// aa
+// a
+/* a\80߿ࠀ𐀀a */
+/* a\80a */
+/* a¿a */
+/* aÀa */
+/* aÁa */
+/* aõa */
+/* aÿa */
+/* aÂa */
+/* aàa */
+/* aà\80¿a */
+/* aà\9f\80a */
+/* aà¿a */
+/* aì\80a */
+/* aa */
+/* að\80\80\80a */
+/* að\8f¿¿a */
+/* aa */
+/* aa */
+
+
+
+/* Verify that we generate a valid UTF-8 .sarif file.
+
+ { dg-final { verify-sarif-file } }
+
+ Verify that we captured the expected warnings.
+
+ { dg-final { scan-sarif-file "\"results\": \\\[" } }
+ { dg-final { scan-sarif-file "\"level\": \"warning\"" } }
+ { dg-final { scan-sarif-file "\"ruleId\": \"-Winvalid-utf8\"" } }
+ { dg-final { scan-sarif-file "\"message\": " } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <80>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <bf>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <c0>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <c1>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <f5>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <ff>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <c2>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <e0>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <e0><80><bf>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <e0><9f><80>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <e0><bf>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <ec><80>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <ed><a0><80>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <f0><80><80><80>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <f0><8f><bf><bf>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <f4><90><80><80>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <fd><bf><bf><bf>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <bf>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <bf>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <80>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <bf>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <c0>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <c1>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <f5>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <ff>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <c2>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <e0>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <e0><80><bf>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <e0><9f><80>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <e0><bf>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <ec><80>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <ed><a0><80>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <f0><80><80><80>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <f0><8f><bf><bf>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <f4><90><80><80>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <fd><bf><bf><bf>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <bf>"} } }
+ { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <bf>"} } }
+*/
--- /dev/null
+/* Adapted from gcc.dg/diagnostic-input-charset-1.c */
+/* { dg-do compile } */
+/* { dg-require-iconv "CP850" } */
+/* { dg-options "-finput-charset=CP850 -fdiagnostics-format=sarif-file" } */
+/* { dg-excess-errors "The error is sent to the SARIF file, rather than stderr" } */
+
+/* Test that diagnostics are converted to UTF-8; this file is encoded in
+ CP850.
+
+ The non-ASCII byte here is 0xf5, which when decoded as CP850
+ is U+00A7 SECTION SIGN */
+const char *section = "õ"
+
+/*
+ { dg-final { verify-sarif-file } }
+
+ Verify that we captured the expected warning, and converted the snippet to
+ UTF-8 on output.
+
+ { dg-final { scan-sarif-file {"text": "expected ',' or ';' at end of input"} } }
+ { dg-final { scan-sarif-file {"text": "const char .section = \\"\u00a7\\"} } }
+*/
}
/* Verify that some JSON was written to a file with the expected name. */
+/* { dg-final { verify-sarif-file } } */
/* We expect various properties.
The indentation here reflects the expected hierarchy, though these tests
}
/* Verify that some JSON was written to a file with the expected name. */
+/* { dg-final { verify-sarif-file } } */
/* We expect various properties.
The indentation here reflects the expected hierarchy, though these tests
}
/*
+ { dg-final { verify-sarif-file } }
+
{ dg-final { scan-sarif-file "\"tool\": " } }
We expect info about the plugin:
dg-scan "scan-sarif-file-not" 0 $testcase $output_file $args
}
+
+# Perform validity checks on the .sarif file produced by the compiler.
+#
+# Assuming python3 is available, use verify-sarif-file.py to check
+# that the .sarif file is UTF-8 encoded and is parseable as JSON.
+
+proc verify-sarif-file { args } {
+ global srcdir subdir
+
+ set testcase [testname-for-summary]
+ set filename [lindex $testcase 0]
+ set output_file "[file tail $filename].sarif"
+
+ if { ![check_effective_target_recent_python3] } {
+ unsupported "$testcase verify-sarif-file: python3 is missing"
+ return
+ }
+
+ # Verify that the file is correctly encoded and is parseable as JSON.
+ set script_name $srcdir/lib/verify-sarif-file.py
+ set what "$testcase (test .sarif output for UTF-8-encoded parseable JSON)"
+ if [catch {exec python3 $script_name $output_file} res ] {
+ verbose "verify-sarif-file: res: $res" 2
+ fail "$what"
+ return
+ } else {
+ pass "$what"
+ }
+}
--- /dev/null
+# Verify that ARGV[1] is UTF-8 encoded and parseable as JSON
+# For use by the verify-sarif-file directive
+
+import json
+import sys
+
+sys.tracebacklimit = 0
+
+fname = sys.argv[1]
+with open(fname, encoding="utf-8") as f:
+ json.load(f)
return true;
}
+/* Return true iff BUFFER of size NUM_BYTES is validly-encoded UTF-8. */
+
+extern bool
+cpp_valid_utf8_p (const char *buffer, size_t num_bytes)
+{
+ const uchar *iter = (const uchar *)buffer;
+ size_t bytesleft = num_bytes;
+ while (bytesleft > 0)
+ {
+ /* one_utf8_to_cppchar implements 5-byte and 6 byte sequences as per
+ RFC 2279, but this has been superceded by RFC 3629, which
+ restricts UTF-8 to 1-byte through 4-byte sequences, and
+ states "the octet values C0, C1, F5 to FF never appear".
+
+ Reject such values. */
+ if (*iter >= 0xf4)
+ return false;
+
+ cppchar_t cp;
+ int err = one_utf8_to_cppchar (&iter, &bytesleft, &cp);
+ if (err)
+ return false;
+ }
+ /* No problems encountered. */
+ return true;
+}
+
/* Subroutine of convert_hex and convert_oct. N is the representation
in the execution character set of a numeric escape; write it into the
string buffer TBUF and update the end-of-string pointer therein. WIDE
bool cpp_input_conversion_is_trivial (const char *input_charset);
int cpp_check_utf8_bom (const char *data, size_t data_length);
+bool cpp_valid_utf8_p (const char *data, size_t num_bytes);
#endif /* ! LIBCPP_CPPLIB_H */