diagnostics: ensure that .sarif files are UTF-8 encoded [PR109098]

author David Malcolm <dmalcolm@redhat.com>

Sat, 25 Mar 2023 00:52:34 +0000 (20:52 -0400)

committer David Malcolm <dmalcolm@redhat.com>

Sat, 25 Mar 2023 00:52:34 +0000 (20:52 -0400)
author David Malcolm <dmalcolm@redhat.com>
Sat, 25 Mar 2023 00:52:34 +0000 (20:52 -0400)
committer David Malcolm <dmalcolm@redhat.com>
Sat, 25 Mar 2023 00:52:34 +0000 (20:52 -0400)
diff --git a/gcc/diagnostic-format-sarif.cc b/gcc/diagnostic-format-sarif.cc

index 2c48cbd46e23d62c8e688ec533308232026500a6..fd29ac2ca3b4ad9e6f3438ac5c9a8ec189a31c47 100644 (file)
--- a/gcc/diagnostic-format-sarif.cc
+++ b/gcc/diagnostic-format-sarif.cc
@@ -1390,76 +1390,25 @@ sarif_builder::make_artifact_object (const char *filename)
    return artifact_obj;
  }
  
-/* Read all data from F_IN until EOF.
-   Return a NULL-terminated buffer containing the data, which must be
-   freed by the caller.
-   Return NULL on errors.  */
-
-static char *
-read_until_eof (FILE *f_in)
-{
-  /* Read content, allocating a buffer for it.  */
-  char *result = NULL;
-  size_t total_sz = 0;
-  size_t alloc_sz = 0;
-  char buf[4096];
-  size_t iter_sz_in;
-
-  while ( (iter_sz_in = fread (buf, 1, sizeof (buf), f_in)) )
-    {
-      gcc_assert (alloc_sz >= total_sz);
-      size_t old_total_sz = total_sz;
-      total_sz += iter_sz_in;
-      /* Allow 1 extra byte for 0-termination.  */
-      if (alloc_sz < (total_sz + 1))
-       {
-         size_t new_alloc_sz = alloc_sz ? alloc_sz * 2: total_sz + 1;
-         result = (char *)xrealloc (result, new_alloc_sz);
-         alloc_sz = new_alloc_sz;
-       }
-      memcpy (result + old_total_sz, buf, iter_sz_in);
-    }
-
-  if (!feof (f_in))
-    return NULL;
-
-  /* 0-terminate the buffer.  */
-  gcc_assert (total_sz < alloc_sz);
-  result[total_sz] = '\0';
-
-  return result;
-}
-
-/* Read all data from FILENAME until EOF.
-   Return a NULL-terminated buffer containing the data, which must be
-   freed by the caller.
-   Return NULL on errors.  */
-
-static char *
-maybe_read_file (const char *filename)
-{
-  FILE *f_in = fopen (filename, "r");
-  if (!f_in)
-    return NULL;
-  char *result = read_until_eof (f_in);
-  fclose (f_in);
-  return result;
-}
-
  /* Make an artifactContent object (SARIF v2.1.0 section 3.3) for the
     full contents of FILENAME.  */
  
  json::object *
  sarif_builder::maybe_make_artifact_content_object (const char *filename) const
  {
-  char *text_utf8 = maybe_read_file (filename);
-  if (!text_utf8)
+  /* Let input.cc handle any charset conversion.  */
+  char_span utf8_content = get_source_file_content (filename);
+  if (!utf8_content)
      return NULL;
  
-  json::object *artifact_content_obj = new json::object ();
-  artifact_content_obj->set ("text", new json::string (text_utf8));
-  free (text_utf8);
+  /* Don't add it if it's not valid UTF-8.  */
+  if (!cpp_valid_utf8_p(utf8_content.get_buffer (), utf8_content.length ()))
+    return NULL;
  
+  json::object *artifact_content_obj = new json::object ();
+  artifact_content_obj->set ("text",
+                            new json::string (utf8_content.get_buffer (),
+                                              utf8_content.length ()));
    return artifact_content_obj;
  }
  
@@ -1501,6 +1450,13 @@ sarif_builder::maybe_make_artifact_content_object (const char *filename,
    if (!text_utf8)
      return NULL;
  
+  /* Don't add it if it's not valid UTF-8.  */
+  if (!cpp_valid_utf8_p(text_utf8, strlen(text_utf8)))
+    {
+      free (text_utf8);
+      return NULL;
+    }
+
    json::object *artifact_content_obj = new json::object ();
    artifact_content_obj->set ("text", new json::string (text_utf8));
    free (text_utf8);
diff --git a/gcc/input.cc b/gcc/input.cc

index 9702dbd39f0cedae0f1c865d62cfb181ffd6c5b4..eaf301ec7c15075d53f801e85180a91bcacc05d2 100644 (file)
--- a/gcc/input.cc
+++ b/gcc/input.cc
@@ -67,6 +67,7 @@ public:
    {
      return m_missing_trailing_newline;
    }
+  char_span get_full_file_content ();
  
    void inc_use_count () { m_use_count++; }
  
@@ -459,6 +460,20 @@ file_cache::add_file (const char *file_path)
    return r;
  }
  
+/* Get a borrowed char_span to the full content of this file
+   as decoded according to the input charset, encoded as UTF-8.  */
+
+char_span
+file_cache_slot::get_full_file_content ()
+{
+  char *line;
+  ssize_t line_len;
+  while (get_next_line (&line, &line_len))
+    {
+    }
+  return char_span (m_data, m_nb_read);
+}
+
  /* Populate this slot for use on FILE_PATH and FP, dropping any
     existing cached content within it.  */
  
@@ -1047,6 +1062,18 @@ get_source_text_between (location_t start, location_t end)
    return xstrdup (buf);
  }
  
+/* Get a borrowed char_span to the full content of FILE_PATH
+   as decoded according to the input charset, encoded as UTF-8.  */
+
+char_span
+get_source_file_content (const char *file_path)
+{
+  diagnostic_file_cache_init ();
+
+  file_cache_slot *c = global_dc->m_file_cache->lookup_or_add_file (file_path);
+  return c->get_full_file_content ();
+}
+
  /* Determine if FILE_PATH missing a trailing newline on its final line.
     Only valid to call once all of the file has been loaded, by
     requesting a line number beyond the end of the file.  */
@@ -4045,7 +4072,104 @@ void test_cpp_utf8 ()
           ASSERT_EQ (byte_col2, byte_col);
        }
    }
+}
+
+static bool
+check_cpp_valid_utf8_p (const char *str)
+{
+  return cpp_valid_utf8_p (str, strlen (str));
+}
+
+/* Check that cpp_valid_utf8_p works as expected.  */
+
+static void
+test_cpp_valid_utf8_p ()
+{
+  ASSERT_TRUE (check_cpp_valid_utf8_p ("hello world"));
+
+  /* 2-byte char (pi).  */
+  ASSERT_TRUE (check_cpp_valid_utf8_p("\xcf\x80"));
+
+  /* 3-byte chars (the Japanese word "mojibake").  */
+  ASSERT_TRUE (check_cpp_valid_utf8_p
+              (
+               /* U+6587 CJK UNIFIED IDEOGRAPH-6587
+                  UTF-8: 0xE6 0x96 0x87
+                  C octal escaped UTF-8: \346\226\207.  */
+               "\346\226\207"
+               /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
+                  UTF-8: 0xE5 0xAD 0x97
+                  C octal escaped UTF-8: \345\255\227.  */
+               "\345\255\227"
+               /* U+5316 CJK UNIFIED IDEOGRAPH-5316
+                  UTF-8: 0xE5 0x8C 0x96
+                  C octal escaped UTF-8: \345\214\226.  */
+               "\345\214\226"
+               /* U+3051 HIRAGANA LETTER KE
+                  UTF-8: 0xE3 0x81 0x91
+                  C octal escaped UTF-8: \343\201\221.  */
+               "\343\201\221"));
+
+  /* 4-byte char: an emoji.  */
+  ASSERT_TRUE (check_cpp_valid_utf8_p ("\xf0\x9f\x98\x82"));
+
+  /* Control codes, including the NUL byte.  */
+  ASSERT_TRUE (cpp_valid_utf8_p ("\r\n\v\0\1", 5));
+
+  ASSERT_FALSE (check_cpp_valid_utf8_p ("\xf0!\x9f!\x98!\x82!"));
+
+  /* Unexpected continuation bytes.  */
+  for (unsigned char continuation_byte = 0x80;
+       continuation_byte <= 0xbf;
+       continuation_byte++)
+    ASSERT_FALSE (cpp_valid_utf8_p ((const char *)&continuation_byte, 1));
+
+  /* "Lonely start characters" for 2-byte sequences.  */
+  {
+    unsigned char buf[2];
+    buf[1] = ' ';
+    for (buf[0] = 0xc0;
+        buf[0] <= 0xdf;
+        buf[0]++)
+      ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
+  }
+
+  /* "Lonely start characters" for 3-byte sequences.  */
+  {
+    unsigned char buf[2];
+    buf[1] = ' ';
+    for (buf[0] = 0xe0;
+        buf[0] <= 0xef;
+        buf[0]++)
+      ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
+  }
+
+  /* "Lonely start characters" for 4-byte sequences.  */
+  {
+    unsigned char buf[2];
+    buf[1] = ' ';
+    for (buf[0] = 0xf0;
+        buf[0] <= 0xf4;
+        buf[0]++)
+      ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
+  }
+
+  /* Invalid start characters (formerly valid for 5-byte and 6-byte
+     sequences).  */
+  {
+    unsigned char buf[2];
+    buf[1] = ' ';
+    for (buf[0] = 0xf5;
+        buf[0] <= 0xfd;
+        buf[0]++)
+      ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
+  }
  
+  /* Impossible bytes.  */
+  ASSERT_FALSE (check_cpp_valid_utf8_p ("\xc0"));
+  ASSERT_FALSE (check_cpp_valid_utf8_p ("\xc1"));
+  ASSERT_FALSE (check_cpp_valid_utf8_p ("\xfe"));
+  ASSERT_FALSE (check_cpp_valid_utf8_p ("\xff"));
  }
  
  /* Run all of the selftests within this file.  */
@@ -4091,6 +4215,7 @@ input_cc_tests ()
    test_line_offset_overflow ();
  
    test_cpp_utf8 ();
+  test_cpp_valid_utf8_p ();
  }
  
  } // namespace selftest
diff --git a/gcc/input.h b/gcc/input.h

index 9d68648bb3c54df6ca5d91afa231351ad5780e4e..d1087b7a9e8bf24e2bcb9b926b5c2d260b28e01a 100644 (file)
--- a/gcc/input.h
+++ b/gcc/input.h
@@ -115,6 +115,7 @@ class char_span
  
  extern char_span location_get_source_line (const char *file_path, int line);
  extern char *get_source_text_between (location_t, location_t);
+extern char_span get_source_file_content (const char *file_path);
  
  extern bool location_missing_trailing_newline (const char *file_path);
  
diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-1.c b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-1.c

index cad530954784dc0ef97f98b86996475ed916b1d9..f0dcaa705ca6c44b5177dabc40a9790ea18629a3 100644 (file)
--- a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-1.c
+++ b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-1.c
@@ -4,6 +4,7 @@
  #warning message
  
  /* Verify that some JSON was written to a file with the expected name.  */
+/* { dg-final { verify-sarif-file } } */
  
  /* We expect various properties.
     The indentation here reflects the expected hierarchy, though these tests
diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-2.c b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-2.c

index 8f5814d894eb01b067edcb533babeb1a40425152..02ee33f5265293a3563e4c66e389e719dc976f5e 100644 (file)
--- a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-2.c
+++ b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-2.c
@@ -10,6 +10,8 @@ int test (void)
  }
  
  /* 
+   { dg-final { verify-sarif-file } }
+
         { dg-final { scan-sarif-file "\"level\": \"warning\"" } }
         { dg-final { scan-sarif-file "\"ruleId\": \"-Wmisleading-indentation\"" } }
           { dg-final { scan-sarif-file "\"text\": \"  if " } }
diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-3.c b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-3.c

index 3856782b5ea9a7218028d2bcb4eb2191b487ea94..80954711db286b1108f44457c714e79f30f057ee 100644 (file)
--- a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-3.c
+++ b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-3.c
@@ -10,6 +10,8 @@ int test (struct s *ptr)
  }
  
  /* 
+   { dg-final { verify-sarif-file } }
+
         { dg-final { scan-sarif-file "\"level\": \"error\"" } }
  
         We expect a logical location for the error (within fn "test"):
diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-4.c b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-4.c

index 2d22f54037ca7ff76072728fd31641f33fd45b42..bd13da714250e1fd49637d1585b21dfd672430fe 100644 (file)
--- a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-4.c
+++ b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-4.c
@@ -8,6 +8,8 @@ int test (void)
  }
  
  /* 
+   { dg-final { verify-sarif-file } }
+
         { dg-final { scan-sarif-file "\"level\": \"error\"" } }
  
         We expect the region expressed in display columns:
diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-Wbidi-chars.c b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-Wbidi-chars.c

new file mode 100644 (file)

index 0000000..283df75
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-Wbidi-chars.c
@@ -0,0 +1,23 @@
+/* Adapted from Wbidi-chars-1.c */
+
+/* PR preprocessor/103026 */
+/* { dg-do compile } */
+/* { dg-options "-fdiagnostics-format=sarif-file" } */
+
+int main() {
+    int isAdmin = 0;
+    /*‮ } ⁦if (isAdmin)⁩ ⁦ begin admins only */
+        __builtin_printf("You are an admin.\n");
+    /* end admins only ‮ { ⁦*/
+    return 0;
+}
+
+/* Verify that we generate a valid UTF-8 .sarif file.
+
+     { dg-final { verify-sarif-file } }
+
+   Verify that we captured the expected warnings.
+
+     { dg-final { scan-sarif-file {"text": "unpaired UTF-8 bidirectional control characters detected"} } }
+     { dg-final { scan-sarif-file {"text": "unpaired UTF-8 bidirectional control characters detected"} } }
+*/
diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-bad-utf8-pr109098-1.c b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-bad-utf8-pr109098-1.c

new file mode 100644 (file)

index 0000000..47f8923
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-bad-utf8-pr109098-1.c
@@ -0,0 +1,23 @@
+/* Try to process this explicitly as UTF-8.
+
+   { dg-do preprocess }
+   { dg-options "-finput-charset=UTF-8 -Winvalid-utf8 -fdiagnostics-format=sarif-file" } */
+
+/* This comment intentionally contains non-UTF-8 bytes:
+ *   \80\98<unknown>\80\99 may be used uninitialized
+ */
+
+/* 
+   { dg-final { verify-sarif-file } }
+
+   Verify that we captured the expected warnings.
+
+     { dg-final { scan-sarif-file "\"results\": \\\[" } }
+       { dg-final { scan-sarif-file "\"level\": \"warning\"" } }
+       { dg-final { scan-sarif-file "\"ruleId\": \"-Winvalid-utf8\"" } }
+       { dg-final { scan-sarif-file "\"message\": " } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <80>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <98>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <80>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <99>"} } }
+*/
diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-bad-utf8-pr109098-2.c b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-bad-utf8-pr109098-2.c

new file mode 100644 (file)

index 0000000..8395f1d
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-bad-utf8-pr109098-2.c
@@ -0,0 +1,16 @@
+/* Try to process this explicitly as UTF-8.  */
+
+/* { dg-do compile } */
+/* { dg-options "-finput-charset=utf-8 -fdiagnostics-format=sarif-file" } */
+/* { dg-excess-errors "The error is sent to the SARIF file, rather than stderr" } */
+
+const char *section = "þ"
+
+/* The above in quotes is byte 0xFE which is not valid in UTF-8.
+   Verify that we can generate a valid UTF-8 .sarif file complaining
+   about the missing semicolon above.  */
+
+/* { dg-final { verify-sarif-file } }
+
+     { dg-final { scan-sarif-file {"text": "expected ',' or ';' at end of input"} } }
+*/
diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-bad-utf8-pr109098-3.c b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-bad-utf8-pr109098-3.c

new file mode 100644 (file)

index 0000000..ead03a5
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-bad-utf8-pr109098-3.c
@@ -0,0 +1,95 @@
+/* Adapted from cpp/Winvalid-utf8-1.c
+
+   P2295R6 - Support for UTF-8 as a portable source file encoding
+   This test intentionally contains various byte sequences which are not valid UTF-8
+   { dg-do preprocess }
+   { dg-options "-finput-charset=UTF-8 -Winvalid-utf8 -fdiagnostics-format=sarif-file" } */
+
+// a\80߿ࠀ퟿𐀀􏿿a
+// a\80a
+// a¿a
+// aÀa
+// aÁa
+// aõa
+// aÿa
+// aÂa
+// aàa
+// aà\80¿a
+// aà\9f\80a
+// aà¿a
+// aì\80a
+// a���a
+// að\80\80\80a
+// að\8f¿¿a
+// a����a
+// a������
+/* a\80߿ࠀ퟿𐀀􏿿a */
+/* a\80a */
+/* a¿a */
+/* aÀa */
+/* aÁa */
+/* aõa */
+/* aÿa */
+/* aÂa */
+/* aàa */
+/* aà\80¿a */
+/* aà\9f\80a */
+/* aà¿a */
+/* aì\80a */
+/* a���a */
+/* að\80\80\80a */
+/* að\8f¿¿a */
+/* a����a */
+/* a������a */
+
+
+
+/* Verify that we generate a valid UTF-8 .sarif file.
+
+     { dg-final { verify-sarif-file } }
+
+   Verify that we captured the expected warnings.
+
+     { dg-final { scan-sarif-file "\"results\": \\\[" } }
+       { dg-final { scan-sarif-file "\"level\": \"warning\"" } }
+       { dg-final { scan-sarif-file "\"ruleId\": \"-Winvalid-utf8\"" } }
+       { dg-final { scan-sarif-file "\"message\": " } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <80>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <bf>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <c0>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <c1>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <f5>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <ff>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <c2>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <e0>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <e0><80><bf>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <e0><9f><80>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <e0><bf>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <ec><80>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <ed><a0><80>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <f0><80><80><80>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <f0><8f><bf><bf>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <f4><90><80><80>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <fd><bf><bf><bf>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <bf>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <bf>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <80>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <bf>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <c0>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <c1>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <f5>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <ff>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <c2>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <e0>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <e0><80><bf>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <e0><9f><80>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <e0><bf>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <ec><80>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <ed><a0><80>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <f0><80><80><80>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <f0><8f><bf><bf>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <f4><90><80><80>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <fd><bf><bf><bf>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <bf>"} } }
+         { dg-final { scan-sarif-file {"text": "invalid UTF-8 character <bf>"} } }
+*/
diff --git a/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-valid-CP850.c b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-valid-CP850.c

new file mode 100644 (file)

index 0000000..a189274
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-valid-CP850.c
@@ -0,0 +1,22 @@
+/* Adapted from gcc.dg/diagnostic-input-charset-1.c  */
+/* { dg-do compile } */
+/* { dg-require-iconv "CP850" } */
+/* { dg-options "-finput-charset=CP850 -fdiagnostics-format=sarif-file" } */
+/* { dg-excess-errors "The error is sent to the SARIF file, rather than stderr" } */
+
+/* Test that diagnostics are converted to UTF-8; this file is encoded in
+   CP850.
+
+   The non-ASCII byte here is 0xf5, which when decoded as CP850
+   is U+00A7 SECTION SIGN  */
+const char *section = "õ"
+
+/* 
+   { dg-final { verify-sarif-file } }
+
+   Verify that we captured the expected warning, and converted the snippet to
+   UTF-8 on output.
+
+   { dg-final { scan-sarif-file {"text": "expected ',' or ';' at end of input"} } }
+   { dg-final { scan-sarif-file {"text": "const char .section = \\"\u00a7\\"} } }
+*/
diff --git a/gcc/testsuite/gcc.dg/plugin/crash-test-ice-sarif.c b/gcc/testsuite/gcc.dg/plugin/crash-test-ice-sarif.c

index 9186a3262ca451e92653163740d21f02a55c382b..3b773a9a84c857e35f34300bd7c17abd3cd1b126 100644 (file)
--- a/gcc/testsuite/gcc.dg/plugin/crash-test-ice-sarif.c
+++ b/gcc/testsuite/gcc.dg/plugin/crash-test-ice-sarif.c
@@ -10,6 +10,7 @@ void test_inject_ice (void)
  }
  
  /* Verify that some JSON was written to a file with the expected name.  */
+/* { dg-final { verify-sarif-file } } */
  
  /* We expect various properties.
     The indentation here reflects the expected hierarchy, though these tests
diff --git a/gcc/testsuite/gcc.dg/plugin/crash-test-write-though-null-sarif.c b/gcc/testsuite/gcc.dg/plugin/crash-test-write-though-null-sarif.c

index 99de3f888d435a0170d6ae636a00fad09205a027..57caa20155fa4730dac72a299b13e0f77b8350e9 100644 (file)
--- a/gcc/testsuite/gcc.dg/plugin/crash-test-write-though-null-sarif.c
+++ b/gcc/testsuite/gcc.dg/plugin/crash-test-write-though-null-sarif.c
@@ -10,6 +10,7 @@ void test_inject_write_through_null (void)
  }
  
  /* Verify that some JSON was written to a file with the expected name.  */
+/* { dg-final { verify-sarif-file } } */
  
  /* We expect various properties.
     The indentation here reflects the expected hierarchy, though these tests
diff --git a/gcc/testsuite/gcc.dg/plugin/diagnostic-test-paths-5.c b/gcc/testsuite/gcc.dg/plugin/diagnostic-test-paths-5.c

index bd09391a8b28159a3b65f7c76e0ebe3c466647e6..27851d1b9f78c64aea3e3e1d828d44a01244b046 100644 (file)
--- a/gcc/testsuite/gcc.dg/plugin/diagnostic-test-paths-5.c
+++ b/gcc/testsuite/gcc.dg/plugin/diagnostic-test-paths-5.c
@@ -34,6 +34,8 @@ make_a_list_of_random_ints_badly(PyObject *self,
  }
  
  /* 
+   { dg-final { verify-sarif-file } }
+
     { dg-final { scan-sarif-file "\"tool\": " } }
  
       We expect info about the plugin:
diff --git a/gcc/testsuite/lib/scansarif.exp b/gcc/testsuite/lib/scansarif.exp

index 05524aa6e7a0c8ad74bee739206db243e2ab788f..83e76c31de7df0bb711c88f62f03ac16869b3a43 100644 (file)
--- a/gcc/testsuite/lib/scansarif.exp
+++ b/gcc/testsuite/lib/scansarif.exp
@@ -51,3 +51,32 @@ proc scan-sarif-file-not { args } {
  
      dg-scan "scan-sarif-file-not" 0 $testcase $output_file $args
  }
+
+# Perform validity checks on the .sarif file produced by the compiler.
+#
+# Assuming python3 is available, use verify-sarif-file.py to check
+# that the .sarif file is UTF-8 encoded and is parseable as JSON.
+
+proc verify-sarif-file { args } {
+    global srcdir subdir
+
+    set testcase [testname-for-summary]
+    set filename [lindex $testcase 0]
+    set output_file "[file tail $filename].sarif"
+
+    if { ![check_effective_target_recent_python3] } {
+       unsupported "$testcase verify-sarif-file: python3 is missing"
+       return
+    }
+
+    # Verify that the file is correctly encoded and is parseable as JSON.
+    set script_name $srcdir/lib/verify-sarif-file.py
+    set what "$testcase (test .sarif output for UTF-8-encoded parseable JSON)"
+    if [catch {exec python3 $script_name $output_file} res ] {
+       verbose "verify-sarif-file: res: $res" 2
+       fail "$what"
+       return
+    } else {
+       pass "$what"
+    }
+}
diff --git a/gcc/testsuite/lib/verify-sarif-file.py b/gcc/testsuite/lib/verify-sarif-file.py

new file mode 100644 (file)

index 0000000..f1833f3
--- /dev/null
+++ b/gcc/testsuite/lib/verify-sarif-file.py
@@ -0,0 +1,11 @@
+# Verify that ARGV[1] is UTF-8 encoded and parseable as JSON
+# For use by the verify-sarif-file directive
+
+import json
+import sys
+
+sys.tracebacklimit = 0
+
+fname = sys.argv[1]
+with open(fname, encoding="utf-8") as f:
+    json.load(f)
diff --git a/libcpp/charset.cc b/libcpp/charset.cc

index 3c47d4f868b1b0da14e651abbb261d25762cc80c..d7f323b2cd526f3a42b5007f2037432ebadef585 100644 (file)
--- a/libcpp/charset.cc
+++ b/libcpp/charset.cc
@@ -1864,6 +1864,33 @@ _cpp_valid_utf8 (cpp_reader *pfile,
    return true;
  }
  
+/* Return true iff BUFFER of size NUM_BYTES is validly-encoded UTF-8.  */
+
+extern bool
+cpp_valid_utf8_p (const char *buffer, size_t num_bytes)
+{
+  const uchar *iter = (const uchar *)buffer;
+  size_t bytesleft = num_bytes;
+  while (bytesleft > 0)
+    {
+      /* one_utf8_to_cppchar implements 5-byte and 6 byte sequences as per
+        RFC 2279, but this has been superceded by RFC 3629, which
+        restricts UTF-8 to 1-byte through 4-byte sequences, and
+        states "the octet values C0, C1, F5 to FF never appear".
+
+        Reject such values.  */
+      if (*iter >= 0xf4)
+       return false;
+
+      cppchar_t cp;
+      int err = one_utf8_to_cppchar (&iter, &bytesleft, &cp);
+      if (err)
+       return false;
+    }
+  /* No problems encountered.  */
+  return true;
+}
+
  /* Subroutine of convert_hex and convert_oct.  N is the representation
     in the execution character set of a numeric escape; write it into the
     string buffer TBUF and update the end-of-string pointer therein.  WIDE
diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h

index 8df071e1587641ea81eadb28515a997aa15cd3dd..a6f0abd894c2af9fa3bfb4e3f6cfec70dad652b0 100644 (file)
--- a/libcpp/include/cpplib.h
+++ b/libcpp/include/cpplib.h
@@ -1600,5 +1600,6 @@ int cpp_wcwidth (cppchar_t c);
  
  bool cpp_input_conversion_is_trivial (const char *input_charset);
  int cpp_check_utf8_bom (const char *data, size_t data_length);
+bool cpp_valid_utf8_p (const char *data, size_t num_bytes);
  
  #endif /* ! LIBCPP_CPPLIB_H */
author	David Malcolm <dmalcolm@redhat.com>
	Sat, 25 Mar 2023 00:52:34 +0000 (20:52 -0400)
committer	David Malcolm <dmalcolm@redhat.com>
	Sat, 25 Mar 2023 00:52:34 +0000 (20:52 -0400)
gcc/diagnostic-format-sarif.cc		patch \| blob \| blame \| history
gcc/input.cc		patch \| blob \| blame \| history
gcc/input.h		patch \| blob \| blame \| history
gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-1.c		patch \| blob \| blame \| history
gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-2.c		patch \| blob \| blame \| history
gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-3.c		patch \| blob \| blame \| history
gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-4.c		patch \| blob \| blame \| history
gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-Wbidi-chars.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-bad-utf8-pr109098-1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-bad-utf8-pr109098-2.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-bad-utf8-pr109098-3.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/c-c++-common/diagnostic-format-sarif-file-valid-CP850.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.dg/plugin/crash-test-ice-sarif.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/plugin/crash-test-write-though-null-sarif.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/plugin/diagnostic-test-paths-5.c		patch \| blob \| blame \| history
gcc/testsuite/lib/scansarif.exp		patch \| blob \| blame \| history
gcc/testsuite/lib/verify-sarif-file.py	[new file with mode: 0644]	patch \| blob
libcpp/charset.cc		patch \| blob \| blame \| history
libcpp/include/cpplib.h		patch \| blob \| blame \| history