gccrs: add utf-8 validation for input source

author Raiki Tamura <tamaron1203@gmail.com>

Tue, 4 Jul 2023 09:21:48 +0000 (18:21 +0900)

committer Arthur Cohen <arthur.cohen@embecosm.com>

Tue, 16 Jan 2024 17:49:31 +0000 (18:49 +0100)
author Raiki Tamura <tamaron1203@gmail.com>
Tue, 4 Jul 2023 09:21:48 +0000 (18:21 +0900)
committer Arthur Cohen <arthur.cohen@embecosm.com>
Tue, 16 Jan 2024 17:49:31 +0000 (18:49 +0100)
diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc

index f14b96c7af5ac5159ff0a0bd5934a436dc4c266a..e5d318d5af90d91001876eeb037fb98be53f3fbb 100644 (file)
--- a/gcc/rust/lex/rust-lex.cc
+++ b/gcc/rust/lex/rust-lex.cc
@@ -167,6 +167,12 @@ Lexer::~Lexer ()
    // line_map->stop();
  }
  
+bool
+Lexer::input_source_is_valid_utf8 ()
+{
+  return raw_input_source->is_valid ();
+}
+
  /* TODO: need to optimise somehow to avoid the virtual function call in the
   * tight loop. Best idea at the moment is CRTP, but that might make lexer
   * implementation annoying when storing the "base class" (i.e. would need
diff --git a/gcc/rust/lex/rust-lex.h b/gcc/rust/lex/rust-lex.h

index 2b86239c957ab9c165389a6e940d4ae813b2ac8d..0c7b998feec67dd35ea70bae71dd38f94e30c606 100644 (file)
--- a/gcc/rust/lex/rust-lex.h
+++ b/gcc/rust/lex/rust-lex.h
@@ -175,6 +175,8 @@ public:
    Lexer (Lexer &&other) = default;
    Lexer &operator= (Lexer &&other) = default;
  
+  bool input_source_is_valid_utf8 ();
+
    // Returns token n tokens ahead of current position.
    const_TokenPtr peek_token (int n) { return token_queue.peek (n); }
    // Peeks the current token.
@@ -217,9 +219,9 @@ public:
  
      Codepoint next_codepoint ()
      {
-      uint8_t input = next_byte ();
+      uint32_t input = next_byte ();
  
-      if ((int8_t) input == EOF)
+      if ((int32_t) input == EOF)
         return Codepoint::eof ();
        else if (input < 128)
         {
@@ -246,11 +248,13 @@ public:
           // 3 bytes or UTF-8 BOM
           uint8_t input2 = next_byte ();
           // If the second byte is equal to 0xBB then the input is no longer a
-         // valid UTF-8 char.
+         // valid UTF-8 char. Then, we check if the third byte makes up a UTF
+         // BOM.
           if (input == 0xEF && input2 == 0xBB)
             {
               uint8_t input3 = next_byte ();
               if (input3 == 0xBF)
+               // found BOM
                 return next_codepoint ();
               else
                 return {0xFFFE};
@@ -289,8 +293,6 @@ public:
         }
        else
         {
-         // rust_error_at (get_current_location (),
-         //   "invalid UTF-8 [SECND] (too long)");
           return {0xFFFE};
         }
      }
@@ -362,8 +364,7 @@ public:
      {
        if (offs >= buffer.size ())
         return EOF;
-
-      return buffer.at (offs++);
+      return (uint8_t) buffer.at (offs++);
      }
  
    public:
diff --git a/gcc/rust/rust-session-manager.cc b/gcc/rust/rust-session-manager.cc

index 8613c1c9cc8e2e2dcb76aafe4adc869641ddee0b..aaf19f380fc9ea58a2db753d32177851f75b7a3b 100644 (file)
--- a/gcc/rust/rust-session-manager.cc
+++ b/gcc/rust/rust-session-manager.cc
@@ -497,6 +497,14 @@ Session::compile_crate (const char *filename)
  
    Lexer lex (filename, std::move (file_wrap), linemap, dump_lex_opt);
  
+  if (!lex.input_source_is_valid_utf8 ())
+    {
+      rust_error_at (Linemap::unknown_location (),
+                    "cannot read %s; stream did not contain valid UTF-8",
+                    filename);
+      return;
+    }
+
    Parser<Lexer> parser (lex);
  
    // generate crate from parser
diff --git a/gcc/testsuite/rust/compile/broken_utf8.rs b/gcc/testsuite/rust/compile/broken_utf8.rs

new file mode 100644 (file)

index 0000000..8053b83
--- /dev/null
+++ b/gcc/testsuite/rust/compile/broken_utf8.rs
@@ -0,0 +1,2 @@
+// { dg-excess-errors "stream did not contain valid UTF-8" }
+ÿ
+\ No newline at end of file
author	Raiki Tamura <tamaron1203@gmail.com>
	Tue, 4 Jul 2023 09:21:48 +0000 (18:21 +0900)
committer	Arthur Cohen <arthur.cohen@embecosm.com>
	Tue, 16 Jan 2024 17:49:31 +0000 (18:49 +0100)
gcc/rust/lex/rust-lex.cc		patch \| blob \| blame \| history
gcc/rust/lex/rust-lex.h		patch \| blob \| blame \| history
gcc/rust/rust-session-manager.cc		patch \| blob \| blame \| history
gcc/testsuite/rust/compile/broken_utf8.rs	[new file with mode: 0644]	patch \| blob