gccrs: Normalize all identifier tokens

author Raiki Tamura <tamaron1203@gmail.com>

Sun, 30 Jul 2023 10:54:36 +0000 (19:54 +0900)

committer Arthur Cohen <arthur.cohen@embecosm.com>

Tue, 16 Jan 2024 18:00:28 +0000 (19:00 +0100)
author Raiki Tamura <tamaron1203@gmail.com>
Sun, 30 Jul 2023 10:54:36 +0000 (19:54 +0900)
committer Arthur Cohen <arthur.cohen@embecosm.com>
Tue, 16 Jan 2024 18:00:28 +0000 (19:00 +0100)
diff --git a/gcc/rust/lex/rust-input-source.h b/gcc/rust/lex/rust-input-source.h

new file mode 100644 (file)

index 0000000..07137de
--- /dev/null
+++ b/gcc/rust/lex/rust-input-source.h
@@ -0,0 +1,193 @@
+#ifndef RUST_INPUT_SOURCE_H
+#define RUST_INPUT_SOURCE_H
+
+#include "rust-codepoint.h"
+#include "optional.h"
+
+namespace Rust {
+// Input source wrapper thing.
+class InputSource
+{
+private:
+  // position of current character
+  unsigned int pos;
+  std::vector<Codepoint> chars;
+  bool is_valid_utf8;
+
+  // Overload operator () to return next char from input stream.
+  virtual int next_byte () = 0;
+
+  Codepoint next_codepoint ()
+  {
+    uint32_t input = next_byte ();
+
+    if ((int32_t) input == EOF)
+      return Codepoint::eof ();
+    else if (input < 128)
+      {
+       // ascii -- 1 byte
+       return {input};
+      }
+    else if ((input & 0xC0) == 0x80)
+      {
+       // invalid (continuation; can't be first char)
+       return {0xFFFE};
+      }
+    else if ((input & 0xE0) == 0xC0)
+      {
+       // 2 bytes
+       uint8_t input2 = next_byte ();
+       if ((input2 & 0xC0) != 0x80)
+         return {0xFFFE};
+
+       uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
+       return output;
+      }
+    else if ((input & 0xF0) == 0xE0)
+      {
+       // 3 bytes or UTF-8 BOM
+       uint8_t input2 = next_byte ();
+       // If the second byte is equal to 0xBB then the input is no longer a
+       // valid UTF-8 char. Then, we check if the third byte makes up a UTF
+       // BOM.
+       if (input == 0xEF && input2 == 0xBB)
+         {
+           uint8_t input3 = next_byte ();
+           if (input3 == 0xBF)
+             // found BOM
+             return next_codepoint ();
+           else
+             return {0xFFFE};
+         }
+
+       if ((input2 & 0xC0) != 0x80)
+         return {0xFFFE};
+
+       uint8_t input3 = next_byte ();
+
+       if ((input3 & 0xC0) != 0x80)
+         return {0xFFFE};
+
+       uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
+                         | ((input3 & 0x3F) << 0);
+       return {output};
+      }
+    else if ((input & 0xF8) == 0xF0)
+      {
+       // 4 bytes
+       uint8_t input2 = next_byte ();
+       if ((input2 & 0xC0) != 0x80)
+         return {0xFFFE};
+
+       uint8_t input3 = next_byte ();
+       if ((input3 & 0xC0) != 0x80)
+         return {0xFFFE};
+
+       uint8_t input4 = next_byte ();
+       if ((input4 & 0xC0) != 0x80)
+         return {0xFFFE};
+
+       uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
+                         | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
+       return {output};
+      }
+    else
+      {
+       return {0xFFFE};
+      }
+  }
+
+protected:
+  // Check if the input source is valid as utf-8 and copy all characters to
+  // `chars`.
+  void init ()
+  {
+    Codepoint char32 = next_codepoint ();
+    while (!char32.is_eof () && char32 != 0xFFFE)
+      {
+       chars.push_back (char32);
+       char32 = next_codepoint ();
+      }
+
+    if (char32 == 0xFFFE)
+      {
+       // Input source is not valid as utf-8.
+       is_valid_utf8 = false;
+      }
+  }
+
+public:
+  InputSource () : pos (0), chars ({}), is_valid_utf8 (true) {}
+
+  virtual ~InputSource () {}
+
+  // Checks if input source is a valid UTF-8 string
+  bool is_valid () { return is_valid_utf8; }
+
+  // get the next UTF-8 character
+  Codepoint next ()
+  {
+    if (pos >= chars.size ())
+      return Codepoint::eof ();
+    else
+      {
+       Codepoint c = chars[pos];
+       pos++;
+       return c;
+      }
+  }
+
+  // Returns codepoint if input source is a valid UTF-8 string. Returns
+  // nullopt otherwise.
+  tl::optional<std::vector<Codepoint>> get_chars ()
+  {
+    if (is_valid ())
+      return {chars};
+    else
+      return tl::nullopt;
+  }
+};
+
+class FileInputSource : public InputSource
+{
+private:
+  // Input source file.
+  FILE *input;
+
+  int next_byte () override { return fgetc (input); }
+
+public:
+  // Create new input source from file.
+  FileInputSource (FILE *input) : InputSource (), input (input)
+  {
+    // TODO make this better?
+    init ();
+  }
+};
+
+class BufferInputSource : public InputSource
+{
+private:
+  const std::string &buffer;
+  size_t offs;
+
+  int next_byte () override
+  {
+    if (offs >= buffer.size ())
+      return EOF;
+    return (uint8_t) buffer.at (offs++);
+  }
+
+public:
+  // Create new input source from file.
+  BufferInputSource (const std::string &b, size_t offset)
+    : InputSource (), buffer (b), offs (offset)
+  {
+    // TODO make this better?
+    init ();
+  }
+};
+
+} // namespace Rust
+
+#endif
diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc

index 7eb9142322ec992aaa42506050c133ff6a8c95ae..53895c1419921fabaaa962eaed787d986191efd4 100644 (file)
--- a/gcc/rust/lex/rust-lex.cc
+++ b/gcc/rust/lex/rust-lex.cc
@@ -2534,8 +2534,7 @@ namespace selftest {
  
  // Checks if `src` has the same contents as the given characters
  void
-assert_source_content (Rust::Lexer::InputSource &src,
-                      std::vector<uint32_t> expected)
+assert_source_content (Rust::InputSource &src, std::vector<uint32_t> expected)
  {
    Rust::Codepoint src_char = src.next ();
    for (auto expected_char : expected)
@@ -2553,7 +2552,7 @@ assert_source_content (Rust::Lexer::InputSource &src,
  void
  test_buffer_input_source (std::string str, std::vector<uint32_t> expected)
  {
-  Rust::Lexer::BufferInputSource source (str, 0);
+  Rust::BufferInputSource source (str, 0);
    assert_source_content (source, expected);
  }
  
@@ -2564,7 +2563,7 @@ test_file_input_source (std::string str, std::vector<uint32_t> expected)
    // Moves to the first character
    fputs (str.c_str (), tmpf);
    std::rewind (tmpf);
-  Rust::Lexer::FileInputSource source (tmpf);
+  Rust::FileInputSource source (tmpf);
    assert_source_content (source, expected);
  }
  
diff --git a/gcc/rust/lex/rust-lex.h b/gcc/rust/lex/rust-lex.h

index 91e814b76f3d1f71ffd85667a25e0f1afce9b46e..683e8c67a89f7016fc48063a427a58ed64514343 100644 (file)
--- a/gcc/rust/lex/rust-lex.h
+++ b/gcc/rust/lex/rust-lex.h
@@ -24,6 +24,7 @@
  #include "rust-token.h"
  #include "optional.h"
  #include "selftest.h"
+#include "rust-input-source.h"
  
  namespace Rust {
  // Simple wrapper for FILE* that simplifies destruction.
@@ -204,186 +205,6 @@ public:
    Linemap *get_line_map () { return line_map; }
    std::string get_filename () { return std::string (input.get_filename ()); }
  
-  // Input source wrapper thing.
-  class InputSource
-  {
-  private:
-    // position of current character
-    unsigned int pos;
-    std::vector<Codepoint> chars;
-    bool is_valid_utf8;
-
-    // Overload operator () to return next char from input stream.
-    virtual int next_byte () = 0;
-
-    Codepoint next_codepoint ()
-    {
-      uint32_t input = next_byte ();
-
-      if ((int32_t) input == EOF)
-       return Codepoint::eof ();
-      else if (input < 128)
-       {
-         // ascii -- 1 byte
-         return {input};
-       }
-      else if ((input & 0xC0) == 0x80)
-       {
-         // invalid (continuation; can't be first char)
-         return {0xFFFE};
-       }
-      else if ((input & 0xE0) == 0xC0)
-       {
-         // 2 bytes
-         uint8_t input2 = next_byte ();
-         if ((input2 & 0xC0) != 0x80)
-           return {0xFFFE};
-
-         uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
-         return output;
-       }
-      else if ((input & 0xF0) == 0xE0)
-       {
-         // 3 bytes or UTF-8 BOM
-         uint8_t input2 = next_byte ();
-         // If the second byte is equal to 0xBB then the input is no longer a
-         // valid UTF-8 char. Then, we check if the third byte makes up a UTF
-         // BOM.
-         if (input == 0xEF && input2 == 0xBB)
-           {
-             uint8_t input3 = next_byte ();
-             if (input3 == 0xBF)
-               // found BOM
-               return next_codepoint ();
-             else
-               return {0xFFFE};
-           }
-
-         if ((input2 & 0xC0) != 0x80)
-           return {0xFFFE};
-
-         uint8_t input3 = next_byte ();
-
-         if ((input3 & 0xC0) != 0x80)
-           return {0xFFFE};
-
-         uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
-                           | ((input3 & 0x3F) << 0);
-         return {output};
-       }
-      else if ((input & 0xF8) == 0xF0)
-       {
-         // 4 bytes
-         uint8_t input2 = next_byte ();
-         if ((input2 & 0xC0) != 0x80)
-           return {0xFFFE};
-
-         uint8_t input3 = next_byte ();
-         if ((input3 & 0xC0) != 0x80)
-           return {0xFFFE};
-
-         uint8_t input4 = next_byte ();
-         if ((input4 & 0xC0) != 0x80)
-           return {0xFFFE};
-
-         uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
-                           | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
-         return {output};
-       }
-      else
-       {
-         return {0xFFFE};
-       }
-    }
-
-  protected:
-    // Check if the input source is valid as utf-8 and copy all characters to
-    // `chars`.
-    void init ()
-    {
-      Codepoint char32 = next_codepoint ();
-      while (!char32.is_eof () && char32 != 0xFFFE)
-       {
-         chars.push_back (char32);
-         char32 = next_codepoint ();
-       }
-
-      if (char32 == 0xFFFE)
-       {
-         // Input source is not valid as utf-8.
-         is_valid_utf8 = false;
-       }
-    }
-
-  public:
-    InputSource () : pos (0), chars ({}), is_valid_utf8 (true) {}
-
-    virtual ~InputSource () {}
-
-    bool is_valid () { return is_valid_utf8; }
-
-    // get the next UTF-8 character
-    Codepoint next ()
-    {
-      if (pos >= chars.size ())
-       return Codepoint::eof ();
-      else
-       {
-         Codepoint c = chars[pos];
-         pos++;
-         return c;
-       }
-    }
-
-    tl::optional<std::vector<Codepoint>> get_chars ()
-    {
-      if (is_valid ())
-       return {chars};
-      else
-       return tl::nullopt;
-    }
-  };
-
-  class FileInputSource : public InputSource
-  {
-  private:
-    // Input source file.
-    FILE *input;
-
-    int next_byte () override { return fgetc (input); }
-
-  public:
-    // Create new input source from file.
-    FileInputSource (FILE *input) : InputSource (), input (input)
-    {
-      // TODO make this better?
-      init ();
-    }
-  };
-
-  class BufferInputSource : public InputSource
-  {
-  private:
-    const std::string &buffer;
-    size_t offs;
-
-    int next_byte () override
-    {
-      if (offs >= buffer.size ())
-       return EOF;
-      return (uint8_t) buffer.at (offs++);
-    }
-
-  public:
-    // Create new input source from file.
-    BufferInputSource (const std::string &b, size_t offset)
-      : InputSource (), buffer (b), offs (offset)
-    {
-      // TODO make this better?
-      init ();
-    }
-  };
-
  private:
    void start_line (int current_line, int current_column);
  
diff --git a/gcc/rust/lex/rust-token.cc b/gcc/rust/lex/rust-token.cc

index 8807017bdbf3f898588d9b4c6c5f9f8819dde654..868946f65f3ca13373d75829accef331b4cfd146 100644 (file)
--- a/gcc/rust/lex/rust-token.cc
+++ b/gcc/rust/lex/rust-token.cc
@@ -19,6 +19,7 @@
  #include "rust-system.h"
  #include "rust-token.h"
  #include "rust-diagnostics.h"
+#include "rust-unicode.h"
  
  namespace Rust {
  // Hackily defined way to get token description for enum value using x-macros
@@ -150,6 +151,23 @@ Token::get_type_hint_str () const
    return get_type_hint_string (type_hint);
  }
  
+std::string
+nfc_normalize_token_string (location_t loc, TokenId id, const std::string &str)
+{
+  if (id == IDENTIFIER || id == LIFETIME)
+    {
+      tl::optional<Utf8String> ustring = Utf8String::make_utf8_string (str);
+      if (ustring.has_value ())
+       return ustring.value ().nfc_normalize ().as_string ();
+      else
+       rust_internal_error_at (loc,
+                               "identifier '%s' is not a valid UTF-8 string",
+                               str.c_str ());
+    }
+  else
+    return str;
+}
+
  const std::string &
  Token::get_str () const
  {
diff --git a/gcc/rust/lex/rust-token.h b/gcc/rust/lex/rust-token.h

index acbbeb010a14489fd05fc20f589a097ad94b6397..5da8b6cd2002a6ba5ebeb34d6a63558a8b510806 100644 (file)
--- a/gcc/rust/lex/rust-token.h
+++ b/gcc/rust/lex/rust-token.h
@@ -21,7 +21,8 @@
  
  #include "rust-system.h"
  #include "rust-linemap.h"
-#include "rust-codepoint.h"
+#include "rust-make-unique.h"
+#include "rust-unicode.h"
  
  namespace Rust {
  // "Primitive core types" in Rust - the different int and float types, as well
@@ -236,6 +237,10 @@ token_id_keyword_string (TokenId id);
  const char *
  get_type_hint_string (PrimitiveCoreType type);
  
+/* Normalize string if a token is a identifier */
+std::string
+nfc_normalize_token_string (location_t loc, TokenId id, const std::string &str);
+
  // Represents a single token. Create using factory static methods.
  class Token
  {
@@ -259,29 +264,40 @@ private:
  
    // Token constructor from token id, location, and a string.
    Token (TokenId token_id, location_t location, std::string &&paramStr)
-    : token_id (token_id), locus (location),
-      str (new std::string (std::move (paramStr))), type_hint (CORETYPE_UNKNOWN)
-  {}
+    : token_id (token_id), locus (location), type_hint (CORETYPE_UNKNOWN)
+  {
+    // Normalize identifier tokens
+    str = Rust::make_unique<std::string> (
+      nfc_normalize_token_string (location, token_id, paramStr));
+  }
  
    // Token constructor from token id, location, and a char.
    Token (TokenId token_id, location_t location, char paramChar)
      : token_id (token_id), locus (location),
        str (new std::string (1, paramChar)), type_hint (CORETYPE_UNKNOWN)
-  {}
+  {
+    // Do not need to normalize 1byte char
+  }
  
    // Token constructor from token id, location, and a "codepoint".
    Token (TokenId token_id, location_t location, Codepoint paramCodepoint)
-    : token_id (token_id), locus (location),
-      str (new std::string (paramCodepoint.as_string ())),
-      type_hint (CORETYPE_UNKNOWN)
-  {}
+    : token_id (token_id), locus (location), type_hint (CORETYPE_UNKNOWN)
+  {
+    // Normalize identifier tokens
+    str = Rust::make_unique<std::string> (
+      nfc_normalize_token_string (location, token_id,
+                                 paramCodepoint.as_string ()));
+  }
  
    // Token constructor from token id, location, a string, and type hint.
    Token (TokenId token_id, location_t location, std::string &&paramStr,
          PrimitiveCoreType parType)
-    : token_id (token_id), locus (location),
-      str (new std::string (std::move (paramStr))), type_hint (parType)
-  {}
+    : token_id (token_id), locus (location), type_hint (parType)
+  {
+    // Normalize identifier tokens
+    str = Rust::make_unique<std::string> (
+      nfc_normalize_token_string (location, token_id, paramStr));
+  }
  
  public:
    // No default constructor.
diff --git a/gcc/rust/rust-lang.cc b/gcc/rust/rust-lang.cc

index 1679d5598efa5deeaff19f9adb6fb1186699b128..157d83f506f4f7631ebc8dd6facf9cb48c72bb46 100644 (file)
--- a/gcc/rust/rust-lang.cc
+++ b/gcc/rust/rust-lang.cc
@@ -452,11 +452,11 @@ run_rust_tests ()
  {
    // Call tests for the rust frontend here
    rust_input_source_test ();
+  rust_utf8_normalize_test ();
    rust_cfg_parser_test ();
    rust_privacy_ctx_test ();
    rust_crate_name_validation_test ();
    rust_simple_path_resolve_test ();
-  rust_utf8_normalize_test ();
  }
  } // namespace selftest
  
diff --git a/gcc/rust/rust-session-manager.cc b/gcc/rust/rust-session-manager.cc

index 401133c14dc8504bdfb43eea7701c051ae0e5701..55f6ec5585bb821d9af7579e889a7ea024549f69 100644 (file)
--- a/gcc/rust/rust-session-manager.cc
+++ b/gcc/rust/rust-session-manager.cc
@@ -115,16 +115,15 @@ infer_crate_name (const std::string &filename)
  static bool
  validate_crate_name (const std::string &crate_name, Error &error)
  {
-  Utf8String utf8_name = {crate_name};
-  tl::optional<std::vector<Codepoint>> uchars_opt = utf8_name.get_chars ();
-
-  if (!uchars_opt.has_value ())
+  tl::optional<Utf8String> utf8_name_opt
+    = Utf8String::make_utf8_string (crate_name);
+  if (!utf8_name_opt.has_value ())
      {
        error = Error (UNDEF_LOCATION, "crate name is not a valid UTF-8 string");
        return false;
      }
  
-  std::vector<Codepoint> uchars = uchars_opt.value ();
+  std::vector<Codepoint> uchars = utf8_name_opt->get_chars ();
    if (uchars.empty ())
      {
        error = Error (UNDEF_LOCATION, "crate name cannot be empty");
diff --git a/gcc/rust/util/rust-unicode.cc b/gcc/rust/util/rust-unicode.cc

index c6aa063c4c546035927f200b16c9873097e9c59b..b2ddaf0b9cec05bc62ce6e62cbad7ed34cec932c 100644 (file)
--- a/gcc/rust/util/rust-unicode.cc
+++ b/gcc/rust/util/rust-unicode.cc
@@ -1,12 +1,14 @@
  #include "rust-system.h"
  #include "optional.h"
  #include "selftest.h"
+#include "rust-lex.h"
+#include "rust-unicode.h"
  
  #include "rust-unicode-data.h"
  
  namespace Rust {
  
-typedef uint32_t codepoint_t;
+typedef Codepoint codepoint_t;
  typedef std::vector<codepoint_t> string_t;
  
  // These constants are used to compose and decompose of Hangul syllables.
@@ -85,7 +87,7 @@ binary_search_sorted_array (const std::array<uint32_t, SIZE> &array,
  int
  lookup_cc (codepoint_t c)
  {
-  auto it = Rust::CCC_TABLE.find (c);
+  auto it = Rust::CCC_TABLE.find (c.value);
    if (it != Rust::CCC_TABLE.end ())
      return it->second;
    else
@@ -96,11 +98,11 @@ lookup_cc (codepoint_t c)
  tl::optional<codepoint_t>
  lookup_recomp (codepoint_t starter, codepoint_t c)
  {
-  auto it = Rust::RECOMPOSITION_MAP.find ({starter, c});
+  auto it = Rust::RECOMPOSITION_MAP.find ({starter.value, c.value});
    if (it != Rust::RECOMPOSITION_MAP.end ())
      return {it->second};
  
-  it = Rust::RECOMPOSITION_MAP.find ({starter, 0});
+  it = Rust::RECOMPOSITION_MAP.find ({starter.value, 0});
    if (it != Rust::RECOMPOSITION_MAP.end ())
      return {it->second};
  
@@ -110,11 +112,11 @@ lookup_recomp (codepoint_t starter, codepoint_t c)
  void
  recursive_decomp_cano (codepoint_t c, string_t &buf)
  {
-  auto it = Rust::DECOMPOSITION_MAP.find (c);
+  auto it = Rust::DECOMPOSITION_MAP.find (c.value);
    if (it != Rust::DECOMPOSITION_MAP.end ())
      {
-      string_t decomped = it->second;
-      for (codepoint_t cp : decomped)
+      std::vector<uint32_t> decomped = it->second;
+      for (uint32_t cp : decomped)
         recursive_decomp_cano (cp, buf);
      }
    else
@@ -127,7 +129,7 @@ decomp_cano (string_t s)
    string_t buf;
    for (codepoint_t c : s)
      {
-      int64_t s_index = c - S_BASE;
+      int64_t s_index = c.value - S_BASE;
        if (0 <= s_index && s_index < S_COUNT)
         {
           // decompose Hangul argorithmically
@@ -160,7 +162,7 @@ sort_cano (string_t &s)
        if (cc_here > 0 && cc_prev > 0 && cc_prev > cc_here)
         {
           // swap
-         int tmp = s[i];
+         codepoint_t tmp = s[i];
           s[i] = s[i - 1];
           s[i - 1] = tmp;
           if (i > 1)
@@ -183,10 +185,10 @@ compose_hangul (string_t s)
        codepoint_t ch = s[src_pos];
  
        // L V => LV
-      int64_t l_index = last - L_BASE;
+      int64_t l_index = last.value - L_BASE;
        if (0 <= l_index && l_index < L_COUNT)
         {
-         int64_t v_index = ch - V_BASE;
+         int64_t v_index = ch.value - V_BASE;
           if (0 <= v_index && v_index < V_COUNT)
             {
               last = S_BASE + (l_index * V_COUNT + v_index) * T_COUNT;
@@ -198,13 +200,13 @@ compose_hangul (string_t s)
         }
  
        // LV T => LVT
-      int64_t s_index = last - S_BASE;
+      int64_t s_index = last.value - S_BASE;
        if (0 <= s_index && s_index < S_COUNT && (s_index % T_COUNT) == 0)
         {
-         int64_t t_index = ch - T_BASE;
+         int64_t t_index = ch.value - T_BASE;
           if (0 < t_index && t_index < T_COUNT)
             {
-             last += t_index;
+             last.value += t_index;
               // pop LV
               buf.pop_back ();
               buf.push_back (last);
@@ -282,6 +284,12 @@ nfc_normalize (string_t s)
    return r;
  }
  
+Utf8String
+Utf8String::nfc_normalize () const
+{
+  return Utf8String (Rust::nfc_normalize (chars));
+}
+
  bool
  is_alphabetic (uint32_t codepoint)
  {
@@ -309,9 +317,10 @@ is_numeric (uint32_t codepoint)
  namespace selftest {
  
  void
-assert_normalize (std::vector<uint32_t> origin, std::vector<uint32_t> expected)
+assert_normalize (const std::vector<Rust::Codepoint> origin,
+                 const std::vector<Rust::Codepoint> expected)
  {
-  std::vector<uint32_t> actual = Rust::nfc_normalize (origin);
+  std::vector<Rust::Codepoint> actual = Rust::nfc_normalize (origin);
  
    ASSERT_EQ (actual.size (), expected.size ());
    for (unsigned int i = 0; i < actual.size (); i++)
diff --git a/gcc/rust/util/rust-unicode.h b/gcc/rust/util/rust-unicode.h

index 68005587d0afab583a50c40cc5c2653be2262482..becf6fb6a0c76b0c2dca3c4b2c77327f79488402 100644 (file)
--- a/gcc/rust/util/rust-unicode.h
+++ b/gcc/rust/util/rust-unicode.h
@@ -21,28 +21,43 @@
  
  #include "optional.h"
  #include "rust-system.h"
-#include "rust-lex.h"
+#include "rust-input-source.h"
  
  namespace Rust {
  
  class Utf8String
  {
  private:
-  tl::optional<std::vector<Codepoint>> chars;
+  std::vector<Codepoint> chars;
  
  public:
-  Utf8String (const std::string &maybe_utf8)
+  static tl::optional<Utf8String>
+  make_utf8_string (const std::string &maybe_utf8)
    {
-    Lexer::BufferInputSource input_source = {maybe_utf8, 0};
-    chars = input_source.get_chars ();
+    BufferInputSource input_source = {maybe_utf8, 0};
+    tl::optional<std::vector<Codepoint>> chars_opt = input_source.get_chars ();
+    if (chars_opt.has_value ())
+      return {Utf8String (chars_opt.value ())};
+    else
+      return tl::nullopt;
    }
  
-  // Returns UTF codepoints when string is valid as UTF-8, returns nullopt
-  // otherwise.
-  tl::optional<std::vector<Codepoint>> get_chars () const { return chars; }
-};
+  Utf8String (const std::vector<Codepoint> codepoints) : chars ({codepoints}) {}
+
+  std::string as_string () const
+  {
+    std::stringstream ss;
+    for (Codepoint c : chars)
+      ss << c.as_string ();
  
-// TODO: add function nfc_normalize
+    return ss.str ();
+  };
+
+  // Returns characters
+  std::vector<Codepoint> get_chars () const { return chars; }
+
+  Utf8String nfc_normalize () const;
+};
  
  bool
  is_alphabetic (uint32_t codepoint);
diff --git a/gcc/testsuite/rust/compile/unicode_norm1.rs b/gcc/testsuite/rust/compile/unicode_norm1.rs

new file mode 100644 (file)

index 0000000..d496054
--- /dev/null
+++ b/gcc/testsuite/rust/compile/unicode_norm1.rs
@@ -0,0 +1,6 @@
+fn main() {
+    // U+304C
+    let が = ();
+    // U+304B + U+3099
+    let _ = が;
+}
author	Raiki Tamura <tamaron1203@gmail.com>
	Sun, 30 Jul 2023 10:54:36 +0000 (19:54 +0900)
committer	Arthur Cohen <arthur.cohen@embecosm.com>
	Tue, 16 Jan 2024 18:00:28 +0000 (19:00 +0100)
gcc/rust/lex/rust-input-source.h	[new file with mode: 0644]	patch \| blob
gcc/rust/lex/rust-lex.cc		patch \| blob \| blame \| history
gcc/rust/lex/rust-lex.h		patch \| blob \| blame \| history
gcc/rust/lex/rust-token.cc		patch \| blob \| blame \| history
gcc/rust/lex/rust-token.h		patch \| blob \| blame \| history
gcc/rust/rust-lang.cc		patch \| blob \| blame \| history
gcc/rust/rust-session-manager.cc		patch \| blob \| blame \| history
gcc/rust/util/rust-unicode.cc		patch \| blob \| blame \| history
gcc/rust/util/rust-unicode.h		patch \| blob \| blame \| history
gcc/testsuite/rust/compile/unicode_norm1.rs	[new file with mode: 0644]	patch \| blob