gccrs: Normalize Hangul to NFC

author Raiki Tamura <tamaron1203@gmail.com>

Mon, 24 Jul 2023 08:29:20 +0000 (17:29 +0900)

committer Arthur Cohen <arthur.cohen@embecosm.com>

Tue, 16 Jan 2024 18:00:26 +0000 (19:00 +0100)
author Raiki Tamura <tamaron1203@gmail.com>
Mon, 24 Jul 2023 08:29:20 +0000 (17:29 +0900)
committer Arthur Cohen <arthur.cohen@embecosm.com>
Tue, 16 Jan 2024 18:00:26 +0000 (19:00 +0100)
diff --git a/gcc/rust/util/rust-unicode.cc b/gcc/rust/util/rust-unicode.cc

index 73e1abd9980b3bca6ce73c9d203d5f467d90a8f9..c6aa063c4c546035927f200b16c9873097e9c59b 100644 (file)
--- a/gcc/rust/util/rust-unicode.cc
+++ b/gcc/rust/util/rust-unicode.cc
@@ -9,6 +9,15 @@ namespace Rust {
  typedef uint32_t codepoint_t;
  typedef std::vector<codepoint_t> string_t;
  
+// These constants are used to compose and decompose of Hangul syllables.
+// See `Sample Code for Hangul Algorithms` in 3.1.2
+// unicode.org/versions/Unicode15.0.0/ch03.pdf
+const uint32_t S_BASE = 0xAC00;
+const uint32_t L_BASE = 0x1100, V_BASE = 0x1161, T_BASE = 0x11A7;
+const uint32_t L_COUNT = 19, V_COUNT = 21, T_COUNT = 28;
+const uint32_t N_COUNT = V_COUNT * T_COUNT;
+const uint32_t S_COUNT = L_COUNT * N_COUNT;
+
  template <std::size_t SIZE>
  int64_t
  binary_search_ranges (
@@ -115,10 +124,26 @@ recursive_decomp_cano (codepoint_t c, string_t &buf)
  string_t
  decomp_cano (string_t s)
  {
-  // TODO: Algorithmic lookup for Hangul
    string_t buf;
    for (codepoint_t c : s)
-    recursive_decomp_cano (c, buf);
+    {
+      int64_t s_index = c - S_BASE;
+      if (0 <= s_index && s_index < S_COUNT)
+       {
+         // decompose Hangul argorithmically
+         uint32_t l = L_BASE + s_index / N_COUNT;
+         uint32_t v = V_BASE + (s_index % N_COUNT) / T_COUNT;
+         uint32_t t = T_BASE + s_index % T_COUNT;
+         buf.push_back (l);
+         buf.push_back (v);
+         if (t != T_BASE)
+           buf.push_back (t);
+         continue;
+       }
+
+      // Current character is not hangul
+      recursive_decomp_cano (c, buf);
+    }
    return buf;
  }
  
@@ -132,7 +157,7 @@ sort_cano (string_t &s)
      {
        cc_here = lookup_cc (s[i]);
        cc_prev = lookup_cc (s[i - 1]);
-      if (cc_here >= 0 && cc_prev > cc_here)
+      if (cc_here > 0 && cc_prev > 0 && cc_prev > cc_here)
         {
           // swap
           int tmp = s[i];
@@ -145,45 +170,100 @@ sort_cano (string_t &s)
  }
  
  string_t
-recomp (string_t s)
+compose_hangul (string_t s)
  {
-  // TODO: Algorithmic lookup for Hangul
    string_t buf;
-  if (s.size () > 0)
+  if (s.size () < 2)
+    return s;
+
+  codepoint_t last = s[0];
+  buf.push_back (last);
+  for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
      {
-      int last_class = -1;
-      // Assume the first character is Starter.
-      codepoint_t starter_ch = s[0];
-      for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
+      codepoint_t ch = s[src_pos];
+
+      // L V => LV
+      int64_t l_index = last - L_BASE;
+      if (0 <= l_index && l_index < L_COUNT)
         {
-         // get current character
-         codepoint_t ch = s[src_pos];
-         int ch_class = lookup_cc (ch);
-         tl::optional<codepoint_t> composite = lookup_recomp (starter_ch, ch);
-         if (composite.has_value () && last_class < ch_class)
+         int64_t v_index = ch - V_BASE;
+         if (0 <= v_index && v_index < V_COUNT)
             {
-             // ch can be composed
-             buf.push_back (composite.value ());
-             starter_ch = composite.value ();
+             last = S_BASE + (l_index * V_COUNT + v_index) * T_COUNT;
+             // pop L
+             buf.pop_back ();
+             buf.push_back (last);
+             continue;
             }
-         else if (ch_class == 0)
-           {
-             // ch is Starter and cannot be composed.
-             if (src_pos == 1)
-               // FIXME: buggy?
-               buf.push_back (starter_ch);
-             // starter_pos = target_pos;
-             starter_ch = ch;
-             last_class = -1;
-             buf.push_back (ch);
-           }
-         else
+       }
+
+      // LV T => LVT
+      int64_t s_index = last - S_BASE;
+      if (0 <= s_index && s_index < S_COUNT && (s_index % T_COUNT) == 0)
+       {
+         int64_t t_index = ch - T_BASE;
+         if (0 < t_index && t_index < T_COUNT)
             {
-             // ch is not Starter.
-             last_class = ch_class;
-             buf.push_back (ch);
+             last += t_index;
+             // pop LV
+             buf.pop_back ();
+             buf.push_back (last);
+             continue;
             }
         }
+      last = ch;
+      buf.push_back (last);
+    }
+  return buf;
+}
+
+string_t
+recomp (string_t s)
+{
+  // compose hangul first
+  s = compose_hangul (s);
+
+  string_t buf;
+  if (s.size () < 2)
+    return s;
+
+  int last_class = -1;
+  // int starter_pos = 0; // Assume the first character is Starter. Correct?
+  // int target_pos = 1;
+  codepoint_t starter_ch = s[0];
+
+  for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
+    {
+      // get current character
+      codepoint_t ch = s[src_pos];
+
+      int ch_class = lookup_cc (ch);
+      tl::optional<codepoint_t> composite = lookup_recomp (starter_ch, ch);
+      if (composite.has_value () && last_class < ch_class)
+       {
+         // ch can be composed
+         buf.push_back (composite.value ());
+         starter_ch = composite.value ();
+       }
+      else if (ch_class == 0)
+       {
+         // ch is Starter and cannot be composed.
+         if (src_pos == 1)
+           // FIXME: buggy?
+           buf.push_back (starter_ch);
+         starter_ch = ch;
+         last_class = -1;
+         buf.push_back (ch);
+       }
+      else
+       {
+         if (src_pos == 1)
+           // FIXME: buggy?
+           buf.push_back (starter_ch);
+         // ch is not Starter.
+         last_class = ch_class;
+         buf.push_back (ch);
+       }
      }
    return buf;
  }
@@ -256,6 +336,16 @@ rust_utf8_normalize_test ()
    assert_normalize ({0x1e0c, 0x0307}, {0x1e0c, 0x0307});
    assert_normalize ({0x0044, 0x0307, 0x0323}, {0x1e0c, 0x0307});
  
+  // testcases for Hangul from Part0
+  assert_normalize ({0x1100, 0xac00, 0x11a8}, {0x1100, 0xac01});
+  assert_normalize ({0x1100, 0xac00, 0x11a8, 0x11a8}, {0x1100, 0xac01, 0x11a8});
+  // testcases for Hangul from Part1
+  assert_normalize ({0x3131}, {0x3131});
+  assert_normalize ({0x3132}, {0x3132});
+  // testcases for Hangul from Part3
+  assert_normalize ({0x1100, 0x0334, 0x1161}, {0x1100, 0x0334, 0x1161});
+  assert_normalize ({0xac54, 0x0334, 0x11ae}, {0xac54, 0x0334, 0x11ae});
+
    // TODO: add more testcases in
    // https://unicode.org/Public/UNIDATA/NormalizationTest.txt
  }
author	Raiki Tamura <tamaron1203@gmail.com>
	Mon, 24 Jul 2023 08:29:20 +0000 (17:29 +0900)
committer	Arthur Cohen <arthur.cohen@embecosm.com>
	Tue, 16 Jan 2024 18:00:26 +0000 (19:00 +0100)