]>
git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/rust/util/rust-unicode.cc
6bd2db550a183065ff2dacbc8587b4506b6b9c1e
1 // Copyright (C) 2020-2023 Free Software Foundation, Inc.
3 // This file is part of GCC.
5 // GCC is free software; you can redistribute it and/or modify it under
6 // the terms of the GNU General Public License as published by the Free
7 // Software Foundation; either version 3, or (at your option) any later
10 // GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 // You should have received a copy of the GNU General Public License
16 // along with GCC; see the file COPYING3. If not see
17 // <http://www.gnu.org/licenses/>.
19 #include "rust-input-source.h"
20 #include "rust-system.h"
24 #include "rust-unicode.h"
26 #include "rust-unicode-data.h"
30 typedef Codepoint codepoint_t
;
31 typedef std::vector
<codepoint_t
> string_t
;
33 // These constants are used to compose and decompose of Hangul syllables.
34 // See `Sample Code for Hangul Algorithms` in 3.1.2
35 // unicode.org/versions/Unicode15.0.0/ch03.pdf
36 const uint32_t S_BASE
= 0xAC00;
37 const uint32_t L_BASE
= 0x1100, V_BASE
= 0x1161, T_BASE
= 0x11A7;
38 const uint32_t L_COUNT
= 19, V_COUNT
= 21, T_COUNT
= 28;
39 const uint32_t N_COUNT
= V_COUNT
* T_COUNT
;
40 const uint32_t S_COUNT
= L_COUNT
* N_COUNT
;
42 // Check if the codepoint is in any of the ranges (half-open intervals [a,b)).
43 template <std::size_t SIZE
>
45 binary_search_ranges (
46 const std::array
<std::pair
<uint32_t, uint32_t>, SIZE
> &ranges
,
49 auto it
= std::lower_bound (ranges
.begin (), ranges
.end (), target_cp
,
50 [] (const std::pair
<uint32_t, uint32_t> &a
,
51 uint32_t b
) { return a
.second
<= b
; });
52 if (it
== ranges
.end ())
55 return it
->first
<= target_cp
&& target_cp
< it
->second
;
59 lookup_cc (codepoint_t c
)
61 auto it
= CCC_TABLE
.find (c
.value
);
62 if (it
!= CCC_TABLE
.end ())
65 // Starter. Returns zero.
69 tl::optional
<codepoint_t
>
70 lookup_recomp (codepoint_t starter
, codepoint_t c
)
72 auto it
= Rust::RECOMPOSITION_MAP
.find ({starter
.value
, c
.value
});
73 if (it
!= Rust::RECOMPOSITION_MAP
.end ())
76 it
= Rust::RECOMPOSITION_MAP
.find ({starter
.value
, 0});
77 if (it
!= Rust::RECOMPOSITION_MAP
.end ())
84 recursive_decomp_cano (codepoint_t c
, string_t
&buf
)
86 auto it
= Rust::DECOMPOSITION_MAP
.find (c
.value
);
87 if (it
!= Rust::DECOMPOSITION_MAP
.end ())
89 std::vector
<uint32_t> decomped
= it
->second
;
90 for (uint32_t cp
: decomped
)
91 recursive_decomp_cano (cp
, buf
);
98 decomp_cano (string_t s
)
101 for (codepoint_t c
: s
)
103 int64_t s_index
= c
.value
- S_BASE
;
104 if (0 <= s_index
&& s_index
< S_COUNT
)
106 // decompose Hangul argorithmically
107 uint32_t l
= L_BASE
+ s_index
/ N_COUNT
;
108 uint32_t v
= V_BASE
+ (s_index
% N_COUNT
) / T_COUNT
;
109 uint32_t t
= T_BASE
+ s_index
% T_COUNT
;
117 // Current character is not hangul
118 recursive_decomp_cano (c
, buf
);
124 sort_cano (string_t
&s
)
126 int cc_here
, cc_prev
;
129 for (unsigned int i
= 1; i
< s
.size (); i
++)
131 cc_here
= lookup_cc (s
[i
]);
132 cc_prev
= lookup_cc (s
[i
- 1]);
133 if (cc_here
> 0 && cc_prev
> 0 && cc_prev
> cc_here
)
136 codepoint_t tmp
= s
[i
];
146 compose_hangul (string_t s
)
152 codepoint_t last
= s
[0];
153 buf
.push_back (last
);
154 for (unsigned int src_pos
= 1; src_pos
< s
.size (); src_pos
++)
156 codepoint_t ch
= s
[src_pos
];
159 int64_t l_index
= last
.value
- L_BASE
;
160 if (0 <= l_index
&& l_index
< L_COUNT
)
162 int64_t v_index
= ch
.value
- V_BASE
;
163 if (0 <= v_index
&& v_index
< V_COUNT
)
165 last
= S_BASE
+ (l_index
* V_COUNT
+ v_index
) * T_COUNT
;
168 buf
.push_back (last
);
174 int64_t s_index
= last
.value
- S_BASE
;
175 if (0 <= s_index
&& s_index
< S_COUNT
&& (s_index
% T_COUNT
) == 0)
177 int64_t t_index
= ch
.value
- T_BASE
;
178 if (0 < t_index
&& t_index
< T_COUNT
)
180 last
.value
+= t_index
;
183 buf
.push_back (last
);
188 buf
.push_back (last
);
196 // compose hangul first
197 s
= compose_hangul (s
);
204 // int starter_pos = 0; // Assume the first character is Starter. Correct?
205 // int target_pos = 1;
206 codepoint_t starter_ch
= s
[0];
208 for (unsigned int src_pos
= 1; src_pos
< s
.size (); src_pos
++)
210 // get current character
211 codepoint_t ch
= s
[src_pos
];
213 int ch_class
= lookup_cc (ch
);
214 tl::optional
<codepoint_t
> composite
= lookup_recomp (starter_ch
, ch
);
215 if (composite
.has_value () && last_class
< ch_class
)
217 // ch can be composed
218 buf
.push_back (composite
.value ());
219 starter_ch
= composite
.value ();
221 else if (ch_class
== 0)
223 // ch is Starter and cannot be composed.
226 buf
.push_back (starter_ch
);
235 buf
.push_back (starter_ch
);
236 // ch is not Starter.
237 last_class
= ch_class
;
244 // see https://unicode.org/reports/tr15/#Detecting_Normalization_Forms
246 nfc_quick_check (const string_t
&s
)
248 int last_canonical_class
= 0;
249 QuickCheckResult res
= QuickCheckResult::YES
;
251 for (unsigned long i
= 0; i
< s
.size (); i
++)
253 codepoint_t c
= s
[i
];
255 if (c
.is_supplementary_character ())
258 int canonical_class
= lookup_cc (c
);
259 if (last_canonical_class
> canonical_class
&& canonical_class
!= 0)
260 return QuickCheckResult::NO
;
262 if (is_nfc_qc_no (c
.value
))
263 return QuickCheckResult::NO
;
265 if (is_nfc_qc_maybe (c
.value
))
266 res
= QuickCheckResult::MAYBE
;
268 last_canonical_class
= canonical_class
;
274 nfc_normalize (const string_t
&s
)
276 if (nfc_quick_check (s
) == QuickCheckResult::YES
)
279 // TODO: optimize normalization.
280 // i.e. only normalize a limited area around MAYBE character, instead of
281 // performing complete normlization of the entire string
284 string_t d
= decomp_cano (s
);
288 string_t r
= recomp (d
);
293 Utf8String::nfc_normalize () const
295 return Utf8String (Rust::nfc_normalize (chars
));
299 is_alphabetic (uint32_t codepoint
)
301 return binary_search_ranges (ALPHABETIC_RANGES
, codepoint
);
305 is_numeric (uint32_t codepoint
)
307 return std::binary_search (NUMERIC_CODEPOINTS
.begin (),
308 NUMERIC_CODEPOINTS
.end (), codepoint
);
312 is_nfc_qc_maybe (uint32_t codepoint
)
314 return binary_search_ranges (NFC_QC_MAYBE_RANGES
, codepoint
);
318 is_nfc_qc_no (uint32_t codepoint
)
320 return binary_search_ranges (NFC_QC_NO_RANGES
, codepoint
);
324 is_ascii_only (const std::string
&str
)
327 if (static_cast<uint32_t> (c
) > MAX_ASCII_CODEPOINT
)
341 ASSERT_EQ (Rust::nfc_quick_check ({0x1e0a /* NFC_QC_YES */}),
342 Rust::QuickCheckResult::YES
);
343 ASSERT_EQ (Rust::nfc_quick_check (
344 {0x1e0a /* NFC_QC_YES */, 0x0323 /* NFC_QC_MAYBE */}),
345 Rust::QuickCheckResult::MAYBE
);
346 ASSERT_EQ (Rust::nfc_quick_check ({0x0340 /* NFC_QC_NO */}),
347 Rust::QuickCheckResult::NO
);
351 assert_normalize (const std::vector
<Rust::Codepoint
> origin
,
352 const std::vector
<Rust::Codepoint
> expected
)
354 std::vector
<Rust::Codepoint
> actual
= Rust::nfc_normalize (origin
);
356 ASSERT_EQ (actual
.size (), expected
.size ());
357 for (unsigned int i
= 0; i
< actual
.size (); i
++)
359 ASSERT_EQ (actual
[i
], expected
[i
]);
364 rust_utf8_normalize_test ()
367 assert_normalize ({'h', 'e', 'l', 'l', 'o'}, {'h', 'e', 'l', 'l', 'o'});
369 assert_normalize ({'/', '\\', '.', ':', '*'}, {'/', '\\', '.', ':', '*'});
371 // testcases retrieved from Part0 of
372 // https://unicode.org/Public/UNIDATA/NormalizationTest.txt
373 assert_normalize ({0x1e0a}, {0x1e0a});
374 assert_normalize ({0x1e0c}, {0x1e0c});
375 assert_normalize ({0x1e0a, 0x0323}, {0x1e0c, 0x0307});
376 assert_normalize ({0x1e0c, 0x0307}, {0x1e0c, 0x0307});
377 assert_normalize ({0x0044, 0x0307, 0x0323}, {0x1e0c, 0x0307});
379 // testcases for Hangul from Part0
380 assert_normalize ({0x1100, 0xac00, 0x11a8}, {0x1100, 0xac01});
381 assert_normalize ({0x1100, 0xac00, 0x11a8, 0x11a8}, {0x1100, 0xac01, 0x11a8});
382 // testcases for Hangul from Part1
383 assert_normalize ({0x3131}, {0x3131});
384 assert_normalize ({0x3132}, {0x3132});
385 // testcases for Hangul from Part3
386 assert_normalize ({0x1100, 0x0334, 0x1161}, {0x1100, 0x0334, 0x1161});
387 assert_normalize ({0xac54, 0x0334, 0x11ae}, {0xac54, 0x0334, 0x11ae});
389 // TODO: add more testcases in
390 // https://unicode.org/Public/UNIDATA/NormalizationTest.txt
394 rust_utf8_property_test ()
396 ASSERT_TRUE (Rust::is_alphabetic ('A'));
397 ASSERT_TRUE (Rust::is_alphabetic ('B'));
398 ASSERT_TRUE (Rust::is_alphabetic ('x'));
399 ASSERT_TRUE (Rust::is_alphabetic ('z'));
400 ASSERT_TRUE (Rust::is_alphabetic (0x00b5)); // µ
401 ASSERT_TRUE (Rust::is_alphabetic (0x3093)); // ん
402 ASSERT_TRUE (Rust::is_alphabetic (0xa8f2)); // ꣲ
403 ASSERT_TRUE (Rust::is_alphabetic (0x2b743)); // 𫝃
405 ASSERT_FALSE (Rust::is_alphabetic ('\v'));
406 ASSERT_FALSE (Rust::is_alphabetic ('-'));
407 ASSERT_FALSE (Rust::is_alphabetic ('_'));
408 ASSERT_FALSE (Rust::is_alphabetic ('+'));
409 ASSERT_FALSE (Rust::is_alphabetic ('0'));
410 ASSERT_FALSE (Rust::is_alphabetic ('1'));
411 ASSERT_FALSE (Rust::is_alphabetic ('2'));
412 ASSERT_FALSE (Rust::is_alphabetic ('9'));
413 ASSERT_FALSE (Rust::is_alphabetic (0xa720)); // ◌
414 ASSERT_FALSE (Rust::is_alphabetic (0xaac1)); // ◌꫁
417 ASSERT_TRUE (Rust::is_numeric ('0'));
418 ASSERT_TRUE (Rust::is_numeric ('1'));
419 ASSERT_TRUE (Rust::is_numeric ('7'));
420 ASSERT_TRUE (Rust::is_numeric ('9'));
421 ASSERT_TRUE (Rust::is_numeric (0x07c2)); // ߂
422 ASSERT_TRUE (Rust::is_numeric (0x096d)); // ७
424 ASSERT_TRUE (Rust::is_numeric (0x16e6)); // ᛮ
425 ASSERT_TRUE (Rust::is_numeric (0xa6e6)); // ꛦ
426 ASSERT_TRUE (Rust::is_numeric (0x12400)); // 𒐀
427 ASSERT_TRUE (Rust::is_numeric (0x1243a)); // 𒐺
429 ASSERT_TRUE (Rust::is_numeric (0x00b2)); // ²
430 ASSERT_TRUE (Rust::is_numeric (0x32b1)); // ㊱
432 ASSERT_FALSE (Rust::is_numeric ('\n'));
433 ASSERT_FALSE (Rust::is_numeric ('-'));
434 ASSERT_FALSE (Rust::is_numeric ('_'));
435 ASSERT_FALSE (Rust::is_numeric ('('));
436 ASSERT_FALSE (Rust::is_numeric ('z'));
437 ASSERT_FALSE (Rust::is_numeric (';'));
438 ASSERT_FALSE (Rust::is_numeric (0x03f4)); // ϴ
439 ASSERT_FALSE (Rust::is_numeric (0x0628)); // ب
440 ASSERT_FALSE (Rust::is_numeric (0x0975)); // ॵ
441 ASSERT_FALSE (Rust::is_numeric (0x18f0)); // ᣰ
442 ASSERT_FALSE (Rust::is_numeric (0x2f30)); // ⼰
445 } // namespace selftest