From: Stephan Bosch Date: Fri, 21 Mar 2025 15:06:05 +0000 (+0100) Subject: lib: unichar - Implement uni_utf8_to_decomposed_titlecase() using the new Unicode... X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=33ccffd73ee89d041ec1deba721815d0e3d7200d;p=thirdparty%2Fdovecot%2Fcore.git lib: unichar - Implement uni_utf8_to_decomposed_titlecase() using the new Unicode character database --- diff --git a/src/lib/test-unichar.c b/src/lib/test-unichar.c index 5026d3fefb..b0f39743a9 100644 --- a/src/lib/test-unichar.c +++ b/src/lib/test-unichar.c @@ -124,10 +124,28 @@ static void test_unichar_surrogates(void) static void test_unichar_collation(void) { const char *in[] = { + /* Plain ASCII letters will be upper-cased */ + "Plain!", + /* U+00FC " " U+00B3 */ "\xc3\xbc \xc2\xb3", + /* U+01C4 "enan" */ + "\xC7\x84\x65\x6E\x61\x6E", + /* Bad characters will be substituted with replacement character */ + "Bad \xFF Characters", + /* "c" U+00F4 "t" U+00E9 */ + "c\xC3\xB4t\xC3\xA9", }; const char *exp[] = { + /* Plain ASCII letters are upper-cased */ + "PLAIN!", + /* "U" U+0308 " 3" */ "U\xcc\x88 3", + /* "Dz" U+030C "ENAN" */ + "\x44\x7A\xCC\x8C\x45\x4E\x41\x4E", + /* Bad characters are substituted with replacement character */ + "BAD \xEF\xBF\xBD CHARACTERS", + /* "CO" U+0302 "TE" U+0301 */ + "CO\xCC\x82TE\xCC\x81", }; unsigned int n_in = N_ELEMENTS(in), n_exp = N_ELEMENTS(exp), i; diff --git a/src/lib/unichar.c b/src/lib/unichar.c index 2a59169413..8248d5c0d4 100644 --- a/src/lib/unichar.c +++ b/src/lib/unichar.c @@ -6,8 +6,6 @@ #include "unicode-data.h" #include "unichar.h" -#include "unicodemap.c" - #define HANGUL_FIRST 0xac00 #define HANGUL_LAST 0xd7a3 @@ -232,18 +230,6 @@ unsigned int uni_utf8_partial_strlen_n(const void *_input, size_t size, return len; } -static bool uint16_find(const uint16_t *data, unsigned int count, - uint16_t value, unsigned int *idx_r) -{ - BINARY_NUMBER_SEARCH(data, count, value, idx_r); -} - -static bool uint32_find(const uint32_t *data, unsigned int count, - uint32_t value, unsigned int *idx_r) -{ - BINARY_NUMBER_SEARCH(data, count, value, idx_r); -} - unichar_t uni_ucs4_to_titlecase(unichar_t chr) { const struct unicode_code_point_data *cp_data = @@ -254,31 +240,6 @@ unichar_t uni_ucs4_to_titlecase(unichar_t chr) return chr; } -static bool uni_ucs4_decompose_uni(unichar_t *chr) -{ - unsigned int idx; - - if (*chr <= 0xff) { - if (uni8_decomp_map[*chr] == *chr) - return FALSE; - *chr = uni8_decomp_map[*chr]; - } else if (*chr <= 0xffff) { - if (*chr < uni16_decomp_keys[0]) - return FALSE; - - if (!uint16_find(uni16_decomp_keys, - N_ELEMENTS(uni16_decomp_keys), *chr, &idx)) - return FALSE; - *chr = uni16_decomp_values[idx]; - } else { - if (!uint32_find(uni32_decomp_keys, - N_ELEMENTS(uni32_decomp_keys), *chr, &idx)) - return FALSE; - *chr = uni32_decomp_values[idx]; - } - return TRUE; -} - static size_t uni_ucs4_decompose_hangul(unichar_t chr, unichar_t buf[3]) { /* The Unicode Standard, Section 3.12.2: @@ -325,22 +286,26 @@ static void uni_ucs4_decompose_hangul_utf8(unichar_t chr, buffer_t *output) uni_ucs4_to_utf8_c(buf[i], output); } -static bool uni_ucs4_decompose_multi_utf8(unichar_t chr, buffer_t *output) +static void +uni_ucs4_decompose_one_utf8(unichar_t chr, bool canonical, buffer_t *output) { - const uint32_t *value; - unsigned int idx; + const unichar_t *decomp; + size_t len, i; - if (chr < multidecomp_keys[0] || chr > 0xffff) - return FALSE; + if (chr >= HANGUL_FIRST && chr <= HANGUL_LAST) { + uni_ucs4_decompose_hangul_utf8(chr, output); + return; + } - if (!uint32_find(multidecomp_keys, N_ELEMENTS(multidecomp_keys), - chr, &idx)) - return FALSE; + len = unicode_code_point_get_full_decomposition(chr, canonical, + &decomp); + if (len == 0) { + uni_ucs4_to_utf8_c(chr, output); + return; + } - value = &multidecomp_values[multidecomp_offsets[idx]]; - for (; *value != 0; value++) - uni_ucs4_to_utf8_c(*value, output); - return TRUE; + for (i = 0; i < len; i++) + uni_ucs4_to_utf8_c(decomp[i], output); } static void output_add_replacement_char(buffer_t *output) @@ -375,11 +340,7 @@ int uni_utf8_to_decomposed_titlecase(const void *_input, size_t size, size -= bytes; chr = uni_ucs4_to_titlecase(chr); - if (chr >= HANGUL_FIRST && chr <= HANGUL_LAST) - uni_ucs4_decompose_hangul_utf8(chr, output); - else if (uni_ucs4_decompose_uni(&chr) || - !uni_ucs4_decompose_multi_utf8(chr, output)) - uni_ucs4_to_utf8_c(chr, output); + uni_ucs4_decompose_one_utf8(chr, FALSE, output); } return ret; }