lib: unichar - Implement uni_utf8_to_decomposed_titlecase() using the new Unicode...

author Stephan Bosch <stephan.bosch@open-xchange.com>

Fri, 21 Mar 2025 15:06:05 +0000 (16:06 +0100)

committer Stephan Bosch <stephan.bosch@open-xchange.com>

Fri, 1 Aug 2025 01:11:19 +0000 (03:11 +0200)
author Stephan Bosch <stephan.bosch@open-xchange.com>
Fri, 21 Mar 2025 15:06:05 +0000 (16:06 +0100)
committer Stephan Bosch <stephan.bosch@open-xchange.com>
Fri, 1 Aug 2025 01:11:19 +0000 (03:11 +0200)
diff --git a/src/lib/test-unichar.c b/src/lib/test-unichar.c

index 5026d3fefbc47b7a2f9576367ded4b0c53dffdf5..b0f39743a9263f96b732cdc6dc840086de25aa4d 100644 (file)
--- a/src/lib/test-unichar.c
+++ b/src/lib/test-unichar.c
@@ -124,10 +124,28 @@ static void test_unichar_surrogates(void)
  static void test_unichar_collation(void)
  {
         const char *in[] = {
+               /* Plain ASCII letters will be upper-cased */
+               "Plain!",
+               /* U+00FC " " U+00B3 */
                 "\xc3\xbc \xc2\xb3",
+               /* U+01C4 "enan" */
+               "\xC7\x84\x65\x6E\x61\x6E",
+               /* Bad characters will be substituted with replacement character */
+               "Bad \xFF Characters",
+               /* "c" U+00F4 "t" U+00E9 */
+               "c\xC3\xB4t\xC3\xA9",
         };
         const char *exp[] = {
+               /* Plain ASCII letters are upper-cased */
+               "PLAIN!",
+               /* "U" U+0308 " 3" */
                 "U\xcc\x88 3",
+               /* "Dz" U+030C "ENAN" */
+               "\x44\x7A\xCC\x8C\x45\x4E\x41\x4E",
+               /* Bad characters are substituted with replacement character */
+               "BAD \xEF\xBF\xBD CHARACTERS",
+               /* "CO" U+0302 "TE" U+0301 */
+               "CO\xCC\x82TE\xCC\x81",
         };
  
         unsigned int n_in = N_ELEMENTS(in), n_exp = N_ELEMENTS(exp), i;
diff --git a/src/lib/unichar.c b/src/lib/unichar.c

index 2a591694135fe3463faf624e8c8f9902cd7cb56b..8248d5c0d45873fd9b6f567f9d91068a1b64e8ea 100644 (file)
--- a/src/lib/unichar.c
+++ b/src/lib/unichar.c
@@ -6,8 +6,6 @@
  #include "unicode-data.h"
  #include "unichar.h"
  
-#include "unicodemap.c"
-
  #define HANGUL_FIRST 0xac00
  #define HANGUL_LAST 0xd7a3
  
@@ -232,18 +230,6 @@ unsigned int uni_utf8_partial_strlen_n(const void *_input, size_t size,
         return len;
  }
  
-static bool uint16_find(const uint16_t *data, unsigned int count,
-                       uint16_t value, unsigned int *idx_r)
-{
-       BINARY_NUMBER_SEARCH(data, count, value, idx_r);
-}
-
-static bool uint32_find(const uint32_t *data, unsigned int count,
-                       uint32_t value, unsigned int *idx_r)
-{
-       BINARY_NUMBER_SEARCH(data, count, value, idx_r);
-}
-
  unichar_t uni_ucs4_to_titlecase(unichar_t chr)
  {
         const struct unicode_code_point_data *cp_data =
@@ -254,31 +240,6 @@ unichar_t uni_ucs4_to_titlecase(unichar_t chr)
         return chr;
  }
  
-static bool uni_ucs4_decompose_uni(unichar_t *chr)
-{
-       unsigned int idx;
-
-       if (*chr <= 0xff) {
-               if (uni8_decomp_map[*chr] == *chr)
-                       return FALSE;
-               *chr = uni8_decomp_map[*chr];
-       } else if (*chr <= 0xffff) {
-               if (*chr < uni16_decomp_keys[0])
-                       return FALSE;
-
-               if (!uint16_find(uni16_decomp_keys,
-                                N_ELEMENTS(uni16_decomp_keys), *chr, &idx))
-                       return FALSE;
-               *chr = uni16_decomp_values[idx];
-       } else {
-               if (!uint32_find(uni32_decomp_keys,
-                                N_ELEMENTS(uni32_decomp_keys), *chr, &idx))
-                       return FALSE;
-               *chr = uni32_decomp_values[idx];
-       }
-       return TRUE;
-}
-
  static size_t uni_ucs4_decompose_hangul(unichar_t chr, unichar_t buf[3])
  {
         /* The Unicode Standard, Section 3.12.2:
@@ -325,22 +286,26 @@ static void uni_ucs4_decompose_hangul_utf8(unichar_t chr, buffer_t *output)
                 uni_ucs4_to_utf8_c(buf[i], output);
  }
  
-static bool uni_ucs4_decompose_multi_utf8(unichar_t chr, buffer_t *output)
+static void
+uni_ucs4_decompose_one_utf8(unichar_t chr, bool canonical, buffer_t *output)
  {
-       const uint32_t *value;
-       unsigned int idx;
+       const unichar_t *decomp;
+       size_t len, i;
  
-       if (chr < multidecomp_keys[0] || chr > 0xffff)
-               return FALSE;
+       if (chr >= HANGUL_FIRST && chr <= HANGUL_LAST) {
+               uni_ucs4_decompose_hangul_utf8(chr, output);
+               return;
+       }
  
-       if (!uint32_find(multidecomp_keys, N_ELEMENTS(multidecomp_keys),
-                        chr, &idx))
-               return FALSE;
+       len = unicode_code_point_get_full_decomposition(chr, canonical,
+                                                       &decomp);
+       if (len == 0) {
+               uni_ucs4_to_utf8_c(chr, output);
+               return;
+       }
  
-       value = &multidecomp_values[multidecomp_offsets[idx]];
-       for (; *value != 0; value++)
-               uni_ucs4_to_utf8_c(*value, output);
-       return TRUE;
+       for (i = 0; i < len; i++)
+               uni_ucs4_to_utf8_c(decomp[i], output);
  }
  
  static void output_add_replacement_char(buffer_t *output)
@@ -375,11 +340,7 @@ int uni_utf8_to_decomposed_titlecase(const void *_input, size_t size,
                 size -= bytes;
  
                 chr = uni_ucs4_to_titlecase(chr);
-               if (chr >= HANGUL_FIRST && chr <= HANGUL_LAST)
-                       uni_ucs4_decompose_hangul_utf8(chr, output);
-               else if (uni_ucs4_decompose_uni(&chr) ||
-                        !uni_ucs4_decompose_multi_utf8(chr, output))
-                       uni_ucs4_to_utf8_c(chr, output);
+               uni_ucs4_decompose_one_utf8(chr, FALSE, output);
         }
         return ret;
  }
author	Stephan Bosch <stephan.bosch@open-xchange.com>
	Fri, 21 Mar 2025 15:06:05 +0000 (16:06 +0100)
committer	Stephan Bosch <stephan.bosch@open-xchange.com>
	Fri, 1 Aug 2025 01:11:19 +0000 (03:11 +0200)
src/lib/test-unichar.c		patch \| blob \| blame \| history
src/lib/unichar.c		patch \| blob \| blame \| history