From: Stephan Bosch <stephan.bosch@open-xchange.com>
Date: Fri, 21 Mar 2025 15:06:05 +0000 (+0100)
Subject: lib: unichar - Implement uni_utf8_to_decomposed_titlecase() using the new Unicode... 
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=33ccffd73ee89d041ec1deba721815d0e3d7200d;p=thirdparty%2Fdovecot%2Fcore.git

lib: unichar - Implement uni_utf8_to_decomposed_titlecase() using the new Unicode character database
---

diff --git a/src/lib/test-unichar.c b/src/lib/test-unichar.c
index 5026d3fefb..b0f39743a9 100644
--- a/src/lib/test-unichar.c
+++ b/src/lib/test-unichar.c
@@ -124,10 +124,28 @@ static void test_unichar_surrogates(void)
 static void test_unichar_collation(void)
 {
 	const char *in[] = {
+		/* Plain ASCII letters will be upper-cased */
+		"Plain!",
+		/* U+00FC " " U+00B3 */
 		"\xc3\xbc \xc2\xb3",
+		/* U+01C4 "enan" */
+		"\xC7\x84\x65\x6E\x61\x6E",
+		/* Bad characters will be substituted with replacement character */
+		"Bad \xFF Characters",
+		/* "c" U+00F4 "t" U+00E9 */
+		"c\xC3\xB4t\xC3\xA9",
 	};
 	const char *exp[] = {
+		/* Plain ASCII letters are upper-cased */
+		"PLAIN!",
+		/* "U" U+0308 " 3" */
 		"U\xcc\x88 3",
+		/* "Dz" U+030C "ENAN" */
+		"\x44\x7A\xCC\x8C\x45\x4E\x41\x4E",
+		/* Bad characters are substituted with replacement character */
+		"BAD \xEF\xBF\xBD CHARACTERS",
+		/* "CO" U+0302 "TE" U+0301 */
+		"CO\xCC\x82TE\xCC\x81",
 	};
 
 	unsigned int n_in = N_ELEMENTS(in), n_exp = N_ELEMENTS(exp), i;
diff --git a/src/lib/unichar.c b/src/lib/unichar.c
index 2a59169413..8248d5c0d4 100644
--- a/src/lib/unichar.c
+++ b/src/lib/unichar.c
@@ -6,8 +6,6 @@
 #include "unicode-data.h"
 #include "unichar.h"
 
-#include "unicodemap.c"
-
 #define HANGUL_FIRST 0xac00
 #define HANGUL_LAST 0xd7a3
 
@@ -232,18 +230,6 @@ unsigned int uni_utf8_partial_strlen_n(const void *_input, size_t size,
 	return len;
 }
 
-static bool uint16_find(const uint16_t *data, unsigned int count,
-			uint16_t value, unsigned int *idx_r)
-{
-	BINARY_NUMBER_SEARCH(data, count, value, idx_r);
-}
-
-static bool uint32_find(const uint32_t *data, unsigned int count,
-			uint32_t value, unsigned int *idx_r)
-{
-	BINARY_NUMBER_SEARCH(data, count, value, idx_r);
-}
-
 unichar_t uni_ucs4_to_titlecase(unichar_t chr)
 {
 	const struct unicode_code_point_data *cp_data =
@@ -254,31 +240,6 @@ unichar_t uni_ucs4_to_titlecase(unichar_t chr)
 	return chr;
 }
 
-static bool uni_ucs4_decompose_uni(unichar_t *chr)
-{
-	unsigned int idx;
-
-	if (*chr <= 0xff) {
-		if (uni8_decomp_map[*chr] == *chr)
-			return FALSE;
-		*chr = uni8_decomp_map[*chr];
-	} else if (*chr <= 0xffff) {
-		if (*chr < uni16_decomp_keys[0])
-			return FALSE;
-
-		if (!uint16_find(uni16_decomp_keys,
-				 N_ELEMENTS(uni16_decomp_keys), *chr, &idx))
-			return FALSE;
-		*chr = uni16_decomp_values[idx];
-	} else {
-		if (!uint32_find(uni32_decomp_keys,
-				 N_ELEMENTS(uni32_decomp_keys), *chr, &idx))
-			return FALSE;
-		*chr = uni32_decomp_values[idx];
-	}
-	return TRUE;
-}
-
 static size_t uni_ucs4_decompose_hangul(unichar_t chr, unichar_t buf[3])
 {
 	/* The Unicode Standard, Section 3.12.2:
@@ -325,22 +286,26 @@ static void uni_ucs4_decompose_hangul_utf8(unichar_t chr, buffer_t *output)
 		uni_ucs4_to_utf8_c(buf[i], output);
 }
 
-static bool uni_ucs4_decompose_multi_utf8(unichar_t chr, buffer_t *output)
+static void
+uni_ucs4_decompose_one_utf8(unichar_t chr, bool canonical, buffer_t *output)
 {
-	const uint32_t *value;
-	unsigned int idx;
+	const unichar_t *decomp;
+	size_t len, i;
 
-	if (chr < multidecomp_keys[0] || chr > 0xffff)
-		return FALSE;
+	if (chr >= HANGUL_FIRST && chr <= HANGUL_LAST) {
+		uni_ucs4_decompose_hangul_utf8(chr, output);
+		return;
+	}
 
-	if (!uint32_find(multidecomp_keys, N_ELEMENTS(multidecomp_keys),
-			 chr, &idx))
-		return FALSE;
+	len = unicode_code_point_get_full_decomposition(chr, canonical,
+							&decomp);
+	if (len == 0) {
+		uni_ucs4_to_utf8_c(chr, output);
+		return;
+	}
 
-	value = &multidecomp_values[multidecomp_offsets[idx]];
-	for (; *value != 0; value++)
-		uni_ucs4_to_utf8_c(*value, output);
-	return TRUE;
+	for (i = 0; i < len; i++)
+		uni_ucs4_to_utf8_c(decomp[i], output);
 }
 
 static void output_add_replacement_char(buffer_t *output)
@@ -375,11 +340,7 @@ int uni_utf8_to_decomposed_titlecase(const void *_input, size_t size,
 		size -= bytes;
 
 		chr = uni_ucs4_to_titlecase(chr);
-		if (chr >= HANGUL_FIRST && chr <= HANGUL_LAST)
-			uni_ucs4_decompose_hangul_utf8(chr, output);
-		else if (uni_ucs4_decompose_uni(&chr) ||
-			 !uni_ucs4_decompose_multi_utf8(chr, output))
-			uni_ucs4_to_utf8_c(chr, output);
+		uni_ucs4_decompose_one_utf8(chr, FALSE, output);
 	}
 	return ret;
 }