From: Stephan Bosch Date: Fri, 21 Mar 2025 04:59:48 +0000 (+0100) Subject: lib: unicode-transform - Implement RFC5051 API in UTF32 and use it X-Git-Tag: 2.4.2~615 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=54ad4a9b2788a116631b897614cdf18fc0fe572a;p=thirdparty%2Fdovecot%2Fcore.git lib: unicode-transform - Implement RFC5051 API in UTF32 and use it --- diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am index 3d848e37c4..647e45c673 100644 --- a/src/lib/Makefile.am +++ b/src/lib/Makefile.am @@ -219,6 +219,7 @@ liblib_la_SOURCES = \ unicode-data-types.c \ unicode-data-tables.c \ unicode-data.c \ + unicode-transform.c \ uri-util.c \ utc-offset.c \ utc-mktime.c \ @@ -383,6 +384,7 @@ headers = \ unicode-data-types.h \ unicode-data-tables.h \ unicode-data.h \ + unicode-transform.h \ uri-util.h \ utc-offset.h \ utc-mktime.h \ diff --git a/src/lib/unichar.c b/src/lib/unichar.c index e1aec6371e..209d1b1e9d 100644 --- a/src/lib/unichar.c +++ b/src/lib/unichar.c @@ -4,6 +4,7 @@ #include "array.h" #include "bsearch-insert-pos.h" #include "unicode-data.h" +#include "unicode-transform.h" #include "unichar.h" const unsigned char utf8_replacement_char[UTF8_REPLACEMENT_CHAR_LEN] = @@ -237,8 +238,6 @@ unichar_t uni_ucs4_to_titlecase(unichar_t chr) return chr; } -#include "unicode-transform.c" - static void output_add_replacement_char(buffer_t *output) { if (output->used >= UTF8_REPLACEMENT_CHAR_LEN && @@ -254,10 +253,13 @@ static void output_add_replacement_char(buffer_t *output) int uni_utf8_to_decomposed_titlecase(const void *_input, size_t size, buffer_t *output) { + struct unicode_rfc5051_context ctx; const unsigned char *input = _input; unichar_t chr; int ret = 0; + unicode_rfc5051_init(&ctx); + while (size > 0) { int bytes = uni_utf8_get_char_n(input, size, &chr); if (bytes <= 0) { @@ -270,8 +272,11 @@ int uni_utf8_to_decomposed_titlecase(const void *_input, size_t size, input += bytes; size -= bytes; - chr = uni_ucs4_to_titlecase(chr); - uni_ucs4_decompose_one_utf8(chr, FALSE, output); + const unichar_t *norm; + size_t norm_len; + + norm_len = unicode_rfc5051_normalize(&ctx, chr, &norm); + uni_ucs4_to_utf8(norm, norm_len, output); } return ret; } diff --git a/src/lib/unicode-transform.c b/src/lib/unicode-transform.c index 529a721299..723da5bec8 100644 --- a/src/lib/unicode-transform.c +++ b/src/lib/unicode-transform.c @@ -1,5 +1,9 @@ /* Copyright (c) 2025 Dovecot authors, see the included COPYING file */ +#include "lib.h" +#include "unicode-data.h" +#include "unicode-transform.h" + #define HANGUL_FIRST 0xac00 #define HANGUL_LAST 0xd7a3 @@ -42,35 +46,35 @@ static size_t unicode_hangul_decompose(uint32_t cp, uint32_t buf[3]) return 3; } -static void uni_ucs4_decompose_hangul_utf8(uint32_t cp, buffer_t *output) -{ - uint32_t buf[3]; - size_t len, i; - - len = unicode_hangul_decompose(cp, buf); +/* + * RFC 5051 - Simple Unicode Collation Algorithm + */ - for (i = 0; i < len; i++) - uni_ucs4_to_utf8_c(buf[i], output); +void unicode_rfc5051_init(struct unicode_rfc5051_context *ctx) +{ + i_zero(ctx); } -static void -uni_ucs4_decompose_one_utf8(uint32_t cp, bool canonical, buffer_t *output) +size_t unicode_rfc5051_normalize(struct unicode_rfc5051_context *ctx, + uint32_t cp, const uint32_t **norm_r) { - const uint32_t *decomp; - size_t len, i; + const struct unicode_code_point_data *cpd; + size_t len; + + cpd = unicode_code_point_get_data(cp); + if (cpd->simple_titlecase_mapping != 0x0000) + cp = cpd->simple_titlecase_mapping; if (cp >= HANGUL_FIRST && cp <= HANGUL_LAST) { - uni_ucs4_decompose_hangul_utf8(cp, output); - return; + *norm_r = ctx->buffer; + return unicode_hangul_decompose(cp, ctx->buffer); } - len = unicode_code_point_get_full_decomposition(cp, canonical, - &decomp); + len = unicode_code_point_get_full_decomposition(cp, FALSE, norm_r); if (len == 0) { - uni_ucs4_to_utf8_c(cp, output); - return; + ctx->buffer[0] = cp; + *norm_r = ctx->buffer; + return 1; } - - for (i = 0; i < len; i++) - uni_ucs4_to_utf8_c(decomp[i], output); + return len; } diff --git a/src/lib/unicode-transform.h b/src/lib/unicode-transform.h new file mode 100644 index 0000000000..ce407a74de --- /dev/null +++ b/src/lib/unicode-transform.h @@ -0,0 +1,16 @@ +#ifndef UNICODE_NF_H +#define UNICODE_NF_H + +/* + * RFC 5051 - Simple Unicode Collation Algorithm + */ + +struct unicode_rfc5051_context { + uint32_t buffer[3]; +}; + +void unicode_rfc5051_init(struct unicode_rfc5051_context *ctx); +size_t unicode_rfc5051_normalize(struct unicode_rfc5051_context *ctx, + uint32_t cp, const uint32_t **norm_r); + +#endif