]> git.ipfire.org Git - thirdparty/dovecot/core.git/commitdiff
lib: unicode-transform - Implement RFC5051 API in UTF32 and use it
authorStephan Bosch <stephan.bosch@open-xchange.com>
Fri, 21 Mar 2025 04:59:48 +0000 (05:59 +0100)
committerStephan Bosch <stephan.bosch@open-xchange.com>
Fri, 1 Aug 2025 01:11:19 +0000 (03:11 +0200)
src/lib/Makefile.am
src/lib/unichar.c
src/lib/unicode-transform.c
src/lib/unicode-transform.h [new file with mode: 0644]

index 3d848e37c40622b87f65053ec942eaa07a243005..647e45c6737cceac887fe1d8ce2bc3fb5e5ca793 100644 (file)
@@ -219,6 +219,7 @@ liblib_la_SOURCES = \
        unicode-data-types.c \
        unicode-data-tables.c \
        unicode-data.c \
+       unicode-transform.c \
        uri-util.c \
        utc-offset.c \
        utc-mktime.c \
@@ -383,6 +384,7 @@ headers = \
        unicode-data-types.h \
        unicode-data-tables.h \
        unicode-data.h \
+       unicode-transform.h \
        uri-util.h \
        utc-offset.h \
        utc-mktime.h \
index e1aec6371e005cec98e5c67b4fa94f6242c1cd8e..209d1b1e9d1bba16d8b26354de74d956f72aac0c 100644 (file)
@@ -4,6 +4,7 @@
 #include "array.h"
 #include "bsearch-insert-pos.h"
 #include "unicode-data.h"
+#include "unicode-transform.h"
 #include "unichar.h"
 
 const unsigned char utf8_replacement_char[UTF8_REPLACEMENT_CHAR_LEN] =
@@ -237,8 +238,6 @@ unichar_t uni_ucs4_to_titlecase(unichar_t chr)
        return chr;
 }
 
-#include "unicode-transform.c"
-
 static void output_add_replacement_char(buffer_t *output)
 {
        if (output->used >= UTF8_REPLACEMENT_CHAR_LEN &&
@@ -254,10 +253,13 @@ static void output_add_replacement_char(buffer_t *output)
 int uni_utf8_to_decomposed_titlecase(const void *_input, size_t size,
                                     buffer_t *output)
 {
+       struct unicode_rfc5051_context ctx;
        const unsigned char *input = _input;
        unichar_t chr;
        int ret = 0;
 
+       unicode_rfc5051_init(&ctx);
+
        while (size > 0) {
                int bytes = uni_utf8_get_char_n(input, size, &chr);
                if (bytes <= 0) {
@@ -270,8 +272,11 @@ int uni_utf8_to_decomposed_titlecase(const void *_input, size_t size,
                input += bytes;
                size -= bytes;
 
-               chr = uni_ucs4_to_titlecase(chr);
-               uni_ucs4_decompose_one_utf8(chr, FALSE, output);
+               const unichar_t *norm;
+               size_t norm_len;
+
+               norm_len = unicode_rfc5051_normalize(&ctx, chr, &norm);
+               uni_ucs4_to_utf8(norm, norm_len, output);
        }
        return ret;
 }
index 529a7212996c84d7ebbc7ba00d811a5c3ee70133..723da5bec82d600ff312d75ff9e9299b887fb8d6 100644 (file)
@@ -1,5 +1,9 @@
 /* Copyright (c) 2025 Dovecot authors, see the included COPYING file */
 
+#include "lib.h"
+#include "unicode-data.h"
+#include "unicode-transform.h"
+
 #define HANGUL_FIRST 0xac00
 #define HANGUL_LAST 0xd7a3
 
@@ -42,35 +46,35 @@ static size_t unicode_hangul_decompose(uint32_t cp, uint32_t buf[3])
        return 3;
 }
 
-static void uni_ucs4_decompose_hangul_utf8(uint32_t cp, buffer_t *output)
-{
-       uint32_t buf[3];
-       size_t len, i;
-
-       len = unicode_hangul_decompose(cp, buf);
+/*
+ * RFC 5051 - Simple Unicode Collation Algorithm
+ */
 
-       for (i = 0; i < len; i++)
-               uni_ucs4_to_utf8_c(buf[i], output);
+void unicode_rfc5051_init(struct unicode_rfc5051_context *ctx)
+{
+       i_zero(ctx);
 }
 
-static void
-uni_ucs4_decompose_one_utf8(uint32_t cp, bool canonical, buffer_t *output)
+size_t unicode_rfc5051_normalize(struct unicode_rfc5051_context *ctx,
+                                uint32_t cp, const uint32_t **norm_r)
 {
-       const uint32_t *decomp;
-       size_t len, i;
+       const struct unicode_code_point_data *cpd;
+       size_t len;
+
+       cpd = unicode_code_point_get_data(cp);
+       if (cpd->simple_titlecase_mapping != 0x0000)
+               cp = cpd->simple_titlecase_mapping;
 
        if (cp >= HANGUL_FIRST && cp <= HANGUL_LAST) {
-               uni_ucs4_decompose_hangul_utf8(cp, output);
-               return;
+               *norm_r = ctx->buffer;
+               return unicode_hangul_decompose(cp, ctx->buffer);
        }
 
-       len = unicode_code_point_get_full_decomposition(cp, canonical,
-                                                       &decomp);
+       len = unicode_code_point_get_full_decomposition(cp, FALSE, norm_r);
        if (len == 0) {
-               uni_ucs4_to_utf8_c(cp, output);
-               return;
+               ctx->buffer[0] = cp;
+               *norm_r = ctx->buffer;
+               return 1;
        }
-
-       for (i = 0; i < len; i++)
-               uni_ucs4_to_utf8_c(decomp[i], output);
+       return len;
 }
diff --git a/src/lib/unicode-transform.h b/src/lib/unicode-transform.h
new file mode 100644 (file)
index 0000000..ce407a7
--- /dev/null
@@ -0,0 +1,16 @@
+#ifndef UNICODE_NF_H
+#define UNICODE_NF_H
+
+/*
+ * RFC 5051 - Simple Unicode Collation Algorithm
+ */
+
+struct unicode_rfc5051_context {
+       uint32_t buffer[3];
+};
+
+void unicode_rfc5051_init(struct unicode_rfc5051_context *ctx);
+size_t unicode_rfc5051_normalize(struct unicode_rfc5051_context *ctx,
+                                uint32_t cp, const uint32_t **norm_r);
+
+#endif