lib: unicode-transform - Implement RFC5051 API in UTF32 and use it

author Stephan Bosch <stephan.bosch@open-xchange.com>

Fri, 21 Mar 2025 04:59:48 +0000 (05:59 +0100)

committer Stephan Bosch <stephan.bosch@open-xchange.com>

Fri, 1 Aug 2025 01:11:19 +0000 (03:11 +0200)
author Stephan Bosch <stephan.bosch@open-xchange.com>
Fri, 21 Mar 2025 04:59:48 +0000 (05:59 +0100)
committer Stephan Bosch <stephan.bosch@open-xchange.com>
Fri, 1 Aug 2025 01:11:19 +0000 (03:11 +0200)
diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am

index 3d848e37c40622b87f65053ec942eaa07a243005..647e45c6737cceac887fe1d8ce2bc3fb5e5ca793 100644 (file)
--- a/src/lib/Makefile.am
+++ b/src/lib/Makefile.am
@@ -219,6 +219,7 @@ liblib_la_SOURCES = \
         unicode-data-types.c \
         unicode-data-tables.c \
         unicode-data.c \
+       unicode-transform.c \
         uri-util.c \
         utc-offset.c \
         utc-mktime.c \
@@ -383,6 +384,7 @@ headers = \
         unicode-data-types.h \
         unicode-data-tables.h \
         unicode-data.h \
+       unicode-transform.h \
         uri-util.h \
         utc-offset.h \
         utc-mktime.h \
diff --git a/src/lib/unichar.c b/src/lib/unichar.c

index e1aec6371e005cec98e5c67b4fa94f6242c1cd8e..209d1b1e9d1bba16d8b26354de74d956f72aac0c 100644 (file)
--- a/src/lib/unichar.c
+++ b/src/lib/unichar.c
@@ -4,6 +4,7 @@
  #include "array.h"
  #include "bsearch-insert-pos.h"
  #include "unicode-data.h"
+#include "unicode-transform.h"
  #include "unichar.h"
  
  const unsigned char utf8_replacement_char[UTF8_REPLACEMENT_CHAR_LEN] =
@@ -237,8 +238,6 @@ unichar_t uni_ucs4_to_titlecase(unichar_t chr)
         return chr;
  }
  
-#include "unicode-transform.c"
-
  static void output_add_replacement_char(buffer_t *output)
  {
         if (output->used >= UTF8_REPLACEMENT_CHAR_LEN &&
@@ -254,10 +253,13 @@ static void output_add_replacement_char(buffer_t *output)
  int uni_utf8_to_decomposed_titlecase(const void *_input, size_t size,
                                      buffer_t *output)
  {
+       struct unicode_rfc5051_context ctx;
         const unsigned char *input = _input;
         unichar_t chr;
         int ret = 0;
  
+       unicode_rfc5051_init(&ctx);
+
         while (size > 0) {
                 int bytes = uni_utf8_get_char_n(input, size, &chr);
                 if (bytes <= 0) {
@@ -270,8 +272,11 @@ int uni_utf8_to_decomposed_titlecase(const void *_input, size_t size,
                 input += bytes;
                 size -= bytes;
  
-               chr = uni_ucs4_to_titlecase(chr);
-               uni_ucs4_decompose_one_utf8(chr, FALSE, output);
+               const unichar_t *norm;
+               size_t norm_len;
+
+               norm_len = unicode_rfc5051_normalize(&ctx, chr, &norm);
+               uni_ucs4_to_utf8(norm, norm_len, output);
         }
         return ret;
  }
diff --git a/src/lib/unicode-transform.c b/src/lib/unicode-transform.c

index 529a7212996c84d7ebbc7ba00d811a5c3ee70133..723da5bec82d600ff312d75ff9e9299b887fb8d6 100644 (file)
--- a/src/lib/unicode-transform.c
+++ b/src/lib/unicode-transform.c
@@ -1,5 +1,9 @@
  /* Copyright (c) 2025 Dovecot authors, see the included COPYING file */
  
+#include "lib.h"
+#include "unicode-data.h"
+#include "unicode-transform.h"
+
  #define HANGUL_FIRST 0xac00
  #define HANGUL_LAST 0xd7a3
  
@@ -42,35 +46,35 @@ static size_t unicode_hangul_decompose(uint32_t cp, uint32_t buf[3])
         return 3;
  }
  
-static void uni_ucs4_decompose_hangul_utf8(uint32_t cp, buffer_t *output)
-{
-       uint32_t buf[3];
-       size_t len, i;
-
-       len = unicode_hangul_decompose(cp, buf);
+/*
+ * RFC 5051 - Simple Unicode Collation Algorithm
+ */
  
-       for (i = 0; i < len; i++)
-               uni_ucs4_to_utf8_c(buf[i], output);
+void unicode_rfc5051_init(struct unicode_rfc5051_context *ctx)
+{
+       i_zero(ctx);
  }
  
-static void
-uni_ucs4_decompose_one_utf8(uint32_t cp, bool canonical, buffer_t *output)
+size_t unicode_rfc5051_normalize(struct unicode_rfc5051_context *ctx,
+                                uint32_t cp, const uint32_t **norm_r)
  {
-       const uint32_t *decomp;
-       size_t len, i;
+       const struct unicode_code_point_data *cpd;
+       size_t len;
+
+       cpd = unicode_code_point_get_data(cp);
+       if (cpd->simple_titlecase_mapping != 0x0000)
+               cp = cpd->simple_titlecase_mapping;
  
         if (cp >= HANGUL_FIRST && cp <= HANGUL_LAST) {
-               uni_ucs4_decompose_hangul_utf8(cp, output);
-               return;
+               *norm_r = ctx->buffer;
+               return unicode_hangul_decompose(cp, ctx->buffer);
         }
  
-       len = unicode_code_point_get_full_decomposition(cp, canonical,
-                                                       &decomp);
+       len = unicode_code_point_get_full_decomposition(cp, FALSE, norm_r);
         if (len == 0) {
-               uni_ucs4_to_utf8_c(cp, output);
-               return;
+               ctx->buffer[0] = cp;
+               *norm_r = ctx->buffer;
+               return 1;
         }
-
-       for (i = 0; i < len; i++)
-               uni_ucs4_to_utf8_c(decomp[i], output);
+       return len;
  }
diff --git a/src/lib/unicode-transform.h b/src/lib/unicode-transform.h

new file mode 100644 (file)

index 0000000..ce407a7
--- /dev/null
+++ b/src/lib/unicode-transform.h
@@ -0,0 +1,16 @@
+#ifndef UNICODE_NF_H
+#define UNICODE_NF_H
+
+/*
+ * RFC 5051 - Simple Unicode Collation Algorithm
+ */
+
+struct unicode_rfc5051_context {
+       uint32_t buffer[3];
+};
+
+void unicode_rfc5051_init(struct unicode_rfc5051_context *ctx);
+size_t unicode_rfc5051_normalize(struct unicode_rfc5051_context *ctx,
+                                uint32_t cp, const uint32_t **norm_r);
+
+#endif
author	Stephan Bosch <stephan.bosch@open-xchange.com>
	Fri, 21 Mar 2025 04:59:48 +0000 (05:59 +0100)
committer	Stephan Bosch <stephan.bosch@open-xchange.com>
	Fri, 1 Aug 2025 01:11:19 +0000 (03:11 +0200)
src/lib/Makefile.am		patch \| blob \| blame \| history
src/lib/unichar.c		patch \| blob \| blame \| history
src/lib/unicode-transform.c		patch \| blob \| blame \| history
src/lib/unicode-transform.h	[new file with mode: 0644]	patch \| blob