#include "array.h"
#include "bsearch-insert-pos.h"
#include "unicode-data.h"
+#include "unicode-transform.h"
#include "unichar.h"
const unsigned char utf8_replacement_char[UTF8_REPLACEMENT_CHAR_LEN] =
return chr;
}
-#include "unicode-transform.c"
-
static void output_add_replacement_char(buffer_t *output)
{
if (output->used >= UTF8_REPLACEMENT_CHAR_LEN &&
int uni_utf8_to_decomposed_titlecase(const void *_input, size_t size,
buffer_t *output)
{
+ struct unicode_rfc5051_context ctx;
const unsigned char *input = _input;
unichar_t chr;
int ret = 0;
+ unicode_rfc5051_init(&ctx);
+
while (size > 0) {
int bytes = uni_utf8_get_char_n(input, size, &chr);
if (bytes <= 0) {
input += bytes;
size -= bytes;
- chr = uni_ucs4_to_titlecase(chr);
- uni_ucs4_decompose_one_utf8(chr, FALSE, output);
+ const unichar_t *norm;
+ size_t norm_len;
+
+ norm_len = unicode_rfc5051_normalize(&ctx, chr, &norm);
+ uni_ucs4_to_utf8(norm, norm_len, output);
}
return ret;
}
/* Copyright (c) 2025 Dovecot authors, see the included COPYING file */
+#include "lib.h"
+#include "unicode-data.h"
+#include "unicode-transform.h"
+
#define HANGUL_FIRST 0xac00
#define HANGUL_LAST 0xd7a3
return 3;
}
-static void uni_ucs4_decompose_hangul_utf8(uint32_t cp, buffer_t *output)
-{
- uint32_t buf[3];
- size_t len, i;
-
- len = unicode_hangul_decompose(cp, buf);
+/*
+ * RFC 5051 - Simple Unicode Collation Algorithm
+ */
- for (i = 0; i < len; i++)
- uni_ucs4_to_utf8_c(buf[i], output);
+void unicode_rfc5051_init(struct unicode_rfc5051_context *ctx)
+{
+ i_zero(ctx);
}
-static void
-uni_ucs4_decompose_one_utf8(uint32_t cp, bool canonical, buffer_t *output)
+size_t unicode_rfc5051_normalize(struct unicode_rfc5051_context *ctx,
+ uint32_t cp, const uint32_t **norm_r)
{
- const uint32_t *decomp;
- size_t len, i;
+ const struct unicode_code_point_data *cpd;
+ size_t len;
+
+ cpd = unicode_code_point_get_data(cp);
+ if (cpd->simple_titlecase_mapping != 0x0000)
+ cp = cpd->simple_titlecase_mapping;
if (cp >= HANGUL_FIRST && cp <= HANGUL_LAST) {
- uni_ucs4_decompose_hangul_utf8(cp, output);
- return;
+ *norm_r = ctx->buffer;
+ return unicode_hangul_decompose(cp, ctx->buffer);
}
- len = unicode_code_point_get_full_decomposition(cp, canonical,
- &decomp);
+ len = unicode_code_point_get_full_decomposition(cp, FALSE, norm_r);
if (len == 0) {
- uni_ucs4_to_utf8_c(cp, output);
- return;
+ ctx->buffer[0] = cp;
+ *norm_r = ctx->buffer;
+ return 1;
}
-
- for (i = 0; i < len; i++)
- uni_ucs4_to_utf8_c(decomp[i], output);
+ return len;
}
--- /dev/null
+#ifndef UNICODE_NF_H
+#define UNICODE_NF_H
+
+/*
+ * RFC 5051 - Simple Unicode Collation Algorithm
+ */
+
+struct unicode_rfc5051_context {
+ uint32_t buffer[3];
+};
+
+void unicode_rfc5051_init(struct unicode_rfc5051_context *ctx);
+size_t unicode_rfc5051_normalize(struct unicode_rfc5051_context *ctx,
+ uint32_t cp, const uint32_t **norm_r);
+
+#endif