buffer_append(output, utf8_replacement_char, UTF8_REPLACEMENT_CHAR_LEN);
}
+int uni_utf8_run_transform(const void *_input, size_t size,
+ struct unicode_transform *trans, buffer_t *output,
+ const char **error_r)
+{
+ struct unicode_transform *trans_last =
+ unicode_transform_get_last(trans);
+ struct unicode_buffer_sink sink;
+ const unsigned char *input = _input;
+ unichar_t chr;
+ ssize_t sret;
+ bool got_chr = FALSE, bad_cp = FALSE;
+ int ret = 0;
+
+ unicode_buffer_sink_init(&sink, output);
+ unicode_transform_chain(trans_last, &sink.transform);
+
+ while (size > 0 || got_chr) {
+ if (!got_chr) {
+ int bytes = uni_utf8_get_char_n(input, size, &chr);
+ if (bytes <= 0) {
+ /* Invalid input. try the next byte. */
+ ret = -1;
+ input++; size--;
+ if (!bad_cp) {
+ chr = UNICODE_REPLACEMENT_CHAR;
+ bad_cp = TRUE;
+ }
+ } else {
+ input += bytes;
+ size -= bytes;
+ bad_cp = FALSE;
+ }
+ }
+
+ sret = unicode_transform_input(trans, &chr, 1, error_r);
+ if (sret < 0)
+ return -1;
+ if (sret > 0)
+ got_chr = FALSE;
+ }
+
+ int fret = unicode_transform_flush(trans, error_r);
+ if (fret < 0)
+ i_panic("unicode_transform_flush(): %s", *error_r);
+ i_assert(fret == 1);
+ return ret;
+}
+
int uni_utf8_to_decomposed_titlecase(const void *_input, size_t size,
buffer_t *output)
{
#define UTF16_VALID_HIGH_SURROGATE(chr) (((chr) & 0xfffc00) == UTF16_SURROGATE_HIGH_FIRST)
#define UTF16_VALID_LOW_SURROGATE(chr) (((chr) & 0xfffc00) == UTF16_SURROGATE_LOW_FIRST)
+struct unicode_transform;
+
typedef uint32_t unichar_t;
ARRAY_DEFINE_TYPE(unichars, unichar_t);
/* Return given character in titlecase. */
unichar_t uni_ucs4_to_titlecase(unichar_t chr) ATTR_CONST;
+/* Run the UTF8 string through the provided Unicode transform and write the
+ result into the buffer again encoded in UTF8. */
+int uni_utf8_run_transform(const void *_input, size_t size,
+ struct unicode_transform *trans, buffer_t *output,
+ const char **error_r);
+
/* Convert UTF-8 input to titlecase and decompose the titlecase characters to
output buffer. Returns 0 if ok, -1 if input was invalid. This generates
output that's compatible with i;unicode-casemap comparator. Invalid input
/* Copyright (c) 2025 Dovecot authors, see the included COPYING file */
#include "lib.h"
+#include "unichar.h"
#include "unicode-data.h"
#include "unicode-transform.h"
#define HANGUL_FIRST 0xac00
#define HANGUL_LAST 0xd7a3
+/*
+ * Transform
+ */
+
+ssize_t uniform_transform_forward(
+ struct unicode_transform *trans, const uint32_t *out,
+ const struct unicode_code_point_data *const *out_data, size_t out_len,
+ const char **error_r)
+{
+ struct unicode_transform_buffer buf_next;
+ ssize_t sret;
+
+ i_zero(&buf_next);
+ buf_next.cp = out;
+ buf_next.cp_data = out_data;
+ buf_next.cp_count = out_len;
+
+ i_assert(trans->next != NULL);
+ i_assert(trans->next->def != NULL);
+ i_assert(trans->next->def->input != NULL);
+ sret = trans->next->def->input(trans->next, &buf_next, error_r);
+
+ i_assert(sret >= 0 || *error_r != NULL);
+ i_assert(sret <= (ssize_t)out_len);
+ return sret;
+}
+
+ssize_t unicode_transform_input(struct unicode_transform *trans,
+ const uint32_t *in, size_t in_len,
+ const char **error_r)
+{
+ struct unicode_transform_buffer in_buf;
+ size_t input_total = 0;
+ ssize_t sret;
+ bool flushed = FALSE;
+ int ret;
+
+ *error_r = NULL;
+
+ i_zero(&in_buf);
+ in_buf.cp = in;
+ in_buf.cp_count = in_len;
+
+ while (in_buf.cp_count > 0) {
+ if (in_buf.cp_count > 0) {
+ i_assert(trans->def->input != NULL);
+ sret = trans->def->input(trans, &in_buf, error_r);
+ if (sret < 0) {
+ i_assert(*error_r != NULL);
+ return -1;
+ }
+ if (sret > 0) {
+ i_assert((size_t)sret <= in_buf.cp_count);
+ in_buf.cp += sret;
+ in_buf.cp_count -= sret;
+ input_total += sret;
+ flushed = FALSE;
+ continue;
+ }
+ if (sret == 0 && flushed)
+ break;
+ }
+
+ struct unicode_transform *tp = trans;
+
+ while (tp->next != NULL) {
+ if (tp->def->flush != NULL) {
+ ret = tp->def->flush(tp, FALSE, error_r);
+ if (ret < 0) {
+ i_assert(*error_r != NULL);
+ return -1;
+ }
+ }
+ tp = tp->next;
+ }
+
+ flushed = TRUE;
+ }
+
+ return input_total;
+}
+
+int unicode_transform_flush(struct unicode_transform *trans,
+ const char **error_r)
+{
+ int ret;
+
+ *error_r = NULL;
+
+ while (trans != NULL) {
+ struct unicode_transform *tp = trans;
+ bool progress = FALSE;
+
+ while (tp != NULL) {
+ if (tp->def->flush == NULL) {
+ progress = TRUE;
+ if (tp == trans)
+ trans = trans->next;
+ } else {
+ ret = tp->def->flush(tp, (tp == trans), error_r);
+ if (ret < 0) {
+ i_assert(*error_r != NULL);
+ return -1;
+ }
+ if (ret > 0) {
+ progress = TRUE;
+ if (tp == trans)
+ trans = trans->next;
+ }
+ }
+ tp = tp->next;
+ }
+ if (!progress)
+ return 0;
+ }
+ return 1;
+}
+
+/* Buffer Sink */
+
+static ssize_t
+unicode_buffer_sink_input(struct unicode_transform *trans,
+ const struct unicode_transform_buffer *buf,
+ const char **error_r);
+
+static const struct unicode_transform_def unicode_buffer_sink_def = {
+ .input = unicode_buffer_sink_input,
+};
+
+void unicode_buffer_sink_init(struct unicode_buffer_sink *sink,
+ buffer_t *buffer)
+{
+ i_zero(sink);
+ unicode_transform_init(&sink->transform, &unicode_buffer_sink_def);
+ sink->buffer = buffer;
+}
+
+static ssize_t
+unicode_buffer_sink_input(struct unicode_transform *trans,
+ const struct unicode_transform_buffer *buf,
+ const char **error_r ATTR_UNUSED)
+{
+ struct unicode_buffer_sink *sink =
+ container_of(trans, struct unicode_buffer_sink, transform);
+
+ uni_ucs4_to_utf8(buf->cp, buf->cp_count, sink->buffer);
+ return buf->cp_count;
+}
+
+/* Static Array Sink */
+
+static ssize_t
+unicode_static_array_sink_input(struct unicode_transform *trans,
+ const struct unicode_transform_buffer *buf,
+ const char **error_r);
+
+static const struct unicode_transform_def unicode_static_array_sink_def = {
+ .input = unicode_static_array_sink_input,
+};
+
+void unicode_static_array_sink_init(struct unicode_static_array_sink *sink,
+ uint32_t *array, size_t array_size,
+ size_t *array_pos)
+{
+ i_zero(sink);
+ unicode_transform_init(&sink->transform,
+ &unicode_static_array_sink_def);
+ sink->array = array;
+ sink->array_size = array_size;
+ sink->array_pos = array_pos;
+}
+
+static ssize_t
+unicode_static_array_sink_input(struct unicode_transform *trans,
+ const struct unicode_transform_buffer *buf,
+ const char **error_r)
+{
+ struct unicode_static_array_sink *sink =
+ container_of(trans, struct unicode_static_array_sink,
+ transform);
+
+ if (*sink->array_pos + buf->cp_count > sink->array_size) {
+ *error_r = "Output overflow";
+ return -1;
+ }
+ memcpy(sink->array + *sink->array_pos, buf->cp,
+ buf->cp_count * sizeof(*buf->cp));
+ *sink->array_pos += buf->cp_count;
+ return buf->cp_count;
+}
+
/*
* Hangul syllable (de)composition
*/
#ifndef UNICODE_NF_H
#define UNICODE_NF_H
+/*
+ * Transform API
+ */
+
+struct unicode_transform;
+
+struct unicode_transform_buffer {
+ const uint32_t *cp;
+ const struct unicode_code_point_data *const *cp_data;
+ size_t cp_count;
+};
+
+struct unicode_transform_def {
+ ssize_t (*input)(struct unicode_transform *trans,
+ const struct unicode_transform_buffer *buf,
+ const char **error_r);
+ int (*flush)(struct unicode_transform *trans, bool finished,
+ const char **error_r);
+};
+
+struct unicode_transform {
+ const struct unicode_transform_def *def;
+ struct unicode_transform *next;
+};
+
+static inline void
+unicode_transform_init(struct unicode_transform *trans,
+ const struct unicode_transform_def *def)
+{
+ i_zero(trans);
+ trans->def = def;
+}
+
+static inline void
+unicode_transform_chain(struct unicode_transform *trans,
+ struct unicode_transform *next)
+{
+ i_assert(trans->next == NULL);
+ trans->next = next;
+}
+
+static inline struct unicode_transform *
+unicode_transform_get_last(struct unicode_transform *trans)
+{
+ while (trans->next != NULL)
+ trans = trans->next;
+ return trans;
+}
+
+ssize_t uniform_transform_forward(
+ struct unicode_transform *trans, const uint32_t *out,
+ const struct unicode_code_point_data *const *out_data, size_t out_len,
+ const char **error_r);
+
+ssize_t unicode_transform_input(struct unicode_transform *trans,
+ const uint32_t *in, size_t in_len,
+ const char **error_r);
+int unicode_transform_flush(struct unicode_transform *trans,
+ const char **error_r);
+
+/* Buffer Sink */
+
+struct unicode_buffer_sink {
+ struct unicode_transform transform;
+ buffer_t *buffer;
+};
+
+void unicode_buffer_sink_init(struct unicode_buffer_sink *sink,
+ buffer_t *buffer);
+
+/* Static Array Sink */
+
+struct unicode_static_array_sink {
+ struct unicode_transform transform;
+ uint32_t *array;
+ size_t array_size;
+ size_t *array_pos;
+};
+
+void unicode_static_array_sink_init(struct unicode_static_array_sink *sink,
+ uint32_t *array, size_t array_size,
+ size_t *array_pos);
+
/*
* RFC 5051 - Simple Unicode Collation Algorithm
*/