From: Stephan Bosch <stephan.bosch@open-xchange.com>
Date: Sun, 30 Mar 2025 01:02:58 +0000 (+0200)
Subject: lib: unicode-transform - Add generic Unicode string transformation API
X-Git-Tag: 2.4.2~612
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=3cc0190ef90056926e748b7fc82e79a8940732d4;p=thirdparty%2Fdovecot%2Fcore.git

lib: unicode-transform - Add generic Unicode string transformation API
---

diff --git a/src/lib/unichar.c b/src/lib/unichar.c
index 209d1b1e9d..0a7405d2cc 100644
--- a/src/lib/unichar.c
+++ b/src/lib/unichar.c
@@ -250,6 +250,54 @@ static void output_add_replacement_char(buffer_t *output)
 	buffer_append(output, utf8_replacement_char, UTF8_REPLACEMENT_CHAR_LEN);
 }
 
+int uni_utf8_run_transform(const void *_input, size_t size,
+			   struct unicode_transform *trans, buffer_t *output,
+			   const char **error_r)
+{
+	struct unicode_transform *trans_last =
+		unicode_transform_get_last(trans);
+	struct unicode_buffer_sink sink;
+	const unsigned char *input = _input;
+	unichar_t chr;
+	ssize_t sret;
+	bool got_chr = FALSE, bad_cp = FALSE;
+	int ret = 0;
+
+	unicode_buffer_sink_init(&sink, output);
+	unicode_transform_chain(trans_last, &sink.transform);
+
+	while (size > 0 || got_chr) {
+		if (!got_chr) {
+			int bytes = uni_utf8_get_char_n(input, size, &chr);
+			if (bytes <= 0) {
+				/* Invalid input. try the next byte. */
+				ret = -1;
+				input++; size--;
+				if (!bad_cp) {
+				       chr = UNICODE_REPLACEMENT_CHAR;
+				       bad_cp = TRUE;
+				}
+			} else {
+				input += bytes;
+				size -= bytes;
+				bad_cp = FALSE;
+			}
+		}
+
+		sret = unicode_transform_input(trans, &chr, 1, error_r);
+		if (sret < 0)
+			return -1;
+		if (sret > 0)
+			got_chr = FALSE;
+	}
+
+	int fret = unicode_transform_flush(trans, error_r);
+	if (fret < 0)
+		i_panic("unicode_transform_flush(): %s", *error_r);
+	i_assert(fret == 1);
+	return ret;
+}
+
 int uni_utf8_to_decomposed_titlecase(const void *_input, size_t size,
 				     buffer_t *output)
 {
diff --git a/src/lib/unichar.h b/src/lib/unichar.h
index e26acaf298..31741a8ba9 100644
--- a/src/lib/unichar.h
+++ b/src/lib/unichar.h
@@ -42,6 +42,8 @@
 #define UTF16_VALID_HIGH_SURROGATE(chr) (((chr) & 0xfffc00) == UTF16_SURROGATE_HIGH_FIRST)
 #define UTF16_VALID_LOW_SURROGATE(chr) (((chr) & 0xfffc00) == UTF16_SURROGATE_LOW_FIRST)
 
+struct unicode_transform;
+
 typedef uint32_t unichar_t;
 ARRAY_DEFINE_TYPE(unichars, unichar_t);
 
@@ -116,6 +118,12 @@ uni_utf8_char_bytes(unsigned char chr)
 /* Return given character in titlecase. */
 unichar_t uni_ucs4_to_titlecase(unichar_t chr) ATTR_CONST;
 
+/* Run the UTF8 string through the provided Unicode transform and write the
+   result into the buffer again encoded in UTF8. */
+int uni_utf8_run_transform(const void *_input, size_t size,
+			   struct unicode_transform *trans, buffer_t *output,
+			   const char **error_r);
+
 /* Convert UTF-8 input to titlecase and decompose the titlecase characters to
    output buffer. Returns 0 if ok, -1 if input was invalid. This generates
    output that's compatible with i;unicode-casemap comparator. Invalid input
diff --git a/src/lib/unicode-transform.c b/src/lib/unicode-transform.c
index cc94c8c359..401e74ec9c 100644
--- a/src/lib/unicode-transform.c
+++ b/src/lib/unicode-transform.c
@@ -1,12 +1,204 @@
 /* Copyright (c) 2025 Dovecot authors, see the included COPYING file */
 
 #include "lib.h"
+#include "unichar.h"
 #include "unicode-data.h"
 #include "unicode-transform.h"
 
 #define HANGUL_FIRST 0xac00
 #define HANGUL_LAST 0xd7a3
 
+/*
+ * Transform
+ */
+
+ssize_t uniform_transform_forward(
+	struct unicode_transform *trans, const uint32_t *out,
+	const struct unicode_code_point_data *const *out_data, size_t out_len,
+	const char **error_r)
+{
+	struct unicode_transform_buffer buf_next;
+	ssize_t sret;
+
+	i_zero(&buf_next);
+	buf_next.cp = out;
+	buf_next.cp_data = out_data;
+	buf_next.cp_count = out_len;
+
+	i_assert(trans->next != NULL);
+	i_assert(trans->next->def != NULL);
+	i_assert(trans->next->def->input != NULL);
+	sret = trans->next->def->input(trans->next, &buf_next, error_r);
+
+	i_assert(sret >= 0 || *error_r != NULL);
+	i_assert(sret <= (ssize_t)out_len);
+	return sret;
+}
+
+ssize_t unicode_transform_input(struct unicode_transform *trans,
+				const uint32_t *in, size_t in_len,
+				const char **error_r)
+{
+	struct unicode_transform_buffer in_buf;
+	size_t input_total = 0;
+	ssize_t sret;
+	bool flushed = FALSE;
+	int ret;
+
+	*error_r = NULL;
+
+	i_zero(&in_buf);
+	in_buf.cp = in;
+	in_buf.cp_count = in_len;
+
+	while (in_buf.cp_count > 0) {
+		if (in_buf.cp_count > 0) {
+			i_assert(trans->def->input != NULL);
+			sret = trans->def->input(trans, &in_buf, error_r);
+			if (sret < 0) {
+				i_assert(*error_r != NULL);
+				return -1;
+			}
+			if (sret > 0) {
+				i_assert((size_t)sret <= in_buf.cp_count);
+				in_buf.cp += sret;
+				in_buf.cp_count -= sret;
+				input_total += sret;
+				flushed = FALSE;
+				continue;
+			}
+			if (sret == 0 && flushed)
+				break;
+		}
+
+		struct unicode_transform *tp = trans;
+
+		while (tp->next != NULL) {
+			if (tp->def->flush != NULL) {
+				ret = tp->def->flush(tp, FALSE, error_r);
+				if (ret < 0) {
+					i_assert(*error_r != NULL);
+					return -1;
+				}
+			}
+			tp = tp->next;
+		}
+
+		flushed = TRUE;
+	}
+
+	return input_total;
+}
+
+int unicode_transform_flush(struct unicode_transform *trans,
+			    const char **error_r)
+{
+	int ret;
+
+	*error_r = NULL;
+
+	while (trans != NULL) {
+		struct unicode_transform *tp = trans;
+		bool progress = FALSE;
+
+		while (tp != NULL) {
+			if (tp->def->flush == NULL) {
+				progress = TRUE;
+				if (tp == trans)
+					trans = trans->next;
+			} else {
+				ret = tp->def->flush(tp, (tp == trans), error_r);
+				if (ret < 0) {
+					i_assert(*error_r != NULL);
+					return -1;
+				}
+				if (ret > 0) {
+					progress = TRUE;
+					if (tp == trans)
+						trans = trans->next;
+				}
+			}
+			tp = tp->next;
+		}
+		if (!progress)
+			return 0;
+	}
+	return 1;
+}
+
+/* Buffer Sink */
+
+static ssize_t
+unicode_buffer_sink_input(struct unicode_transform *trans,
+			  const struct unicode_transform_buffer *buf,
+			  const char **error_r);
+
+static const struct unicode_transform_def unicode_buffer_sink_def = {
+	.input = unicode_buffer_sink_input,
+};
+
+void unicode_buffer_sink_init(struct unicode_buffer_sink *sink,
+			      buffer_t *buffer)
+{
+	i_zero(sink);
+	unicode_transform_init(&sink->transform, &unicode_buffer_sink_def);
+	sink->buffer = buffer;
+}
+
+static ssize_t
+unicode_buffer_sink_input(struct unicode_transform *trans,
+			  const struct unicode_transform_buffer *buf,
+			  const char **error_r ATTR_UNUSED)
+{
+	struct unicode_buffer_sink *sink =
+		container_of(trans, struct unicode_buffer_sink, transform);
+
+	uni_ucs4_to_utf8(buf->cp, buf->cp_count, sink->buffer);
+	return buf->cp_count;
+}
+
+/* Static Array Sink */
+
+static ssize_t
+unicode_static_array_sink_input(struct unicode_transform *trans,
+				const struct unicode_transform_buffer *buf,
+				const char **error_r);
+
+static const struct unicode_transform_def unicode_static_array_sink_def = {
+	.input = unicode_static_array_sink_input,
+};
+
+void unicode_static_array_sink_init(struct unicode_static_array_sink *sink,
+				    uint32_t *array, size_t array_size,
+				    size_t *array_pos)
+{
+	i_zero(sink);
+	unicode_transform_init(&sink->transform,
+			       &unicode_static_array_sink_def);
+	sink->array = array;
+	sink->array_size = array_size;
+	sink->array_pos = array_pos;
+}
+
+static ssize_t
+unicode_static_array_sink_input(struct unicode_transform *trans,
+				const struct unicode_transform_buffer *buf,
+				const char **error_r)
+{
+	struct unicode_static_array_sink *sink =
+		container_of(trans, struct unicode_static_array_sink,
+			     transform);
+
+	if (*sink->array_pos + buf->cp_count > sink->array_size) {
+		*error_r = "Output overflow";
+		return -1;
+	}
+	memcpy(sink->array + *sink->array_pos, buf->cp,
+	       buf->cp_count * sizeof(*buf->cp));
+	*sink->array_pos += buf->cp_count;
+	return buf->cp_count;
+}
+
 /*
  * Hangul syllable (de)composition
  */
diff --git a/src/lib/unicode-transform.h b/src/lib/unicode-transform.h
index ce407a74de..e6e18d3177 100644
--- a/src/lib/unicode-transform.h
+++ b/src/lib/unicode-transform.h
@@ -1,6 +1,89 @@
 #ifndef UNICODE_NF_H
 #define UNICODE_NF_H
 
+/*
+ * Transform API
+ */
+
+struct unicode_transform;
+
+struct unicode_transform_buffer {
+	const uint32_t *cp;
+	const struct unicode_code_point_data *const *cp_data;
+	size_t cp_count;
+};
+
+struct unicode_transform_def {
+	ssize_t (*input)(struct unicode_transform *trans,
+			 const struct unicode_transform_buffer *buf,
+			 const char **error_r);
+	int (*flush)(struct unicode_transform *trans, bool finished,
+		     const char **error_r);
+};
+
+struct unicode_transform {
+	const struct unicode_transform_def *def;
+	struct unicode_transform *next;
+};
+
+static inline void
+unicode_transform_init(struct unicode_transform *trans,
+		       const struct unicode_transform_def *def)
+{
+	i_zero(trans);
+	trans->def = def;
+}
+
+static inline void
+unicode_transform_chain(struct unicode_transform *trans,
+			struct unicode_transform *next)
+{
+	i_assert(trans->next == NULL);
+	trans->next = next;
+}
+
+static inline struct unicode_transform *
+unicode_transform_get_last(struct unicode_transform *trans)
+{
+	while (trans->next != NULL)
+		trans = trans->next;
+	return trans;
+}
+
+ssize_t uniform_transform_forward(
+	struct unicode_transform *trans, const uint32_t *out,
+	const struct unicode_code_point_data *const *out_data, size_t out_len,
+	const char **error_r);
+
+ssize_t unicode_transform_input(struct unicode_transform *trans,
+				const uint32_t *in, size_t in_len,
+				const char **error_r);
+int unicode_transform_flush(struct unicode_transform *trans,
+			    const char **error_r);
+
+/* Buffer Sink */
+
+struct unicode_buffer_sink {
+	struct unicode_transform transform;
+	buffer_t *buffer;
+};
+
+void unicode_buffer_sink_init(struct unicode_buffer_sink *sink,
+			      buffer_t *buffer);
+
+/* Static Array Sink */
+
+struct unicode_static_array_sink {
+	struct unicode_transform transform;
+	uint32_t *array;
+	size_t array_size;
+	size_t *array_pos;
+};
+
+void unicode_static_array_sink_init(struct unicode_static_array_sink *sink,
+				    uint32_t *array, size_t array_size,
+				    size_t *array_pos);
+
 /*
  * RFC 5051 - Simple Unicode Collation Algorithm
  */