lib: unicode-transform - Add generic Unicode string transformation API

author Stephan Bosch <stephan.bosch@open-xchange.com>

Sun, 30 Mar 2025 01:02:58 +0000 (03:02 +0200)

committer Stephan Bosch <stephan.bosch@open-xchange.com>

Fri, 1 Aug 2025 01:11:19 +0000 (03:11 +0200)
author Stephan Bosch <stephan.bosch@open-xchange.com>
Sun, 30 Mar 2025 01:02:58 +0000 (03:02 +0200)
committer Stephan Bosch <stephan.bosch@open-xchange.com>
Fri, 1 Aug 2025 01:11:19 +0000 (03:11 +0200)
diff --git a/src/lib/unichar.c b/src/lib/unichar.c

index 209d1b1e9d1bba16d8b26354de74d956f72aac0c..0a7405d2cc50ebff1e7c219595897a6b65f39d02 100644 (file)
--- a/src/lib/unichar.c
+++ b/src/lib/unichar.c
@@ -250,6 +250,54 @@ static void output_add_replacement_char(buffer_t *output)
         buffer_append(output, utf8_replacement_char, UTF8_REPLACEMENT_CHAR_LEN);
  }
  
+int uni_utf8_run_transform(const void *_input, size_t size,
+                          struct unicode_transform *trans, buffer_t *output,
+                          const char **error_r)
+{
+       struct unicode_transform *trans_last =
+               unicode_transform_get_last(trans);
+       struct unicode_buffer_sink sink;
+       const unsigned char *input = _input;
+       unichar_t chr;
+       ssize_t sret;
+       bool got_chr = FALSE, bad_cp = FALSE;
+       int ret = 0;
+
+       unicode_buffer_sink_init(&sink, output);
+       unicode_transform_chain(trans_last, &sink.transform);
+
+       while (size > 0 || got_chr) {
+               if (!got_chr) {
+                       int bytes = uni_utf8_get_char_n(input, size, &chr);
+                       if (bytes <= 0) {
+                               /* Invalid input. try the next byte. */
+                               ret = -1;
+                               input++; size--;
+                               if (!bad_cp) {
+                                      chr = UNICODE_REPLACEMENT_CHAR;
+                                      bad_cp = TRUE;
+                               }
+                       } else {
+                               input += bytes;
+                               size -= bytes;
+                               bad_cp = FALSE;
+                       }
+               }
+
+               sret = unicode_transform_input(trans, &chr, 1, error_r);
+               if (sret < 0)
+                       return -1;
+               if (sret > 0)
+                       got_chr = FALSE;
+       }
+
+       int fret = unicode_transform_flush(trans, error_r);
+       if (fret < 0)
+               i_panic("unicode_transform_flush(): %s", *error_r);
+       i_assert(fret == 1);
+       return ret;
+}
+
  int uni_utf8_to_decomposed_titlecase(const void *_input, size_t size,
                                      buffer_t *output)
  {
diff --git a/src/lib/unichar.h b/src/lib/unichar.h

index e26acaf298f86ca5e2d54c5fff0ac63e8539d9b7..31741a8ba9dc6da7106aa0eb258c1cdaa35645a6 100644 (file)
--- a/src/lib/unichar.h
+++ b/src/lib/unichar.h
@@ -42,6 +42,8 @@
  #define UTF16_VALID_HIGH_SURROGATE(chr) (((chr) & 0xfffc00) == UTF16_SURROGATE_HIGH_FIRST)
  #define UTF16_VALID_LOW_SURROGATE(chr) (((chr) & 0xfffc00) == UTF16_SURROGATE_LOW_FIRST)
  
+struct unicode_transform;
+
  typedef uint32_t unichar_t;
  ARRAY_DEFINE_TYPE(unichars, unichar_t);
  
@@ -116,6 +118,12 @@ uni_utf8_char_bytes(unsigned char chr)
  /* Return given character in titlecase. */
  unichar_t uni_ucs4_to_titlecase(unichar_t chr) ATTR_CONST;
  
+/* Run the UTF8 string through the provided Unicode transform and write the
+   result into the buffer again encoded in UTF8. */
+int uni_utf8_run_transform(const void *_input, size_t size,
+                          struct unicode_transform *trans, buffer_t *output,
+                          const char **error_r);
+
  /* Convert UTF-8 input to titlecase and decompose the titlecase characters to
     output buffer. Returns 0 if ok, -1 if input was invalid. This generates
     output that's compatible with i;unicode-casemap comparator. Invalid input
diff --git a/src/lib/unicode-transform.c b/src/lib/unicode-transform.c

index cc94c8c3590e0b7772c9143ba797688c2c231c5c..401e74ec9c891615d40605e6757b1e8c0eaf79b8 100644 (file)
--- a/src/lib/unicode-transform.c
+++ b/src/lib/unicode-transform.c
@@ -1,12 +1,204 @@
  /* Copyright (c) 2025 Dovecot authors, see the included COPYING file */
  
  #include "lib.h"
+#include "unichar.h"
  #include "unicode-data.h"
  #include "unicode-transform.h"
  
  #define HANGUL_FIRST 0xac00
  #define HANGUL_LAST 0xd7a3
  
+/*
+ * Transform
+ */
+
+ssize_t uniform_transform_forward(
+       struct unicode_transform *trans, const uint32_t *out,
+       const struct unicode_code_point_data *const *out_data, size_t out_len,
+       const char **error_r)
+{
+       struct unicode_transform_buffer buf_next;
+       ssize_t sret;
+
+       i_zero(&buf_next);
+       buf_next.cp = out;
+       buf_next.cp_data = out_data;
+       buf_next.cp_count = out_len;
+
+       i_assert(trans->next != NULL);
+       i_assert(trans->next->def != NULL);
+       i_assert(trans->next->def->input != NULL);
+       sret = trans->next->def->input(trans->next, &buf_next, error_r);
+
+       i_assert(sret >= 0 || *error_r != NULL);
+       i_assert(sret <= (ssize_t)out_len);
+       return sret;
+}
+
+ssize_t unicode_transform_input(struct unicode_transform *trans,
+                               const uint32_t *in, size_t in_len,
+                               const char **error_r)
+{
+       struct unicode_transform_buffer in_buf;
+       size_t input_total = 0;
+       ssize_t sret;
+       bool flushed = FALSE;
+       int ret;
+
+       *error_r = NULL;
+
+       i_zero(&in_buf);
+       in_buf.cp = in;
+       in_buf.cp_count = in_len;
+
+       while (in_buf.cp_count > 0) {
+               if (in_buf.cp_count > 0) {
+                       i_assert(trans->def->input != NULL);
+                       sret = trans->def->input(trans, &in_buf, error_r);
+                       if (sret < 0) {
+                               i_assert(*error_r != NULL);
+                               return -1;
+                       }
+                       if (sret > 0) {
+                               i_assert((size_t)sret <= in_buf.cp_count);
+                               in_buf.cp += sret;
+                               in_buf.cp_count -= sret;
+                               input_total += sret;
+                               flushed = FALSE;
+                               continue;
+                       }
+                       if (sret == 0 && flushed)
+                               break;
+               }
+
+               struct unicode_transform *tp = trans;
+
+               while (tp->next != NULL) {
+                       if (tp->def->flush != NULL) {
+                               ret = tp->def->flush(tp, FALSE, error_r);
+                               if (ret < 0) {
+                                       i_assert(*error_r != NULL);
+                                       return -1;
+                               }
+                       }
+                       tp = tp->next;
+               }
+
+               flushed = TRUE;
+       }
+
+       return input_total;
+}
+
+int unicode_transform_flush(struct unicode_transform *trans,
+                           const char **error_r)
+{
+       int ret;
+
+       *error_r = NULL;
+
+       while (trans != NULL) {
+               struct unicode_transform *tp = trans;
+               bool progress = FALSE;
+
+               while (tp != NULL) {
+                       if (tp->def->flush == NULL) {
+                               progress = TRUE;
+                               if (tp == trans)
+                                       trans = trans->next;
+                       } else {
+                               ret = tp->def->flush(tp, (tp == trans), error_r);
+                               if (ret < 0) {
+                                       i_assert(*error_r != NULL);
+                                       return -1;
+                               }
+                               if (ret > 0) {
+                                       progress = TRUE;
+                                       if (tp == trans)
+                                               trans = trans->next;
+                               }
+                       }
+                       tp = tp->next;
+               }
+               if (!progress)
+                       return 0;
+       }
+       return 1;
+}
+
+/* Buffer Sink */
+
+static ssize_t
+unicode_buffer_sink_input(struct unicode_transform *trans,
+                         const struct unicode_transform_buffer *buf,
+                         const char **error_r);
+
+static const struct unicode_transform_def unicode_buffer_sink_def = {
+       .input = unicode_buffer_sink_input,
+};
+
+void unicode_buffer_sink_init(struct unicode_buffer_sink *sink,
+                             buffer_t *buffer)
+{
+       i_zero(sink);
+       unicode_transform_init(&sink->transform, &unicode_buffer_sink_def);
+       sink->buffer = buffer;
+}
+
+static ssize_t
+unicode_buffer_sink_input(struct unicode_transform *trans,
+                         const struct unicode_transform_buffer *buf,
+                         const char **error_r ATTR_UNUSED)
+{
+       struct unicode_buffer_sink *sink =
+               container_of(trans, struct unicode_buffer_sink, transform);
+
+       uni_ucs4_to_utf8(buf->cp, buf->cp_count, sink->buffer);
+       return buf->cp_count;
+}
+
+/* Static Array Sink */
+
+static ssize_t
+unicode_static_array_sink_input(struct unicode_transform *trans,
+                               const struct unicode_transform_buffer *buf,
+                               const char **error_r);
+
+static const struct unicode_transform_def unicode_static_array_sink_def = {
+       .input = unicode_static_array_sink_input,
+};
+
+void unicode_static_array_sink_init(struct unicode_static_array_sink *sink,
+                                   uint32_t *array, size_t array_size,
+                                   size_t *array_pos)
+{
+       i_zero(sink);
+       unicode_transform_init(&sink->transform,
+                              &unicode_static_array_sink_def);
+       sink->array = array;
+       sink->array_size = array_size;
+       sink->array_pos = array_pos;
+}
+
+static ssize_t
+unicode_static_array_sink_input(struct unicode_transform *trans,
+                               const struct unicode_transform_buffer *buf,
+                               const char **error_r)
+{
+       struct unicode_static_array_sink *sink =
+               container_of(trans, struct unicode_static_array_sink,
+                            transform);
+
+       if (*sink->array_pos + buf->cp_count > sink->array_size) {
+               *error_r = "Output overflow";
+               return -1;
+       }
+       memcpy(sink->array + *sink->array_pos, buf->cp,
+              buf->cp_count * sizeof(*buf->cp));
+       *sink->array_pos += buf->cp_count;
+       return buf->cp_count;
+}
+
  /*
   * Hangul syllable (de)composition
   */
diff --git a/src/lib/unicode-transform.h b/src/lib/unicode-transform.h

index ce407a74de6119e3dabea12b2fb06dafe2f4d68b..e6e18d3177f8ce67951a30b1e176fb53a4a0d49d 100644 (file)
--- a/src/lib/unicode-transform.h
+++ b/src/lib/unicode-transform.h
@@ -1,6 +1,89 @@
  #ifndef UNICODE_NF_H
  #define UNICODE_NF_H
  
+/*
+ * Transform API
+ */
+
+struct unicode_transform;
+
+struct unicode_transform_buffer {
+       const uint32_t *cp;
+       const struct unicode_code_point_data *const *cp_data;
+       size_t cp_count;
+};
+
+struct unicode_transform_def {
+       ssize_t (*input)(struct unicode_transform *trans,
+                        const struct unicode_transform_buffer *buf,
+                        const char **error_r);
+       int (*flush)(struct unicode_transform *trans, bool finished,
+                    const char **error_r);
+};
+
+struct unicode_transform {
+       const struct unicode_transform_def *def;
+       struct unicode_transform *next;
+};
+
+static inline void
+unicode_transform_init(struct unicode_transform *trans,
+                      const struct unicode_transform_def *def)
+{
+       i_zero(trans);
+       trans->def = def;
+}
+
+static inline void
+unicode_transform_chain(struct unicode_transform *trans,
+                       struct unicode_transform *next)
+{
+       i_assert(trans->next == NULL);
+       trans->next = next;
+}
+
+static inline struct unicode_transform *
+unicode_transform_get_last(struct unicode_transform *trans)
+{
+       while (trans->next != NULL)
+               trans = trans->next;
+       return trans;
+}
+
+ssize_t uniform_transform_forward(
+       struct unicode_transform *trans, const uint32_t *out,
+       const struct unicode_code_point_data *const *out_data, size_t out_len,
+       const char **error_r);
+
+ssize_t unicode_transform_input(struct unicode_transform *trans,
+                               const uint32_t *in, size_t in_len,
+                               const char **error_r);
+int unicode_transform_flush(struct unicode_transform *trans,
+                           const char **error_r);
+
+/* Buffer Sink */
+
+struct unicode_buffer_sink {
+       struct unicode_transform transform;
+       buffer_t *buffer;
+};
+
+void unicode_buffer_sink_init(struct unicode_buffer_sink *sink,
+                             buffer_t *buffer);
+
+/* Static Array Sink */
+
+struct unicode_static_array_sink {
+       struct unicode_transform transform;
+       uint32_t *array;
+       size_t array_size;
+       size_t *array_pos;
+};
+
+void unicode_static_array_sink_init(struct unicode_static_array_sink *sink,
+                                   uint32_t *array, size_t array_size,
+                                   size_t *array_pos);
+
  /*
   * RFC 5051 - Simple Unicode Collation Algorithm
   */
author	Stephan Bosch <stephan.bosch@open-xchange.com>
	Sun, 30 Mar 2025 01:02:58 +0000 (03:02 +0200)
committer	Stephan Bosch <stephan.bosch@open-xchange.com>
	Fri, 1 Aug 2025 01:11:19 +0000 (03:11 +0200)
src/lib/unichar.c		patch \| blob \| blame \| history
src/lib/unichar.h		patch \| blob \| blame \| history
src/lib/unicode-transform.c		patch \| blob \| blame \| history
src/lib/unicode-transform.h		patch \| blob \| blame \| history