lib: unicode-transform - Implement streaming Unicode Normalization

author Stephan Bosch <stephan.bosch@open-xchange.com>

Wed, 27 Nov 2024 00:36:43 +0000 (01:36 +0100)

committer Stephan Bosch <stephan.bosch@open-xchange.com>

Fri, 1 Aug 2025 01:11:19 +0000 (03:11 +0200)
author Stephan Bosch <stephan.bosch@open-xchange.com>
Wed, 27 Nov 2024 00:36:43 +0000 (01:36 +0100)
committer Stephan Bosch <stephan.bosch@open-xchange.com>
Fri, 1 Aug 2025 01:11:19 +0000 (03:11 +0200)
diff --git a/src/lib/unichar.c b/src/lib/unichar.c

index 0a7405d2cc50ebff1e7c219595897a6b65f39d02..a8a60e92da27c260e67991fd725baa8af24e7541 100644 (file)
--- a/src/lib/unichar.c
+++ b/src/lib/unichar.c
@@ -2,6 +2,7 @@
  
  #include "lib.h"
  #include "array.h"
+#include "str.h"
  #include "bsearch-insert-pos.h"
  #include "unicode-data.h"
  #include "unicode-transform.h"
@@ -298,6 +299,125 @@ int uni_utf8_run_transform(const void *_input, size_t size,
         return ret;
  }
  
+static inline int
+uni_utf8_write_nf_common(const void *_input, size_t size,
+                        enum unicode_nf_type nf_type, buffer_t *output)
+{
+       static struct unicode_nf_context ctx;
+       const char *error;
+
+       unicode_nf_init(&ctx, nf_type);
+
+       return uni_utf8_run_transform(_input, size, &ctx.transform, output,
+                                     &error);
+}
+
+int uni_utf8_write_nfd(const void *input, size_t size, buffer_t *output)
+{
+       return uni_utf8_write_nf_common(input, size, UNICODE_NFD, output);
+}
+
+int uni_utf8_write_nfkd(const void *input, size_t size, buffer_t *output)
+{
+       return uni_utf8_write_nf_common(input, size, UNICODE_NFKD, output);
+}
+
+int uni_utf8_write_nfc(const void *input, size_t size, buffer_t *output)
+{
+       return uni_utf8_write_nf_common(input, size, UNICODE_NFC, output);
+}
+
+int uni_utf8_write_nfkc(const void *input, size_t size, buffer_t *output)
+{
+       return uni_utf8_write_nf_common(input, size, UNICODE_NFKC, output);
+}
+
+int uni_utf8_to_nfd(const void *input, size_t size, const char **output_r)
+{
+       buffer_t *output = t_buffer_create(size);
+
+       if (uni_utf8_write_nf_common(input, size, UNICODE_NFD, output) < 0)
+               return -1;
+       *output_r = str_c(output);
+       return 0;
+}
+
+int uni_utf8_to_nfkd(const void *input, size_t size, const char **output_r)
+{
+       buffer_t *output = t_buffer_create(size);
+
+       if (uni_utf8_write_nf_common(input, size, UNICODE_NFKD, output) < 0)
+               return -1;
+       *output_r = str_c(output);
+       return 0;
+}
+
+int uni_utf8_to_nfc(const void *input, size_t size, const char **output_r)
+{
+       buffer_t *output = t_buffer_create(size);
+
+       if (uni_utf8_write_nf_common(input, size, UNICODE_NFC, output) < 0)
+               return -1;
+       *output_r = str_c(output);
+       return 0;
+}
+
+int uni_utf8_to_nfkc(const void *input, size_t size, const char **output_r)
+{
+       buffer_t *output = t_buffer_create(size);
+
+       if (uni_utf8_write_nf_common(input, size, UNICODE_NFKC, output) < 0)
+               return -1;
+       *output_r = str_c(output);
+       return 0;
+}
+
+static int
+uni_utf8_is_nf(const void *_input, size_t size, enum unicode_nf_type type)
+{
+       static struct unicode_nf_checker unc;
+       const unsigned char *input = _input;
+       unichar_t chr;
+       int ret;
+
+       unicode_nf_checker_init(&unc, type);
+
+       while (size > 0) {
+               const struct unicode_code_point_data *cp_data = NULL;
+               int bytes = uni_utf8_get_char_n(input, size, &chr);
+               if (bytes <= 0)
+                       return -1;
+               input += bytes;
+               size -= bytes;
+
+               ret = unicode_nf_checker_input(&unc, chr, &cp_data);
+               if (ret <= 0)
+                       return ret;
+       }
+
+       return unicode_nf_checker_finish(&unc);
+}
+
+int uni_utf8_is_nfd(const void *input, size_t size)
+{
+       return uni_utf8_is_nf(input, size, UNICODE_NFD);
+}
+
+int uni_utf8_is_nfkd(const void *input, size_t size)
+{
+       return uni_utf8_is_nf(input, size, UNICODE_NFKD);
+}
+
+int uni_utf8_is_nfc(const void *input, size_t size)
+{
+       return uni_utf8_is_nf(input, size, UNICODE_NFC);
+}
+
+int uni_utf8_is_nfkc(const void *input, size_t size)
+{
+       return uni_utf8_is_nf(input, size, UNICODE_NFKC);
+}
+
  int uni_utf8_to_decomposed_titlecase(const void *_input, size_t size,
                                      buffer_t *output)
  {
diff --git a/src/lib/unichar.h b/src/lib/unichar.h

index 31741a8ba9dc6da7106aa0eb258c1cdaa35645a6..1e1ef09eceeb33e2f4311174924d2eb07c094e1f 100644 (file)
--- a/src/lib/unichar.h
+++ b/src/lib/unichar.h
@@ -124,6 +124,39 @@ int uni_utf8_run_transform(const void *_input, size_t size,
                            struct unicode_transform *trans, buffer_t *output,
                            const char **error_r);
  
+/* Normalize the UTF-8 input in Unicode NFD, NFKD, NFC or NFKC form and write
+   the result to the output buffer.
+
+   Refer to Unicode Standard Annex #15, Section 1.2 for more information. An
+   excerpt can be found in unicode-nf.h.
+
+   NOTE: Do not blindly use this function to write and append several values
+   together expecting the result to be NF* normalized as well. This function
+   does not check whether concatenation preserves the desired normalization nor
+   does it endeavour to achieve this result. Blind concatination works only in
+   very specific cases, so make sure you know what you are doing.
+ */
+int uni_utf8_write_nfd(const void *input, size_t size, buffer_t *output);
+int uni_utf8_write_nfkd(const void *input, size_t size, buffer_t *output);
+int uni_utf8_write_nfc(const void *input, size_t size, buffer_t *output);
+int uni_utf8_write_nfkc(const void *input, size_t size, buffer_t *output);
+
+/* Same as the write variants, but return the normalized input in the
+   output_r argument as a C string.
+ */
+int uni_utf8_to_nfd(const void *input, size_t size, const char **output_r);
+int uni_utf8_to_nfkd(const void *input, size_t size, const char **output_r);
+int uni_utf8_to_nfc(const void *input, size_t size, const char **output_r);
+int uni_utf8_to_nfkc(const void *input, size_t size, const char **output_r);
+
+/* Check whether the input is normalized in the indicated form. Returns -1 if
+   the input is not even valid UTF8 or contains invalid code points. Returns 1
+   if the input adheres to the requested normalization form and 0 otherwise. */
+int uni_utf8_is_nfd(const void *input, size_t size);
+int uni_utf8_is_nfkd(const void *input, size_t size);
+int uni_utf8_is_nfc(const void *input, size_t size);
+int uni_utf8_is_nfkc(const void *input, size_t size);
+
  /* Convert UTF-8 input to titlecase and decompose the titlecase characters to
     output buffer. Returns 0 if ok, -1 if input was invalid. This generates
     output that's compatible with i;unicode-casemap comparator. Invalid input
diff --git a/src/lib/unicode-transform.c b/src/lib/unicode-transform.c

index 401e74ec9c891615d40605e6757b1e8c0eaf79b8..8274f1ffd5f46ddc248e3cdaf31a25596eaf518a 100644 (file)
--- a/src/lib/unicode-transform.c
+++ b/src/lib/unicode-transform.c
@@ -35,9 +35,9 @@ ssize_t uniform_transform_forward(
         return sret;
  }
  
-ssize_t unicode_transform_input(struct unicode_transform *trans,
-                               const uint32_t *in, size_t in_len,
-                               const char **error_r)
+ssize_t unicode_transform_input_buf(struct unicode_transform *trans,
+                                   const struct unicode_transform_buffer *buf,
+                                   const char **error_r)
  {
         struct unicode_transform_buffer in_buf;
         size_t input_total = 0;
@@ -47,9 +47,7 @@ ssize_t unicode_transform_input(struct unicode_transform *trans,
  
         *error_r = NULL;
  
-       i_zero(&in_buf);
-       in_buf.cp = in;
-       in_buf.cp_count = in_len;
+       in_buf = *buf;
  
         while (in_buf.cp_count > 0) {
                 if (in_buf.cp_count > 0) {
@@ -207,10 +205,15 @@ static const uint16_t uni_hangul_s_base = 0xac00;
  static const uint16_t uni_hangul_l_base = 0x1100;
  static const uint16_t uni_hangul_v_base = 0x1161;
  static const uint16_t uni_hangul_t_base = 0x11a7;
+static const unsigned int uni_hangul_l_count = 19;
  static const unsigned int uni_hangul_v_count = 21;
  static const unsigned int uni_hangul_t_count = 28;
  static const unsigned int uni_hangul_n_count =
         uni_hangul_v_count * uni_hangul_t_count;
+static const uint16_t uni_hangul_l_end = uni_hangul_l_base + uni_hangul_l_count;
+static const uint16_t uni_hangul_v_end = uni_hangul_v_base + uni_hangul_v_count;
+static const uint16_t uni_hangul_t_end = uni_hangul_t_base + uni_hangul_t_count;
+static const uint16_t uni_hangul_s_end = 0xD7A4;
  
  static size_t unicode_hangul_decompose(uint32_t cp, uint32_t buf[3])
  {
@@ -240,6 +243,661 @@ static size_t unicode_hangul_decompose(uint32_t cp, uint32_t buf[3])
         return 3;
  }
  
+static uint32_t unicode_hangul_compose_pair(uint32_t l, uint32_t r)
+{
+       /* The Unicode Standard, Section 3.12.3:
+          Hangul Syllable Composition
+        */
+
+       /* <LPart, VPart> */
+       if (l >= uni_hangul_l_base && l < uni_hangul_l_end &&
+           r >= uni_hangul_v_base && r < uni_hangul_v_end) {
+               uint32_t l_part = l, v_part = r;
+
+               unsigned int l_index = l_part - uni_hangul_l_base;
+               unsigned int v_index = v_part - uni_hangul_v_base;
+               unsigned int lv_index = l_index * uni_hangul_n_count +
+                                       v_index * uni_hangul_t_count;
+               return uni_hangul_s_base + lv_index;
+       }
+       /* A sequence <LVPart, TPart> */
+       if (l >= uni_hangul_s_base && l < uni_hangul_s_end &&
+           r >= (uni_hangul_t_base + 1) && r < uni_hangul_t_end &&
+           ((l - uni_hangul_s_base) % uni_hangul_t_count) == 0) {
+               uint32_t lv_part = l, t_part = r;
+
+               unsigned int t_index = t_part - uni_hangul_t_base;
+               return lv_part + t_index;
+       }
+       return 0x0000;
+}
+
+/*
+ * Normalization transform: NFD, NFKD, NFC, NFKC
+ */
+
+static ssize_t
+unicode_nf_input(struct unicode_transform *trans,
+                const struct unicode_transform_buffer *buf,
+                const char **error_r);
+static int
+unicode_nf_flush(struct unicode_transform *trans, bool finished,
+                const char **error_r);
+
+static const struct unicode_transform_def unicode_nf_def = {
+       .input = unicode_nf_input,
+       .flush = unicode_nf_flush,
+};
+
+void unicode_nf_init(struct unicode_nf_context *ctx_r,
+                    enum unicode_nf_type type)
+{
+       i_zero(ctx_r);
+       unicode_transform_init(&ctx_r->transform, &unicode_nf_def);
+
+       switch (type) {
+       case UNICODE_NFD:
+               ctx_r->canonical = TRUE;
+               ctx_r->nf_qc_mask = UNICODE_NFD_QUICK_CHECK_MASK;
+               break;
+       case UNICODE_NFKD:
+               ctx_r->nf_qc_mask = UNICODE_NFKD_QUICK_CHECK_MASK;
+               break;
+       case UNICODE_NFC:
+               ctx_r->compose = TRUE;
+               ctx_r->canonical = TRUE;
+               ctx_r->nf_qc_mask = UNICODE_NFC_QUICK_CHECK_MASK;
+               break;
+       case UNICODE_NFKC:
+               ctx_r->compose = TRUE;
+               ctx_r->nf_qc_mask = UNICODE_NFKC_QUICK_CHECK_MASK;
+               break;
+       }
+}
+
+void unicode_nf_reset(struct unicode_nf_context *ctx)
+{
+       enum unicode_nf_type type =
+               (ctx->compose ? (ctx->canonical ? UNICODE_NFC : UNICODE_NFKC) :
+                               (ctx->canonical ? UNICODE_NFD : UNICODE_NFKD));
+       struct unicode_transform *next = ctx->transform.next;
+
+       unicode_nf_init(ctx, type);
+       unicode_transform_chain(&ctx->transform, next);
+}
+
+static void
+unicode_nf_buffer_delete(struct unicode_nf_context *ctx, size_t offset,
+                        size_t count)
+{
+       if (count == 0)
+               return;
+
+       i_assert(offset < ctx->buffer_len);
+       i_assert(count <= ctx->buffer_len);
+       i_assert(offset <= (ctx->buffer_len - count));
+
+       if (count == ctx->buffer_len) {
+               ctx->buffer_len = 0;
+               return;
+       }
+
+       size_t trailer = ctx->buffer_len - (offset + count);
+       if (trailer > 0) {
+               memmove(&ctx->cp_buffer[offset],
+                       &ctx->cp_buffer[offset + count],
+                       trailer * sizeof(ctx->cp_buffer[0]));
+               memmove(&ctx->cpd_buffer[offset],
+                       &ctx->cpd_buffer[offset + count],
+                       trailer * sizeof(ctx->cpd_buffer[0]));
+       }
+       ctx->buffer_len -= count;
+}
+
+static void
+unicode_nf_buffer_swap(struct unicode_nf_context *ctx,
+                      size_t idx1, size_t idx2)
+{
+       uint32_t tmp_cp = ctx->cp_buffer[idx2];
+       const struct unicode_code_point_data *tmp_cpd = ctx->cpd_buffer[idx2];
+
+       ctx->cp_buffer[idx2] = ctx->cp_buffer[idx1];
+       ctx->cpd_buffer[idx2] = ctx->cpd_buffer[idx1];
+       ctx->cp_buffer[idx1] = tmp_cp;
+       ctx->cpd_buffer[idx1] = tmp_cpd;
+}
+
+static void
+unicode_nf_cp(struct unicode_nf_context *ctx, uint32_t cp,
+             const struct unicode_code_point_data *cpd)
+{
+       static const size_t buffer_size = UNICODE_NF_BUFFER_SIZE;
+       uint8_t nf_qc_mask = ctx->nf_qc_mask;
+       size_t i;
+
+       /*
+        * Decompose the code point
+        */
+
+       const uint32_t *decomp, *decomp_k;
+       uint32_t decomp_hangul[3];
+       size_t len, len_k;
+
+       if (cp >= HANGUL_FIRST && cp <= HANGUL_LAST) {
+               len = len_k = unicode_hangul_decompose(cp, decomp_hangul);
+               decomp = decomp_k = decomp_hangul;
+       } else {
+               if (cpd == NULL)
+                       cpd = unicode_code_point_get_data(cp);
+               len = unicode_code_point_data_get_full_decomposition(
+                       cpd, ctx->canonical, &decomp);
+               if (len == 0) {
+                       decomp = &cp;
+                       len = 1;
+               }
+               len_k = len;
+               decomp_k = decomp;
+               if (ctx->canonical) {
+                       len_k = unicode_code_point_data_get_full_decomposition(
+                               cpd, ctx->canonical, &decomp_k);
+                       if (len_k == 0) {
+                               decomp_k = decomp;
+                               len_k = len;
+                       }
+               }
+               if (len > 0)
+                       cpd = NULL;
+       }
+
+       i_assert(len <= UNICODE_DECOMPOSITION_MAX_LENGTH);
+       i_assert(len_k <= UNICODE_DECOMPOSITION_MAX_LENGTH);
+
+       if ((ctx->buffer_len + len) > buffer_size) {
+               /* Decomposition overflows the buffer. Record and mark it as
+                  pending and come back to it once the buffer is sufficiently
+                  drained. */
+               i_assert(ctx->pending_decomp == 0);
+               ctx->pending_decomp = len;
+               ctx->pending_cp = cp;
+               ctx->pending_cpd = cpd;
+               return;
+       }
+
+       /* UAX15-D4: Stream-Safe Text Process is the process of producing a
+          Unicode string in Stream-Safe Text Format by processing that string
+          from start to finish, inserting U+034F COMBINING GRAPHEME JOINER
+          (CGJ) within long sequences of non-starters. The exact position o
+          the inserted CGJs are determined according to the following
+          algorithm, which describes the generation of an output string from an
+          input string:
+
+          1. If the input string is empty, return an empty output string.
+          2. Set nonStarterCount to zero.
+          3. For each code point C in the input string:
+               a. Produce the NFKD decomposition S.
+               b. If nonStarterCount plus the number of initial non-starters in
+                  S is greater than 30, append a CGJ to the output string and
+                  set the nonStarterCount to zero.
+               c. Append C to the output string.
+               d. If there are no starters in S, increment nonStarterCount by
+                  the number of code points in S; otherwise, set
+                  nonStarterCount to the number of trailing non-starters in S
+                  (which may be zero).
+          4. Return the output string.
+        */
+
+       /* Determine number of leading and trailing non-starters in full NFKD
+          decomposition. */
+       const struct unicode_code_point_data *
+               decomp_cpd[UNICODE_DECOMPOSITION_MAX_LENGTH];
+       size_t ns_lead = 0, ns_trail = 0;
+       bool seen_starter = FALSE;
+       for (i = 0; i < len_k; i++) {
+               if (cpd == NULL)
+                       cpd = unicode_code_point_get_data(decomp[i]);
+
+               uint8_t ccc = cpd->canonical_combining_class;
+
+               if (decomp == decomp_k) {
+                       decomp_cpd[i] = cpd;
+                       cpd = NULL;
+               }
+
+               if (ccc == 0)
+                       seen_starter = TRUE;
+               else if (!seen_starter)
+                       ns_lead++;
+               else
+                       ns_trail++;
+       }
+
+       /* Lookup canonical decomposed code points if necessary (avoid double
+          lookups). */
+       if (decomp != decomp_k) {
+               for (i = 0; i < len; i++) {
+                       if (cpd == NULL)
+                               cpd = unicode_code_point_get_data(decomp[i]);
+                       decomp_cpd[i] = cpd;
+                       cpd = NULL;
+               }
+       }
+
+       ctx->nonstarter_count += ns_lead;
+       if (ctx->nonstarter_count > 30) {
+               ctx->nonstarter_count = ns_trail;
+
+               /* Write U+034F COMBINING GRAPHEME JOINER (CGJ)
+                */
+               ctx->cp_buffer[ctx->buffer_len] = 0x034F;
+               ctx->cpd_buffer[ctx->buffer_len] =
+                       unicode_code_point_get_data(0x034F);
+               ctx->buffer_len++;
+       }
+
+       /*
+        * Buffer the requested decomposition for COA sorting
+        */
+
+       i_assert(ctx->buffer_len <= buffer_size);
+       if ((ctx->buffer_len + len) > buffer_size) {
+               /* Decomposition now overflows the buffer. Record and mark it as
+                  pending and come back to it once the buffer is sufficiently
+                  drained. */
+               i_assert(ctx->pending_decomp == 0);
+               ctx->pending_decomp = len;
+               ctx->pending_cp = cp;
+               ctx->pending_cpd = cpd;
+       } else {
+               for (i = 0; i < len; i++) {
+                       ctx->cp_buffer[ctx->buffer_len] = decomp[i];
+                       ctx->cpd_buffer[ctx->buffer_len] = decomp_cpd[i];
+                       ctx->buffer_len++;
+               }
+               i_assert(ctx->buffer_len <= buffer_size);
+       }
+
+       /*
+        * Apply the Canonical Ordering Algorithm (COA)
+        */
+
+       bool changed = TRUE;
+       size_t last_qc_y;
+       size_t last_starter;
+
+       while (changed) {
+               changed = FALSE;
+               last_qc_y = 0;
+               last_starter = 0;
+
+               for (i = I_MAX(1, ctx->buffer_output_max);
+                    i < ctx->buffer_len; i++) {
+                       const struct unicode_code_point_data
+                               *cpd_i = ctx->cpd_buffer[i],
+                               *cpd_im1 = ctx->cpd_buffer[i - 1];
+                       uint8_t ccc_i = cpd_i->canonical_combining_class;
+                       uint8_t ccc_im1 = cpd_im1->canonical_combining_class;
+                       bool nqc = ((cpd_i->nf_quick_check & nf_qc_mask) == 0);
+
+                       if (ccc_i == 0) {
+                               last_starter = i;
+                               if (nqc)
+                                       last_qc_y = i;
+                       } else if (ccc_im1 > ccc_i) {
+                               unicode_nf_buffer_swap(ctx, i - 1, i);
+                               changed = TRUE;
+                       }
+               }
+       }
+       ctx->buffer_output_max = I_MIN(last_qc_y, last_starter);
+}
+
+static bool
+unicode_nf_input_cp(struct unicode_nf_context *ctx, uint32_t cp,
+                   const struct unicode_code_point_data *cpd)
+{
+       static const size_t buffer_size = UNICODE_NF_BUFFER_SIZE;
+
+       i_assert(ctx->buffer_len <= buffer_size);
+       if (ctx->buffer_len == buffer_size ||
+           (ctx->pending_decomp > 0 &&
+            ctx->buffer_len > (buffer_size - ctx->pending_decomp))) {
+               /* Buffer is (still too) full. */
+               return FALSE;
+       }
+
+       if (ctx->pending_decomp > 0) {
+               /* Earlier, the buffer was too full for the next decomposition
+                  and it was recorded and marked as pending. Now, we have the
+                  opportunity to continue. */
+               unicode_nf_cp(ctx, ctx->pending_cp, ctx->pending_cpd);
+               ctx->pending_decomp = 0;
+
+               i_assert(ctx->buffer_len <= buffer_size);
+               if (ctx->buffer_output_max > 0 &&
+                   ctx->buffer_len == buffer_size) {
+                       /* Pending decomposition filled the buffer completely.
+                        */
+                       return FALSE;
+               }
+       }
+
+       /* Normal input of next code point */
+       unicode_nf_cp(ctx, cp, cpd);
+       return TRUE;
+}
+
+static ssize_t
+unicode_nf_input(struct unicode_transform *trans,
+                const struct unicode_transform_buffer *buf,
+                const char **error_r ATTR_UNUSED)
+{
+       struct unicode_nf_context *ctx =
+               container_of(trans, struct unicode_nf_context, transform);
+       size_t n;
+
+       for (n = 0; n < buf->cp_count; n++) {
+               if (!unicode_nf_input_cp(ctx, buf->cp[n],
+                                        (buf->cp_data == NULL ?
+                                         NULL : buf->cp_data[n])))
+                       break;
+       }
+       return n;
+}
+
+static uint32_t
+unicode_nf_compose_pair(uint32_t l, uint32_t r,
+                       const struct unicode_code_point_data **l_data)
+{
+       uint32_t comp = unicode_hangul_compose_pair(l, r);
+
+       if (comp > 0x0000)
+               return comp;
+
+       if (*l_data == NULL)
+               *l_data = unicode_code_point_get_data(l);
+       return unicode_code_point_data_find_composition(*l_data, r);
+}
+
+static int
+unicode_nf_flush_more(struct unicode_nf_context *ctx, bool finished,
+                     const char **error_r)
+{
+       struct unicode_transform *trans = &ctx->transform;
+
+       ctx->finished = finished;
+
+       if (ctx->buffer_len == 0)
+               return 1;
+       if (!finished && ctx->buffer_output_max == 0)
+               return 0;
+
+       /*
+        * Apply the Canonical Composition Algorithm
+        */
+
+       if (ctx->finished)
+               ctx->buffer_output_max = ctx->buffer_len;
+       i_assert(ctx->buffer_processed <= ctx->buffer_output_max);
+       if (ctx->compose && ctx->buffer_len > 1) {
+               size_t in_pos, out_pos, starter;
+               int last_ccc;
+
+               out_pos = 1;
+               last_ccc = -1;
+               starter = 0;
+               for (in_pos = I_MAX(1, ctx->buffer_processed);
+                    in_pos < ctx->buffer_output_max; in_pos++) {
+                       uint32_t cp = ctx->cp_buffer[in_pos];
+                       const struct unicode_code_point_data *cpd =
+                               ctx->cpd_buffer[in_pos];
+
+                       if (cpd == NULL) {
+                               ctx->cpd_buffer[in_pos] = cpd =
+                                       unicode_code_point_get_data(cp);
+                       }
+
+                       uint8_t ccc = cpd->canonical_combining_class;
+                       uint32_t comp = 0x0000;
+                       if (last_ccc < (int)ccc) {
+                               comp = unicode_nf_compose_pair(
+                                       ctx->cp_buffer[starter], cp,
+                                       &ctx->cpd_buffer[starter]);
+                       }
+                       if (comp > 0x0000) {
+                               ctx->cp_buffer[starter] = comp;
+                               ctx->cpd_buffer[starter] = NULL;
+                       } else if (ccc == 0) {
+                               starter = out_pos;
+                               last_ccc = -1;
+                               ctx->cp_buffer[out_pos] = cp;
+                               ctx->cpd_buffer[out_pos] = cpd;
+                               out_pos++;
+                       } else {
+                               last_ccc = ccc;
+                               ctx->cp_buffer[out_pos] = cp;
+                               ctx->cpd_buffer[out_pos] = cpd;
+                               out_pos++;
+                       }
+               }
+               if (finished) {
+                       ctx->buffer_len = ctx->buffer_output_max = out_pos;
+               } else if (in_pos > out_pos) {
+                       unicode_nf_buffer_delete(ctx, out_pos,
+                                                (in_pos - out_pos));
+                       ctx->buffer_output_max = out_pos;
+               }
+       }
+       ctx->buffer_processed = ctx->buffer_output_max;
+
+       /*
+        * Forward output
+        */
+
+       size_t output_len = ctx->buffer_processed;
+       ssize_t sret;
+
+       sret = uniform_transform_forward(trans, ctx->cp_buffer, ctx->cpd_buffer,
+                                        output_len, error_r);
+       if (sret < 0)
+               return -1;
+
+       i_assert((size_t)sret <= ctx->buffer_processed);
+       unicode_nf_buffer_delete(ctx, 0, sret);
+       ctx->buffer_processed -= sret;
+       ctx->buffer_output_max -= sret;
+       if ((size_t)sret < output_len)
+               return 0;
+       return 1;
+}
+
+static int
+unicode_nf_flush(struct unicode_transform *trans, bool finished,
+                const char **error_r)
+{
+       struct unicode_nf_context *ctx =
+               container_of(trans, struct unicode_nf_context, transform);
+       int ret;
+
+       ret = unicode_nf_flush_more(ctx, finished, error_r);
+       if (ret <= 0)
+               return ret;
+
+       if (finished && ctx->pending_decomp > 0) {
+               unicode_nf_cp(ctx, ctx->pending_cp, ctx->pending_cpd);
+               ctx->pending_decomp = 0;
+       }
+
+       return unicode_nf_flush_more(ctx, finished, error_r);
+}
+
+/*
+ * Normalization check
+ */
+
+static ssize_t
+unicode_nf_check_sink_input(struct unicode_transform *trans,
+                           const struct unicode_transform_buffer *buf,
+                           const char **error_r);
+
+static const struct unicode_transform_def unicode_nf_check_sink_def = {
+       .input = unicode_nf_check_sink_input,
+};
+
+void unicode_nf_checker_init(struct unicode_nf_checker *unc_r,
+                            enum unicode_nf_type type)
+{
+       i_zero(unc_r);
+
+       switch (type) {
+       case UNICODE_NFD:
+               unc_r->canonical = TRUE;
+               unc_r->nf_qc_mask = UNICODE_NFD_QUICK_CHECK_MASK;
+               unc_r->nf_qc_yes = UNICODE_NFD_QUICK_CHECK_YES;
+               unc_r->nf_qc_no = UNICODE_NFD_QUICK_CHECK_NO;
+               break;
+       case UNICODE_NFKD:
+               unc_r->nf_qc_mask = UNICODE_NFKD_QUICK_CHECK_MASK;
+               unc_r->nf_qc_yes = UNICODE_NFKD_QUICK_CHECK_YES;
+               unc_r->nf_qc_no = UNICODE_NFKD_QUICK_CHECK_NO;
+               break;
+       case UNICODE_NFC:
+               unc_r->compose = TRUE;
+               unc_r->canonical = TRUE;
+               unc_r->nf_qc_mask = UNICODE_NFC_QUICK_CHECK_MASK;
+               unc_r->nf_qc_yes = UNICODE_NFC_QUICK_CHECK_YES;
+               unc_r->nf_qc_no = UNICODE_NFC_QUICK_CHECK_NO;
+               break;
+       case UNICODE_NFKC:
+               unc_r->compose = TRUE;
+               unc_r->nf_qc_mask = UNICODE_NFKC_QUICK_CHECK_MASK;
+               unc_r->nf_qc_yes = UNICODE_NFKC_QUICK_CHECK_YES;
+               unc_r->nf_qc_no = UNICODE_NFKC_QUICK_CHECK_NO;
+               break;
+       }
+
+       unicode_nf_init(&unc_r->nf, type);
+       unicode_transform_init(&unc_r->sink, &unicode_nf_check_sink_def);
+       unicode_transform_chain(&unc_r->nf.transform, &unc_r->sink);
+}
+
+void unicode_nf_checker_reset(struct unicode_nf_checker *unc)
+{
+       enum unicode_nf_type type =
+               (unc->compose ? (unc->canonical ? UNICODE_NFC : UNICODE_NFKC) :
+                               (unc->canonical ? UNICODE_NFD : UNICODE_NFKD));
+
+       unicode_nf_checker_init(unc, type);
+}
+
+static ssize_t
+unicode_nf_check_sink_input(struct unicode_transform *trans,
+                           const struct unicode_transform_buffer *buf,
+                           const char **error_r)
+{
+       struct unicode_nf_checker *unc =
+               container_of(trans, struct unicode_nf_checker, sink);
+       size_t n;
+
+       i_assert(unc->buffer_len > 0);
+       i_assert(buf->cp_count <= unc->buffer_len);
+       for (n = 0; n < buf->cp_count; n++) {
+               if (buf->cp[n] != unc->cp_buffer[n]) {
+                       *error_r = "Not normalized";
+                       return -1;
+               }
+       }
+       if (buf->cp_count == unc->buffer_len)
+               unc->buffer_len = 0;
+       else {
+               unc->buffer_len -= buf->cp_count;
+               memmove(&unc->cp_buffer[0], &unc->cp_buffer[buf->cp_count],
+                       unc->buffer_len);
+       }
+       return buf->cp_count;
+}
+
+int unicode_nf_checker_input(struct unicode_nf_checker *unc, uint32_t cp,
+                            const struct unicode_code_point_data **_cp_data)
+{
+       const struct unicode_code_point_data *cpd_last = unc->cpd_last;
+
+       if (*_cp_data == NULL)
+               *_cp_data = unicode_code_point_get_data(cp);
+
+       const struct unicode_code_point_data *cp_data = *_cp_data;
+       const char *error;
+       int ret;
+
+       unc->cpd_last = cp_data;
+
+       if (cp_data->general_category == UNICODE_GENERAL_CATEGORY_INVALID)
+               return -1;
+       if ((cp_data->nf_quick_check & unc->nf_qc_mask) == unc->nf_qc_no)
+               return 0;
+       if (cpd_last != NULL && cp_data->canonical_combining_class != 0 &&
+           cpd_last->canonical_combining_class >
+               cp_data->canonical_combining_class)
+               return 0;
+       if ((cp_data->nf_quick_check & unc->nf_qc_mask) == unc->nf_qc_yes &&
+           cp_data->canonical_combining_class == 0) {
+               if (unc->buffer_len > 0) {
+                       ret = unicode_transform_flush(&unc->nf.transform,
+                                                     &error);
+                       i_assert(ret != 0);
+                       if (ret < 0)
+                               return 0;
+                       unicode_nf_reset(&unc->nf);
+               }
+               i_assert(unc->buffer_len == 0);
+               unc->cp_buffer[0] = cp;
+               return 1;
+       }
+
+       struct unicode_transform_buffer buf;
+       ssize_t sret;
+
+       if (unc->buffer_len == 0 && cpd_last != NULL) {
+               i_zero(&buf);
+               buf.cp = &unc->cp_buffer[0];
+               buf.cp_data = &cpd_last;
+               buf.cp_count = 1;
+
+               unc->buffer_len++;
+               sret = unicode_transform_input_buf(&unc->nf.transform, &buf,
+                                                  &error);
+               i_assert(sret != 0);
+               if (sret < 0)
+                       return 0;
+       }
+
+       i_assert(unc->buffer_len < UNICODE_NF_BUFFER_SIZE);
+       unc->cp_buffer[unc->buffer_len] = cp;
+       unc->buffer_len++;
+
+       i_zero(&buf);
+       buf.cp = &cp;
+       buf.cp_data = &cp_data;
+       buf.cp_count = 1;
+       sret = unicode_transform_input_buf(&unc->nf.transform, &buf, &error);
+       i_assert(sret != 0);
+       if (sret < 0)
+               return 0;
+       return 1;
+}
+
+int unicode_nf_checker_finish(struct unicode_nf_checker *unc)
+{
+       if (unc->buffer_len == 0)
+               return 1;
+
+       const char *error;
+       int ret;
+
+       ret = unicode_transform_flush(&unc->nf.transform, &error);
+       i_assert(ret != 0);
+       return (ret > 0 ? 1 : 0);
+}
+
  /*
   * RFC 5051 - Simple Unicode Collation Algorithm
   */
diff --git a/src/lib/unicode-transform.h b/src/lib/unicode-transform.h

index e6e18d3177f8ce67951a30b1e176fb53a4a0d49d..3e3499f0e5a68b735e8cefbb8416cae22bf9d129 100644 (file)
--- a/src/lib/unicode-transform.h
+++ b/src/lib/unicode-transform.h
@@ -1,6 +1,11 @@
  #ifndef UNICODE_NF_H
  #define UNICODE_NF_H
  
+#define UNICODE_NF_STREAM_SAFE_NON_STARTER_LEN 30
+#define UNICODE_NF_BUFFER_SIZE (UNICODE_NF_STREAM_SAFE_NON_STARTER_LEN + 2)
+
+struct unicode_code_point_data;
+
  /*
   * Transform API
   */
@@ -55,9 +60,21 @@ ssize_t uniform_transform_forward(
         const struct unicode_code_point_data *const *out_data, size_t out_len,
         const char **error_r);
  
-ssize_t unicode_transform_input(struct unicode_transform *trans,
-                               const uint32_t *in, size_t in_len,
-                               const char **error_r);
+ssize_t unicode_transform_input_buf(struct unicode_transform *trans,
+                                   const struct unicode_transform_buffer *buf,
+                                   const char **error_r);
+static inline ssize_t
+unicode_transform_input(struct unicode_transform *trans,
+                       const uint32_t *in, size_t in_len, const char **error_r)
+{
+       struct unicode_transform_buffer buf = {
+               .cp = in,
+               .cp_count = in_len,
+       };
+
+       return unicode_transform_input_buf(trans, &buf, error_r);
+}
+
  int unicode_transform_flush(struct unicode_transform *trans,
                             const char **error_r);
  
@@ -84,6 +101,101 @@ void unicode_static_array_sink_init(struct unicode_static_array_sink *sink,
                                     uint32_t *array, size_t array_size,
                                     size_t *array_pos);
  
+/*
+ * NFD, NFKD, NFC, NFKC
+ */
+
+/* Unicode Standard Annex #15, Section 1.2:
+
+   Unicode Normalization Forms are formally defined normalizations of Unicode
+   strings which make it possible to determine whether any two Unicode strings
+   are equivalent to each other. Depending on the particular Unicode
+   Normalization Form, that equivalence can either be a canonical equivalence or
+   a compatibility equivalence.
+
+   Essentially, the Unicode Normalization Algorithm puts all combining marks in
+   a specified order, and uses rules for decomposition and composition to
+   transform each string into one of the Unicode Normalization Forms. A binary
+   comparison of the transformed strings will then determine equivalence.
+
+   The four Unicode Normalization Forms are summarized as follows:
+
+     Normalization Form D  (NFD)   - Canonical Decomposition
+     Normalization Form KD (NFKD)  - Compatibility Decomposition
+     Normalization Form C  (NFC)   - Canonical Decomposition, followed by
+                                     Canonical Composition
+     Normalization Form KC (NFKC)  - Compatibility Decomposition, followed by
+                                     Canonical Composition
+
+   There are two forms of normalization that convert to composite characters:
+   Normalization Form C and Normalization Form KC. The difference between these
+   depends on whether the resulting text is to be a canonical equivalent to the
+   original unnormalized text or a compatibility equivalent to the original
+   unnormalized text. (In NFKC and NFKD, a K is used to stand for compatibility
+   to avoid confusion with the C standing for composition.) Both types of
+   normalization can be useful in different circumstances.
+ */
+
+enum unicode_nf_type {
+       UNICODE_NFD,
+       UNICODE_NFKD,
+       UNICODE_NFC,
+       UNICODE_NFKC,
+};
+
+struct unicode_nf_context {
+       struct unicode_transform transform;
+
+       size_t nonstarter_count;
+       uint32_t cp_buffer[UNICODE_NF_BUFFER_SIZE];
+       const struct unicode_code_point_data *
+               cpd_buffer[UNICODE_NF_BUFFER_SIZE];
+       size_t buffer_len, buffer_processed, buffer_output_max;
+
+       size_t pending_decomp;
+       uint32_t pending_cp;
+       const struct unicode_code_point_data *pending_cpd;
+
+       uint8_t nf_qc_mask;
+
+       bool compose:1;
+       bool canonical:1;
+       bool finished:1;
+};
+
+void unicode_nf_init(struct unicode_nf_context *ctx_r,
+                    enum unicode_nf_type type);
+void unicode_nf_reset(struct unicode_nf_context *ctx);
+
+/*
+ * Normalization check
+ */
+
+struct unicode_nf_checker {
+       const struct unicode_code_point_data *cpd_last;
+
+       uint8_t nf_qc_mask;
+       uint8_t nf_qc_yes;
+       uint8_t nf_qc_no;
+
+       uint32_t cp_buffer[UNICODE_NF_BUFFER_SIZE];
+       size_t buffer_len;
+       struct unicode_nf_context nf;
+       struct unicode_transform sink;
+
+       bool not_first_cp;
+       bool compose:1;
+       bool canonical:1;
+};
+
+void unicode_nf_checker_init(struct unicode_nf_checker *unc_r,
+                            enum unicode_nf_type type);
+void unicode_nf_checker_reset(struct unicode_nf_checker *unc);
+
+int unicode_nf_checker_input(struct unicode_nf_checker *unc, uint32_t cp,
+                            const struct unicode_code_point_data **cp_data);
+int unicode_nf_checker_finish(struct unicode_nf_checker *unc);
+
  /*
   * RFC 5051 - Simple Unicode Collation Algorithm
   */
author	Stephan Bosch <stephan.bosch@open-xchange.com>
	Wed, 27 Nov 2024 00:36:43 +0000 (01:36 +0100)
committer	Stephan Bosch <stephan.bosch@open-xchange.com>
	Fri, 1 Aug 2025 01:11:19 +0000 (03:11 +0200)
src/lib/unichar.c		patch \| blob \| blame \| history
src/lib/unichar.h		patch \| blob \| blame \| history
src/lib/unicode-transform.c		patch \| blob \| blame \| history
src/lib/unicode-transform.h		patch \| blob \| blame \| history