From: Stephan Bosch <stephan.bosch@open-xchange.com>
Date: Wed, 27 Nov 2024 00:36:43 +0000 (+0100)
Subject: lib: unicode-transform - Implement streaming Unicode Normalization
X-Git-Tag: 2.4.2~611
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0ce5059db585830ea39adca938339a907b6f0789;p=thirdparty%2Fdovecot%2Fcore.git

lib: unicode-transform - Implement streaming Unicode Normalization

All standard forms are supported: NFD, NFKD, NFC, NFKC.
---

diff --git a/src/lib/unichar.c b/src/lib/unichar.c
index 0a7405d2cc..a8a60e92da 100644
--- a/src/lib/unichar.c
+++ b/src/lib/unichar.c
@@ -2,6 +2,7 @@
 
 #include "lib.h"
 #include "array.h"
+#include "str.h"
 #include "bsearch-insert-pos.h"
 #include "unicode-data.h"
 #include "unicode-transform.h"
@@ -298,6 +299,125 @@ int uni_utf8_run_transform(const void *_input, size_t size,
 	return ret;
 }
 
+static inline int
+uni_utf8_write_nf_common(const void *_input, size_t size,
+			 enum unicode_nf_type nf_type, buffer_t *output)
+{
+	static struct unicode_nf_context ctx;
+	const char *error;
+
+	unicode_nf_init(&ctx, nf_type);
+
+	return uni_utf8_run_transform(_input, size, &ctx.transform, output,
+				      &error);
+}
+
+int uni_utf8_write_nfd(const void *input, size_t size, buffer_t *output)
+{
+	return uni_utf8_write_nf_common(input, size, UNICODE_NFD, output);
+}
+
+int uni_utf8_write_nfkd(const void *input, size_t size, buffer_t *output)
+{
+	return uni_utf8_write_nf_common(input, size, UNICODE_NFKD, output);
+}
+
+int uni_utf8_write_nfc(const void *input, size_t size, buffer_t *output)
+{
+	return uni_utf8_write_nf_common(input, size, UNICODE_NFC, output);
+}
+
+int uni_utf8_write_nfkc(const void *input, size_t size, buffer_t *output)
+{
+	return uni_utf8_write_nf_common(input, size, UNICODE_NFKC, output);
+}
+
+int uni_utf8_to_nfd(const void *input, size_t size, const char **output_r)
+{
+	buffer_t *output = t_buffer_create(size);
+
+	if (uni_utf8_write_nf_common(input, size, UNICODE_NFD, output) < 0)
+		return -1;
+	*output_r = str_c(output);
+	return 0;
+}
+
+int uni_utf8_to_nfkd(const void *input, size_t size, const char **output_r)
+{
+	buffer_t *output = t_buffer_create(size);
+
+	if (uni_utf8_write_nf_common(input, size, UNICODE_NFKD, output) < 0)
+		return -1;
+	*output_r = str_c(output);
+	return 0;
+}
+
+int uni_utf8_to_nfc(const void *input, size_t size, const char **output_r)
+{
+	buffer_t *output = t_buffer_create(size);
+
+	if (uni_utf8_write_nf_common(input, size, UNICODE_NFC, output) < 0)
+		return -1;
+	*output_r = str_c(output);
+	return 0;
+}
+
+int uni_utf8_to_nfkc(const void *input, size_t size, const char **output_r)
+{
+	buffer_t *output = t_buffer_create(size);
+
+	if (uni_utf8_write_nf_common(input, size, UNICODE_NFKC, output) < 0)
+		return -1;
+	*output_r = str_c(output);
+	return 0;
+}
+
+static int
+uni_utf8_is_nf(const void *_input, size_t size, enum unicode_nf_type type)
+{
+	static struct unicode_nf_checker unc;
+	const unsigned char *input = _input;
+	unichar_t chr;
+	int ret;
+
+	unicode_nf_checker_init(&unc, type);
+
+	while (size > 0) {
+		const struct unicode_code_point_data *cp_data = NULL;
+		int bytes = uni_utf8_get_char_n(input, size, &chr);
+		if (bytes <= 0)
+			return -1;
+		input += bytes;
+		size -= bytes;
+
+		ret = unicode_nf_checker_input(&unc, chr, &cp_data);
+		if (ret <= 0)
+			return ret;
+	}
+
+	return unicode_nf_checker_finish(&unc);
+}
+
+int uni_utf8_is_nfd(const void *input, size_t size)
+{
+	return uni_utf8_is_nf(input, size, UNICODE_NFD);
+}
+
+int uni_utf8_is_nfkd(const void *input, size_t size)
+{
+	return uni_utf8_is_nf(input, size, UNICODE_NFKD);
+}
+
+int uni_utf8_is_nfc(const void *input, size_t size)
+{
+	return uni_utf8_is_nf(input, size, UNICODE_NFC);
+}
+
+int uni_utf8_is_nfkc(const void *input, size_t size)
+{
+	return uni_utf8_is_nf(input, size, UNICODE_NFKC);
+}
+
 int uni_utf8_to_decomposed_titlecase(const void *_input, size_t size,
 				     buffer_t *output)
 {
diff --git a/src/lib/unichar.h b/src/lib/unichar.h
index 31741a8ba9..1e1ef09ece 100644
--- a/src/lib/unichar.h
+++ b/src/lib/unichar.h
@@ -124,6 +124,39 @@ int uni_utf8_run_transform(const void *_input, size_t size,
 			   struct unicode_transform *trans, buffer_t *output,
 			   const char **error_r);
 
+/* Normalize the UTF-8 input in Unicode NFD, NFKD, NFC or NFKC form and write
+   the result to the output buffer.
+
+   Refer to Unicode Standard Annex #15, Section 1.2 for more information. An
+   excerpt can be found in unicode-nf.h.
+
+   NOTE: Do not blindly use this function to write and append several values
+   together expecting the result to be NF* normalized as well. This function
+   does not check whether concatenation preserves the desired normalization nor
+   does it endeavour to achieve this result. Blind concatination works only in
+   very specific cases, so make sure you know what you are doing.
+ */
+int uni_utf8_write_nfd(const void *input, size_t size, buffer_t *output);
+int uni_utf8_write_nfkd(const void *input, size_t size, buffer_t *output);
+int uni_utf8_write_nfc(const void *input, size_t size, buffer_t *output);
+int uni_utf8_write_nfkc(const void *input, size_t size, buffer_t *output);
+
+/* Same as the write variants, but return the normalized input in the
+   output_r argument as a C string.
+ */
+int uni_utf8_to_nfd(const void *input, size_t size, const char **output_r);
+int uni_utf8_to_nfkd(const void *input, size_t size, const char **output_r);
+int uni_utf8_to_nfc(const void *input, size_t size, const char **output_r);
+int uni_utf8_to_nfkc(const void *input, size_t size, const char **output_r);
+
+/* Check whether the input is normalized in the indicated form. Returns -1 if
+   the input is not even valid UTF8 or contains invalid code points. Returns 1
+   if the input adheres to the requested normalization form and 0 otherwise. */
+int uni_utf8_is_nfd(const void *input, size_t size);
+int uni_utf8_is_nfkd(const void *input, size_t size);
+int uni_utf8_is_nfc(const void *input, size_t size);
+int uni_utf8_is_nfkc(const void *input, size_t size);
+
 /* Convert UTF-8 input to titlecase and decompose the titlecase characters to
    output buffer. Returns 0 if ok, -1 if input was invalid. This generates
    output that's compatible with i;unicode-casemap comparator. Invalid input
diff --git a/src/lib/unicode-transform.c b/src/lib/unicode-transform.c
index 401e74ec9c..8274f1ffd5 100644
--- a/src/lib/unicode-transform.c
+++ b/src/lib/unicode-transform.c
@@ -35,9 +35,9 @@ ssize_t uniform_transform_forward(
 	return sret;
 }
 
-ssize_t unicode_transform_input(struct unicode_transform *trans,
-				const uint32_t *in, size_t in_len,
-				const char **error_r)
+ssize_t unicode_transform_input_buf(struct unicode_transform *trans,
+				    const struct unicode_transform_buffer *buf,
+				    const char **error_r)
 {
 	struct unicode_transform_buffer in_buf;
 	size_t input_total = 0;
@@ -47,9 +47,7 @@ ssize_t unicode_transform_input(struct unicode_transform *trans,
 
 	*error_r = NULL;
 
-	i_zero(&in_buf);
-	in_buf.cp = in;
-	in_buf.cp_count = in_len;
+	in_buf = *buf;
 
 	while (in_buf.cp_count > 0) {
 		if (in_buf.cp_count > 0) {
@@ -207,10 +205,15 @@ static const uint16_t uni_hangul_s_base = 0xac00;
 static const uint16_t uni_hangul_l_base = 0x1100;
 static const uint16_t uni_hangul_v_base = 0x1161;
 static const uint16_t uni_hangul_t_base = 0x11a7;
+static const unsigned int uni_hangul_l_count = 19;
 static const unsigned int uni_hangul_v_count = 21;
 static const unsigned int uni_hangul_t_count = 28;
 static const unsigned int uni_hangul_n_count =
 	uni_hangul_v_count * uni_hangul_t_count;
+static const uint16_t uni_hangul_l_end = uni_hangul_l_base + uni_hangul_l_count;
+static const uint16_t uni_hangul_v_end = uni_hangul_v_base + uni_hangul_v_count;
+static const uint16_t uni_hangul_t_end = uni_hangul_t_base + uni_hangul_t_count;
+static const uint16_t uni_hangul_s_end = 0xD7A4;
 
 static size_t unicode_hangul_decompose(uint32_t cp, uint32_t buf[3])
 {
@@ -240,6 +243,661 @@ static size_t unicode_hangul_decompose(uint32_t cp, uint32_t buf[3])
 	return 3;
 }
 
+static uint32_t unicode_hangul_compose_pair(uint32_t l, uint32_t r)
+{
+	/* The Unicode Standard, Section 3.12.3:
+	   Hangul Syllable Composition
+	 */
+
+	/* <LPart, VPart> */
+	if (l >= uni_hangul_l_base && l < uni_hangul_l_end &&
+	    r >= uni_hangul_v_base && r < uni_hangul_v_end) {
+		uint32_t l_part = l, v_part = r;
+
+		unsigned int l_index = l_part - uni_hangul_l_base;
+		unsigned int v_index = v_part - uni_hangul_v_base;
+		unsigned int lv_index = l_index * uni_hangul_n_count +
+					v_index * uni_hangul_t_count;
+		return uni_hangul_s_base + lv_index;
+	}
+	/* A sequence <LVPart, TPart> */
+	if (l >= uni_hangul_s_base && l < uni_hangul_s_end &&
+	    r >= (uni_hangul_t_base + 1) && r < uni_hangul_t_end &&
+	    ((l - uni_hangul_s_base) % uni_hangul_t_count) == 0) {
+		uint32_t lv_part = l, t_part = r;
+
+		unsigned int t_index = t_part - uni_hangul_t_base;
+		return lv_part + t_index;
+	}
+	return 0x0000;
+}
+
+/*
+ * Normalization transform: NFD, NFKD, NFC, NFKC
+ */
+
+static ssize_t
+unicode_nf_input(struct unicode_transform *trans,
+		 const struct unicode_transform_buffer *buf,
+		 const char **error_r);
+static int
+unicode_nf_flush(struct unicode_transform *trans, bool finished,
+		 const char **error_r);
+
+static const struct unicode_transform_def unicode_nf_def = {
+	.input = unicode_nf_input,
+	.flush = unicode_nf_flush,
+};
+
+void unicode_nf_init(struct unicode_nf_context *ctx_r,
+		     enum unicode_nf_type type)
+{
+	i_zero(ctx_r);
+	unicode_transform_init(&ctx_r->transform, &unicode_nf_def);
+
+	switch (type) {
+	case UNICODE_NFD:
+		ctx_r->canonical = TRUE;
+		ctx_r->nf_qc_mask = UNICODE_NFD_QUICK_CHECK_MASK;
+		break;
+	case UNICODE_NFKD:
+		ctx_r->nf_qc_mask = UNICODE_NFKD_QUICK_CHECK_MASK;
+		break;
+	case UNICODE_NFC:
+		ctx_r->compose = TRUE;
+		ctx_r->canonical = TRUE;
+		ctx_r->nf_qc_mask = UNICODE_NFC_QUICK_CHECK_MASK;
+		break;
+	case UNICODE_NFKC:
+		ctx_r->compose = TRUE;
+		ctx_r->nf_qc_mask = UNICODE_NFKC_QUICK_CHECK_MASK;
+		break;
+	}
+}
+
+void unicode_nf_reset(struct unicode_nf_context *ctx)
+{
+	enum unicode_nf_type type =
+		(ctx->compose ? (ctx->canonical ? UNICODE_NFC : UNICODE_NFKC) :
+				(ctx->canonical ? UNICODE_NFD : UNICODE_NFKD));
+	struct unicode_transform *next = ctx->transform.next;
+
+	unicode_nf_init(ctx, type);
+	unicode_transform_chain(&ctx->transform, next);
+}
+
+static void
+unicode_nf_buffer_delete(struct unicode_nf_context *ctx, size_t offset,
+			 size_t count)
+{
+	if (count == 0)
+		return;
+
+	i_assert(offset < ctx->buffer_len);
+	i_assert(count <= ctx->buffer_len);
+	i_assert(offset <= (ctx->buffer_len - count));
+
+	if (count == ctx->buffer_len) {
+		ctx->buffer_len = 0;
+		return;
+	}
+
+	size_t trailer = ctx->buffer_len - (offset + count);
+	if (trailer > 0) {
+		memmove(&ctx->cp_buffer[offset],
+			&ctx->cp_buffer[offset + count],
+			trailer * sizeof(ctx->cp_buffer[0]));
+		memmove(&ctx->cpd_buffer[offset],
+			&ctx->cpd_buffer[offset + count],
+			trailer * sizeof(ctx->cpd_buffer[0]));
+	}
+	ctx->buffer_len -= count;
+}
+
+static void
+unicode_nf_buffer_swap(struct unicode_nf_context *ctx,
+		       size_t idx1, size_t idx2)
+{
+	uint32_t tmp_cp = ctx->cp_buffer[idx2];
+	const struct unicode_code_point_data *tmp_cpd = ctx->cpd_buffer[idx2];
+
+	ctx->cp_buffer[idx2] = ctx->cp_buffer[idx1];
+	ctx->cpd_buffer[idx2] = ctx->cpd_buffer[idx1];
+	ctx->cp_buffer[idx1] = tmp_cp;
+	ctx->cpd_buffer[idx1] = tmp_cpd;
+}
+
+static void
+unicode_nf_cp(struct unicode_nf_context *ctx, uint32_t cp,
+	      const struct unicode_code_point_data *cpd)
+{
+	static const size_t buffer_size = UNICODE_NF_BUFFER_SIZE;
+	uint8_t nf_qc_mask = ctx->nf_qc_mask;
+	size_t i;
+
+	/*
+	 * Decompose the code point
+	 */
+
+	const uint32_t *decomp, *decomp_k;
+	uint32_t decomp_hangul[3];
+	size_t len, len_k;
+
+	if (cp >= HANGUL_FIRST && cp <= HANGUL_LAST) {
+		len = len_k = unicode_hangul_decompose(cp, decomp_hangul);
+		decomp = decomp_k = decomp_hangul;
+	} else {
+		if (cpd == NULL)
+			cpd = unicode_code_point_get_data(cp);
+		len = unicode_code_point_data_get_full_decomposition(
+			cpd, ctx->canonical, &decomp);
+		if (len == 0) {
+			decomp = &cp;
+			len = 1;
+		}
+		len_k = len;
+		decomp_k = decomp;
+		if (ctx->canonical) {
+			len_k = unicode_code_point_data_get_full_decomposition(
+				cpd, ctx->canonical, &decomp_k);
+			if (len_k == 0) {
+				decomp_k = decomp;
+				len_k = len;
+			}
+		}
+		if (len > 0)
+			cpd = NULL;
+	}
+
+	i_assert(len <= UNICODE_DECOMPOSITION_MAX_LENGTH);
+	i_assert(len_k <= UNICODE_DECOMPOSITION_MAX_LENGTH);
+
+	if ((ctx->buffer_len + len) > buffer_size) {
+		/* Decomposition overflows the buffer. Record and mark it as
+		   pending and come back to it once the buffer is sufficiently
+		   drained. */
+		i_assert(ctx->pending_decomp == 0);
+		ctx->pending_decomp = len;
+		ctx->pending_cp = cp;
+		ctx->pending_cpd = cpd;
+		return;
+	}
+
+	/* UAX15-D4: Stream-Safe Text Process is the process of producing a
+	   Unicode string in Stream-Safe Text Format by processing that string
+	   from start to finish, inserting U+034F COMBINING GRAPHEME JOINER
+	   (CGJ) within long sequences of non-starters. The exact position o
+	   the inserted CGJs are determined according to the following
+	   algorithm, which describes the generation of an output string from an
+	   input string:
+
+	   1. If the input string is empty, return an empty output string.
+	   2. Set nonStarterCount to zero.
+	   3. For each code point C in the input string:
+		a. Produce the NFKD decomposition S.
+		b. If nonStarterCount plus the number of initial non-starters in
+		   S is greater than 30, append a CGJ to the output string and
+		   set the nonStarterCount to zero.
+		c. Append C to the output string.
+		d. If there are no starters in S, increment nonStarterCount by
+		   the number of code points in S; otherwise, set
+		   nonStarterCount to the number of trailing non-starters in S
+		   (which may be zero).
+	   4. Return the output string.
+	 */
+
+	/* Determine number of leading and trailing non-starters in full NFKD
+	   decomposition. */
+	const struct unicode_code_point_data *
+		decomp_cpd[UNICODE_DECOMPOSITION_MAX_LENGTH];
+	size_t ns_lead = 0, ns_trail = 0;
+	bool seen_starter = FALSE;
+	for (i = 0; i < len_k; i++) {
+		if (cpd == NULL)
+			cpd = unicode_code_point_get_data(decomp[i]);
+
+		uint8_t ccc = cpd->canonical_combining_class;
+
+		if (decomp == decomp_k) {
+			decomp_cpd[i] = cpd;
+			cpd = NULL;
+		}
+
+		if (ccc == 0)
+			seen_starter = TRUE;
+		else if (!seen_starter)
+			ns_lead++;
+		else
+			ns_trail++;
+	}
+
+	/* Lookup canonical decomposed code points if necessary (avoid double
+	   lookups). */
+	if (decomp != decomp_k) {
+		for (i = 0; i < len; i++) {
+			if (cpd == NULL)
+				cpd = unicode_code_point_get_data(decomp[i]);
+			decomp_cpd[i] = cpd;
+			cpd = NULL;
+		}
+	}
+
+	ctx->nonstarter_count += ns_lead;
+	if (ctx->nonstarter_count > 30) {
+		ctx->nonstarter_count = ns_trail;
+
+		/* Write U+034F COMBINING GRAPHEME JOINER (CGJ)
+		 */
+		ctx->cp_buffer[ctx->buffer_len] = 0x034F;
+		ctx->cpd_buffer[ctx->buffer_len] =
+			unicode_code_point_get_data(0x034F);
+		ctx->buffer_len++;
+	}
+
+	/*
+	 * Buffer the requested decomposition for COA sorting
+	 */
+
+	i_assert(ctx->buffer_len <= buffer_size);
+	if ((ctx->buffer_len + len) > buffer_size) {
+		/* Decomposition now overflows the buffer. Record and mark it as
+		   pending and come back to it once the buffer is sufficiently
+		   drained. */
+		i_assert(ctx->pending_decomp == 0);
+		ctx->pending_decomp = len;
+		ctx->pending_cp = cp;
+		ctx->pending_cpd = cpd;
+	} else {
+		for (i = 0; i < len; i++) {
+			ctx->cp_buffer[ctx->buffer_len] = decomp[i];
+			ctx->cpd_buffer[ctx->buffer_len] = decomp_cpd[i];
+			ctx->buffer_len++;
+		}
+		i_assert(ctx->buffer_len <= buffer_size);
+	}
+
+	/*
+	 * Apply the Canonical Ordering Algorithm (COA)
+	 */
+
+	bool changed = TRUE;
+	size_t last_qc_y;
+	size_t last_starter;
+
+	while (changed) {
+		changed = FALSE;
+		last_qc_y = 0;
+		last_starter = 0;
+
+		for (i = I_MAX(1, ctx->buffer_output_max);
+		     i < ctx->buffer_len; i++) {
+			const struct unicode_code_point_data
+				*cpd_i = ctx->cpd_buffer[i],
+				*cpd_im1 = ctx->cpd_buffer[i - 1];
+			uint8_t ccc_i = cpd_i->canonical_combining_class;
+			uint8_t ccc_im1 = cpd_im1->canonical_combining_class;
+			bool nqc = ((cpd_i->nf_quick_check & nf_qc_mask) == 0);
+
+			if (ccc_i == 0) {
+				last_starter = i;
+				if (nqc)
+					last_qc_y = i;
+			} else if (ccc_im1 > ccc_i) {
+				unicode_nf_buffer_swap(ctx, i - 1, i);
+				changed = TRUE;
+			}
+		}
+	}
+	ctx->buffer_output_max = I_MIN(last_qc_y, last_starter);
+}
+
+static bool
+unicode_nf_input_cp(struct unicode_nf_context *ctx, uint32_t cp,
+		    const struct unicode_code_point_data *cpd)
+{
+	static const size_t buffer_size = UNICODE_NF_BUFFER_SIZE;
+
+	i_assert(ctx->buffer_len <= buffer_size);
+	if (ctx->buffer_len == buffer_size ||
+	    (ctx->pending_decomp > 0 &&
+	     ctx->buffer_len > (buffer_size - ctx->pending_decomp))) {
+		/* Buffer is (still too) full. */
+		return FALSE;
+	}
+
+	if (ctx->pending_decomp > 0) {
+		/* Earlier, the buffer was too full for the next decomposition
+		   and it was recorded and marked as pending. Now, we have the
+		   opportunity to continue. */
+		unicode_nf_cp(ctx, ctx->pending_cp, ctx->pending_cpd);
+		ctx->pending_decomp = 0;
+
+		i_assert(ctx->buffer_len <= buffer_size);
+		if (ctx->buffer_output_max > 0 &&
+		    ctx->buffer_len == buffer_size) {
+			/* Pending decomposition filled the buffer completely.
+			 */
+			return FALSE;
+		}
+	}
+
+	/* Normal input of next code point */
+	unicode_nf_cp(ctx, cp, cpd);
+	return TRUE;
+}
+
+static ssize_t
+unicode_nf_input(struct unicode_transform *trans,
+		 const struct unicode_transform_buffer *buf,
+		 const char **error_r ATTR_UNUSED)
+{
+	struct unicode_nf_context *ctx =
+		container_of(trans, struct unicode_nf_context, transform);
+	size_t n;
+
+	for (n = 0; n < buf->cp_count; n++) {
+		if (!unicode_nf_input_cp(ctx, buf->cp[n],
+					 (buf->cp_data == NULL ?
+					  NULL : buf->cp_data[n])))
+			break;
+	}
+	return n;
+}
+
+static uint32_t
+unicode_nf_compose_pair(uint32_t l, uint32_t r,
+			const struct unicode_code_point_data **l_data)
+{
+	uint32_t comp = unicode_hangul_compose_pair(l, r);
+
+	if (comp > 0x0000)
+		return comp;
+
+	if (*l_data == NULL)
+		*l_data = unicode_code_point_get_data(l);
+	return unicode_code_point_data_find_composition(*l_data, r);
+}
+
+static int
+unicode_nf_flush_more(struct unicode_nf_context *ctx, bool finished,
+		      const char **error_r)
+{
+	struct unicode_transform *trans = &ctx->transform;
+
+	ctx->finished = finished;
+
+	if (ctx->buffer_len == 0)
+		return 1;
+	if (!finished && ctx->buffer_output_max == 0)
+		return 0;
+
+	/*
+	 * Apply the Canonical Composition Algorithm
+	 */
+
+	if (ctx->finished)
+		ctx->buffer_output_max = ctx->buffer_len;
+	i_assert(ctx->buffer_processed <= ctx->buffer_output_max);
+	if (ctx->compose && ctx->buffer_len > 1) {
+		size_t in_pos, out_pos, starter;
+		int last_ccc;
+
+		out_pos = 1;
+		last_ccc = -1;
+		starter = 0;
+		for (in_pos = I_MAX(1, ctx->buffer_processed);
+		     in_pos < ctx->buffer_output_max; in_pos++) {
+			uint32_t cp = ctx->cp_buffer[in_pos];
+			const struct unicode_code_point_data *cpd =
+				ctx->cpd_buffer[in_pos];
+
+			if (cpd == NULL) {
+				ctx->cpd_buffer[in_pos] = cpd =
+					unicode_code_point_get_data(cp);
+			}
+
+			uint8_t ccc = cpd->canonical_combining_class;
+			uint32_t comp = 0x0000;
+			if (last_ccc < (int)ccc) {
+				comp = unicode_nf_compose_pair(
+					ctx->cp_buffer[starter], cp,
+					&ctx->cpd_buffer[starter]);
+			}
+			if (comp > 0x0000) {
+				ctx->cp_buffer[starter] = comp;
+				ctx->cpd_buffer[starter] = NULL;
+			} else if (ccc == 0) {
+				starter = out_pos;
+				last_ccc = -1;
+				ctx->cp_buffer[out_pos] = cp;
+				ctx->cpd_buffer[out_pos] = cpd;
+				out_pos++;
+			} else {
+				last_ccc = ccc;
+				ctx->cp_buffer[out_pos] = cp;
+				ctx->cpd_buffer[out_pos] = cpd;
+				out_pos++;
+			}
+		}
+		if (finished) {
+			ctx->buffer_len = ctx->buffer_output_max = out_pos;
+		} else if (in_pos > out_pos) {
+			unicode_nf_buffer_delete(ctx, out_pos,
+						 (in_pos - out_pos));
+			ctx->buffer_output_max = out_pos;
+		}
+	}
+	ctx->buffer_processed = ctx->buffer_output_max;
+
+	/*
+	 * Forward output
+	 */
+
+	size_t output_len = ctx->buffer_processed;
+	ssize_t sret;
+
+	sret = uniform_transform_forward(trans, ctx->cp_buffer, ctx->cpd_buffer,
+					 output_len, error_r);
+	if (sret < 0)
+		return -1;
+
+	i_assert((size_t)sret <= ctx->buffer_processed);
+	unicode_nf_buffer_delete(ctx, 0, sret);
+	ctx->buffer_processed -= sret;
+	ctx->buffer_output_max -= sret;
+	if ((size_t)sret < output_len)
+		return 0;
+	return 1;
+}
+
+static int
+unicode_nf_flush(struct unicode_transform *trans, bool finished,
+		 const char **error_r)
+{
+	struct unicode_nf_context *ctx =
+		container_of(trans, struct unicode_nf_context, transform);
+	int ret;
+
+	ret = unicode_nf_flush_more(ctx, finished, error_r);
+	if (ret <= 0)
+		return ret;
+
+	if (finished && ctx->pending_decomp > 0) {
+		unicode_nf_cp(ctx, ctx->pending_cp, ctx->pending_cpd);
+		ctx->pending_decomp = 0;
+	}
+
+	return unicode_nf_flush_more(ctx, finished, error_r);
+}
+
+/*
+ * Normalization check
+ */
+
+static ssize_t
+unicode_nf_check_sink_input(struct unicode_transform *trans,
+			    const struct unicode_transform_buffer *buf,
+			    const char **error_r);
+
+static const struct unicode_transform_def unicode_nf_check_sink_def = {
+	.input = unicode_nf_check_sink_input,
+};
+
+void unicode_nf_checker_init(struct unicode_nf_checker *unc_r,
+			     enum unicode_nf_type type)
+{
+	i_zero(unc_r);
+
+	switch (type) {
+	case UNICODE_NFD:
+		unc_r->canonical = TRUE;
+		unc_r->nf_qc_mask = UNICODE_NFD_QUICK_CHECK_MASK;
+		unc_r->nf_qc_yes = UNICODE_NFD_QUICK_CHECK_YES;
+		unc_r->nf_qc_no = UNICODE_NFD_QUICK_CHECK_NO;
+		break;
+	case UNICODE_NFKD:
+		unc_r->nf_qc_mask = UNICODE_NFKD_QUICK_CHECK_MASK;
+		unc_r->nf_qc_yes = UNICODE_NFKD_QUICK_CHECK_YES;
+		unc_r->nf_qc_no = UNICODE_NFKD_QUICK_CHECK_NO;
+		break;
+	case UNICODE_NFC:
+		unc_r->compose = TRUE;
+		unc_r->canonical = TRUE;
+		unc_r->nf_qc_mask = UNICODE_NFC_QUICK_CHECK_MASK;
+		unc_r->nf_qc_yes = UNICODE_NFC_QUICK_CHECK_YES;
+		unc_r->nf_qc_no = UNICODE_NFC_QUICK_CHECK_NO;
+		break;
+	case UNICODE_NFKC:
+		unc_r->compose = TRUE;
+		unc_r->nf_qc_mask = UNICODE_NFKC_QUICK_CHECK_MASK;
+		unc_r->nf_qc_yes = UNICODE_NFKC_QUICK_CHECK_YES;
+		unc_r->nf_qc_no = UNICODE_NFKC_QUICK_CHECK_NO;
+		break;
+	}
+
+	unicode_nf_init(&unc_r->nf, type);
+	unicode_transform_init(&unc_r->sink, &unicode_nf_check_sink_def);
+	unicode_transform_chain(&unc_r->nf.transform, &unc_r->sink);
+}
+
+void unicode_nf_checker_reset(struct unicode_nf_checker *unc)
+{
+	enum unicode_nf_type type =
+		(unc->compose ? (unc->canonical ? UNICODE_NFC : UNICODE_NFKC) :
+				(unc->canonical ? UNICODE_NFD : UNICODE_NFKD));
+
+	unicode_nf_checker_init(unc, type);
+}
+
+static ssize_t
+unicode_nf_check_sink_input(struct unicode_transform *trans,
+			    const struct unicode_transform_buffer *buf,
+			    const char **error_r)
+{
+	struct unicode_nf_checker *unc =
+		container_of(trans, struct unicode_nf_checker, sink);
+	size_t n;
+
+	i_assert(unc->buffer_len > 0);
+	i_assert(buf->cp_count <= unc->buffer_len);
+	for (n = 0; n < buf->cp_count; n++) {
+		if (buf->cp[n] != unc->cp_buffer[n]) {
+			*error_r = "Not normalized";
+			return -1;
+		}
+	}
+	if (buf->cp_count == unc->buffer_len)
+		unc->buffer_len = 0;
+	else {
+		unc->buffer_len -= buf->cp_count;
+		memmove(&unc->cp_buffer[0], &unc->cp_buffer[buf->cp_count],
+			unc->buffer_len);
+	}
+	return buf->cp_count;
+}
+
+int unicode_nf_checker_input(struct unicode_nf_checker *unc, uint32_t cp,
+			     const struct unicode_code_point_data **_cp_data)
+{
+	const struct unicode_code_point_data *cpd_last = unc->cpd_last;
+
+	if (*_cp_data == NULL)
+		*_cp_data = unicode_code_point_get_data(cp);
+
+	const struct unicode_code_point_data *cp_data = *_cp_data;
+	const char *error;
+	int ret;
+
+	unc->cpd_last = cp_data;
+
+	if (cp_data->general_category == UNICODE_GENERAL_CATEGORY_INVALID)
+		return -1;
+	if ((cp_data->nf_quick_check & unc->nf_qc_mask) == unc->nf_qc_no)
+		return 0;
+	if (cpd_last != NULL && cp_data->canonical_combining_class != 0 &&
+	    cpd_last->canonical_combining_class >
+		cp_data->canonical_combining_class)
+		return 0;
+	if ((cp_data->nf_quick_check & unc->nf_qc_mask) == unc->nf_qc_yes &&
+	    cp_data->canonical_combining_class == 0) {
+		if (unc->buffer_len > 0) {
+			ret = unicode_transform_flush(&unc->nf.transform,
+						      &error);
+			i_assert(ret != 0);
+			if (ret < 0)
+				return 0;
+			unicode_nf_reset(&unc->nf);
+		}
+		i_assert(unc->buffer_len == 0);
+		unc->cp_buffer[0] = cp;
+		return 1;
+	}
+
+	struct unicode_transform_buffer buf;
+	ssize_t sret;
+
+	if (unc->buffer_len == 0 && cpd_last != NULL) {
+		i_zero(&buf);
+		buf.cp = &unc->cp_buffer[0];
+		buf.cp_data = &cpd_last;
+		buf.cp_count = 1;
+
+		unc->buffer_len++;
+		sret = unicode_transform_input_buf(&unc->nf.transform, &buf,
+						   &error);
+		i_assert(sret != 0);
+		if (sret < 0)
+			return 0;
+	}
+
+	i_assert(unc->buffer_len < UNICODE_NF_BUFFER_SIZE);
+	unc->cp_buffer[unc->buffer_len] = cp;
+	unc->buffer_len++;
+
+	i_zero(&buf);
+	buf.cp = &cp;
+	buf.cp_data = &cp_data;
+	buf.cp_count = 1;
+	sret = unicode_transform_input_buf(&unc->nf.transform, &buf, &error);
+	i_assert(sret != 0);
+	if (sret < 0)
+		return 0;
+	return 1;
+}
+
+int unicode_nf_checker_finish(struct unicode_nf_checker *unc)
+{
+	if (unc->buffer_len == 0)
+		return 1;
+
+	const char *error;
+	int ret;
+
+	ret = unicode_transform_flush(&unc->nf.transform, &error);
+	i_assert(ret != 0);
+	return (ret > 0 ? 1 : 0);
+}
+
 /*
  * RFC 5051 - Simple Unicode Collation Algorithm
  */
diff --git a/src/lib/unicode-transform.h b/src/lib/unicode-transform.h
index e6e18d3177..3e3499f0e5 100644
--- a/src/lib/unicode-transform.h
+++ b/src/lib/unicode-transform.h
@@ -1,6 +1,11 @@
 #ifndef UNICODE_NF_H
 #define UNICODE_NF_H
 
+#define UNICODE_NF_STREAM_SAFE_NON_STARTER_LEN 30
+#define UNICODE_NF_BUFFER_SIZE (UNICODE_NF_STREAM_SAFE_NON_STARTER_LEN + 2)
+
+struct unicode_code_point_data;
+
 /*
  * Transform API
  */
@@ -55,9 +60,21 @@ ssize_t uniform_transform_forward(
 	const struct unicode_code_point_data *const *out_data, size_t out_len,
 	const char **error_r);
 
-ssize_t unicode_transform_input(struct unicode_transform *trans,
-				const uint32_t *in, size_t in_len,
-				const char **error_r);
+ssize_t unicode_transform_input_buf(struct unicode_transform *trans,
+				    const struct unicode_transform_buffer *buf,
+				    const char **error_r);
+static inline ssize_t
+unicode_transform_input(struct unicode_transform *trans,
+			const uint32_t *in, size_t in_len, const char **error_r)
+{
+	struct unicode_transform_buffer buf = {
+		.cp = in,
+		.cp_count = in_len,
+	};
+
+	return unicode_transform_input_buf(trans, &buf, error_r);
+}
+
 int unicode_transform_flush(struct unicode_transform *trans,
 			    const char **error_r);
 
@@ -84,6 +101,101 @@ void unicode_static_array_sink_init(struct unicode_static_array_sink *sink,
 				    uint32_t *array, size_t array_size,
 				    size_t *array_pos);
 
+/*
+ * NFD, NFKD, NFC, NFKC
+ */
+
+/* Unicode Standard Annex #15, Section 1.2:
+
+   Unicode Normalization Forms are formally defined normalizations of Unicode
+   strings which make it possible to determine whether any two Unicode strings
+   are equivalent to each other. Depending on the particular Unicode
+   Normalization Form, that equivalence can either be a canonical equivalence or
+   a compatibility equivalence.
+
+   Essentially, the Unicode Normalization Algorithm puts all combining marks in
+   a specified order, and uses rules for decomposition and composition to
+   transform each string into one of the Unicode Normalization Forms. A binary
+   comparison of the transformed strings will then determine equivalence.
+
+   The four Unicode Normalization Forms are summarized as follows:
+
+     Normalization Form D  (NFD)   - Canonical Decomposition
+     Normalization Form KD (NFKD)  - Compatibility Decomposition
+     Normalization Form C  (NFC)   - Canonical Decomposition, followed by
+                                     Canonical Composition
+     Normalization Form KC (NFKC)  - Compatibility Decomposition, followed by
+                                     Canonical Composition
+
+   There are two forms of normalization that convert to composite characters:
+   Normalization Form C and Normalization Form KC. The difference between these
+   depends on whether the resulting text is to be a canonical equivalent to the
+   original unnormalized text or a compatibility equivalent to the original
+   unnormalized text. (In NFKC and NFKD, a K is used to stand for compatibility
+   to avoid confusion with the C standing for composition.) Both types of
+   normalization can be useful in different circumstances.
+ */
+
+enum unicode_nf_type {
+	UNICODE_NFD,
+	UNICODE_NFKD,
+	UNICODE_NFC,
+	UNICODE_NFKC,
+};
+
+struct unicode_nf_context {
+	struct unicode_transform transform;
+
+	size_t nonstarter_count;
+	uint32_t cp_buffer[UNICODE_NF_BUFFER_SIZE];
+	const struct unicode_code_point_data *
+		cpd_buffer[UNICODE_NF_BUFFER_SIZE];
+	size_t buffer_len, buffer_processed, buffer_output_max;
+
+	size_t pending_decomp;
+	uint32_t pending_cp;
+	const struct unicode_code_point_data *pending_cpd;
+
+	uint8_t nf_qc_mask;
+
+	bool compose:1;
+	bool canonical:1;
+	bool finished:1;
+};
+
+void unicode_nf_init(struct unicode_nf_context *ctx_r,
+		     enum unicode_nf_type type);
+void unicode_nf_reset(struct unicode_nf_context *ctx);
+
+/*
+ * Normalization check
+ */
+
+struct unicode_nf_checker {
+	const struct unicode_code_point_data *cpd_last;
+
+	uint8_t nf_qc_mask;
+	uint8_t nf_qc_yes;
+	uint8_t nf_qc_no;
+
+	uint32_t cp_buffer[UNICODE_NF_BUFFER_SIZE];
+	size_t buffer_len;
+	struct unicode_nf_context nf;
+	struct unicode_transform sink;
+
+	bool not_first_cp;
+	bool compose:1;
+	bool canonical:1;
+};
+
+void unicode_nf_checker_init(struct unicode_nf_checker *unc_r,
+			     enum unicode_nf_type type);
+void unicode_nf_checker_reset(struct unicode_nf_checker *unc);
+
+int unicode_nf_checker_input(struct unicode_nf_checker *unc, uint32_t cp,
+			     const struct unicode_code_point_data **cp_data);
+int unicode_nf_checker_finish(struct unicode_nf_checker *unc);
+
 /*
  * RFC 5051 - Simple Unicode Collation Algorithm
  */