]> git.ipfire.org Git - thirdparty/dovecot/core.git/commitdiff
lib: unicode-transform - Add Unicode conformance test for NF* normalizations as unit...
authorStephan Bosch <stephan.bosch@open-xchange.com>
Mon, 7 Apr 2025 22:17:09 +0000 (00:17 +0200)
committerStephan Bosch <stephan.bosch@open-xchange.com>
Fri, 1 Aug 2025 01:11:19 +0000 (03:11 +0200)
src/lib/Makefile.am
src/lib/test-lib.inc
src/lib/test-unicode-nf.c [new file with mode: 0644]

index 2748d732c44cf133ca4ca54b4a1282f57d642aea..254c93e6d80c963684b4877bb72bc30dd1338112 100644 (file)
@@ -17,6 +17,7 @@ UCD_FILES = \
        $(UCD_DIR)/CompositionExclusions.txt \
        $(UCD_DIR)/DerivedCoreProperties.txt \
        $(UCD_DIR)/DerivedNormalizationProps.txt \
+       $(UCD_DIR)/NormalizationTest.txt \
        $(UCD_DIR)/PropertyValueAliases.txt \
        $(UCD_DIR)/UnicodeData.txt
 
@@ -55,6 +56,8 @@ $(UCD_DIR)/DerivedCoreProperties.txt:
        $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/DerivedCoreProperties.txt
 $(UCD_DIR)/DerivedNormalizationProps.txt:
        $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/DerivedNormalizationProps.txt
+$(UCD_DIR)/NormalizationTest.txt:
+       $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/NormalizationTest.txt
 $(UCD_DIR)/PropertyValueAliases.txt:
        $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/PropertyValueAliases.txt
 $(UCD_DIR)/UnicodeData.txt:
@@ -500,6 +503,7 @@ test_lib_SOURCES = \
        test-time-util.c \
        test-unichar.c \
        test-unicode-data.c \
+       test-unicode-nf.c \
        test-utc-mktime.c \
        test-uri.c \
        test-wildcard-match.c
index d3a1f336697479f813c91383efac8c0d40501be3..1bdc76e367d887b1e302dba07f9ce1da5433b1c5 100644 (file)
@@ -108,6 +108,7 @@ TEST(test_str_table)
 TEST(test_time_util)
 TEST(test_unichar)
 TEST(test_unicode_data)
+TEST(test_unicode_nf)
 TEST(test_uri)
 TEST(test_utc_mktime)
 TEST(test_wildcard_match)
diff --git a/src/lib/test-unicode-nf.c b/src/lib/test-unicode-nf.c
new file mode 100644 (file)
index 0000000..408e57e
--- /dev/null
@@ -0,0 +1,566 @@
+/* Copyright (c) 2025 Dovecot authors, see the included COPYING file */
+
+#include "test-lib.h"
+#include "strnum.h"
+#include "str.h"
+#include "unichar.h"
+#include "istream.h"
+
+#include <fcntl.h>
+
+#define UCD_NORMALIZATION_TEST_TXT UCD_DIR "/NormalizationTest.txt"
+
+static int test_column_to_utf8(const char *column, const char **out_r)
+{
+       const char *const *cps = t_strsplit(column, " ");
+       string_t *out = t_str_new(256);
+
+       while (*cps != NULL) {
+               uint32_t cp;
+
+               if (str_to_uint32_hex(*cps, &cp) < 0)
+                       return -1;
+               if (!uni_is_valid_ucs4(cp))
+                       return -1;
+               uni_ucs4_to_utf8_c(cp, out);
+               cps++;
+       }
+       *out_r = str_c(out);
+       return 0;
+}
+
+static void
+test_columns(const char *c1, const char *c2, const char *c3, const char *c4,
+            const char *c5, unsigned int line_num)
+{
+       buffer_t *nf_out = t_buffer_create(128);
+       int ret;
+
+       /* NFC
+            c2 ==  toNFC(c1) ==  toNFC(c2) ==  toNFC(c3)
+            c4 ==  toNFC(c4) ==  toNFC(c5)
+        */
+
+       /* c2 == toNFC(c1) */
+       buffer_set_used_size(nf_out, 0);
+       ret = uni_utf8_write_nfc(c1, strlen(c1), nf_out);
+       test_assert_idx(ret == 0, line_num);
+       test_assert_strcmp_idx(c2, str_c(nf_out), line_num);
+
+       /* c2 == toNFC(c2) */
+       buffer_set_used_size(nf_out, 0);
+       ret = uni_utf8_write_nfc(c2, strlen(c2), nf_out);
+       test_assert_idx(ret == 0, line_num);
+       test_assert_strcmp_idx(c2, str_c(nf_out), line_num);
+
+       /* c2 == toNFC(c3) */
+       buffer_set_used_size(nf_out, 0);
+       ret = uni_utf8_write_nfc(c3, strlen(c3), nf_out);
+       test_assert_idx(ret == 0, line_num);
+       test_assert_strcmp_idx(c2, str_c(nf_out), line_num);
+
+       /* c4 == toNFC(c4) */
+       buffer_set_used_size(nf_out, 0);
+       ret = uni_utf8_write_nfc(c4, strlen(c4), nf_out);
+       test_assert_idx(ret == 0, line_num);
+       test_assert_strcmp_idx(c4, str_c(nf_out), line_num);
+
+       /* c4 == toNFC(c5) */
+       buffer_set_used_size(nf_out, 0);
+       ret = uni_utf8_write_nfc(c5, strlen(c5), nf_out);
+       test_assert_idx(ret == 0, line_num);
+       test_assert_strcmp_idx(c4, str_c(nf_out), line_num);
+
+       /* Check isNFC() */
+       ret = uni_utf8_is_nfc(c2, strlen(c2));
+       test_assert_idx(ret > 0, line_num);
+       ret = uni_utf8_is_nfc(c4, strlen(c4));
+       test_assert_idx(ret > 0, line_num);
+       if (strcmp(c2, c1) != 0) {
+               ret = uni_utf8_is_nfc(c1, strlen(c1));
+               test_assert_idx(ret == 0, line_num);
+       }
+       if (strcmp(c2, c3) != 0) {
+               ret = uni_utf8_is_nfc(c3, strlen(c3));
+               test_assert_idx(ret == 0, line_num);
+       }
+       if (strcmp(c4, c5) != 0) {
+               ret = uni_utf8_is_nfc(c5, strlen(c5));
+               test_assert_idx(ret == 0, line_num);
+       }
+
+       /* NFD
+            c3 ==  toNFD(c1) ==  toNFD(c2) ==  toNFD(c3)
+            c5 ==  toNFD(c4) ==  toNFD(c5)
+        */
+
+       /* c3 == toNFD(c1) */
+       buffer_set_used_size(nf_out, 0);
+       ret = uni_utf8_write_nfd(c1, strlen(c1), nf_out);
+       test_assert_idx(ret == 0, line_num);
+       test_assert_strcmp_idx(c3, str_c(nf_out), line_num);
+
+       /* c3 == toNFD(c2) */
+       buffer_set_used_size(nf_out, 0);
+       ret = uni_utf8_write_nfd(c2, strlen(c2), nf_out);
+       test_assert_idx(ret == 0, line_num);
+       test_assert_strcmp_idx(c3, str_c(nf_out), line_num);
+
+       /* c3 == toNFD(c3) */
+       buffer_set_used_size(nf_out, 0);
+       ret = uni_utf8_write_nfd(c3, strlen(c3), nf_out);
+       test_assert_idx(ret == 0, line_num);
+       test_assert_strcmp_idx(c3, str_c(nf_out), line_num);
+
+       /* c5 == toNFD(c4) */
+       buffer_set_used_size(nf_out, 0);
+       ret = uni_utf8_write_nfd(c4, strlen(c4), nf_out);
+       test_assert_idx(ret == 0, line_num);
+       test_assert_strcmp_idx(c5, str_c(nf_out), line_num);
+
+       /* c5 == toNFD(c5) */
+       buffer_set_used_size(nf_out, 0);
+       ret = uni_utf8_write_nfd(c5, strlen(c5), nf_out);
+       test_assert_idx(ret == 0, line_num);
+       test_assert_strcmp_idx(c5, str_c(nf_out), line_num);
+
+       /* Check isNFD() */
+       ret = uni_utf8_is_nfd(c3, strlen(c3));
+       test_assert_idx(ret > 0, line_num);
+       ret = uni_utf8_is_nfd(c5, strlen(c5));
+       test_assert_idx(ret > 0, line_num);
+       if (strcmp(c1, c3) != 0) {
+               ret = uni_utf8_is_nfd(c1, strlen(c1));
+               test_assert_idx(ret == 0, line_num);
+       }
+       if (strcmp(c2, c3) != 0) {
+               ret = uni_utf8_is_nfd(c2, strlen(c2));
+               test_assert_idx(ret == 0, line_num);
+       }
+       if (strcmp(c4, c5) != 0) {
+               ret = uni_utf8_is_nfd(c4, strlen(c4));
+               test_assert_idx(ret == 0, line_num);
+       }
+
+       /* NFKC
+            c4 == toNFKC(c1) == toNFKC(c2) == toNFKC(c3) == toNFKC(c4)
+               == toNFKC(c5)
+        */
+
+       /* c4 == toNFKC(c1) */
+       buffer_set_used_size(nf_out, 0);
+       ret = uni_utf8_write_nfkc(c1, strlen(c1), nf_out);
+       test_assert_idx(ret == 0, line_num);
+       test_assert_strcmp_idx(c4, str_c(nf_out), line_num);
+
+       /* c4 == toNFKC(c2) */
+       buffer_set_used_size(nf_out, 0);
+       ret = uni_utf8_write_nfkc(c2, strlen(c2), nf_out);
+       test_assert_idx(ret == 0, line_num);
+       test_assert_strcmp_idx(c4, str_c(nf_out), line_num);
+
+       /* c4 == toNFKC(c3) */
+       buffer_set_used_size(nf_out, 0);
+       ret = uni_utf8_write_nfkc(c3, strlen(c3), nf_out);
+       test_assert_idx(ret == 0, line_num);
+       test_assert_strcmp_idx(c4, str_c(nf_out), line_num);
+
+       /* c4 == toNFKC(c4) */
+       buffer_set_used_size(nf_out, 0);
+       ret = uni_utf8_write_nfkc(c4, strlen(c4), nf_out);
+       test_assert_idx(ret == 0, line_num);
+       test_assert_strcmp_idx(c4, str_c(nf_out), line_num);
+
+       /* c4 == toNFKC(c5) */
+       buffer_set_used_size(nf_out, 0);
+       ret = uni_utf8_write_nfkc(c5, strlen(c5), nf_out);
+       test_assert_idx(ret == 0, line_num);
+       test_assert_strcmp_idx(c4, str_c(nf_out), line_num);
+
+       /* Check isNFKC() */
+       ret = uni_utf8_is_nfkc(c4, strlen(c4));
+       test_assert_idx(ret > 0, line_num);
+       if (strcmp(c4, c1) != 0) {
+               ret = uni_utf8_is_nfkc(c1, strlen(c1));
+               test_assert_idx(ret == 0, line_num);
+       }
+       if (strcmp(c4, c2) != 0) {
+               ret = uni_utf8_is_nfkc(c2, strlen(c2));
+               test_assert_idx(ret == 0, line_num);
+       }
+       if (strcmp(c4, c3) != 0) {
+               ret = uni_utf8_is_nfkc(c3, strlen(c3));
+               test_assert_idx(ret == 0, line_num);
+       }
+       if (strcmp(c4, c5) != 0) {
+               ret = uni_utf8_is_nfkc(c5, strlen(c5));
+               test_assert_idx(ret == 0, line_num);
+       }
+
+       /* NFKD
+            c5 == toNFKD(c1) == toNFKD(c2) == toNFKD(c3) == toNFKD(c4)
+               == toNFKD(c5)
+        */
+
+       /* c5 == toNFKD(c1) */
+       buffer_set_used_size(nf_out, 0);
+       ret = uni_utf8_write_nfkd(c1, strlen(c1), nf_out);
+       test_assert_idx(ret == 0, line_num);
+       test_assert_strcmp_idx(c5, str_c(nf_out), line_num);
+
+       /* c5 == toNFKD(c2) */
+       buffer_set_used_size(nf_out, 0);
+       ret = uni_utf8_write_nfkd(c2, strlen(c2), nf_out);
+       test_assert_idx(ret == 0, line_num);
+       test_assert_strcmp_idx(c5, str_c(nf_out), line_num);
+
+       /* c5 == toNFKD(c3) */
+       buffer_set_used_size(nf_out, 0);
+       ret = uni_utf8_write_nfkd(c3, strlen(c3), nf_out);
+       test_assert_idx(ret == 0, line_num);
+       test_assert_strcmp_idx(c5, str_c(nf_out), line_num);
+
+       /* c5 == toNFKD(c4) */
+       buffer_set_used_size(nf_out, 0);
+       ret = uni_utf8_write_nfkd(c4, strlen(c4), nf_out);
+       test_assert_idx(ret == 0, line_num);
+       test_assert_strcmp_idx(c5, str_c(nf_out), line_num);
+
+       /* c5 == toNFKD(c5) */
+       buffer_set_used_size(nf_out, 0);
+       ret = uni_utf8_write_nfkd(c5, strlen(c5), nf_out);
+       test_assert_idx(ret == 0, line_num);
+       test_assert_strcmp_idx(c5, str_c(nf_out), line_num);
+
+       /* Check isNFKD() */
+       ret = uni_utf8_is_nfd(c5, strlen(c5));
+       test_assert_idx(ret > 0, line_num);
+       if (strcmp(c1, c5) != 0) {
+               ret = uni_utf8_is_nfkd(c1, strlen(c1));
+               test_assert_idx(ret == 0, line_num);
+       }
+       if (strcmp(c2, c5) != 0) {
+               ret = uni_utf8_is_nfkd(c2, strlen(c2));
+               test_assert_idx(ret == 0, line_num);
+       }
+       if (strcmp(c3, c5) != 0) {
+               ret = uni_utf8_is_nfkd(c3, strlen(c3));
+               test_assert_idx(ret == 0, line_num);
+       }
+       if (strcmp(c4, c5) != 0) {
+               ret = uni_utf8_is_nfkd(c4, strlen(c4));
+               test_assert_idx(ret == 0, line_num);
+       }
+}
+
+static void test_line(const char *line, bool part1, unsigned int line_num)
+{
+       static uint32_t cp_last = 0;
+       uint32_t cp = 0x110000;
+
+       /* CONFORMANCE:
+
+          1. The following invariants must be true for all conformant
+             implementations
+
+             NFC
+               c2 ==  toNFC(c1) ==  toNFC(c2) ==  toNFC(c3)
+               c4 ==  toNFC(c4) ==  toNFC(c5)
+
+             NFD
+               c3 ==  toNFD(c1) ==  toNFD(c2) ==  toNFD(c3)
+               c5 ==  toNFD(c4) ==  toNFD(c5)
+
+             NFKC
+               c4 == toNFKC(c1) == toNFKC(c2) == toNFKC(c3) == toNFKC(c4)
+                  == toNFKC(c5)
+
+             NFKD
+               c5 == toNFKD(c1) == toNFKD(c2) == toNFKD(c3) == toNFKD(c4)
+                  == toNFKD(c5)
+        */
+       if (line != NULL) {
+               const char *const *columns = t_strsplit(line, ";");
+               if (str_array_length(columns) < 5) {
+                       test_failed(t_strdup_printf(
+                               "Invalid test at %s:%u",
+                               UCD_NORMALIZATION_TEST_TXT, line_num));
+                       return;
+               }
+
+               const char *c[5];
+               unsigned int i;
+
+               for (i = 0; i < 5; i++) {
+                       if (test_column_to_utf8(columns[i], &c[i]) < 0) {
+                               test_failed(t_strdup_printf(
+                                       "Invalid test at %s:%u: "
+                                       "Bad input in column %u: %s",
+                                       UCD_NORMALIZATION_TEST_TXT,
+                                       line_num, i + 1, columns[i]));
+                               return;
+                       }
+               }
+
+               test_columns(c[0], c[1], c[2], c[3], c[4], line_num);
+
+               if (!part1)
+                       return;
+
+               if (str_to_uint32_hex(columns[0], &cp) < 0) {
+                       test_failed(t_strdup_printf(
+                               "Invalid test at %s:%u: "
+                               "Bad input in column 1 for part1: %s",
+                               UCD_NORMALIZATION_TEST_TXT,
+                               line_num, columns[0]));
+                       return;
+               }
+       }
+
+       /* 2. For every code point X assigned in this version of Unicode that is
+             not specifically listed in Part 1, the following invariants must
+             be true for all conformant
+             implementations:
+
+             X == toNFC(X) == toNFD(X) == toNFKC(X) == toNFKD(X)
+        */
+
+       i_assert(part1);
+       string_t *out = t_str_new(256);
+       buffer_t *nf_out = t_buffer_create(128);
+       uint32_t i;
+       int ret;
+
+       for (i = cp_last; i < cp; i++) {
+               if (!uni_is_valid_ucs4(i))
+                       continue;
+               str_truncate(out, 0);
+               uni_ucs4_to_utf8_c(i, out);
+
+               /* X == toNFC(X) */
+               buffer_set_used_size(nf_out, 0);
+               ret = uni_utf8_write_nfc(str_data(out), str_len(out), nf_out);
+               test_assert_idx(ret == 0, line_num);
+               test_assert_strcmp_idx(str_c(out), str_c(nf_out), line_num);
+
+               /* X == toNFD(X) */
+               buffer_set_used_size(nf_out, 0);
+               ret = uni_utf8_write_nfd(str_data(out), str_len(out), nf_out);
+               test_assert_idx(ret == 0, line_num);
+               test_assert_strcmp_idx(str_c(out), str_c(nf_out), line_num);
+
+               /* X == toNFKC(X) */
+               buffer_set_used_size(nf_out, 0);
+               ret = uni_utf8_write_nfkc(str_data(out), str_len(out), nf_out);
+               test_assert_idx(ret == 0, line_num);
+               test_assert_strcmp_idx(str_c(out), str_c(nf_out), line_num);
+
+               /* X == toNFKD(X) */
+               buffer_set_used_size(nf_out, 0);
+               ret = uni_utf8_write_nfkd(str_data(out), str_len(out), nf_out);
+               test_assert_idx(ret == 0, line_num);
+               test_assert_strcmp_idx(str_c(out), str_c(nf_out), line_num);
+       }
+       cp_last = cp + 1;
+}
+
+static void test_long(void)
+{
+       static const char *nfc_utf32 = "FDFA FDFA FDFA";
+       static const char *nfkd_utf32 =
+               "0635 0644 0649 0020 0627 0644 0644 0647 0020 "
+               "0639 0644 064A 0647 0020 0648 0633 0644 0645 "
+               "0635 0644 0649 0020 0627 0644 0644 0647 0020 "
+               "0639 0644 064A 0647 0020 0648 0633 0644 0645 "
+               "0635 0644 0649 0020 0627 0644 0644 0647 0020 "
+               "0639 0644 064A 0647 0020 0648 0633 0644 0645";
+
+       const char *nfc, *nfkd;
+       buffer_t *nf_out = t_buffer_create(128);
+       int ret;
+
+       ret = test_column_to_utf8(nfc_utf32, &nfc);
+       test_assert(ret == 0);
+       ret = test_column_to_utf8(nfkd_utf32, &nfkd);
+       test_assert(ret == 0);
+
+       buffer_set_used_size(nf_out, 0);
+       ret = uni_utf8_write_nfc(nfc, strlen(nfc), nf_out);
+       test_assert(ret == 0);
+       test_assert_strcmp(nfc, str_c(nf_out));
+
+       buffer_set_used_size(nf_out, 0);
+       ret = uni_utf8_write_nfc(nfkd, strlen(nfkd), nf_out);
+       test_assert(ret == 0);
+       test_assert_strcmp(nfkd, str_c(nf_out));
+
+       buffer_set_used_size(nf_out, 0);
+       ret = uni_utf8_write_nfd(nfc, strlen(nfc), nf_out);
+       test_assert(ret == 0);
+       test_assert_strcmp(nfc, str_c(nf_out));
+
+       buffer_set_used_size(nf_out, 0);
+       ret = uni_utf8_write_nfd(nfkd, strlen(nfkd), nf_out);
+       test_assert(ret == 0);
+       test_assert_strcmp(nfkd, str_c(nf_out));
+
+       buffer_set_used_size(nf_out, 0);
+       ret = uni_utf8_write_nfkc(nfc, strlen(nfkd), nf_out);
+       test_assert(ret == 0);
+       test_assert_strcmp(nfkd, str_c(nf_out));
+
+       buffer_set_used_size(nf_out, 0);
+       ret = uni_utf8_write_nfkc(nfkd, strlen(nfkd), nf_out);
+       test_assert(ret == 0);
+       test_assert_strcmp(nfkd, str_c(nf_out));
+
+       buffer_set_used_size(nf_out, 0);
+       ret = uni_utf8_write_nfkd(nfc, strlen(nfkd), nf_out);
+       test_assert(ret == 0);
+       test_assert_strcmp(nfkd, str_c(nf_out));
+
+       buffer_set_used_size(nf_out, 0);
+       ret = uni_utf8_write_nfkd(nfkd, strlen(nfkd), nf_out);
+       test_assert(ret == 0);
+       test_assert_strcmp(nfkd, str_c(nf_out));
+}
+
+static void test_stream_safe(void)
+{
+       /* UAX15, Section 13:
+
+          Consider the extreme case of a string containing a digit 2 followed
+          by 10,000 umlauts followed by one dot-below, then a digit 3. As part
+          of normalization, the dot-below at the end must be reordered to
+          immediately after the digit 2, which means that 10,003 characters
+          need to be considered before the result can be output.
+
+          Such extremely long sequences of combining marks are not illegal,
+          even though for all practical purposes they are not meaningful.
+          However, the possibility of encountering such sequences forces a
+          conformant, serializing implementation to provide large buffer
+          capacity or to provide a special exception mechanism just for such
+          degenerate cases. The Stream-Safe Text Format specification addresses
+          this situation.
+        */
+
+       /* Construct test string */
+
+       string_t *in = t_str_new(1024);
+       buffer_t *nf_out = t_buffer_create(1024);
+       unsigned int i = 0;
+
+       /* digit 2 */
+       str_append(in, "2");
+       /* not quite 10,000 umlauts */
+       for  (i = 0; i < 100; i++)
+               str_append(in, "\xCC\x88");
+       /* dot-below */
+       str_append(in, "\xCC\xA3");
+       /* digit 3 */
+       str_append(in, "3");
+
+       /* Apply NFD normalization */
+
+       int ret;
+
+       ret = uni_utf8_write_nfd(str_data(in), str_len(in), nf_out);
+       test_assert(ret == 0);
+
+       /* Check the result */
+
+       const unsigned char *nf_data = nf_out->data;
+       size_t nf_size = nf_out->used;
+
+       test_assert(nf_size > 32);
+
+       static const char safe_block[] =
+               "\xCC\x88\xCC\x88\xCC\x88\xCC\x88\xCC\x88\xCC\x88\xCC\x88"
+               "\xCC\x88\xCC\x88\xCC\x88\xCC\x88\xCC\x88\xCC\x88\xCC\x88"
+               "\xCC\x88\xCC\x88\xCC\x88\xCC\x88\xCC\x88\xCC\x88\xCC\x88"
+               "\xCC\x88\xCC\x88\xCC\x88\xCC\x88\xCC\x88\xCC\x88\xCC\x88"
+               "\xCC\x88\xCC\x88";
+       static const char last_block[] =
+               "\xCC\x88\xCC\x88\xCC\x88\xCC\x88\xCC\x88\xCC\x88\xCC\x88"
+               "\xCC\x88\xCC\x88\xCC\x88";
+
+       test_assert(nf_data[0] == '2');                    /* digit 2 */
+       test_assert_memcmp(&nf_data[1], safe_block, 60);   /* 30 umlauts */
+       test_assert_memcmp(&nf_data[61], "\xCD\x8F", 2);   /* CGJ */
+       test_assert_memcmp(&nf_data[63], safe_block, 60);  /* 30 umlauts */
+       test_assert_memcmp(&nf_data[123], "\xCD\x8F", 2);  /* CGJ */
+       test_assert_memcmp(&nf_data[125], safe_block, 60); /* 30 umlauts */
+       test_assert_memcmp(&nf_data[185], "\xCD\x8F", 2);  /* CGJ */
+       test_assert_memcmp(&nf_data[187], "\xCC\xA3", 2);  /* dot-below */
+       test_assert_memcmp(&nf_data[189], last_block, 20); /* 10 umlauts */
+       test_assert(nf_data[209] == '3');                  /* digit 3 */
+}
+
+void test_unicode_nf(void)
+{
+       struct istream *input = NULL;
+       int fd;
+
+       /* Test using NormalizationTest.txt from UCD */
+       test_begin(t_strdup_printf("unicode normalization: open %s",
+                                  UCD_NORMALIZATION_TEST_TXT));
+
+       fd = open(UCD_NORMALIZATION_TEST_TXT, O_RDONLY);
+       if (fd < 0)
+               test_failed(t_strdup_printf("Failed to open: %m"));
+       else
+               input = i_stream_create_fd_autoclose(&fd, 1024);
+
+       unsigned int line_num = 0;
+       bool part1 = FALSE;
+
+       while (!test_has_failed()) {
+               char *line = i_stream_read_next_line(input);
+               if (line == NULL)
+                       break;
+               line_num++;
+
+               char *comment = strchr(line, '#');
+
+               if (comment != NULL)
+                       *comment = '\0';
+               if (*line == '\0')
+                       continue;
+
+               if (*line == '@') {
+                       if (part1) {
+                               T_BEGIN {
+                                       test_line(NULL, part1, line_num);
+                               } T_END;
+                       }
+
+                       test_end();
+                       const char *part = t_str_trim(line + 1, " ");;
+                       test_begin(t_strdup_printf(
+                               "unicode normalization: %s",
+                               t_str_lcase(part)));
+                       part1 = (strcmp(part, "Part1") == 0);
+                       continue;
+               }
+
+               if (test_has_failed())
+                       break;
+
+               T_BEGIN {
+                       test_line(line, part1, line_num);
+               } T_END;
+       }
+
+       i_stream_destroy(&input);
+       test_end();
+
+       /* Test long decompositions beyond NormalizationTests.txt */
+       test_begin("unicode normalization: long decompositions");
+       test_long();
+       test_end();
+
+       /* Test Stream Safe algorithm (UAX15-D4) */
+       test_begin("unicode normalization: stream safe");
+       test_stream_safe();
+       test_end();
+}