From: Stephan Bosch Date: Mon, 7 Apr 2025 22:17:09 +0000 (+0200) Subject: lib: unicode-transform - Add Unicode conformance test for NF* normalizations as unit... X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=50ff2fe7860067414c9f50f4c5633c866257856e;p=thirdparty%2Fdovecot%2Fcore.git lib: unicode-transform - Add Unicode conformance test for NF* normalizations as unit test --- diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am index 2748d732c4..254c93e6d8 100644 --- a/src/lib/Makefile.am +++ b/src/lib/Makefile.am @@ -17,6 +17,7 @@ UCD_FILES = \ $(UCD_DIR)/CompositionExclusions.txt \ $(UCD_DIR)/DerivedCoreProperties.txt \ $(UCD_DIR)/DerivedNormalizationProps.txt \ + $(UCD_DIR)/NormalizationTest.txt \ $(UCD_DIR)/PropertyValueAliases.txt \ $(UCD_DIR)/UnicodeData.txt @@ -55,6 +56,8 @@ $(UCD_DIR)/DerivedCoreProperties.txt: $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/DerivedCoreProperties.txt $(UCD_DIR)/DerivedNormalizationProps.txt: $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/DerivedNormalizationProps.txt +$(UCD_DIR)/NormalizationTest.txt: + $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/NormalizationTest.txt $(UCD_DIR)/PropertyValueAliases.txt: $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/PropertyValueAliases.txt $(UCD_DIR)/UnicodeData.txt: @@ -500,6 +503,7 @@ test_lib_SOURCES = \ test-time-util.c \ test-unichar.c \ test-unicode-data.c \ + test-unicode-nf.c \ test-utc-mktime.c \ test-uri.c \ test-wildcard-match.c diff --git a/src/lib/test-lib.inc b/src/lib/test-lib.inc index d3a1f33669..1bdc76e367 100644 --- a/src/lib/test-lib.inc +++ b/src/lib/test-lib.inc @@ -108,6 +108,7 @@ TEST(test_str_table) TEST(test_time_util) TEST(test_unichar) TEST(test_unicode_data) +TEST(test_unicode_nf) TEST(test_uri) TEST(test_utc_mktime) TEST(test_wildcard_match) diff --git a/src/lib/test-unicode-nf.c b/src/lib/test-unicode-nf.c new file mode 100644 index 0000000000..408e57e487 --- /dev/null +++ b/src/lib/test-unicode-nf.c @@ -0,0 +1,566 @@ +/* Copyright (c) 2025 Dovecot authors, see the included COPYING file */ + +#include "test-lib.h" +#include "strnum.h" +#include "str.h" +#include "unichar.h" +#include "istream.h" + +#include + +#define UCD_NORMALIZATION_TEST_TXT UCD_DIR "/NormalizationTest.txt" + +static int test_column_to_utf8(const char *column, const char **out_r) +{ + const char *const *cps = t_strsplit(column, " "); + string_t *out = t_str_new(256); + + while (*cps != NULL) { + uint32_t cp; + + if (str_to_uint32_hex(*cps, &cp) < 0) + return -1; + if (!uni_is_valid_ucs4(cp)) + return -1; + uni_ucs4_to_utf8_c(cp, out); + cps++; + } + *out_r = str_c(out); + return 0; +} + +static void +test_columns(const char *c1, const char *c2, const char *c3, const char *c4, + const char *c5, unsigned int line_num) +{ + buffer_t *nf_out = t_buffer_create(128); + int ret; + + /* NFC + c2 == toNFC(c1) == toNFC(c2) == toNFC(c3) + c4 == toNFC(c4) == toNFC(c5) + */ + + /* c2 == toNFC(c1) */ + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfc(c1, strlen(c1), nf_out); + test_assert_idx(ret == 0, line_num); + test_assert_strcmp_idx(c2, str_c(nf_out), line_num); + + /* c2 == toNFC(c2) */ + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfc(c2, strlen(c2), nf_out); + test_assert_idx(ret == 0, line_num); + test_assert_strcmp_idx(c2, str_c(nf_out), line_num); + + /* c2 == toNFC(c3) */ + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfc(c3, strlen(c3), nf_out); + test_assert_idx(ret == 0, line_num); + test_assert_strcmp_idx(c2, str_c(nf_out), line_num); + + /* c4 == toNFC(c4) */ + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfc(c4, strlen(c4), nf_out); + test_assert_idx(ret == 0, line_num); + test_assert_strcmp_idx(c4, str_c(nf_out), line_num); + + /* c4 == toNFC(c5) */ + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfc(c5, strlen(c5), nf_out); + test_assert_idx(ret == 0, line_num); + test_assert_strcmp_idx(c4, str_c(nf_out), line_num); + + /* Check isNFC() */ + ret = uni_utf8_is_nfc(c2, strlen(c2)); + test_assert_idx(ret > 0, line_num); + ret = uni_utf8_is_nfc(c4, strlen(c4)); + test_assert_idx(ret > 0, line_num); + if (strcmp(c2, c1) != 0) { + ret = uni_utf8_is_nfc(c1, strlen(c1)); + test_assert_idx(ret == 0, line_num); + } + if (strcmp(c2, c3) != 0) { + ret = uni_utf8_is_nfc(c3, strlen(c3)); + test_assert_idx(ret == 0, line_num); + } + if (strcmp(c4, c5) != 0) { + ret = uni_utf8_is_nfc(c5, strlen(c5)); + test_assert_idx(ret == 0, line_num); + } + + /* NFD + c3 == toNFD(c1) == toNFD(c2) == toNFD(c3) + c5 == toNFD(c4) == toNFD(c5) + */ + + /* c3 == toNFD(c1) */ + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfd(c1, strlen(c1), nf_out); + test_assert_idx(ret == 0, line_num); + test_assert_strcmp_idx(c3, str_c(nf_out), line_num); + + /* c3 == toNFD(c2) */ + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfd(c2, strlen(c2), nf_out); + test_assert_idx(ret == 0, line_num); + test_assert_strcmp_idx(c3, str_c(nf_out), line_num); + + /* c3 == toNFD(c3) */ + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfd(c3, strlen(c3), nf_out); + test_assert_idx(ret == 0, line_num); + test_assert_strcmp_idx(c3, str_c(nf_out), line_num); + + /* c5 == toNFD(c4) */ + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfd(c4, strlen(c4), nf_out); + test_assert_idx(ret == 0, line_num); + test_assert_strcmp_idx(c5, str_c(nf_out), line_num); + + /* c5 == toNFD(c5) */ + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfd(c5, strlen(c5), nf_out); + test_assert_idx(ret == 0, line_num); + test_assert_strcmp_idx(c5, str_c(nf_out), line_num); + + /* Check isNFD() */ + ret = uni_utf8_is_nfd(c3, strlen(c3)); + test_assert_idx(ret > 0, line_num); + ret = uni_utf8_is_nfd(c5, strlen(c5)); + test_assert_idx(ret > 0, line_num); + if (strcmp(c1, c3) != 0) { + ret = uni_utf8_is_nfd(c1, strlen(c1)); + test_assert_idx(ret == 0, line_num); + } + if (strcmp(c2, c3) != 0) { + ret = uni_utf8_is_nfd(c2, strlen(c2)); + test_assert_idx(ret == 0, line_num); + } + if (strcmp(c4, c5) != 0) { + ret = uni_utf8_is_nfd(c4, strlen(c4)); + test_assert_idx(ret == 0, line_num); + } + + /* NFKC + c4 == toNFKC(c1) == toNFKC(c2) == toNFKC(c3) == toNFKC(c4) + == toNFKC(c5) + */ + + /* c4 == toNFKC(c1) */ + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfkc(c1, strlen(c1), nf_out); + test_assert_idx(ret == 0, line_num); + test_assert_strcmp_idx(c4, str_c(nf_out), line_num); + + /* c4 == toNFKC(c2) */ + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfkc(c2, strlen(c2), nf_out); + test_assert_idx(ret == 0, line_num); + test_assert_strcmp_idx(c4, str_c(nf_out), line_num); + + /* c4 == toNFKC(c3) */ + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfkc(c3, strlen(c3), nf_out); + test_assert_idx(ret == 0, line_num); + test_assert_strcmp_idx(c4, str_c(nf_out), line_num); + + /* c4 == toNFKC(c4) */ + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfkc(c4, strlen(c4), nf_out); + test_assert_idx(ret == 0, line_num); + test_assert_strcmp_idx(c4, str_c(nf_out), line_num); + + /* c4 == toNFKC(c5) */ + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfkc(c5, strlen(c5), nf_out); + test_assert_idx(ret == 0, line_num); + test_assert_strcmp_idx(c4, str_c(nf_out), line_num); + + /* Check isNFKC() */ + ret = uni_utf8_is_nfkc(c4, strlen(c4)); + test_assert_idx(ret > 0, line_num); + if (strcmp(c4, c1) != 0) { + ret = uni_utf8_is_nfkc(c1, strlen(c1)); + test_assert_idx(ret == 0, line_num); + } + if (strcmp(c4, c2) != 0) { + ret = uni_utf8_is_nfkc(c2, strlen(c2)); + test_assert_idx(ret == 0, line_num); + } + if (strcmp(c4, c3) != 0) { + ret = uni_utf8_is_nfkc(c3, strlen(c3)); + test_assert_idx(ret == 0, line_num); + } + if (strcmp(c4, c5) != 0) { + ret = uni_utf8_is_nfkc(c5, strlen(c5)); + test_assert_idx(ret == 0, line_num); + } + + /* NFKD + c5 == toNFKD(c1) == toNFKD(c2) == toNFKD(c3) == toNFKD(c4) + == toNFKD(c5) + */ + + /* c5 == toNFKD(c1) */ + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfkd(c1, strlen(c1), nf_out); + test_assert_idx(ret == 0, line_num); + test_assert_strcmp_idx(c5, str_c(nf_out), line_num); + + /* c5 == toNFKD(c2) */ + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfkd(c2, strlen(c2), nf_out); + test_assert_idx(ret == 0, line_num); + test_assert_strcmp_idx(c5, str_c(nf_out), line_num); + + /* c5 == toNFKD(c3) */ + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfkd(c3, strlen(c3), nf_out); + test_assert_idx(ret == 0, line_num); + test_assert_strcmp_idx(c5, str_c(nf_out), line_num); + + /* c5 == toNFKD(c4) */ + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfkd(c4, strlen(c4), nf_out); + test_assert_idx(ret == 0, line_num); + test_assert_strcmp_idx(c5, str_c(nf_out), line_num); + + /* c5 == toNFKD(c5) */ + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfkd(c5, strlen(c5), nf_out); + test_assert_idx(ret == 0, line_num); + test_assert_strcmp_idx(c5, str_c(nf_out), line_num); + + /* Check isNFKD() */ + ret = uni_utf8_is_nfd(c5, strlen(c5)); + test_assert_idx(ret > 0, line_num); + if (strcmp(c1, c5) != 0) { + ret = uni_utf8_is_nfkd(c1, strlen(c1)); + test_assert_idx(ret == 0, line_num); + } + if (strcmp(c2, c5) != 0) { + ret = uni_utf8_is_nfkd(c2, strlen(c2)); + test_assert_idx(ret == 0, line_num); + } + if (strcmp(c3, c5) != 0) { + ret = uni_utf8_is_nfkd(c3, strlen(c3)); + test_assert_idx(ret == 0, line_num); + } + if (strcmp(c4, c5) != 0) { + ret = uni_utf8_is_nfkd(c4, strlen(c4)); + test_assert_idx(ret == 0, line_num); + } +} + +static void test_line(const char *line, bool part1, unsigned int line_num) +{ + static uint32_t cp_last = 0; + uint32_t cp = 0x110000; + + /* CONFORMANCE: + + 1. The following invariants must be true for all conformant + implementations + + NFC + c2 == toNFC(c1) == toNFC(c2) == toNFC(c3) + c4 == toNFC(c4) == toNFC(c5) + + NFD + c3 == toNFD(c1) == toNFD(c2) == toNFD(c3) + c5 == toNFD(c4) == toNFD(c5) + + NFKC + c4 == toNFKC(c1) == toNFKC(c2) == toNFKC(c3) == toNFKC(c4) + == toNFKC(c5) + + NFKD + c5 == toNFKD(c1) == toNFKD(c2) == toNFKD(c3) == toNFKD(c4) + == toNFKD(c5) + */ + if (line != NULL) { + const char *const *columns = t_strsplit(line, ";"); + if (str_array_length(columns) < 5) { + test_failed(t_strdup_printf( + "Invalid test at %s:%u", + UCD_NORMALIZATION_TEST_TXT, line_num)); + return; + } + + const char *c[5]; + unsigned int i; + + for (i = 0; i < 5; i++) { + if (test_column_to_utf8(columns[i], &c[i]) < 0) { + test_failed(t_strdup_printf( + "Invalid test at %s:%u: " + "Bad input in column %u: %s", + UCD_NORMALIZATION_TEST_TXT, + line_num, i + 1, columns[i])); + return; + } + } + + test_columns(c[0], c[1], c[2], c[3], c[4], line_num); + + if (!part1) + return; + + if (str_to_uint32_hex(columns[0], &cp) < 0) { + test_failed(t_strdup_printf( + "Invalid test at %s:%u: " + "Bad input in column 1 for part1: %s", + UCD_NORMALIZATION_TEST_TXT, + line_num, columns[0])); + return; + } + } + + /* 2. For every code point X assigned in this version of Unicode that is + not specifically listed in Part 1, the following invariants must + be true for all conformant + implementations: + + X == toNFC(X) == toNFD(X) == toNFKC(X) == toNFKD(X) + */ + + i_assert(part1); + string_t *out = t_str_new(256); + buffer_t *nf_out = t_buffer_create(128); + uint32_t i; + int ret; + + for (i = cp_last; i < cp; i++) { + if (!uni_is_valid_ucs4(i)) + continue; + str_truncate(out, 0); + uni_ucs4_to_utf8_c(i, out); + + /* X == toNFC(X) */ + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfc(str_data(out), str_len(out), nf_out); + test_assert_idx(ret == 0, line_num); + test_assert_strcmp_idx(str_c(out), str_c(nf_out), line_num); + + /* X == toNFD(X) */ + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfd(str_data(out), str_len(out), nf_out); + test_assert_idx(ret == 0, line_num); + test_assert_strcmp_idx(str_c(out), str_c(nf_out), line_num); + + /* X == toNFKC(X) */ + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfkc(str_data(out), str_len(out), nf_out); + test_assert_idx(ret == 0, line_num); + test_assert_strcmp_idx(str_c(out), str_c(nf_out), line_num); + + /* X == toNFKD(X) */ + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfkd(str_data(out), str_len(out), nf_out); + test_assert_idx(ret == 0, line_num); + test_assert_strcmp_idx(str_c(out), str_c(nf_out), line_num); + } + cp_last = cp + 1; +} + +static void test_long(void) +{ + static const char *nfc_utf32 = "FDFA FDFA FDFA"; + static const char *nfkd_utf32 = + "0635 0644 0649 0020 0627 0644 0644 0647 0020 " + "0639 0644 064A 0647 0020 0648 0633 0644 0645 " + "0635 0644 0649 0020 0627 0644 0644 0647 0020 " + "0639 0644 064A 0647 0020 0648 0633 0644 0645 " + "0635 0644 0649 0020 0627 0644 0644 0647 0020 " + "0639 0644 064A 0647 0020 0648 0633 0644 0645"; + + const char *nfc, *nfkd; + buffer_t *nf_out = t_buffer_create(128); + int ret; + + ret = test_column_to_utf8(nfc_utf32, &nfc); + test_assert(ret == 0); + ret = test_column_to_utf8(nfkd_utf32, &nfkd); + test_assert(ret == 0); + + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfc(nfc, strlen(nfc), nf_out); + test_assert(ret == 0); + test_assert_strcmp(nfc, str_c(nf_out)); + + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfc(nfkd, strlen(nfkd), nf_out); + test_assert(ret == 0); + test_assert_strcmp(nfkd, str_c(nf_out)); + + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfd(nfc, strlen(nfc), nf_out); + test_assert(ret == 0); + test_assert_strcmp(nfc, str_c(nf_out)); + + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfd(nfkd, strlen(nfkd), nf_out); + test_assert(ret == 0); + test_assert_strcmp(nfkd, str_c(nf_out)); + + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfkc(nfc, strlen(nfkd), nf_out); + test_assert(ret == 0); + test_assert_strcmp(nfkd, str_c(nf_out)); + + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfkc(nfkd, strlen(nfkd), nf_out); + test_assert(ret == 0); + test_assert_strcmp(nfkd, str_c(nf_out)); + + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfkd(nfc, strlen(nfkd), nf_out); + test_assert(ret == 0); + test_assert_strcmp(nfkd, str_c(nf_out)); + + buffer_set_used_size(nf_out, 0); + ret = uni_utf8_write_nfkd(nfkd, strlen(nfkd), nf_out); + test_assert(ret == 0); + test_assert_strcmp(nfkd, str_c(nf_out)); +} + +static void test_stream_safe(void) +{ + /* UAX15, Section 13: + + Consider the extreme case of a string containing a digit 2 followed + by 10,000 umlauts followed by one dot-below, then a digit 3. As part + of normalization, the dot-below at the end must be reordered to + immediately after the digit 2, which means that 10,003 characters + need to be considered before the result can be output. + + Such extremely long sequences of combining marks are not illegal, + even though for all practical purposes they are not meaningful. + However, the possibility of encountering such sequences forces a + conformant, serializing implementation to provide large buffer + capacity or to provide a special exception mechanism just for such + degenerate cases. The Stream-Safe Text Format specification addresses + this situation. + */ + + /* Construct test string */ + + string_t *in = t_str_new(1024); + buffer_t *nf_out = t_buffer_create(1024); + unsigned int i = 0; + + /* digit 2 */ + str_append(in, "2"); + /* not quite 10,000 umlauts */ + for (i = 0; i < 100; i++) + str_append(in, "\xCC\x88"); + /* dot-below */ + str_append(in, "\xCC\xA3"); + /* digit 3 */ + str_append(in, "3"); + + /* Apply NFD normalization */ + + int ret; + + ret = uni_utf8_write_nfd(str_data(in), str_len(in), nf_out); + test_assert(ret == 0); + + /* Check the result */ + + const unsigned char *nf_data = nf_out->data; + size_t nf_size = nf_out->used; + + test_assert(nf_size > 32); + + static const char safe_block[] = + "\xCC\x88\xCC\x88\xCC\x88\xCC\x88\xCC\x88\xCC\x88\xCC\x88" + "\xCC\x88\xCC\x88\xCC\x88\xCC\x88\xCC\x88\xCC\x88\xCC\x88" + "\xCC\x88\xCC\x88\xCC\x88\xCC\x88\xCC\x88\xCC\x88\xCC\x88" + "\xCC\x88\xCC\x88\xCC\x88\xCC\x88\xCC\x88\xCC\x88\xCC\x88" + "\xCC\x88\xCC\x88"; + static const char last_block[] = + "\xCC\x88\xCC\x88\xCC\x88\xCC\x88\xCC\x88\xCC\x88\xCC\x88" + "\xCC\x88\xCC\x88\xCC\x88"; + + test_assert(nf_data[0] == '2'); /* digit 2 */ + test_assert_memcmp(&nf_data[1], safe_block, 60); /* 30 umlauts */ + test_assert_memcmp(&nf_data[61], "\xCD\x8F", 2); /* CGJ */ + test_assert_memcmp(&nf_data[63], safe_block, 60); /* 30 umlauts */ + test_assert_memcmp(&nf_data[123], "\xCD\x8F", 2); /* CGJ */ + test_assert_memcmp(&nf_data[125], safe_block, 60); /* 30 umlauts */ + test_assert_memcmp(&nf_data[185], "\xCD\x8F", 2); /* CGJ */ + test_assert_memcmp(&nf_data[187], "\xCC\xA3", 2); /* dot-below */ + test_assert_memcmp(&nf_data[189], last_block, 20); /* 10 umlauts */ + test_assert(nf_data[209] == '3'); /* digit 3 */ +} + +void test_unicode_nf(void) +{ + struct istream *input = NULL; + int fd; + + /* Test using NormalizationTest.txt from UCD */ + test_begin(t_strdup_printf("unicode normalization: open %s", + UCD_NORMALIZATION_TEST_TXT)); + + fd = open(UCD_NORMALIZATION_TEST_TXT, O_RDONLY); + if (fd < 0) + test_failed(t_strdup_printf("Failed to open: %m")); + else + input = i_stream_create_fd_autoclose(&fd, 1024); + + unsigned int line_num = 0; + bool part1 = FALSE; + + while (!test_has_failed()) { + char *line = i_stream_read_next_line(input); + if (line == NULL) + break; + line_num++; + + char *comment = strchr(line, '#'); + + if (comment != NULL) + *comment = '\0'; + if (*line == '\0') + continue; + + if (*line == '@') { + if (part1) { + T_BEGIN { + test_line(NULL, part1, line_num); + } T_END; + } + + test_end(); + const char *part = t_str_trim(line + 1, " ");; + test_begin(t_strdup_printf( + "unicode normalization: %s", + t_str_lcase(part))); + part1 = (strcmp(part, "Part1") == 0); + continue; + } + + if (test_has_failed()) + break; + + T_BEGIN { + test_line(line, part1, line_num); + } T_END; + } + + i_stream_destroy(&input); + test_end(); + + /* Test long decompositions beyond NormalizationTests.txt */ + test_begin("unicode normalization: long decompositions"); + test_long(); + test_end(); + + /* Test Stream Safe algorithm (UAX15-D4) */ + test_begin("unicode normalization: stream safe"); + test_stream_safe(); + test_end(); +}