From: Stephan Bosch <stephan.bosch@open-xchange.com>
Date: Tue, 22 Apr 2025 00:04:43 +0000 (+0200)
Subject: lib: unicode-data - Add fields needed for Unicode normalization
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=6ab58560d89d57a1b6b53f73362aa6ceb879a128;p=thirdparty%2Fdovecot%2Fcore.git

lib: unicode-data - Add fields needed for Unicode normalization
---

diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am
index 647e45c673..2748d732c4 100644
--- a/src/lib/Makefile.am
+++ b/src/lib/Makefile.am
@@ -14,7 +14,9 @@ BUILT_SOURCES = $(srcdir)/unicode-data-tables.c \
 UCD_URL = https://dovecot.org/res
 UCD_DIR = $(srcdir)/ucd
 UCD_FILES = \
+	$(UCD_DIR)/CompositionExclusions.txt \
 	$(UCD_DIR)/DerivedCoreProperties.txt \
+	$(UCD_DIR)/DerivedNormalizationProps.txt \
 	$(UCD_DIR)/PropertyValueAliases.txt \
 	$(UCD_DIR)/UnicodeData.txt
 
@@ -47,8 +49,12 @@ YACC=/bin/false
 # dependency, anything including the header will race the bison process.
 event-filter-parser.h: event-filter-parser.c
 
+$(UCD_DIR)/CompositionExclusions.txt:
+	$(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/CompositionExclusions.txt
 $(UCD_DIR)/DerivedCoreProperties.txt:
 	$(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/DerivedCoreProperties.txt
+$(UCD_DIR)/DerivedNormalizationProps.txt:
+	$(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/DerivedNormalizationProps.txt
 $(UCD_DIR)/PropertyValueAliases.txt:
 	$(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/PropertyValueAliases.txt
 $(UCD_DIR)/UnicodeData.txt:
diff --git a/src/lib/test-unicode-data.c b/src/lib/test-unicode-data.c
index 3edef44122..341352a895 100644
--- a/src/lib/test-unicode-data.c
+++ b/src/lib/test-unicode-data.c
@@ -9,8 +9,128 @@
 
 #include <fcntl.h>
 
+#define UCD_COMPOSITION_EXCLUSIONS_TXT "CompositionExclusions.txt"
+#define UCD_DERIVED_NORMALIZATION_PROPS_TXT "DerivedNormalizationProps.txt"
 #define UCD_UNICODE_DATA_TXT "UnicodeData.txt"
 
+static bool
+parse_prop_file_line(const char *line, const char *file, unsigned int line_num,
+		     uint32_t *cp_first_r, uint32_t *cp_last_r,
+		     const char **prop_r, const char **value_r)
+{
+	unsigned int expected_columns = 1;
+
+	if (prop_r != NULL)
+		expected_columns++;
+
+	const char *const *columns = t_strsplit(line, ";");
+	if (str_array_length(columns) < expected_columns) {
+		test_failed(t_strdup_printf(
+			"Invalid data at %s:%u", file, line_num));
+		return FALSE;
+	}
+
+	const char *p = strstr(columns[0], "..");
+	const char *cp_first_hex, *cp_last_hex;
+
+	cp_last_hex = NULL;
+	if (p == NULL) {
+		cp_first_hex = t_str_trim(columns[0], " \t");
+	} else {
+		cp_first_hex = t_str_trim(t_strdup_until(columns[0], p), " \t");
+		cp_last_hex = t_str_trim(p + 2, " \t");
+	}
+	if (str_to_uint32_hex(cp_first_hex, cp_first_r) < 0) {
+		test_failed(t_strdup_printf(
+				"Invalid data at %s:%u: "
+				"Bad first code point", file, line_num));
+		return FALSE;
+	}
+	if (cp_last_hex == NULL)
+		*cp_last_r = *cp_first_r;
+	else if (str_to_uint32_hex(cp_last_hex, cp_last_r) < 0) {
+		test_failed(t_strdup_printf(
+				"Invalid data at %s:%u: "
+				"Bad first code point", file, line_num));
+		return FALSE;
+	}
+
+	if (prop_r != NULL) {
+		*prop_r = t_str_trim(columns[1], " \t");
+		if (value_r != NULL) {
+			if (columns[2] != NULL)
+				*value_r = t_str_trim(columns[2], " \t");
+			else
+				*value_r = NULL;
+		}
+	}
+	return !test_has_failed();
+}
+
+static void
+test_composition_exclusions_line(const char *line, unsigned int line_num)
+{
+	uint32_t cp_first, cp_last, cp;
+
+	if (!parse_prop_file_line(line, UCD_COMPOSITION_EXCLUSIONS_TXT,
+				  line_num, &cp_first, &cp_last, NULL, NULL))
+		return;
+
+	for (cp = cp_first; cp <= cp_last && !test_has_failed(); cp++) {
+		const struct unicode_code_point_data *cp_data =
+			unicode_code_point_get_data(cp);
+
+		test_assert_idx(cp_data->composition_count == 0, cp);
+	}
+}
+
+static void
+test_derived_normalization_props_line(const char *line, unsigned int line_num)
+{
+	uint32_t cp_first, cp_last, cp;
+	const char *prop, *value;
+
+	if (!parse_prop_file_line(line, UCD_DERIVED_NORMALIZATION_PROPS_TXT,
+				  line_num, &cp_first, &cp_last, &prop, &value))
+		return;
+
+	for (cp = cp_first; cp <= cp_last && !test_has_failed(); cp++) {
+		const struct unicode_code_point_data *cp_data =
+			unicode_code_point_get_data(cp);
+		uint8_t qc, qc_no, qc_maybe;
+
+		if (strcmp(prop, "NFD_QC") == 0) {
+			qc = (cp_data->nf_quick_check &
+			      UNICODE_NFD_QUICK_CHECK_MASK);
+			qc_no = UNICODE_NFD_QUICK_CHECK_NO;
+			qc_maybe = UNICODE_NFD_QUICK_CHECK_MAYBE;
+		} else if (strcmp(prop, "NFKD_QC") == 0) {
+			qc = (cp_data->nf_quick_check &
+			      UNICODE_NFKD_QUICK_CHECK_MASK);
+			qc_no = UNICODE_NFKD_QUICK_CHECK_NO;
+			qc_maybe = UNICODE_NFKD_QUICK_CHECK_MAYBE;
+		} else if (strcmp(prop, "NFC_QC") == 0) {
+			qc = (cp_data->nf_quick_check &
+			      UNICODE_NFC_QUICK_CHECK_MASK);
+			qc_no = UNICODE_NFC_QUICK_CHECK_NO;
+			qc_maybe = UNICODE_NFC_QUICK_CHECK_MAYBE;
+		} else if (strcmp(prop, "NFKC_QC") == 0) {
+			qc = (cp_data->nf_quick_check &
+			      UNICODE_NFKC_QUICK_CHECK_MASK);
+			qc_no = UNICODE_NFKC_QUICK_CHECK_NO;
+			qc_maybe = UNICODE_NFKC_QUICK_CHECK_MAYBE;
+		} else {
+			continue;
+		}
+
+		i_assert(value != NULL);
+		if (strcmp(value, "N") == 0)
+			test_assert_idx(qc == qc_no, cp);
+		else if (strcmp(value, "M") == 0)
+			test_assert_idx(qc == qc_maybe, cp);
+	}
+}
+
 static void test_unicode_data_line(const char *line, unsigned int line_num)
 {
 	static uint32_t cp_first = 0;
@@ -69,6 +189,18 @@ static void test_unicode_data_line(const char *line, unsigned int line_num)
 	}
 	test_assert(!unicode_general_category_is_group(general_category));
 
+	/* Parse Canonical_Combining_Class */
+
+	unsigned int ccc = 0;
+	if (*columns[3] != '\0' &&
+	    (str_to_uint(columns[3], &ccc) < 0 || ccc > UINT8_MAX)) {
+		test_failed(t_strdup_printf(
+			    "Invalid data at %s:%u: "
+			    "Bad Canonical_Combining_Class for code point %"PRIu32": %s",
+			    UCD_UNICODE_DATA_TXT, line_num, cp, columns[3]));
+		return;
+	}
+
 	/* Parse Decomposition_* */
 
 	const char *decomp_spec = columns[5];
@@ -135,6 +267,8 @@ static void test_unicode_data_line(const char *line, unsigned int line_num)
 
 		test_assert_idx(
 			cp_data->general_category == general_category, cp);
+		test_assert_idx(
+			cp_data->canonical_combining_class == ccc, cp);
 
 		const uint32_t *cp_decomp;
 		size_t cp_decomp_len, cp_decomp_idx;
@@ -216,6 +350,13 @@ test_ucd_file(const char *filename,
 
 void test_unicode_data(void)
 {
-	/* Check that UCD data files match with what is compiled. */
+	/* Check that UCD data files match with what is compiled. For the
+	   property files only the positive assignment of properties to the
+	   code points mentioned in the files is tested, and notably not their
+	   absence for other code points. */
+	test_ucd_file(UCD_COMPOSITION_EXCLUSIONS_TXT,
+		      test_composition_exclusions_line);
+	test_ucd_file(UCD_DERIVED_NORMALIZATION_PROPS_TXT,
+		      test_derived_normalization_props_line);
 	test_ucd_file(UCD_UNICODE_DATA_TXT, test_unicode_data_line);
 }
diff --git a/src/lib/unicode-data-static.h b/src/lib/unicode-data-static.h
index ffc61bb8ac..63a43c5d0b 100644
--- a/src/lib/unicode-data-static.h
+++ b/src/lib/unicode-data-static.h
@@ -93,17 +93,44 @@ enum unicode_general_category {
 	UNICODE_GENERAL_CATEGORY_CN = UNICODE_GENERAL_CATEGORY_C | 5,
 };
 
+/* UAX #44, Section 5.7.5: Decompositions and Normalization
+ */
+enum unicode_nf_quick_check {
+	UNICODE_NFKC_QUICK_CHECK_YES   = (0x00 << 6),
+	UNICODE_NFKC_QUICK_CHECK_NO    = (0x01 << 6),
+	UNICODE_NFKC_QUICK_CHECK_MAYBE = (0x02 << 6),
+	UNICODE_NFC_QUICK_CHECK_YES    = (0x00 << 4),
+	UNICODE_NFC_QUICK_CHECK_NO     = (0x01 << 4),
+	UNICODE_NFC_QUICK_CHECK_MAYBE  = (0x02 << 4),
+	UNICODE_NFKD_QUICK_CHECK_YES   = (0x00 << 2),
+	UNICODE_NFKD_QUICK_CHECK_NO    = (0x01 << 2),
+	UNICODE_NFKD_QUICK_CHECK_MAYBE = (0x02 << 2),
+	UNICODE_NFD_QUICK_CHECK_YES    = (0x00 << 0),
+	UNICODE_NFD_QUICK_CHECK_NO     = (0x01 << 0),
+	UNICODE_NFD_QUICK_CHECK_MAYBE  = (0x02 << 0),
+
+	UNICODE_NFKC_QUICK_CHECK_MASK  = (0x03 << 6),
+	UNICODE_NFC_QUICK_CHECK_MASK   = (0x03 << 4),
+	UNICODE_NFKD_QUICK_CHECK_MASK  = (0x03 << 2),
+	UNICODE_NFD_QUICK_CHECK_MASK   = (0x03 << 0),
+};
+
 struct unicode_code_point_data {
 	uint8_t general_category; // Not yet used
+	uint8_t canonical_combining_class;
+	uint8_t nf_quick_check;
 
 	uint8_t decomposition_type; // Not yet used
 	uint8_t decomposition_first_length;
 	uint8_t decomposition_full_length;
 	uint8_t decomposition_full_k_length;
 
+	uint8_t composition_count;
+
 	uint16_t decomposition_first_offset;
 	uint16_t decomposition_full_offset;
 	uint16_t decomposition_full_k_offset;
+	uint16_t composition_offset;
 
 	uint32_t simple_titlecase_mapping;
 };
diff --git a/src/lib/unicode-data.h b/src/lib/unicode-data.h
index eace84639f..0f71509d7c 100644
--- a/src/lib/unicode-data.h
+++ b/src/lib/unicode-data.h
@@ -55,6 +55,31 @@ unicode_code_point_data_get_full_decomposition(
 	return cp_data->decomposition_full_k_length;
 }
 
+static inline uint32_t
+unicode_code_point_data_find_composition(
+	const struct unicode_code_point_data *cp_data, uint32_t second)
+{
+	const uint32_t *compositions =
+		&unicode_compositions[cp_data->composition_offset];
+	size_t left_idx, right_idx;
+
+	left_idx = 0; right_idx = cp_data->composition_count;
+	while (left_idx < right_idx) {
+		unsigned int idx = (left_idx + right_idx) / 2;
+
+		if (second > compositions[idx])
+			left_idx = idx + 1;
+		else if (second < compositions[idx])
+			right_idx = idx;
+		else {
+			return unicode_composition_primaries[
+				cp_data->composition_offset + idx];
+		}
+	}
+
+	return 0x0000;
+}
+
 static inline size_t
 unicode_code_point_get_full_decomposition(uint32_t cp, bool canonical,
 					  const uint32_t **decomp_r)
diff --git a/src/lib/unicode-ucd-compile.py b/src/lib/unicode-ucd-compile.py
index e7283e91f9..1664691f0c 100755
--- a/src/lib/unicode-ucd-compile.py
+++ b/src/lib/unicode-ucd-compile.py
@@ -34,6 +34,13 @@ ud_decomposition_type_names = []
 ud_decompositions = []
 ud_decomposition_max_length = 0
 
+ud_composition_pairs = {}
+ud_composition_composites = {}
+ud_composition_exclusions = {}
+ud_compositions = []
+ud_composition_primaries = []
+ud_compositions_max_per_starter = 0
+
 
 class UCDFileOpen:
     def __init__(self, filename):
@@ -358,6 +365,57 @@ def read_ucd_files():
             # Add range
             CodePointRange(cp_first, cp_last, cpd)
 
+    # CompositionExclusions.txt
+    with UCDFileOpen("CompositionExclusions.txt") as ucd:
+        for line in ucd.fd:
+            data = line.split("#")
+
+            cprng = parse_cp_range(data[0])
+            if cprng is None:
+                continue
+
+            for cp in range(cprng[0], cprng[1] + 1):
+                ud_composition_exclusions[cp] = True
+
+    # DerivedNormalizationProps.txt
+    with UCDFileOpen("DerivedNormalizationProps.txt") as ucd:
+        line_num = 0
+        for line in ucd.fd:
+            line_num = line_num + 1
+            data = line.split("#")
+            line = data[0].strip()
+            if len(line) == 0:
+                continue
+
+            cols = line.split(";")
+            if len(cols) < 3:
+                if len(cols) < 2:
+                    die(f"{ucd}:{line_num}: Missing columns")
+                continue
+
+            cprng = parse_cp_range(cols[0])
+            if cprng is None:
+                continue
+
+            prop = cols[1].strip()
+            value = cols[2].strip()
+            if prop == "NFD_QC":
+                cpd = CodePointData()
+                cpd.nfd_quick_check = value
+                CodePointRange(cprng[0], cprng[1], cpd)
+            elif prop == "NFKD_QC":
+                cpd = CodePointData()
+                cpd.nfkd_quick_check = value
+                CodePointRange(cprng[0], cprng[1], cpd)
+            elif prop == "NFC_QC":
+                cpd = CodePointData()
+                cpd.nfc_quick_check = value
+                CodePointRange(cprng[0], cprng[1], cpd)
+            elif prop == "NFKC_QC":
+                cpd = CodePointData()
+                cpd.nfkc_quick_check = value
+                CodePointRange(cprng[0], cprng[1], cpd)
+
 
 def expand_decompositions():
     global ud_codepoints
@@ -495,6 +553,84 @@ def expand_decompositions():
                 ud_decomposition_max_length = len(dc)
 
 
+def derive_canonical_compositions():
+    global ud_codepoints
+    global ud_decompositions
+    global ud_composition_exclusions
+    global ud_composition_pairs
+    global ud_composition_composites
+    global ud_compositions
+    global ud_composition_primaries
+    global ud_compositions_max_per_starter
+
+    for cpr in ud_codepoints:
+        if cpr.cp_last > cpr.cp_first:
+            # No compositions in ranges expected, ever
+            continue
+        cp = cpr.cp_first
+        cpd = cpr.data
+
+        if not hasattr(cpd, "decomposition_full_offset"):
+            continue
+
+        # Skip singleton decompositions
+        if len(cpd.decomposition_first) < 2:
+            continue
+
+        # Skip non-starter decompositions
+        dc_offset = cpd.decomposition_full_offset
+        dc_len = cpd.decomposition_full_length
+        dc = ud_decompositions[dc_offset:(dc_offset + dc_len)]
+
+        scpr = ud_codepoints_index[dc[0]]
+        scpd = scpr.data
+        if (
+            hasattr(scpd, "canonical_combining_class")
+            and scpd.canonical_combining_class > 0
+        ):
+            continue
+
+        # Skip composition exclusions
+        if cp in ud_composition_exclusions:
+            continue
+
+        dc = cpd.decomposition_first
+
+        # Record all alternative pairs for each starter
+        if not dc[0] in ud_composition_pairs:
+            mp = [(dc[1], cp)]
+            ud_composition_pairs[dc[0]] = mp
+        else:
+            mp = ud_composition_pairs[dc[0]]
+            mp.append((dc[1], cp))
+
+            if len(mp) > ud_compositions_max_per_starter:
+                ud_compositions_max_per_starter = len(mp)
+
+    # Compose lookup tables
+    for cpr in ud_codepoints:
+        if cpr.cp_last > cpr.cp_first:
+            # No compositions in ranges expected, ever
+            continue
+        cp = cpr.cp_first
+        cpd = cpr.data
+
+        if cp not in ud_composition_pairs:
+            continue
+
+        def mp_key_func(a):
+            return a[0]
+
+        mp = ud_composition_pairs[cp]
+        mp.sort(key=mp_key_func)
+
+        cpd.composition_offset = len(ud_compositions)
+        cpd.composition_count = len(mp)
+
+        ud_compositions = ud_compositions + [p[0] for p in mp]
+        ud_composition_primaries = ud_composition_primaries + [p[1] for p in mp]
+
+
 def create_cp_range_index():
     global ud_codepoints
     global ud_codepoints_index
@@ -665,6 +801,10 @@ def write_tables_h():
         print(
             "#define UNICODE_DECOMPOSITION_MAX_LENGTH %s" % ud_decomposition_max_length
         )
+        print(
+            "#define UNICODE_COMPOSITIONS_MAX_PER_STARTER %s"
+            % ud_compositions_max_per_starter
+        )
         print("")
         print("extern const struct unicode_code_point_data unicode_code_points[];")
         print("")
@@ -675,6 +815,9 @@ def write_tables_h():
         print("")
         print("extern const uint32_t unicode_decompositions[];")
         print("")
+        print("extern const uint32_t unicode_compositions[];")
+        print("extern const uint32_t unicode_composition_primaries[];")
+        print("")
         print("#endif")
 
     sys.stdout = orig_stdout
@@ -719,6 +862,57 @@ def write_tables_c():
                 "\t\t.general_category = %s,"
                 % get_general_category_def(cpd.general_category)
             )
+            if (
+                hasattr(cpd, "canonical_combining_class")
+                and cpd.canonical_combining_class > 0
+            ):
+                print(
+                    "\t\t.canonical_combining_class = %u,"
+                    % cpd.canonical_combining_class
+                )
+            if (
+                hasattr(cpd, "nfd_quick_check")
+                or hasattr(cpd, "nfkd_quick_check")
+                or hasattr(cpd, "nfc_quick_check")
+                or hasattr(cpd, "nfkc_quick_check")
+            ):
+                print("\t\t.nf_quick_check = (", end="")
+                if hasattr(cpd, "nfkc_quick_check"):
+                    if cpd.nfkc_quick_check == "N":
+                        print("UNICODE_NFKC_QUICK_CHECK_NO", end="")
+                    elif cpd.nfkc_quick_check == "M":
+                        print("UNICODE_NFKC_QUICK_CHECK_MAYBE", end="")
+                if hasattr(cpd, "nfkc_quick_check") and hasattr(cpd, "nfc_quick_check"):
+                    print(" |")
+                    print("\t\t\t\t   ", end="")
+                if hasattr(cpd, "nfc_quick_check"):
+                    if cpd.nfc_quick_check == "N":
+                        print("UNICODE_NFC_QUICK_CHECK_NO", end="")
+                    elif cpd.nfc_quick_check == "M":
+                        print("UNICODE_NFC_QUICK_CHECK_MAYBE", end="")
+                if (
+                    hasattr(cpd, "nfkc_quick_check") or hasattr(cpd, "nfc_quick_check")
+                ) and hasattr(cpd, "nfkd_quick_check"):
+                    print(" |")
+                    print("\t\t\t\t   ", end="")
+                if hasattr(cpd, "nfkd_quick_check"):
+                    if cpd.nfkd_quick_check == "N":
+                        print("UNICODE_NFKD_QUICK_CHECK_NO", end="")
+                    elif cpd.nfkd_quick_check == "M":
+                        print("UNICODE_NFKD_QUICK_CHECK_MAYBE", end="")
+                if (
+                    hasattr(cpd, "nfkc_quick_check")
+                    or hasattr(cpd, "nfc_quick_check")
+                    or hasattr(cpd, "nfkd_quick_check")
+                ) and hasattr(cpd, "nfd_quick_check"):
+                    print(" |")
+                    print("\t\t\t\t   ", end="")
+                if hasattr(cpd, "nfd_quick_check"):
+                    if cpd.nfd_quick_check == "N":
+                        print("UNICODE_NFD_QUICK_CHECK_NO", end="")
+                    elif cpd.nfd_quick_check == "M":
+                        print("UNICODE_NFD_QUICK_CHECK_MAYBE", end="")
+                print("),")
             if hasattr(cpd, "decomposition_type"):
                 print(
                     "\t\t.decomposition_type = %s,"
@@ -745,6 +939,9 @@ def write_tables_c():
                     "\t\t.decomposition_full_k_offset = %u,"
                     % cpd.decomposition_full_k_offset
                 )
+            if hasattr(cpd, "composition_count"):
+                print("\t\t.composition_count = %u," % cpd.composition_count)
+                print("\t\t.composition_offset = %u," % cpd.composition_offset)
             if hasattr(cpd, "simple_titlecase_mapping"):
                 print(
                     "\t\t.simple_titlecase_mapping = 0x%04X,"
@@ -957,6 +1154,16 @@ def write_tables_c():
         print_list(ud_decompositions)
         print(",")
         print("};")
+        print("")
+        print("const uint32_t unicode_compositions[] = {")
+        print_list(ud_compositions)
+        print(",")
+        print("};")
+        print("")
+        print("const uint32_t unicode_composition_primaries[] = {")
+        print_list(ud_composition_primaries)
+        print(",")
+        print("};")
 
     sys.stdout = orig_stdout
 
@@ -1065,6 +1272,7 @@ def main():
 
     create_cp_range_index()
     expand_decompositions()
+    derive_canonical_compositions()
 
     create_cp_index_tables()