]> git.ipfire.org Git - thirdparty/dovecot/core.git/commitdiff
lib: unicode-data - Add fields needed for Unicode normalization
authorStephan Bosch <stephan.bosch@open-xchange.com>
Tue, 22 Apr 2025 00:04:43 +0000 (02:04 +0200)
committerStephan Bosch <stephan.bosch@open-xchange.com>
Fri, 1 Aug 2025 01:11:19 +0000 (03:11 +0200)
src/lib/Makefile.am
src/lib/test-unicode-data.c
src/lib/unicode-data-static.h
src/lib/unicode-data.h
src/lib/unicode-ucd-compile.py

index 647e45c6737cceac887fe1d8ce2bc3fb5e5ca793..2748d732c44cf133ca4ca54b4a1282f57d642aea 100644 (file)
@@ -14,7 +14,9 @@ BUILT_SOURCES = $(srcdir)/unicode-data-tables.c \
 UCD_URL = https://dovecot.org/res
 UCD_DIR = $(srcdir)/ucd
 UCD_FILES = \
+       $(UCD_DIR)/CompositionExclusions.txt \
        $(UCD_DIR)/DerivedCoreProperties.txt \
+       $(UCD_DIR)/DerivedNormalizationProps.txt \
        $(UCD_DIR)/PropertyValueAliases.txt \
        $(UCD_DIR)/UnicodeData.txt
 
@@ -47,8 +49,12 @@ YACC=/bin/false
 # dependency, anything including the header will race the bison process.
 event-filter-parser.h: event-filter-parser.c
 
+$(UCD_DIR)/CompositionExclusions.txt:
+       $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/CompositionExclusions.txt
 $(UCD_DIR)/DerivedCoreProperties.txt:
        $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/DerivedCoreProperties.txt
+$(UCD_DIR)/DerivedNormalizationProps.txt:
+       $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/DerivedNormalizationProps.txt
 $(UCD_DIR)/PropertyValueAliases.txt:
        $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/PropertyValueAliases.txt
 $(UCD_DIR)/UnicodeData.txt:
index 3edef44122236b9ee9d3db086b1e2c233534ae31..341352a8952c7f1ffa0266969c756576d94b46c6 100644 (file)
@@ -9,8 +9,128 @@
 
 #include <fcntl.h>
 
+#define UCD_COMPOSITION_EXCLUSIONS_TXT "CompositionExclusions.txt"
+#define UCD_DERIVED_NORMALIZATION_PROPS_TXT "DerivedNormalizationProps.txt"
 #define UCD_UNICODE_DATA_TXT "UnicodeData.txt"
 
+static bool
+parse_prop_file_line(const char *line, const char *file, unsigned int line_num,
+                    uint32_t *cp_first_r, uint32_t *cp_last_r,
+                    const char **prop_r, const char **value_r)
+{
+       unsigned int expected_columns = 1;
+
+       if (prop_r != NULL)
+               expected_columns++;
+
+       const char *const *columns = t_strsplit(line, ";");
+       if (str_array_length(columns) < expected_columns) {
+               test_failed(t_strdup_printf(
+                       "Invalid data at %s:%u", file, line_num));
+               return FALSE;
+       }
+
+       const char *p = strstr(columns[0], "..");
+       const char *cp_first_hex, *cp_last_hex;
+
+       cp_last_hex = NULL;
+       if (p == NULL) {
+               cp_first_hex = t_str_trim(columns[0], " \t");
+       } else {
+               cp_first_hex = t_str_trim(t_strdup_until(columns[0], p), " \t");
+               cp_last_hex = t_str_trim(p + 2, " \t");
+       }
+       if (str_to_uint32_hex(cp_first_hex, cp_first_r) < 0) {
+               test_failed(t_strdup_printf(
+                               "Invalid data at %s:%u: "
+                               "Bad first code point", file, line_num));
+               return FALSE;
+       }
+       if (cp_last_hex == NULL)
+               *cp_last_r = *cp_first_r;
+       else if (str_to_uint32_hex(cp_last_hex, cp_last_r) < 0) {
+               test_failed(t_strdup_printf(
+                               "Invalid data at %s:%u: "
+                               "Bad first code point", file, line_num));
+               return FALSE;
+       }
+
+       if (prop_r != NULL) {
+               *prop_r = t_str_trim(columns[1], " \t");
+               if (value_r != NULL) {
+                       if (columns[2] != NULL)
+                               *value_r = t_str_trim(columns[2], " \t");
+                       else
+                               *value_r = NULL;
+               }
+       }
+       return !test_has_failed();
+}
+
+static void
+test_composition_exclusions_line(const char *line, unsigned int line_num)
+{
+       uint32_t cp_first, cp_last, cp;
+
+       if (!parse_prop_file_line(line, UCD_COMPOSITION_EXCLUSIONS_TXT,
+                                 line_num, &cp_first, &cp_last, NULL, NULL))
+               return;
+
+       for (cp = cp_first; cp <= cp_last && !test_has_failed(); cp++) {
+               const struct unicode_code_point_data *cp_data =
+                       unicode_code_point_get_data(cp);
+
+               test_assert_idx(cp_data->composition_count == 0, cp);
+       }
+}
+
+static void
+test_derived_normalization_props_line(const char *line, unsigned int line_num)
+{
+       uint32_t cp_first, cp_last, cp;
+       const char *prop, *value;
+
+       if (!parse_prop_file_line(line, UCD_DERIVED_NORMALIZATION_PROPS_TXT,
+                                 line_num, &cp_first, &cp_last, &prop, &value))
+               return;
+
+       for (cp = cp_first; cp <= cp_last && !test_has_failed(); cp++) {
+               const struct unicode_code_point_data *cp_data =
+                       unicode_code_point_get_data(cp);
+               uint8_t qc, qc_no, qc_maybe;
+
+               if (strcmp(prop, "NFD_QC") == 0) {
+                       qc = (cp_data->nf_quick_check &
+                             UNICODE_NFD_QUICK_CHECK_MASK);
+                       qc_no = UNICODE_NFD_QUICK_CHECK_NO;
+                       qc_maybe = UNICODE_NFD_QUICK_CHECK_MAYBE;
+               } else if (strcmp(prop, "NFKD_QC") == 0) {
+                       qc = (cp_data->nf_quick_check &
+                             UNICODE_NFKD_QUICK_CHECK_MASK);
+                       qc_no = UNICODE_NFKD_QUICK_CHECK_NO;
+                       qc_maybe = UNICODE_NFKD_QUICK_CHECK_MAYBE;
+               } else if (strcmp(prop, "NFC_QC") == 0) {
+                       qc = (cp_data->nf_quick_check &
+                             UNICODE_NFC_QUICK_CHECK_MASK);
+                       qc_no = UNICODE_NFC_QUICK_CHECK_NO;
+                       qc_maybe = UNICODE_NFC_QUICK_CHECK_MAYBE;
+               } else if (strcmp(prop, "NFKC_QC") == 0) {
+                       qc = (cp_data->nf_quick_check &
+                             UNICODE_NFKC_QUICK_CHECK_MASK);
+                       qc_no = UNICODE_NFKC_QUICK_CHECK_NO;
+                       qc_maybe = UNICODE_NFKC_QUICK_CHECK_MAYBE;
+               } else {
+                       continue;
+               }
+
+               i_assert(value != NULL);
+               if (strcmp(value, "N") == 0)
+                       test_assert_idx(qc == qc_no, cp);
+               else if (strcmp(value, "M") == 0)
+                       test_assert_idx(qc == qc_maybe, cp);
+       }
+}
+
 static void test_unicode_data_line(const char *line, unsigned int line_num)
 {
        static uint32_t cp_first = 0;
@@ -69,6 +189,18 @@ static void test_unicode_data_line(const char *line, unsigned int line_num)
        }
        test_assert(!unicode_general_category_is_group(general_category));
 
+       /* Parse Canonical_Combining_Class */
+
+       unsigned int ccc = 0;
+       if (*columns[3] != '\0' &&
+           (str_to_uint(columns[3], &ccc) < 0 || ccc > UINT8_MAX)) {
+               test_failed(t_strdup_printf(
+                           "Invalid data at %s:%u: "
+                           "Bad Canonical_Combining_Class for code point %"PRIu32": %s",
+                           UCD_UNICODE_DATA_TXT, line_num, cp, columns[3]));
+               return;
+       }
+
        /* Parse Decomposition_* */
 
        const char *decomp_spec = columns[5];
@@ -135,6 +267,8 @@ static void test_unicode_data_line(const char *line, unsigned int line_num)
 
                test_assert_idx(
                        cp_data->general_category == general_category, cp);
+               test_assert_idx(
+                       cp_data->canonical_combining_class == ccc, cp);
 
                const uint32_t *cp_decomp;
                size_t cp_decomp_len, cp_decomp_idx;
@@ -216,6 +350,13 @@ test_ucd_file(const char *filename,
 
 void test_unicode_data(void)
 {
-       /* Check that UCD data files match with what is compiled. */
+       /* Check that UCD data files match with what is compiled. For the
+          property files only the positive assignment of properties to the
+          code points mentioned in the files is tested, and notably not their
+          absence for other code points. */
+       test_ucd_file(UCD_COMPOSITION_EXCLUSIONS_TXT,
+                     test_composition_exclusions_line);
+       test_ucd_file(UCD_DERIVED_NORMALIZATION_PROPS_TXT,
+                     test_derived_normalization_props_line);
        test_ucd_file(UCD_UNICODE_DATA_TXT, test_unicode_data_line);
 }
index ffc61bb8ac2fcdf8135f1162a0905cdd200f2dff..63a43c5d0b0348124d29a28597590a09d0b38282 100644 (file)
@@ -93,17 +93,44 @@ enum unicode_general_category {
        UNICODE_GENERAL_CATEGORY_CN = UNICODE_GENERAL_CATEGORY_C | 5,
 };
 
+/* UAX #44, Section 5.7.5: Decompositions and Normalization
+ */
+enum unicode_nf_quick_check {
+       UNICODE_NFKC_QUICK_CHECK_YES   = (0x00 << 6),
+       UNICODE_NFKC_QUICK_CHECK_NO    = (0x01 << 6),
+       UNICODE_NFKC_QUICK_CHECK_MAYBE = (0x02 << 6),
+       UNICODE_NFC_QUICK_CHECK_YES    = (0x00 << 4),
+       UNICODE_NFC_QUICK_CHECK_NO     = (0x01 << 4),
+       UNICODE_NFC_QUICK_CHECK_MAYBE  = (0x02 << 4),
+       UNICODE_NFKD_QUICK_CHECK_YES   = (0x00 << 2),
+       UNICODE_NFKD_QUICK_CHECK_NO    = (0x01 << 2),
+       UNICODE_NFKD_QUICK_CHECK_MAYBE = (0x02 << 2),
+       UNICODE_NFD_QUICK_CHECK_YES    = (0x00 << 0),
+       UNICODE_NFD_QUICK_CHECK_NO     = (0x01 << 0),
+       UNICODE_NFD_QUICK_CHECK_MAYBE  = (0x02 << 0),
+
+       UNICODE_NFKC_QUICK_CHECK_MASK  = (0x03 << 6),
+       UNICODE_NFC_QUICK_CHECK_MASK   = (0x03 << 4),
+       UNICODE_NFKD_QUICK_CHECK_MASK  = (0x03 << 2),
+       UNICODE_NFD_QUICK_CHECK_MASK   = (0x03 << 0),
+};
+
 struct unicode_code_point_data {
        uint8_t general_category; // Not yet used
+       uint8_t canonical_combining_class;
+       uint8_t nf_quick_check;
 
        uint8_t decomposition_type; // Not yet used
        uint8_t decomposition_first_length;
        uint8_t decomposition_full_length;
        uint8_t decomposition_full_k_length;
 
+       uint8_t composition_count;
+
        uint16_t decomposition_first_offset;
        uint16_t decomposition_full_offset;
        uint16_t decomposition_full_k_offset;
+       uint16_t composition_offset;
 
        uint32_t simple_titlecase_mapping;
 };
index eace84639ffd8b9035385a1b2ef0daff2a706391..0f71509d7c9772357969af8845a00f69f08ff7e5 100644 (file)
@@ -55,6 +55,31 @@ unicode_code_point_data_get_full_decomposition(
        return cp_data->decomposition_full_k_length;
 }
 
+static inline uint32_t
+unicode_code_point_data_find_composition(
+       const struct unicode_code_point_data *cp_data, uint32_t second)
+{
+       const uint32_t *compositions =
+               &unicode_compositions[cp_data->composition_offset];
+       size_t left_idx, right_idx;
+
+       left_idx = 0; right_idx = cp_data->composition_count;
+       while (left_idx < right_idx) {
+               unsigned int idx = (left_idx + right_idx) / 2;
+
+               if (second > compositions[idx])
+                       left_idx = idx + 1;
+               else if (second < compositions[idx])
+                       right_idx = idx;
+               else {
+                       return unicode_composition_primaries[
+                               cp_data->composition_offset + idx];
+               }
+       }
+
+       return 0x0000;
+}
+
 static inline size_t
 unicode_code_point_get_full_decomposition(uint32_t cp, bool canonical,
                                          const uint32_t **decomp_r)
index e7283e91f9995cf668e9ceeb020d563ec459cd44..1664691f0cee1ded7e1c45df738210ceb2350f79 100755 (executable)
@@ -34,6 +34,13 @@ ud_decomposition_type_names = []
 ud_decompositions = []
 ud_decomposition_max_length = 0
 
+ud_composition_pairs = {}
+ud_composition_composites = {}
+ud_composition_exclusions = {}
+ud_compositions = []
+ud_composition_primaries = []
+ud_compositions_max_per_starter = 0
+
 
 class UCDFileOpen:
     def __init__(self, filename):
@@ -358,6 +365,57 @@ def read_ucd_files():
             # Add range
             CodePointRange(cp_first, cp_last, cpd)
 
+    # CompositionExclusions.txt
+    with UCDFileOpen("CompositionExclusions.txt") as ucd:
+        for line in ucd.fd:
+            data = line.split("#")
+
+            cprng = parse_cp_range(data[0])
+            if cprng is None:
+                continue
+
+            for cp in range(cprng[0], cprng[1] + 1):
+                ud_composition_exclusions[cp] = True
+
+    # DerivedNormalizationProps.txt
+    with UCDFileOpen("DerivedNormalizationProps.txt") as ucd:
+        line_num = 0
+        for line in ucd.fd:
+            line_num = line_num + 1
+            data = line.split("#")
+            line = data[0].strip()
+            if len(line) == 0:
+                continue
+
+            cols = line.split(";")
+            if len(cols) < 3:
+                if len(cols) < 2:
+                    die(f"{ucd}:{line_num}: Missing columns")
+                continue
+
+            cprng = parse_cp_range(cols[0])
+            if cprng is None:
+                continue
+
+            prop = cols[1].strip()
+            value = cols[2].strip()
+            if prop == "NFD_QC":
+                cpd = CodePointData()
+                cpd.nfd_quick_check = value
+                CodePointRange(cprng[0], cprng[1], cpd)
+            elif prop == "NFKD_QC":
+                cpd = CodePointData()
+                cpd.nfkd_quick_check = value
+                CodePointRange(cprng[0], cprng[1], cpd)
+            elif prop == "NFC_QC":
+                cpd = CodePointData()
+                cpd.nfc_quick_check = value
+                CodePointRange(cprng[0], cprng[1], cpd)
+            elif prop == "NFKC_QC":
+                cpd = CodePointData()
+                cpd.nfkc_quick_check = value
+                CodePointRange(cprng[0], cprng[1], cpd)
+
 
 def expand_decompositions():
     global ud_codepoints
@@ -495,6 +553,84 @@ def expand_decompositions():
                 ud_decomposition_max_length = len(dc)
 
 
+def derive_canonical_compositions():
+    global ud_codepoints
+    global ud_decompositions
+    global ud_composition_exclusions
+    global ud_composition_pairs
+    global ud_composition_composites
+    global ud_compositions
+    global ud_composition_primaries
+    global ud_compositions_max_per_starter
+
+    for cpr in ud_codepoints:
+        if cpr.cp_last > cpr.cp_first:
+            # No compositions in ranges expected, ever
+            continue
+        cp = cpr.cp_first
+        cpd = cpr.data
+
+        if not hasattr(cpd, "decomposition_full_offset"):
+            continue
+
+        # Skip singleton decompositions
+        if len(cpd.decomposition_first) < 2:
+            continue
+
+        # Skip non-starter decompositions
+        dc_offset = cpd.decomposition_full_offset
+        dc_len = cpd.decomposition_full_length
+        dc = ud_decompositions[dc_offset:(dc_offset + dc_len)]
+
+        scpr = ud_codepoints_index[dc[0]]
+        scpd = scpr.data
+        if (
+            hasattr(scpd, "canonical_combining_class")
+            and scpd.canonical_combining_class > 0
+        ):
+            continue
+
+        # Skip composition exclusions
+        if cp in ud_composition_exclusions:
+            continue
+
+        dc = cpd.decomposition_first
+
+        # Record all alternative pairs for each starter
+        if not dc[0] in ud_composition_pairs:
+            mp = [(dc[1], cp)]
+            ud_composition_pairs[dc[0]] = mp
+        else:
+            mp = ud_composition_pairs[dc[0]]
+            mp.append((dc[1], cp))
+
+            if len(mp) > ud_compositions_max_per_starter:
+                ud_compositions_max_per_starter = len(mp)
+
+    # Compose lookup tables
+    for cpr in ud_codepoints:
+        if cpr.cp_last > cpr.cp_first:
+            # No compositions in ranges expected, ever
+            continue
+        cp = cpr.cp_first
+        cpd = cpr.data
+
+        if cp not in ud_composition_pairs:
+            continue
+
+        def mp_key_func(a):
+            return a[0]
+
+        mp = ud_composition_pairs[cp]
+        mp.sort(key=mp_key_func)
+
+        cpd.composition_offset = len(ud_compositions)
+        cpd.composition_count = len(mp)
+
+        ud_compositions = ud_compositions + [p[0] for p in mp]
+        ud_composition_primaries = ud_composition_primaries + [p[1] for p in mp]
+
+
 def create_cp_range_index():
     global ud_codepoints
     global ud_codepoints_index
@@ -665,6 +801,10 @@ def write_tables_h():
         print(
             "#define UNICODE_DECOMPOSITION_MAX_LENGTH %s" % ud_decomposition_max_length
         )
+        print(
+            "#define UNICODE_COMPOSITIONS_MAX_PER_STARTER %s"
+            % ud_compositions_max_per_starter
+        )
         print("")
         print("extern const struct unicode_code_point_data unicode_code_points[];")
         print("")
@@ -675,6 +815,9 @@ def write_tables_h():
         print("")
         print("extern const uint32_t unicode_decompositions[];")
         print("")
+        print("extern const uint32_t unicode_compositions[];")
+        print("extern const uint32_t unicode_composition_primaries[];")
+        print("")
         print("#endif")
 
     sys.stdout = orig_stdout
@@ -719,6 +862,57 @@ def write_tables_c():
                 "\t\t.general_category = %s,"
                 % get_general_category_def(cpd.general_category)
             )
+            if (
+                hasattr(cpd, "canonical_combining_class")
+                and cpd.canonical_combining_class > 0
+            ):
+                print(
+                    "\t\t.canonical_combining_class = %u,"
+                    % cpd.canonical_combining_class
+                )
+            if (
+                hasattr(cpd, "nfd_quick_check")
+                or hasattr(cpd, "nfkd_quick_check")
+                or hasattr(cpd, "nfc_quick_check")
+                or hasattr(cpd, "nfkc_quick_check")
+            ):
+                print("\t\t.nf_quick_check = (", end="")
+                if hasattr(cpd, "nfkc_quick_check"):
+                    if cpd.nfkc_quick_check == "N":
+                        print("UNICODE_NFKC_QUICK_CHECK_NO", end="")
+                    elif cpd.nfkc_quick_check == "M":
+                        print("UNICODE_NFKC_QUICK_CHECK_MAYBE", end="")
+                if hasattr(cpd, "nfkc_quick_check") and hasattr(cpd, "nfc_quick_check"):
+                    print(" |")
+                    print("\t\t\t\t   ", end="")
+                if hasattr(cpd, "nfc_quick_check"):
+                    if cpd.nfc_quick_check == "N":
+                        print("UNICODE_NFC_QUICK_CHECK_NO", end="")
+                    elif cpd.nfc_quick_check == "M":
+                        print("UNICODE_NFC_QUICK_CHECK_MAYBE", end="")
+                if (
+                    hasattr(cpd, "nfkc_quick_check") or hasattr(cpd, "nfc_quick_check")
+                ) and hasattr(cpd, "nfkd_quick_check"):
+                    print(" |")
+                    print("\t\t\t\t   ", end="")
+                if hasattr(cpd, "nfkd_quick_check"):
+                    if cpd.nfkd_quick_check == "N":
+                        print("UNICODE_NFKD_QUICK_CHECK_NO", end="")
+                    elif cpd.nfkd_quick_check == "M":
+                        print("UNICODE_NFKD_QUICK_CHECK_MAYBE", end="")
+                if (
+                    hasattr(cpd, "nfkc_quick_check")
+                    or hasattr(cpd, "nfc_quick_check")
+                    or hasattr(cpd, "nfkd_quick_check")
+                ) and hasattr(cpd, "nfd_quick_check"):
+                    print(" |")
+                    print("\t\t\t\t   ", end="")
+                if hasattr(cpd, "nfd_quick_check"):
+                    if cpd.nfd_quick_check == "N":
+                        print("UNICODE_NFD_QUICK_CHECK_NO", end="")
+                    elif cpd.nfd_quick_check == "M":
+                        print("UNICODE_NFD_QUICK_CHECK_MAYBE", end="")
+                print("),")
             if hasattr(cpd, "decomposition_type"):
                 print(
                     "\t\t.decomposition_type = %s,"
@@ -745,6 +939,9 @@ def write_tables_c():
                     "\t\t.decomposition_full_k_offset = %u,"
                     % cpd.decomposition_full_k_offset
                 )
+            if hasattr(cpd, "composition_count"):
+                print("\t\t.composition_count = %u," % cpd.composition_count)
+                print("\t\t.composition_offset = %u," % cpd.composition_offset)
             if hasattr(cpd, "simple_titlecase_mapping"):
                 print(
                     "\t\t.simple_titlecase_mapping = 0x%04X,"
@@ -957,6 +1154,16 @@ def write_tables_c():
         print_list(ud_decompositions)
         print(",")
         print("};")
+        print("")
+        print("const uint32_t unicode_compositions[] = {")
+        print_list(ud_compositions)
+        print(",")
+        print("};")
+        print("")
+        print("const uint32_t unicode_composition_primaries[] = {")
+        print_list(ud_composition_primaries)
+        print(",")
+        print("};")
 
     sys.stdout = orig_stdout
 
@@ -1065,6 +1272,7 @@ def main():
 
     create_cp_range_index()
     expand_decompositions()
+    derive_canonical_compositions()
 
     create_cp_index_tables()