]> git.ipfire.org Git - thirdparty/dovecot/core.git/commitdiff
lib: unicode-data - Add case folding mappings
authorStephan Bosch <stephan.bosch@open-xchange.com>
Tue, 1 Apr 2025 01:07:30 +0000 (03:07 +0200)
committerStephan Bosch <stephan.bosch@open-xchange.com>
Fri, 1 Aug 2025 01:11:19 +0000 (03:11 +0200)
src/lib/Makefile.am
src/lib/test-unicode-data.c
src/lib/unicode-data-static.h
src/lib/unicode-data.h
src/lib/unicode-ucd-compile.py

index 8fe2070fd6f4f796a4110b73bd3c78a82ae7f327..61e48817a9bdae217901439a949b2effaae4d852 100644 (file)
@@ -14,6 +14,7 @@ BUILT_SOURCES = $(srcdir)/unicode-data-tables.c \
 UCD_URL = https://dovecot.org/res
 UCD_DIR = $(srcdir)/ucd
 UCD_FILES = \
+       $(UCD_DIR)/CaseFolding.txt \
        $(UCD_DIR)/CompositionExclusions.txt \
        $(UCD_DIR)/DerivedCoreProperties.txt \
        $(UCD_DIR)/DerivedNormalizationProps.txt \
@@ -53,6 +54,8 @@ YACC=/bin/false
 # dependency, anything including the header will race the bison process.
 event-filter-parser.h: event-filter-parser.c
 
+$(UCD_DIR)/CaseFolding.txt:
+       $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/CaseFolding.txt
 $(UCD_DIR)/CompositionExclusions.txt:
        $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/CompositionExclusions.txt
 $(UCD_DIR)/DerivedCoreProperties.txt:
index 83ab3a7c72dd10a92434d8a9ef5ebf88655c0de3..d1b1f683094ad1162ec7441e943be3ed84013c88 100644 (file)
@@ -9,6 +9,7 @@
 
 #include <fcntl.h>
 
+#define UCD_CASE_FOLDING_TXT "CaseFolding.txt"
 #define UCD_COMPOSITION_EXCLUSIONS_TXT "CompositionExclusions.txt"
 #define UCD_DERIVED_NORMALIZATION_PROPS_TXT "DerivedNormalizationProps.txt"
 #define UCD_PROP_LIST_TXT "PropList.txt"
@@ -108,6 +109,58 @@ test_case_mapping(uint32_t cp, const char *const *parsed_mapping,
        }
 }
 
+static void test_case_folding_line(const char *line, unsigned int line_num)
+{
+       const char *const *columns = t_strsplit(line, ";");
+       size_t num_columns = str_array_length(columns);
+
+       /* <code>; <status>; <mapping>; # <name> */
+
+       if (num_columns < 4) {
+               test_failed(t_strdup_printf(
+                       "Invalid data at %s:%u",
+                       UCD_CASE_FOLDING_TXT, line_num));
+               return;
+       }
+
+       if (num_columns > 4 && strlen(t_str_trim(columns[4], " ")) > 0) {
+               /* Skip lines with condition list */
+               return;
+       }
+
+       const char *cp_hex = t_str_trim(columns[0], " ");
+       uint32_t cp;
+
+       if (str_to_uint32_hex(cp_hex, &cp) < 0) {
+               test_failed(t_strdup_printf(
+                               "Invalid data at %s:%u: "
+                               "Bad code point",
+                               UCD_CASE_FOLDING_TXT, line_num));
+               return;
+       }
+
+       /* Parse Decomposition_* */
+
+       const char *status = t_str_trim(columns[1], " ");
+
+       if (strcmp(status, "C") != 0 && strcmp(status, "F") != 0)
+               return;
+
+       const char *mapping = t_str_trim(columns[2], " ");
+       const char *const *map = t_strsplit(mapping, " ");
+
+       /* Check data */
+
+       const struct unicode_code_point_data *cp_data =
+               unicode_code_point_get_data(cp);
+       const uint32_t *case_map;
+       size_t case_map_len;
+
+       case_map_len = unicode_code_point_data_get_casefold_mapping(
+               cp_data, &case_map);
+       test_case_mapping(cp, map, case_map, case_map_len);
+}
+
 static void
 test_composition_exclusions_line(const char *line, unsigned int line_num)
 {
@@ -542,6 +595,7 @@ void test_unicode_data(void)
           property files only the positive assignment of properties to the
           code points mentioned in the files is tested, and notably not their
           absence for other code points. */
+       test_ucd_file(UCD_CASE_FOLDING_TXT, test_case_folding_line);
        test_ucd_file(UCD_COMPOSITION_EXCLUSIONS_TXT,
                      test_composition_exclusions_line);
        test_ucd_file(UCD_DERIVED_NORMALIZATION_PROPS_TXT,
index a2a66c5c94e6ce678abd343c87d7196b4568ba65..9ff142e8ba14b1d9ccc82161219acdc0f94d530f 100644 (file)
@@ -129,6 +129,7 @@ struct unicode_code_point_data {
 
        uint8_t uppercase_mapping_length;
        uint8_t lowercase_mapping_length;
+       uint8_t casefold_mapping_length;
 
        uint16_t decomposition_first_offset;
        uint16_t decomposition_full_offset;
@@ -137,6 +138,7 @@ struct unicode_code_point_data {
 
        uint16_t uppercase_mapping_offset;
        uint16_t lowercase_mapping_offset;
+       uint16_t casefold_mapping_offset;
 
        uint32_t simple_titlecase_mapping;
 
index 0fea0930cbee97f8b01eab062d32440fb6dc9411..801eadd737e4e29d8769e6a0e6cf0f99f63177ac 100644 (file)
@@ -115,6 +115,18 @@ unicode_code_point_data_get_lowercase_mapping(
        return cp_data->lowercase_mapping_length;
 }
 
+static inline size_t
+unicode_code_point_data_get_casefold_mapping(
+       const struct unicode_code_point_data *cp_data,
+       const uint32_t **map_r)
+{
+       uint32_t offset;
+
+       offset = cp_data->casefold_mapping_offset;
+       *map_r = &unicode_case_mappings[offset];
+       return cp_data->casefold_mapping_length;
+}
+
 uint8_t unicode_general_category_from_string(const char *str);
 
 #endif
index c0f6e415c23397bf04e77601385eb0f3ce8ab8b9..12e0ec76004a9572816e02bc07b4b3d81d2e212d 100755 (executable)
@@ -368,6 +368,44 @@ def read_ucd_files():
             # Add range
             CodePointRange(cp_first, cp_last, cpd)
 
+    # CaseFolding.txt
+    with UCDFileOpen("CaseFolding.txt") as ucd:
+        line_num = 0
+        for line in ucd.fd:
+            line_num = line_num + 1
+            data = line.split("#")
+            line = data[0].strip()
+            if len(line) == 0:
+                continue
+
+            cols = line.split(";")
+            if len(cols) < 3:
+                die(f"{ucd}:{line_num}: Missing columns")
+
+            cp_hex = cols[0].strip()
+            if len(cp_hex) == 0:
+                continue
+            cp = int(cp_hex, 16)
+
+            status = cols[1].strip()
+            mapping = cols[2].strip()
+
+            if status != "C" and status != "F":
+                continue
+
+            codes_hex = mapping.split(" ")
+            if len(codes_hex) > 0:
+                first_code_hex = codes_hex[0].strip()
+                first_code = int(first_code_hex, 16)
+                if len(codes_hex) > 1 or first_code != cp:
+                    codes = []
+                    for code_hex in codes_hex:
+                        codes.append(int(code_hex, 16))
+
+                    cpd = CodePointData()
+                    cpd.case_folding = codes
+                    CodePointRange(cp, cp, cpd)
+
     # CompositionExclusions.txt
     with UCDFileOpen("CompositionExclusions.txt") as ucd:
         for line in ucd.fd:
@@ -665,6 +703,23 @@ def resolve_case_mappings():
         if len(lcase_codes) > ud_case_mapping_max_length:
             ud_case_mapping_max_length = len(lcase_codes)
 
+        # Case_Folding
+        cfold_codes = []
+        if hasattr(cpd, "case_folding"):
+            cfold_codes = cpd.case_folding
+        if len(ucase_codes) > 0 and cfold_codes == ucase_codes:
+            cpd.casefold_mapping_length = cpd.uppercase_mapping_length
+            cpd.casefold_mapping_offset = cpd.uppercase_mapping_offset
+        elif len(lcase_codes) > 0 and cfold_codes == lcase_codes:
+            cpd.casefold_mapping_length = cpd.lowercase_mapping_length
+            cpd.casefold_mapping_offset = cpd.lowercase_mapping_offset
+        elif len(cfold_codes) > 0 and (len(cfold_codes) > 1 or cfold_codes[0] != cp):
+            cpd.casefold_mapping_offset = len(ud_case_mappings)
+            cpd.casefold_mapping_length = len(cfold_codes)
+            ud_case_mappings = ud_case_mappings + cfold_codes
+        if len(cfold_codes) > ud_case_mapping_max_length:
+            ud_case_mapping_max_length = len(cfold_codes)
+
 
 def expand_decompositions():
     global ud_codepoints
@@ -1215,6 +1270,12 @@ def write_tables_c():
                 print(
                     "\t\t.uppercase_mapping_offset = %s," % cpd.uppercase_mapping_offset
                 )
+            if (
+                hasattr(cpd, "casefold_mapping_length")
+                and cpd.casefold_mapping_length > 0
+            ):
+                print("\t\t.casefold_mapping_length = %s," % cpd.casefold_mapping_length)
+                print("\t\t.casefold_mapping_offset = %s," % cpd.casefold_mapping_offset)
             if hasattr(cpd, "simple_titlecase_mapping"):
                 print(
                     "\t\t.simple_titlecase_mapping = 0x%04X,"