From: Stephan Bosch Date: Tue, 1 Apr 2025 01:07:30 +0000 (+0200) Subject: lib: unicode-data - Add case folding mappings X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=b0100f3c628a32a2369864617bf6d1ef143b22fb;p=thirdparty%2Fdovecot%2Fcore.git lib: unicode-data - Add case folding mappings --- diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am index 8fe2070fd6..61e48817a9 100644 --- a/src/lib/Makefile.am +++ b/src/lib/Makefile.am @@ -14,6 +14,7 @@ BUILT_SOURCES = $(srcdir)/unicode-data-tables.c \ UCD_URL = https://dovecot.org/res UCD_DIR = $(srcdir)/ucd UCD_FILES = \ + $(UCD_DIR)/CaseFolding.txt \ $(UCD_DIR)/CompositionExclusions.txt \ $(UCD_DIR)/DerivedCoreProperties.txt \ $(UCD_DIR)/DerivedNormalizationProps.txt \ @@ -53,6 +54,8 @@ YACC=/bin/false # dependency, anything including the header will race the bison process. event-filter-parser.h: event-filter-parser.c +$(UCD_DIR)/CaseFolding.txt: + $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/CaseFolding.txt $(UCD_DIR)/CompositionExclusions.txt: $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/CompositionExclusions.txt $(UCD_DIR)/DerivedCoreProperties.txt: diff --git a/src/lib/test-unicode-data.c b/src/lib/test-unicode-data.c index 83ab3a7c72..d1b1f68309 100644 --- a/src/lib/test-unicode-data.c +++ b/src/lib/test-unicode-data.c @@ -9,6 +9,7 @@ #include +#define UCD_CASE_FOLDING_TXT "CaseFolding.txt" #define UCD_COMPOSITION_EXCLUSIONS_TXT "CompositionExclusions.txt" #define UCD_DERIVED_NORMALIZATION_PROPS_TXT "DerivedNormalizationProps.txt" #define UCD_PROP_LIST_TXT "PropList.txt" @@ -108,6 +109,58 @@ test_case_mapping(uint32_t cp, const char *const *parsed_mapping, } } +static void test_case_folding_line(const char *line, unsigned int line_num) +{ + const char *const *columns = t_strsplit(line, ";"); + size_t num_columns = str_array_length(columns); + + /* ; ; ; # */ + + if (num_columns < 4) { + test_failed(t_strdup_printf( + "Invalid data at %s:%u", + UCD_CASE_FOLDING_TXT, line_num)); + return; + } + + if (num_columns > 4 && strlen(t_str_trim(columns[4], " ")) > 0) { + /* Skip lines with condition list */ + return; + } + + const char *cp_hex = t_str_trim(columns[0], " "); + uint32_t cp; + + if (str_to_uint32_hex(cp_hex, &cp) < 0) { + test_failed(t_strdup_printf( + "Invalid data at %s:%u: " + "Bad code point", + UCD_CASE_FOLDING_TXT, line_num)); + return; + } + + /* Parse Decomposition_* */ + + const char *status = t_str_trim(columns[1], " "); + + if (strcmp(status, "C") != 0 && strcmp(status, "F") != 0) + return; + + const char *mapping = t_str_trim(columns[2], " "); + const char *const *map = t_strsplit(mapping, " "); + + /* Check data */ + + const struct unicode_code_point_data *cp_data = + unicode_code_point_get_data(cp); + const uint32_t *case_map; + size_t case_map_len; + + case_map_len = unicode_code_point_data_get_casefold_mapping( + cp_data, &case_map); + test_case_mapping(cp, map, case_map, case_map_len); +} + static void test_composition_exclusions_line(const char *line, unsigned int line_num) { @@ -542,6 +595,7 @@ void test_unicode_data(void) property files only the positive assignment of properties to the code points mentioned in the files is tested, and notably not their absence for other code points. */ + test_ucd_file(UCD_CASE_FOLDING_TXT, test_case_folding_line); test_ucd_file(UCD_COMPOSITION_EXCLUSIONS_TXT, test_composition_exclusions_line); test_ucd_file(UCD_DERIVED_NORMALIZATION_PROPS_TXT, diff --git a/src/lib/unicode-data-static.h b/src/lib/unicode-data-static.h index a2a66c5c94..9ff142e8ba 100644 --- a/src/lib/unicode-data-static.h +++ b/src/lib/unicode-data-static.h @@ -129,6 +129,7 @@ struct unicode_code_point_data { uint8_t uppercase_mapping_length; uint8_t lowercase_mapping_length; + uint8_t casefold_mapping_length; uint16_t decomposition_first_offset; uint16_t decomposition_full_offset; @@ -137,6 +138,7 @@ struct unicode_code_point_data { uint16_t uppercase_mapping_offset; uint16_t lowercase_mapping_offset; + uint16_t casefold_mapping_offset; uint32_t simple_titlecase_mapping; diff --git a/src/lib/unicode-data.h b/src/lib/unicode-data.h index 0fea0930cb..801eadd737 100644 --- a/src/lib/unicode-data.h +++ b/src/lib/unicode-data.h @@ -115,6 +115,18 @@ unicode_code_point_data_get_lowercase_mapping( return cp_data->lowercase_mapping_length; } +static inline size_t +unicode_code_point_data_get_casefold_mapping( + const struct unicode_code_point_data *cp_data, + const uint32_t **map_r) +{ + uint32_t offset; + + offset = cp_data->casefold_mapping_offset; + *map_r = &unicode_case_mappings[offset]; + return cp_data->casefold_mapping_length; +} + uint8_t unicode_general_category_from_string(const char *str); #endif diff --git a/src/lib/unicode-ucd-compile.py b/src/lib/unicode-ucd-compile.py index c0f6e415c2..12e0ec7600 100755 --- a/src/lib/unicode-ucd-compile.py +++ b/src/lib/unicode-ucd-compile.py @@ -368,6 +368,44 @@ def read_ucd_files(): # Add range CodePointRange(cp_first, cp_last, cpd) + # CaseFolding.txt + with UCDFileOpen("CaseFolding.txt") as ucd: + line_num = 0 + for line in ucd.fd: + line_num = line_num + 1 + data = line.split("#") + line = data[0].strip() + if len(line) == 0: + continue + + cols = line.split(";") + if len(cols) < 3: + die(f"{ucd}:{line_num}: Missing columns") + + cp_hex = cols[0].strip() + if len(cp_hex) == 0: + continue + cp = int(cp_hex, 16) + + status = cols[1].strip() + mapping = cols[2].strip() + + if status != "C" and status != "F": + continue + + codes_hex = mapping.split(" ") + if len(codes_hex) > 0: + first_code_hex = codes_hex[0].strip() + first_code = int(first_code_hex, 16) + if len(codes_hex) > 1 or first_code != cp: + codes = [] + for code_hex in codes_hex: + codes.append(int(code_hex, 16)) + + cpd = CodePointData() + cpd.case_folding = codes + CodePointRange(cp, cp, cpd) + # CompositionExclusions.txt with UCDFileOpen("CompositionExclusions.txt") as ucd: for line in ucd.fd: @@ -665,6 +703,23 @@ def resolve_case_mappings(): if len(lcase_codes) > ud_case_mapping_max_length: ud_case_mapping_max_length = len(lcase_codes) + # Case_Folding + cfold_codes = [] + if hasattr(cpd, "case_folding"): + cfold_codes = cpd.case_folding + if len(ucase_codes) > 0 and cfold_codes == ucase_codes: + cpd.casefold_mapping_length = cpd.uppercase_mapping_length + cpd.casefold_mapping_offset = cpd.uppercase_mapping_offset + elif len(lcase_codes) > 0 and cfold_codes == lcase_codes: + cpd.casefold_mapping_length = cpd.lowercase_mapping_length + cpd.casefold_mapping_offset = cpd.lowercase_mapping_offset + elif len(cfold_codes) > 0 and (len(cfold_codes) > 1 or cfold_codes[0] != cp): + cpd.casefold_mapping_offset = len(ud_case_mappings) + cpd.casefold_mapping_length = len(cfold_codes) + ud_case_mappings = ud_case_mappings + cfold_codes + if len(cfold_codes) > ud_case_mapping_max_length: + ud_case_mapping_max_length = len(cfold_codes) + def expand_decompositions(): global ud_codepoints @@ -1215,6 +1270,12 @@ def write_tables_c(): print( "\t\t.uppercase_mapping_offset = %s," % cpd.uppercase_mapping_offset ) + if ( + hasattr(cpd, "casefold_mapping_length") + and cpd.casefold_mapping_length > 0 + ): + print("\t\t.casefold_mapping_length = %s," % cpd.casefold_mapping_length) + print("\t\t.casefold_mapping_offset = %s," % cpd.casefold_mapping_offset) if hasattr(cpd, "simple_titlecase_mapping"): print( "\t\t.simple_titlecase_mapping = 0x%04X,"