From: Stephan Bosch Date: Mon, 31 Mar 2025 21:23:01 +0000 (+0200) Subject: lib: unicode-data - Add special case mappings for upper and lower case X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=3438021cdea78d775da455b488d7bdf94a3d6047;p=thirdparty%2Fdovecot%2Fcore.git lib: unicode-data - Add special case mappings for upper and lower case --- diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am index 9bb13aee2f..8fe2070fd6 100644 --- a/src/lib/Makefile.am +++ b/src/lib/Makefile.am @@ -20,6 +20,7 @@ UCD_FILES = \ $(UCD_DIR)/NormalizationTest.txt \ $(UCD_DIR)/PropertyValueAliases.txt \ $(UCD_DIR)/PropList.txt \ + $(UCD_DIR)/SpecialCasing.txt \ $(UCD_DIR)/UnicodeData.txt \ $(UCD_DIR)/WordBreakProperty.txt @@ -64,6 +65,8 @@ $(UCD_DIR)/PropertyValueAliases.txt: $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/PropertyValueAliases.txt $(UCD_DIR)/PropList.txt: $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/PropList.txt +$(UCD_DIR)/SpecialCasing.txt: + $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/SpecialCasing.txt $(UCD_DIR)/UnicodeData.txt: $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/UnicodeData.txt $(UCD_DIR)/WordBreakProperty.txt: diff --git a/src/lib/test-unicode-data.c b/src/lib/test-unicode-data.c index 790656df4f..83ab3a7c72 100644 --- a/src/lib/test-unicode-data.c +++ b/src/lib/test-unicode-data.c @@ -12,6 +12,7 @@ #define UCD_COMPOSITION_EXCLUSIONS_TXT "CompositionExclusions.txt" #define UCD_DERIVED_NORMALIZATION_PROPS_TXT "DerivedNormalizationProps.txt" #define UCD_PROP_LIST_TXT "PropList.txt" +#define UCD_SPECIAL_CASING_TXT "SpecialCasing.txt" #define UCD_UNICODE_DATA_TXT "UnicodeData.txt" #define UCD_WORD_BREAK_PROPERTY_TXT "WordBreakProperty.txt" @@ -69,6 +70,44 @@ parse_prop_file_line(const char *line, const char *file, unsigned int line_num, return !test_has_failed(); } +static void +test_case_mapping(uint32_t cp, const char *const *parsed_mapping, + const uint32_t *case_map, unsigned int case_map_len) +{ + unsigned int case_map_idx; + unsigned int parsed_mapping_len = str_array_length(parsed_mapping); + + if (parsed_mapping_len == 1 && case_map_len == 0) { + /* Maps to itself (compiled as len == 0) */ + uint32_t mcp; + + test_assert_idx(str_to_uint32_hex(*parsed_mapping, &mcp) >= 0, cp); + if (test_has_failed()) + return; + test_assert_idx(mcp == cp, cp); + return; + } + + /* Explicit mapping */ + test_assert(parsed_mapping_len == case_map_len); + if (test_has_failed()) + return; + + case_map_idx = 0; + while (*parsed_mapping != NULL && !test_has_failed()) { + uint32_t mcp; + + test_assert_idx(str_to_uint32_hex(*parsed_mapping, &mcp) >= 0, cp); + if (test_has_failed()) + return; + test_assert_idx(uni_is_valid_ucs4(mcp), cp); + test_assert_idx(mcp == case_map[case_map_idx], cp); + + case_map_idx++; + parsed_mapping++; + } +} + static void test_composition_exclusions_line(const char *line, unsigned int line_num) { @@ -161,6 +200,58 @@ static void test_prop_list_line(const char *line, unsigned int line_num) } } +static void test_special_casing_line(const char *line, unsigned int line_num) +{ + const char *const *columns = t_strsplit(line, ";"); + size_t num_columns = str_array_length(columns); + + /* ; ; ; <upper>; (<condition_list>;)? */ + + if (num_columns < 4) { + test_failed(t_strdup_printf( + "Invalid data at %s:%u", + UCD_SPECIAL_CASING_TXT, line_num)); + return; + } + + if (num_columns > 4 && strlen(t_str_trim(columns[4], " ")) > 0) { + /* Skip lines with condition list */ + return; + } + + const char *cp_hex = t_str_trim(columns[0], " "); + uint32_t cp; + + if (str_to_uint32_hex(cp_hex, &cp) < 0) { + test_failed(t_strdup_printf( + "Invalid data at %s:%u: " + "Bad code point", + UCD_SPECIAL_CASING_TXT, line_num)); + return; + } + + /* Parse Decomposition_* */ + + const char *lower = t_str_trim(columns[1], " "); + const char *upper = t_str_trim(columns[3], " "); + const char *const *lower_map = t_strsplit(lower, " "); + const char *const *upper_map = t_strsplit(upper, " "); + + /* Check data */ + + const struct unicode_code_point_data *cp_data = + unicode_code_point_get_data(cp); + const uint32_t *case_map; + size_t case_map_len; + + case_map_len = unicode_code_point_data_get_uppercase_mapping( + cp_data, &case_map); + test_case_mapping(cp, upper_map, case_map, case_map_len); + case_map_len = unicode_code_point_data_get_lowercase_mapping( + cp_data, &case_map); + test_case_mapping(cp, lower_map, case_map, case_map_len); +} + static void test_unicode_data_line(const char *line, unsigned int line_num) { static uint32_t cp_first = 0; @@ -329,6 +420,22 @@ static void test_unicode_data_line(const char *line, unsigned int line_num) decomp++; } + if (cp_data->uppercase_mapping_length == 1) { + const uint32_t *map; + size_t map_len = + unicode_code_point_data_get_uppercase_mapping( + cp_data, &map); + test_assert_idx(map_len == 1 && + map[0] == simple_uppercase_mapping, cp); + } + if (cp_data->lowercase_mapping_length == 1) { + const uint32_t *map; + size_t map_len = + unicode_code_point_data_get_lowercase_mapping( + cp_data, &map); + test_assert_idx(map_len == 1 && + map[0] == simple_lowercase_mapping, cp); + } test_assert_idx( cp_data->simple_titlecase_mapping == simple_titlecase_mapping, cp); @@ -440,6 +547,7 @@ void test_unicode_data(void) test_ucd_file(UCD_DERIVED_NORMALIZATION_PROPS_TXT, test_derived_normalization_props_line); test_ucd_file(UCD_PROP_LIST_TXT, test_prop_list_line); + test_ucd_file(UCD_SPECIAL_CASING_TXT, test_special_casing_line); test_ucd_file(UCD_UNICODE_DATA_TXT, test_unicode_data_line); test_ucd_file(UCD_WORD_BREAK_PROPERTY_TXT, test_word_break_property_line); diff --git a/src/lib/unicode-data-static.h b/src/lib/unicode-data-static.h index 5c5a65e2b0..a2a66c5c94 100644 --- a/src/lib/unicode-data-static.h +++ b/src/lib/unicode-data-static.h @@ -127,11 +127,17 @@ struct unicode_code_point_data { uint8_t composition_count; + uint8_t uppercase_mapping_length; + uint8_t lowercase_mapping_length; + uint16_t decomposition_first_offset; uint16_t decomposition_full_offset; uint16_t decomposition_full_k_offset; uint16_t composition_offset; + uint16_t uppercase_mapping_offset; + uint16_t lowercase_mapping_offset; + uint32_t simple_titlecase_mapping; /* Property bits (UAX #44, Section 5.1) */ diff --git a/src/lib/unicode-data.h b/src/lib/unicode-data.h index 0f71509d7c..0fea0930cb 100644 --- a/src/lib/unicode-data.h +++ b/src/lib/unicode-data.h @@ -91,6 +91,30 @@ unicode_code_point_get_full_decomposition(uint32_t cp, bool canonical, cp_data, canonical, decomp_r); } +static inline size_t +unicode_code_point_data_get_uppercase_mapping( + const struct unicode_code_point_data *cp_data, + const uint32_t **map_r) +{ + uint32_t offset; + + offset = cp_data->uppercase_mapping_offset; + *map_r = &unicode_case_mappings[offset]; + return cp_data->uppercase_mapping_length; +} + +static inline size_t +unicode_code_point_data_get_lowercase_mapping( + const struct unicode_code_point_data *cp_data, + const uint32_t **map_r) +{ + uint32_t offset; + + offset = cp_data->lowercase_mapping_offset; + *map_r = &unicode_case_mappings[offset]; + return cp_data->lowercase_mapping_length; +} + uint8_t unicode_general_category_from_string(const char *str); #endif diff --git a/src/lib/unicode-ucd-compile.py b/src/lib/unicode-ucd-compile.py index 923fd9a191..c0f6e415c2 100755 --- a/src/lib/unicode-ucd-compile.py +++ b/src/lib/unicode-ucd-compile.py @@ -41,6 +41,9 @@ ud_compositions = [] ud_composition_primaries = [] ud_compositions_max_per_starter = 0 +ud_case_mappings = [] +ud_case_mapping_max_length = 0 + class UCDFileOpen: def __init__(self, filename): @@ -460,6 +463,65 @@ def read_ucd_files(): cpd.pb_m_terminal_punctuation = True CodePointRange(cprng[0], cprng[1], cpd) + # SpecialCasing.txt + with UCDFileOpen("SpecialCasing.txt") as ucd: + line_num = 0 + for line in ucd.fd: + line_num = line_num + 1 + data = line.split("#") + line = data[0].strip() + if len(line) == 0: + continue + + # <code>; <lower>; <title>; <upper>; (<condition_list>;)? # <comment> + cols = line.split(";") + if len(cols) < 4: + die(f"{ucd}:{line_num}: Missing columns") + if len(cols) > 4 and len(cols[4].strip()) > 0: + # Skip lines with condition list + continue + + cp_hex = cols[0].strip() + if len(cp_hex) == 0: + continue + cp = int(cp_hex, 16) + + lower = cols[1].strip() + upper = cols[3].strip() + + cpd = None + + # Lowercase_Mapping + codes_hex = lower.split(" ") + if len(codes_hex) > 0: + first_code_hex = codes_hex[0].strip() + first_code = int(first_code_hex, 16) + if len(codes_hex) > 1 or first_code != cp: + codes = [] + for code_hex in codes_hex: + codes.append(int(code_hex, 16)) + + if cpd is None: + cpd = CodePointData() + cpd.lowercase_mapping = codes + + # Uppercase_Mapping + codes_hex = upper.split(" ") + if len(codes_hex) > 0: + first_code_hex = codes_hex[0].strip() + first_code = int(first_code_hex, 16) + if len(codes_hex) > 1 or first_code != cp: + codes = [] + for code_hex in codes_hex: + codes.append(int(code_hex, 16)) + + if cpd is None: + cpd = CodePointData() + cpd.uppercase_mapping = codes + + if cpd is not None: + CodePointRange(cp, cp, cpd) + # WordBreakProperty.txt with UCDFileOpen("WordBreakProperty.txt") as ucd: line_num = 0 @@ -549,6 +611,61 @@ def read_ucd_files(): CodePointRange(cprng[0], cprng[1], cpd) +def resolve_case_mappings(): + global ud_codepoints + global ud_case_mappings + global ud_case_mapping_max_length + + for cpr in ud_codepoints: + if cpr.cp_last > cpr.cp_first: + # No case mappings in ranges expected, ever + continue + cp = cpr.cp_first + cpd = cpr.data + + # Uppercase_Mapping + ucase_codes = [] + if hasattr(cpd, "uppercase_mapping"): + ucase_codes = cpd.uppercase_mapping + if len(ucase_codes) > 0 and (len(ucase_codes) > 1 or ucase_codes[0] != cp): + cpd.uppercase_mapping_offset = len(ud_case_mappings) + cpd.uppercase_mapping_length = len(ucase_codes) + ud_case_mappings = ud_case_mappings + ucase_codes + elif ( + hasattr(cpd, "simple_uppercase_mapping") + and cpd.simple_uppercase_mapping != cp + ): + cpd.uppercase_mapping_offset = len(ud_case_mappings) + cpd.uppercase_mapping_length = 1 + ud_case_mappings.append(cpd.simple_uppercase_mapping) + ucase_codes = [cpd.simple_uppercase_mapping] + else: + ucase_codes = [] + if len(ucase_codes) > ud_case_mapping_max_length: + ud_case_mapping_max_length = len(ucase_codes) + + # Lowercase_Mapping + lcase_codes = [] + if hasattr(cpd, "lowercase_mapping"): + lcase_codes = cpd.lowercase_mapping + if len(lcase_codes) > 0 and (len(lcase_codes) > 1 or lcase_codes[0] != cp): + cpd.lowercase_mapping_offset = len(ud_case_mappings) + cpd.lowercase_mapping_length = len(lcase_codes) + ud_case_mappings = ud_case_mappings + lcase_codes + elif ( + hasattr(cpd, "simple_lowercase_mapping") + and cpd.simple_lowercase_mapping != cp + ): + cpd.lowercase_mapping_offset = len(ud_case_mappings) + cpd.lowercase_mapping_length = 1 + ud_case_mappings.append(cpd.simple_lowercase_mapping) + lcase_codes = [cpd.simple_lowercase_mapping] + else: + lcase_codes = [] + if len(lcase_codes) > ud_case_mapping_max_length: + ud_case_mapping_max_length = len(lcase_codes) + + def expand_decompositions(): global ud_codepoints global ud_codepoints_index @@ -918,6 +1035,7 @@ def write_tables_h(): global output_dir global ud_decomposition_max_length global ud_compositions_max_per_starter + global ud_case_mapping_max_length orig_stdout = sys.stdout @@ -937,6 +1055,7 @@ def write_tables_h(): "#define UNICODE_COMPOSITIONS_MAX_PER_STARTER %s" % ud_compositions_max_per_starter ) + print("#define UNICODE_CASE_MAPPING_MAX_LENGTH %s" % ud_case_mapping_max_length) print("") print("extern const struct unicode_code_point_data unicode_code_points[];") print("") @@ -950,6 +1069,8 @@ def write_tables_h(): print("extern const uint32_t unicode_compositions[];") print("extern const uint32_t unicode_composition_primaries[];") print("") + print("extern const uint32_t unicode_case_mappings[];") + print("") print("#endif") sys.stdout = orig_stdout @@ -1074,6 +1195,26 @@ def write_tables_c(): if hasattr(cpd, "composition_count"): print("\t\t.composition_count = %u," % cpd.composition_count) print("\t\t.composition_offset = %u," % cpd.composition_offset) + if ( + hasattr(cpd, "lowercase_mapping_length") + and cpd.lowercase_mapping_length > 0 + ): + print( + "\t\t.lowercase_mapping_length = %s," % cpd.lowercase_mapping_length + ) + print( + "\t\t.lowercase_mapping_offset = %s," % cpd.lowercase_mapping_offset + ) + if ( + hasattr(cpd, "uppercase_mapping_length") + and cpd.uppercase_mapping_length > 0 + ): + print( + "\t\t.uppercase_mapping_length = %s," % cpd.uppercase_mapping_length + ) + print( + "\t\t.uppercase_mapping_offset = %s," % cpd.uppercase_mapping_offset + ) if hasattr(cpd, "simple_titlecase_mapping"): print( "\t\t.simple_titlecase_mapping = 0x%04X," @@ -1342,6 +1483,11 @@ def write_tables_c(): print_list(ud_composition_primaries) print(",") print("};") + print("") + print("const uint32_t unicode_case_mappings[] = {") + print_list(ud_case_mappings) + print(",") + print("};") sys.stdout = orig_stdout @@ -1449,6 +1595,7 @@ def main(): source_files.sort() create_cp_range_index() + resolve_case_mappings() expand_decompositions() derive_canonical_compositions()