From: Stephan Bosch Date: Tue, 22 Apr 2025 00:04:43 +0000 (+0200) Subject: lib: unicode-data - Add fields needed for Unicode normalization X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=6ab58560d89d57a1b6b53f73362aa6ceb879a128;p=thirdparty%2Fdovecot%2Fcore.git lib: unicode-data - Add fields needed for Unicode normalization --- diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am index 647e45c673..2748d732c4 100644 --- a/src/lib/Makefile.am +++ b/src/lib/Makefile.am @@ -14,7 +14,9 @@ BUILT_SOURCES = $(srcdir)/unicode-data-tables.c \ UCD_URL = https://dovecot.org/res UCD_DIR = $(srcdir)/ucd UCD_FILES = \ + $(UCD_DIR)/CompositionExclusions.txt \ $(UCD_DIR)/DerivedCoreProperties.txt \ + $(UCD_DIR)/DerivedNormalizationProps.txt \ $(UCD_DIR)/PropertyValueAliases.txt \ $(UCD_DIR)/UnicodeData.txt @@ -47,8 +49,12 @@ YACC=/bin/false # dependency, anything including the header will race the bison process. event-filter-parser.h: event-filter-parser.c +$(UCD_DIR)/CompositionExclusions.txt: + $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/CompositionExclusions.txt $(UCD_DIR)/DerivedCoreProperties.txt: $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/DerivedCoreProperties.txt +$(UCD_DIR)/DerivedNormalizationProps.txt: + $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/DerivedNormalizationProps.txt $(UCD_DIR)/PropertyValueAliases.txt: $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/PropertyValueAliases.txt $(UCD_DIR)/UnicodeData.txt: diff --git a/src/lib/test-unicode-data.c b/src/lib/test-unicode-data.c index 3edef44122..341352a895 100644 --- a/src/lib/test-unicode-data.c +++ b/src/lib/test-unicode-data.c @@ -9,8 +9,128 @@ #include +#define UCD_COMPOSITION_EXCLUSIONS_TXT "CompositionExclusions.txt" +#define UCD_DERIVED_NORMALIZATION_PROPS_TXT "DerivedNormalizationProps.txt" #define UCD_UNICODE_DATA_TXT "UnicodeData.txt" +static bool +parse_prop_file_line(const char *line, const char *file, unsigned int line_num, + uint32_t *cp_first_r, uint32_t *cp_last_r, + const char **prop_r, const char **value_r) +{ + unsigned int expected_columns = 1; + + if (prop_r != NULL) + expected_columns++; + + const char *const *columns = t_strsplit(line, ";"); + if (str_array_length(columns) < expected_columns) { + test_failed(t_strdup_printf( + "Invalid data at %s:%u", file, line_num)); + return FALSE; + } + + const char *p = strstr(columns[0], ".."); + const char *cp_first_hex, *cp_last_hex; + + cp_last_hex = NULL; + if (p == NULL) { + cp_first_hex = t_str_trim(columns[0], " \t"); + } else { + cp_first_hex = t_str_trim(t_strdup_until(columns[0], p), " \t"); + cp_last_hex = t_str_trim(p + 2, " \t"); + } + if (str_to_uint32_hex(cp_first_hex, cp_first_r) < 0) { + test_failed(t_strdup_printf( + "Invalid data at %s:%u: " + "Bad first code point", file, line_num)); + return FALSE; + } + if (cp_last_hex == NULL) + *cp_last_r = *cp_first_r; + else if (str_to_uint32_hex(cp_last_hex, cp_last_r) < 0) { + test_failed(t_strdup_printf( + "Invalid data at %s:%u: " + "Bad first code point", file, line_num)); + return FALSE; + } + + if (prop_r != NULL) { + *prop_r = t_str_trim(columns[1], " \t"); + if (value_r != NULL) { + if (columns[2] != NULL) + *value_r = t_str_trim(columns[2], " \t"); + else + *value_r = NULL; + } + } + return !test_has_failed(); +} + +static void +test_composition_exclusions_line(const char *line, unsigned int line_num) +{ + uint32_t cp_first, cp_last, cp; + + if (!parse_prop_file_line(line, UCD_COMPOSITION_EXCLUSIONS_TXT, + line_num, &cp_first, &cp_last, NULL, NULL)) + return; + + for (cp = cp_first; cp <= cp_last && !test_has_failed(); cp++) { + const struct unicode_code_point_data *cp_data = + unicode_code_point_get_data(cp); + + test_assert_idx(cp_data->composition_count == 0, cp); + } +} + +static void +test_derived_normalization_props_line(const char *line, unsigned int line_num) +{ + uint32_t cp_first, cp_last, cp; + const char *prop, *value; + + if (!parse_prop_file_line(line, UCD_DERIVED_NORMALIZATION_PROPS_TXT, + line_num, &cp_first, &cp_last, &prop, &value)) + return; + + for (cp = cp_first; cp <= cp_last && !test_has_failed(); cp++) { + const struct unicode_code_point_data *cp_data = + unicode_code_point_get_data(cp); + uint8_t qc, qc_no, qc_maybe; + + if (strcmp(prop, "NFD_QC") == 0) { + qc = (cp_data->nf_quick_check & + UNICODE_NFD_QUICK_CHECK_MASK); + qc_no = UNICODE_NFD_QUICK_CHECK_NO; + qc_maybe = UNICODE_NFD_QUICK_CHECK_MAYBE; + } else if (strcmp(prop, "NFKD_QC") == 0) { + qc = (cp_data->nf_quick_check & + UNICODE_NFKD_QUICK_CHECK_MASK); + qc_no = UNICODE_NFKD_QUICK_CHECK_NO; + qc_maybe = UNICODE_NFKD_QUICK_CHECK_MAYBE; + } else if (strcmp(prop, "NFC_QC") == 0) { + qc = (cp_data->nf_quick_check & + UNICODE_NFC_QUICK_CHECK_MASK); + qc_no = UNICODE_NFC_QUICK_CHECK_NO; + qc_maybe = UNICODE_NFC_QUICK_CHECK_MAYBE; + } else if (strcmp(prop, "NFKC_QC") == 0) { + qc = (cp_data->nf_quick_check & + UNICODE_NFKC_QUICK_CHECK_MASK); + qc_no = UNICODE_NFKC_QUICK_CHECK_NO; + qc_maybe = UNICODE_NFKC_QUICK_CHECK_MAYBE; + } else { + continue; + } + + i_assert(value != NULL); + if (strcmp(value, "N") == 0) + test_assert_idx(qc == qc_no, cp); + else if (strcmp(value, "M") == 0) + test_assert_idx(qc == qc_maybe, cp); + } +} + static void test_unicode_data_line(const char *line, unsigned int line_num) { static uint32_t cp_first = 0; @@ -69,6 +189,18 @@ static void test_unicode_data_line(const char *line, unsigned int line_num) } test_assert(!unicode_general_category_is_group(general_category)); + /* Parse Canonical_Combining_Class */ + + unsigned int ccc = 0; + if (*columns[3] != '\0' && + (str_to_uint(columns[3], &ccc) < 0 || ccc > UINT8_MAX)) { + test_failed(t_strdup_printf( + "Invalid data at %s:%u: " + "Bad Canonical_Combining_Class for code point %"PRIu32": %s", + UCD_UNICODE_DATA_TXT, line_num, cp, columns[3])); + return; + } + /* Parse Decomposition_* */ const char *decomp_spec = columns[5]; @@ -135,6 +267,8 @@ static void test_unicode_data_line(const char *line, unsigned int line_num) test_assert_idx( cp_data->general_category == general_category, cp); + test_assert_idx( + cp_data->canonical_combining_class == ccc, cp); const uint32_t *cp_decomp; size_t cp_decomp_len, cp_decomp_idx; @@ -216,6 +350,13 @@ test_ucd_file(const char *filename, void test_unicode_data(void) { - /* Check that UCD data files match with what is compiled. */ + /* Check that UCD data files match with what is compiled. For the + property files only the positive assignment of properties to the + code points mentioned in the files is tested, and notably not their + absence for other code points. */ + test_ucd_file(UCD_COMPOSITION_EXCLUSIONS_TXT, + test_composition_exclusions_line); + test_ucd_file(UCD_DERIVED_NORMALIZATION_PROPS_TXT, + test_derived_normalization_props_line); test_ucd_file(UCD_UNICODE_DATA_TXT, test_unicode_data_line); } diff --git a/src/lib/unicode-data-static.h b/src/lib/unicode-data-static.h index ffc61bb8ac..63a43c5d0b 100644 --- a/src/lib/unicode-data-static.h +++ b/src/lib/unicode-data-static.h @@ -93,17 +93,44 @@ enum unicode_general_category { UNICODE_GENERAL_CATEGORY_CN = UNICODE_GENERAL_CATEGORY_C | 5, }; +/* UAX #44, Section 5.7.5: Decompositions and Normalization + */ +enum unicode_nf_quick_check { + UNICODE_NFKC_QUICK_CHECK_YES = (0x00 << 6), + UNICODE_NFKC_QUICK_CHECK_NO = (0x01 << 6), + UNICODE_NFKC_QUICK_CHECK_MAYBE = (0x02 << 6), + UNICODE_NFC_QUICK_CHECK_YES = (0x00 << 4), + UNICODE_NFC_QUICK_CHECK_NO = (0x01 << 4), + UNICODE_NFC_QUICK_CHECK_MAYBE = (0x02 << 4), + UNICODE_NFKD_QUICK_CHECK_YES = (0x00 << 2), + UNICODE_NFKD_QUICK_CHECK_NO = (0x01 << 2), + UNICODE_NFKD_QUICK_CHECK_MAYBE = (0x02 << 2), + UNICODE_NFD_QUICK_CHECK_YES = (0x00 << 0), + UNICODE_NFD_QUICK_CHECK_NO = (0x01 << 0), + UNICODE_NFD_QUICK_CHECK_MAYBE = (0x02 << 0), + + UNICODE_NFKC_QUICK_CHECK_MASK = (0x03 << 6), + UNICODE_NFC_QUICK_CHECK_MASK = (0x03 << 4), + UNICODE_NFKD_QUICK_CHECK_MASK = (0x03 << 2), + UNICODE_NFD_QUICK_CHECK_MASK = (0x03 << 0), +}; + struct unicode_code_point_data { uint8_t general_category; // Not yet used + uint8_t canonical_combining_class; + uint8_t nf_quick_check; uint8_t decomposition_type; // Not yet used uint8_t decomposition_first_length; uint8_t decomposition_full_length; uint8_t decomposition_full_k_length; + uint8_t composition_count; + uint16_t decomposition_first_offset; uint16_t decomposition_full_offset; uint16_t decomposition_full_k_offset; + uint16_t composition_offset; uint32_t simple_titlecase_mapping; }; diff --git a/src/lib/unicode-data.h b/src/lib/unicode-data.h index eace84639f..0f71509d7c 100644 --- a/src/lib/unicode-data.h +++ b/src/lib/unicode-data.h @@ -55,6 +55,31 @@ unicode_code_point_data_get_full_decomposition( return cp_data->decomposition_full_k_length; } +static inline uint32_t +unicode_code_point_data_find_composition( + const struct unicode_code_point_data *cp_data, uint32_t second) +{ + const uint32_t *compositions = + &unicode_compositions[cp_data->composition_offset]; + size_t left_idx, right_idx; + + left_idx = 0; right_idx = cp_data->composition_count; + while (left_idx < right_idx) { + unsigned int idx = (left_idx + right_idx) / 2; + + if (second > compositions[idx]) + left_idx = idx + 1; + else if (second < compositions[idx]) + right_idx = idx; + else { + return unicode_composition_primaries[ + cp_data->composition_offset + idx]; + } + } + + return 0x0000; +} + static inline size_t unicode_code_point_get_full_decomposition(uint32_t cp, bool canonical, const uint32_t **decomp_r) diff --git a/src/lib/unicode-ucd-compile.py b/src/lib/unicode-ucd-compile.py index e7283e91f9..1664691f0c 100755 --- a/src/lib/unicode-ucd-compile.py +++ b/src/lib/unicode-ucd-compile.py @@ -34,6 +34,13 @@ ud_decomposition_type_names = [] ud_decompositions = [] ud_decomposition_max_length = 0 +ud_composition_pairs = {} +ud_composition_composites = {} +ud_composition_exclusions = {} +ud_compositions = [] +ud_composition_primaries = [] +ud_compositions_max_per_starter = 0 + class UCDFileOpen: def __init__(self, filename): @@ -358,6 +365,57 @@ def read_ucd_files(): # Add range CodePointRange(cp_first, cp_last, cpd) + # CompositionExclusions.txt + with UCDFileOpen("CompositionExclusions.txt") as ucd: + for line in ucd.fd: + data = line.split("#") + + cprng = parse_cp_range(data[0]) + if cprng is None: + continue + + for cp in range(cprng[0], cprng[1] + 1): + ud_composition_exclusions[cp] = True + + # DerivedNormalizationProps.txt + with UCDFileOpen("DerivedNormalizationProps.txt") as ucd: + line_num = 0 + for line in ucd.fd: + line_num = line_num + 1 + data = line.split("#") + line = data[0].strip() + if len(line) == 0: + continue + + cols = line.split(";") + if len(cols) < 3: + if len(cols) < 2: + die(f"{ucd}:{line_num}: Missing columns") + continue + + cprng = parse_cp_range(cols[0]) + if cprng is None: + continue + + prop = cols[1].strip() + value = cols[2].strip() + if prop == "NFD_QC": + cpd = CodePointData() + cpd.nfd_quick_check = value + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "NFKD_QC": + cpd = CodePointData() + cpd.nfkd_quick_check = value + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "NFC_QC": + cpd = CodePointData() + cpd.nfc_quick_check = value + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "NFKC_QC": + cpd = CodePointData() + cpd.nfkc_quick_check = value + CodePointRange(cprng[0], cprng[1], cpd) + def expand_decompositions(): global ud_codepoints @@ -495,6 +553,84 @@ def expand_decompositions(): ud_decomposition_max_length = len(dc) +def derive_canonical_compositions(): + global ud_codepoints + global ud_decompositions + global ud_composition_exclusions + global ud_composition_pairs + global ud_composition_composites + global ud_compositions + global ud_composition_primaries + global ud_compositions_max_per_starter + + for cpr in ud_codepoints: + if cpr.cp_last > cpr.cp_first: + # No compositions in ranges expected, ever + continue + cp = cpr.cp_first + cpd = cpr.data + + if not hasattr(cpd, "decomposition_full_offset"): + continue + + # Skip singleton decompositions + if len(cpd.decomposition_first) < 2: + continue + + # Skip non-starter decompositions + dc_offset = cpd.decomposition_full_offset + dc_len = cpd.decomposition_full_length + dc = ud_decompositions[dc_offset:(dc_offset + dc_len)] + + scpr = ud_codepoints_index[dc[0]] + scpd = scpr.data + if ( + hasattr(scpd, "canonical_combining_class") + and scpd.canonical_combining_class > 0 + ): + continue + + # Skip composition exclusions + if cp in ud_composition_exclusions: + continue + + dc = cpd.decomposition_first + + # Record all alternative pairs for each starter + if not dc[0] in ud_composition_pairs: + mp = [(dc[1], cp)] + ud_composition_pairs[dc[0]] = mp + else: + mp = ud_composition_pairs[dc[0]] + mp.append((dc[1], cp)) + + if len(mp) > ud_compositions_max_per_starter: + ud_compositions_max_per_starter = len(mp) + + # Compose lookup tables + for cpr in ud_codepoints: + if cpr.cp_last > cpr.cp_first: + # No compositions in ranges expected, ever + continue + cp = cpr.cp_first + cpd = cpr.data + + if cp not in ud_composition_pairs: + continue + + def mp_key_func(a): + return a[0] + + mp = ud_composition_pairs[cp] + mp.sort(key=mp_key_func) + + cpd.composition_offset = len(ud_compositions) + cpd.composition_count = len(mp) + + ud_compositions = ud_compositions + [p[0] for p in mp] + ud_composition_primaries = ud_composition_primaries + [p[1] for p in mp] + + def create_cp_range_index(): global ud_codepoints global ud_codepoints_index @@ -665,6 +801,10 @@ def write_tables_h(): print( "#define UNICODE_DECOMPOSITION_MAX_LENGTH %s" % ud_decomposition_max_length ) + print( + "#define UNICODE_COMPOSITIONS_MAX_PER_STARTER %s" + % ud_compositions_max_per_starter + ) print("") print("extern const struct unicode_code_point_data unicode_code_points[];") print("") @@ -675,6 +815,9 @@ def write_tables_h(): print("") print("extern const uint32_t unicode_decompositions[];") print("") + print("extern const uint32_t unicode_compositions[];") + print("extern const uint32_t unicode_composition_primaries[];") + print("") print("#endif") sys.stdout = orig_stdout @@ -719,6 +862,57 @@ def write_tables_c(): "\t\t.general_category = %s," % get_general_category_def(cpd.general_category) ) + if ( + hasattr(cpd, "canonical_combining_class") + and cpd.canonical_combining_class > 0 + ): + print( + "\t\t.canonical_combining_class = %u," + % cpd.canonical_combining_class + ) + if ( + hasattr(cpd, "nfd_quick_check") + or hasattr(cpd, "nfkd_quick_check") + or hasattr(cpd, "nfc_quick_check") + or hasattr(cpd, "nfkc_quick_check") + ): + print("\t\t.nf_quick_check = (", end="") + if hasattr(cpd, "nfkc_quick_check"): + if cpd.nfkc_quick_check == "N": + print("UNICODE_NFKC_QUICK_CHECK_NO", end="") + elif cpd.nfkc_quick_check == "M": + print("UNICODE_NFKC_QUICK_CHECK_MAYBE", end="") + if hasattr(cpd, "nfkc_quick_check") and hasattr(cpd, "nfc_quick_check"): + print(" |") + print("\t\t\t\t ", end="") + if hasattr(cpd, "nfc_quick_check"): + if cpd.nfc_quick_check == "N": + print("UNICODE_NFC_QUICK_CHECK_NO", end="") + elif cpd.nfc_quick_check == "M": + print("UNICODE_NFC_QUICK_CHECK_MAYBE", end="") + if ( + hasattr(cpd, "nfkc_quick_check") or hasattr(cpd, "nfc_quick_check") + ) and hasattr(cpd, "nfkd_quick_check"): + print(" |") + print("\t\t\t\t ", end="") + if hasattr(cpd, "nfkd_quick_check"): + if cpd.nfkd_quick_check == "N": + print("UNICODE_NFKD_QUICK_CHECK_NO", end="") + elif cpd.nfkd_quick_check == "M": + print("UNICODE_NFKD_QUICK_CHECK_MAYBE", end="") + if ( + hasattr(cpd, "nfkc_quick_check") + or hasattr(cpd, "nfc_quick_check") + or hasattr(cpd, "nfkd_quick_check") + ) and hasattr(cpd, "nfd_quick_check"): + print(" |") + print("\t\t\t\t ", end="") + if hasattr(cpd, "nfd_quick_check"): + if cpd.nfd_quick_check == "N": + print("UNICODE_NFD_QUICK_CHECK_NO", end="") + elif cpd.nfd_quick_check == "M": + print("UNICODE_NFD_QUICK_CHECK_MAYBE", end="") + print("),") if hasattr(cpd, "decomposition_type"): print( "\t\t.decomposition_type = %s," @@ -745,6 +939,9 @@ def write_tables_c(): "\t\t.decomposition_full_k_offset = %u," % cpd.decomposition_full_k_offset ) + if hasattr(cpd, "composition_count"): + print("\t\t.composition_count = %u," % cpd.composition_count) + print("\t\t.composition_offset = %u," % cpd.composition_offset) if hasattr(cpd, "simple_titlecase_mapping"): print( "\t\t.simple_titlecase_mapping = 0x%04X," @@ -957,6 +1154,16 @@ def write_tables_c(): print_list(ud_decompositions) print(",") print("};") + print("") + print("const uint32_t unicode_compositions[] = {") + print_list(ud_compositions) + print(",") + print("};") + print("") + print("const uint32_t unicode_composition_primaries[] = {") + print_list(ud_composition_primaries) + print(",") + print("};") sys.stdout = orig_stdout @@ -1065,6 +1272,7 @@ def main(): create_cp_range_index() expand_decompositions() + derive_canonical_compositions() create_cp_index_tables()