From: Stephan Bosch Date: Tue, 22 Apr 2025 00:55:05 +0000 (+0200) Subject: lib: unicode-data - Add data for first and full code point decomposition X-Git-Tag: 2.4.2~627 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=618ce3aee856d2eb450ed3bd209242bcd7506549;p=thirdparty%2Fdovecot%2Fcore.git lib: unicode-data - Add data for first and full code point decomposition --- diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am index 1d98248ecf..e3927510f9 100644 --- a/src/lib/Makefile.am +++ b/src/lib/Makefile.am @@ -6,6 +6,8 @@ noinst_LTLIBRARIES = liblib.la BUILT_SOURCES = $(srcdir)/unicodemap.c \ $(srcdir)/unicode-data-tables.c \ $(srcdir)/unicode-data-tables.h \ + $(srcdir)/unicode-data-types.c \ + $(srcdir)/unicode-data-types.h \ event-filter-lexer.c \ event-filter-parser.c \ event-filter-parser.h @@ -14,12 +16,15 @@ UCD_URL = https://dovecot.org/res UCD_DIR = $(srcdir)/ucd UCD_FILES = \ $(UCD_DIR)/DerivedCoreProperties.txt \ + $(UCD_DIR)/PropertyValueAliases.txt \ $(UCD_DIR)/UnicodeData.txt EXTRA_DIST = \ unicodemap.c \ unicode-data-tables.c \ unicode-data-tables.h \ + unicode-data-types.c \ + unicode-data-types.h \ unicodemap.pl \ unicode-ucd-compile.py \ $(UCD_FILES) @@ -47,12 +52,15 @@ event-filter-parser.h: event-filter-parser.c $(UCD_DIR)/DerivedCoreProperties.txt: $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/DerivedCoreProperties.txt +$(UCD_DIR)/PropertyValueAliases.txt: + $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/PropertyValueAliases.txt $(UCD_DIR)/UnicodeData.txt: $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/UnicodeData.txt $(srcdir)/unicodemap.c: $(srcdir)/unicodemap.pl $(UCD_DIR)/UnicodeData.txt $(AM_V_GEN)$(PERL) $(srcdir)/unicodemap.pl < $(UCD_DIR)/UnicodeData.txt > $@ -$(srcdir)/unicode-data-tables.c $(srcdir)/unicode-data-tables.h &: \ +$(srcdir)/unicode-data-tables.c $(srcdir)/unicode-data-tables.h \ + $(srcdir)/unicode-data-types.c $(srcdir)/unicode-data-types.h &: \ $(srcdir)/unicode-ucd-compile.py $(UCD_FILES) $(AM_V_GEN)$(PYTHON) $(srcdir)/unicode-ucd-compile.py $(UCD_DIR) $(srcdir) @@ -213,6 +221,7 @@ liblib_la_SOURCES = \ unlink-directory.c \ unlink-old-files.c \ unichar.c \ + unicode-data-types.c \ unicode-data-tables.c \ unicode-data.c \ uri-util.c \ @@ -376,6 +385,7 @@ headers = \ unlink-old-files.h \ unichar.h \ unicode-data-static.h \ + unicode-data-types.h \ unicode-data-tables.h \ unicode-data.h \ uri-util.h \ diff --git a/src/lib/test-unicode-data.c b/src/lib/test-unicode-data.c index b01b57b658..3edef44122 100644 --- a/src/lib/test-unicode-data.c +++ b/src/lib/test-unicode-data.c @@ -69,6 +69,29 @@ static void test_unicode_data_line(const char *line, unsigned int line_num) } test_assert(!unicode_general_category_is_group(general_category)); + /* Parse Decomposition_* */ + + const char *decomp_spec = columns[5]; + enum unicode_decomposition_type decomp_type = + UNICODE_DECOMPOSITION_TYPE_CANONICAL; + + if (*decomp_spec == '<') { + const char *p = strchr(decomp_spec + 1, '>'); + + if (p == NULL || *(p + 1) != ' ') { + test_failed(t_strdup_printf( + "Invalid data at %s:%u: " + "Bad Decomposition for code point %"PRIu32": %s", + UCD_UNICODE_DATA_TXT, line_num, cp, columns[5])); + return; + } + decomp_type = unicode_decomposition_type_from_string( + t_strdup_until(decomp_spec + 1, p)); + decomp_spec = p + 2; + } + + const char *const *decomp = t_strsplit(decomp_spec, " "); + /* Parse Simple_*case_Mapping */ uint32_t simple_uppercase_mapping = 0; @@ -113,6 +136,35 @@ static void test_unicode_data_line(const char *line, unsigned int line_num) test_assert_idx( cp_data->general_category == general_category, cp); + const uint32_t *cp_decomp; + size_t cp_decomp_len, cp_decomp_idx; + uint8_t cp_decomp_type; + + cp_decomp_len = + unicode_code_point_data_get_first_decomposition( + cp_data, &cp_decomp_type, &cp_decomp); + test_assert(str_array_length(decomp) == cp_decomp_len); + if (test_has_failed()) + break; + + test_assert_idx( + (cp_decomp_type == decomp_type || + cp_decomp_type == UNICODE_DECOMPOSITION_TYPE_COMPAT), + cp); + cp_decomp_idx = 0; + while (*decomp != NULL && !test_has_failed()) { + uint32_t dcp; + + test_assert_idx(str_to_uint32_hex(*decomp, &dcp) >= 0, cp); + if (test_has_failed()) + break; + test_assert_idx(uni_is_valid_ucs4(dcp), cp); + test_assert_idx(dcp == cp_decomp[cp_decomp_idx], cp); + + cp_decomp_idx++; + decomp++; + } + test_assert_idx( cp_data->simple_titlecase_mapping == simple_titlecase_mapping, cp); diff --git a/src/lib/unicode-data-static.h b/src/lib/unicode-data-static.h index 0258548985..ffc61bb8ac 100644 --- a/src/lib/unicode-data-static.h +++ b/src/lib/unicode-data-static.h @@ -96,6 +96,15 @@ enum unicode_general_category { struct unicode_code_point_data { uint8_t general_category; // Not yet used + uint8_t decomposition_type; // Not yet used + uint8_t decomposition_first_length; + uint8_t decomposition_full_length; + uint8_t decomposition_full_k_length; + + uint16_t decomposition_first_offset; + uint16_t decomposition_full_offset; + uint16_t decomposition_full_k_offset; + uint32_t simple_titlecase_mapping; }; diff --git a/src/lib/unicode-data.h b/src/lib/unicode-data.h index 6b156f1b30..eace84639f 100644 --- a/src/lib/unicode-data.h +++ b/src/lib/unicode-data.h @@ -24,6 +24,48 @@ unicode_code_point_get_data(uint32_t cp) return &unicode_code_points[idxcp]; } +static inline size_t +unicode_code_point_data_get_first_decomposition( + const struct unicode_code_point_data *cp_data, + uint8_t *type_r, const uint32_t **decomp_r) +{ + uint32_t offset; + + if (type_r != NULL) + *type_r = cp_data->decomposition_type; + offset = cp_data->decomposition_first_offset; + *decomp_r = &unicode_decompositions[offset]; + return cp_data->decomposition_first_length; +} + +static inline size_t +unicode_code_point_data_get_full_decomposition( + const struct unicode_code_point_data *cp_data, bool canonical, + const uint32_t **decomp_r) +{ + uint32_t offset; + + if (canonical) { + offset = cp_data->decomposition_full_offset; + *decomp_r = &unicode_decompositions[offset]; + return cp_data->decomposition_full_length; + } + offset = cp_data->decomposition_full_k_offset; + *decomp_r = &unicode_decompositions[offset]; + return cp_data->decomposition_full_k_length; +} + +static inline size_t +unicode_code_point_get_full_decomposition(uint32_t cp, bool canonical, + const uint32_t **decomp_r) +{ + const struct unicode_code_point_data *cp_data = + unicode_code_point_get_data(cp); + + return unicode_code_point_data_get_full_decomposition( + cp_data, canonical, decomp_r); +} + uint8_t unicode_general_category_from_string(const char *str); #endif diff --git a/src/lib/unicode-ucd-compile.py b/src/lib/unicode-ucd-compile.py index acd3653983..e7283e91f9 100755 --- a/src/lib/unicode-ucd-compile.py +++ b/src/lib/unicode-ucd-compile.py @@ -30,6 +30,10 @@ ud_codepoints_index16_blocks = 1 ud_codepoints_index24_blocks = 2 ud_codepoints_index32_blocks = 2 +ud_decomposition_type_names = [] +ud_decompositions = [] +ud_decomposition_max_length = 0 + class UCDFileOpen: def __init__(self, filename): @@ -256,6 +260,25 @@ def read_ucd_files(): global ud_decomposition_type_names global ud_composition_exclusions + # PropertyValueAliases.txt + with UCDFileOpen("PropertyValueAliases.txt") as ucd: + line_num = 0 + for line in ucd.fd: + line_num = line_num + 1 + data = line.split("#") + line = data[0].strip() + if len(line) == 0: + continue + + cols = line.split(";") + if len(cols) < 3: + die(f"{ucd}:{line_num}: Missing columns") + + prop = cols[0].strip() + if prop == "dt": + lval = cols[2].strip() + ud_decomposition_type_names.append(lval) + # UnicodeData.txt with UCDFileOpen("UnicodeData.txt") as ucd: cp_range_first = None @@ -336,6 +359,142 @@ def read_ucd_files(): CodePointRange(cp_first, cp_last, cpd) +def expand_decompositions(): + global ud_codepoints + global ud_codepoints_index + global ud_decompositions + global ud_decomposition_max_length + + # Record first decompositions in ud_decompositions table + for cpr in ud_codepoints: + cpd = cpr.data + + if not hasattr(cpd, "decomposition_first") or len(cpd.decomposition_first) == 0: + continue + + dc = cpd.decomposition_first + cpd.decomposition_offset = len(ud_decompositions) + cpd.decomposition_length = len(dc) + ud_decompositions = ud_decompositions + dc + if len(dc) > ud_decomposition_max_length: + ud_decomposition_max_length = len(dc) + + # Expand all decompositions + for cpr in ud_codepoints: + if cpr.cp_last > cpr.cp_first: + # No decompositions in ranges expected, ever + continue + cpd = cpr.data + + if not hasattr(cpd, "decomposition_first") or len(cpd.decomposition_first) == 0: + continue + + dc_type = None + if hasattr(cpd, "decomposition_type"): + dc_type = cpd.decomposition_type + + # Canonical + dc = [] + + finished = False + changed = False + if dc_type is None: + dc = cpd.decomposition_first + else: + finished = True + changed = True + + while not finished: + finished = True + + dc_new = [] + for dcp in dc: + if dcp not in ud_codepoints_index: + dc_new.append(dcp) + continue + + scpr = ud_codepoints_index[dcp] + scpd = scpr.data + + if ( + hasattr(scpd, "decomposition_type") + or not hasattr(scpd, "decomposition_first") + or ( + len(scpd.decomposition_first) == 1 + and scpd.decomposition_first[0] == dcp + ) + ): + dc_new.append(dcp) + continue + + finished = False + changed = True + dc_new = dc_new + scpd.decomposition_first + + if not finished: + dc = dc_new + + if not changed: + if hasattr(cpd, "decomposition_offset"): + cpd.decomposition_full_offset = cpd.decomposition_offset + cpd.decomposition_full_length = cpd.decomposition_length + elif len(dc) == 0: + pass + else: + cpd.decomposition_full_offset = len(ud_decompositions) + cpd.decomposition_full_length = len(dc) + ud_decompositions = ud_decompositions + dc + if len(dc) > ud_decomposition_max_length: + ud_decomposition_max_length = len(dc) + + dc_c = dc + + # Compatibility + dc = cpd.decomposition_first + + finished = False + changed = False + while not finished: + finished = True + + dc_new = [] + for dcp in dc: + if dcp not in ud_codepoints_index: + dc_new.append(dcp) + continue + + scpr = ud_codepoints_index[dcp] + scpd = scpr.data + + if not hasattr(scpd, "decomposition_first") or ( + len(scpd.decomposition_first) == 1 + and scpd.decomposition_first[0] == dcp + ): + dc_new.append(dcp) + continue + + finished = False + changed = True + dc_new = dc_new + scpd.decomposition_first + + if not finished: + dc = dc_new + + if not changed: + if hasattr(cpd, "decomposition_offset"): + cpd.decomposition_full_k_offset = cpd.decomposition_offset + cpd.decomposition_full_k_length = cpd.decomposition_length + elif dc == dc_c: + cpd.decomposition_full_k_offset = cpd.decomposition_full_offset + cpd.decomposition_full_k_length = cpd.decomposition_full_length + else: + cpd.decomposition_full_k_offset = len(ud_decompositions) + cpd.decomposition_full_k_length = len(dc) + ud_decompositions = ud_decompositions + dc + if len(dc) > ud_decomposition_max_length: + ud_decomposition_max_length = len(dc) + + def create_cp_range_index(): global ud_codepoints global ud_codepoints_index @@ -501,7 +660,11 @@ def write_tables_h(): print("#define UNICODE_DATA_TABLES_H") print("") print_top_message() - print('#include "unicode-data-static.h"') + print('#include "unicode-data-types.h"') + print("") + print( + "#define UNICODE_DECOMPOSITION_MAX_LENGTH %s" % ud_decomposition_max_length + ) print("") print("extern const struct unicode_code_point_data unicode_code_points[];") print("") @@ -510,6 +673,8 @@ def write_tables_h(): print("extern const uint16_t unicode_code_points_index24[];") print("extern const uint16_t unicode_code_points_index32[];") print("") + print("extern const uint32_t unicode_decompositions[];") + print("") print("#endif") sys.stdout = orig_stdout @@ -554,6 +719,32 @@ def write_tables_c(): "\t\t.general_category = %s," % get_general_category_def(cpd.general_category) ) + if hasattr(cpd, "decomposition_type"): + print( + "\t\t.decomposition_type = %s," + % decomposition_type_def(cpd.decomposition_type) + ) + if hasattr(cpd, "decomposition_length"): + print("\t\t.decomposition_first_length = %u," % cpd.decomposition_length) + print("\t\t.decomposition_first_offset = %u," % cpd.decomposition_offset) + if hasattr(cpd, "decomposition_full_length"): + print( + "\t\t.decomposition_full_length = %u," + % cpd.decomposition_full_length + ) + print( + "\t\t.decomposition_full_offset = %u," + % cpd.decomposition_full_offset + ) + if hasattr(cpd, "decomposition_full_k_length"): + print( + "\t\t.decomposition_full_k_length = %u," + % cpd.decomposition_full_k_length + ) + print( + "\t\t.decomposition_full_k_offset = %u," + % cpd.decomposition_full_k_offset + ) if hasattr(cpd, "simple_titlecase_mapping"): print( "\t\t.simple_titlecase_mapping = 0x%04X," @@ -761,6 +952,85 @@ def write_tables_c(): print(" ", end="") print(",") print("};") + print("") + print("const uint32_t unicode_decompositions[] = {") + print_list(ud_decompositions) + print(",") + print("};") + + sys.stdout = orig_stdout + + +def write_types_h(): + global output_dir + global ud_decomposition_type_names + + orig_stdout = sys.stdout + + with open(output_dir + "/unicode-data-types.h", mode="w", encoding="utf-8") as fd: + sys.stdout = fd + + print("#ifndef UNICODE_DATA_TYPES_H") + print("#define UNICODE_DATA_TYPES_H") + print("") + print_top_message() + print('#include "unicode-data-static.h"') + print("") + print("/* Decomposition_Type */") + print("enum unicode_decomposition_type {") + print("\t/* Canonical */") + print("\tUNICODE_DECOMPOSITION_TYPE_CANONICAL = 0,") + for dt in ud_decomposition_type_names: + dt_uc = dt.upper() + + if dt_uc == "CANONICAL": + continue + + print("\t/* <%s> */" % dt) + print("\tUNICODE_DECOMPOSITION_TYPE_%s," % dt_uc) + print("};") + print("") + print("/* Decomposition_Type */") + print("enum unicode_decomposition_type") + print("unicode_decomposition_type_from_string(const char *str);") + print("") + print("#endif") + + sys.stdout = orig_stdout + + +def write_types_c(): + global output_dir + global ud_decomposition_type_names + + orig_stdout = sys.stdout + + with open(output_dir + "/unicode-data-types.c", mode="w", encoding="utf-8") as fd: + sys.stdout = fd + + print_top_message() + print('#include "lib.h"') + print('#include "unicode-data-types.h"') + print("") + print("/* Decomposition_Type */") + print("enum unicode_decomposition_type") + print("unicode_decomposition_type_from_string(const char *str)") + print("{") + print("\t/* Canonical */") + print('\tif (strcasecmp(str, "Canonical") == 0)') + print("\t\treturn UNICODE_DECOMPOSITION_TYPE_CANONICAL;") + for dt in ud_decomposition_type_names: + dt_uc = dt.upper() + + if dt_uc == "CANONICAL": + continue + + print("\t/* <%s> */" % dt) + print('\telse if (strcasecmp(str, "%s") == 0)' % dt) + print("\t\treturn UNICODE_DECOMPOSITION_TYPE_%s;" % dt_uc) + print("") + print("\treturn UNICODE_DECOMPOSITION_TYPE_CANONICAL;") + print("}") sys.stdout = orig_stdout @@ -794,11 +1064,14 @@ def main(): source_files.sort() create_cp_range_index() + expand_decompositions() create_cp_index_tables() write_tables_h() write_tables_c() + write_types_h() + write_types_c() if __name__ == "__main__":