From: Stephan Bosch Date: Fri, 21 Mar 2025 17:18:07 +0000 (+0100) Subject: lib: unicode-data - Add relevant word break and boundary bits X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=19c9bc8bd4774d85c2bfcd09de500055e8174434;p=thirdparty%2Fdovecot%2Fcore.git lib: unicode-data - Add relevant word break and boundary bits --- diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am index 254c93e6d8..9bb13aee2f 100644 --- a/src/lib/Makefile.am +++ b/src/lib/Makefile.am @@ -19,7 +19,9 @@ UCD_FILES = \ $(UCD_DIR)/DerivedNormalizationProps.txt \ $(UCD_DIR)/NormalizationTest.txt \ $(UCD_DIR)/PropertyValueAliases.txt \ - $(UCD_DIR)/UnicodeData.txt + $(UCD_DIR)/PropList.txt \ + $(UCD_DIR)/UnicodeData.txt \ + $(UCD_DIR)/WordBreakProperty.txt EXTRA_DIST = \ unicode-data-tables.c \ @@ -60,8 +62,12 @@ $(UCD_DIR)/NormalizationTest.txt: $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/NormalizationTest.txt $(UCD_DIR)/PropertyValueAliases.txt: $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/PropertyValueAliases.txt +$(UCD_DIR)/PropList.txt: + $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/PropList.txt $(UCD_DIR)/UnicodeData.txt: $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/UnicodeData.txt +$(UCD_DIR)/WordBreakProperty.txt: + $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/WordBreakProperty.txt $(srcdir)/unicode-data-tables.c $(srcdir)/unicode-data-tables.h \ $(srcdir)/unicode-data-types.c $(srcdir)/unicode-data-types.h &: \ diff --git a/src/lib/test-unicode-data.c b/src/lib/test-unicode-data.c index 341352a895..790656df4f 100644 --- a/src/lib/test-unicode-data.c +++ b/src/lib/test-unicode-data.c @@ -11,7 +11,9 @@ #define UCD_COMPOSITION_EXCLUSIONS_TXT "CompositionExclusions.txt" #define UCD_DERIVED_NORMALIZATION_PROPS_TXT "DerivedNormalizationProps.txt" +#define UCD_PROP_LIST_TXT "PropList.txt" #define UCD_UNICODE_DATA_TXT "UnicodeData.txt" +#define UCD_WORD_BREAK_PROPERTY_TXT "WordBreakProperty.txt" static bool parse_prop_file_line(const char *line, const char *file, unsigned int line_num, @@ -131,6 +133,34 @@ test_derived_normalization_props_line(const char *line, unsigned int line_num) } } +static void test_prop_list_line(const char *line, unsigned int line_num) +{ + uint32_t cp_first, cp_last, cp; + const char *prop; + + if (!parse_prop_file_line(line, UCD_PROP_LIST_TXT, line_num, + &cp_first, &cp_last, &prop, NULL)) + return; + + for (cp = cp_first; cp <= cp_last && !test_has_failed(); cp++) { + const struct unicode_code_point_data *cp_data = + unicode_code_point_get_data(cp); + + if (strcmp(prop, "White_Space") == 0) + test_assert_idx(cp_data->pb_g_white_space, cp); + else if (strcmp(prop, "Pattern_White_Space") == 0) + test_assert_idx(cp_data->pb_i_pattern_white_space, cp); + else if (strcmp(prop, "Quotation_Mark") == 0) + test_assert_idx(cp_data->pb_m_quotation_mark, cp); + else if (strcmp(prop, "Dash") == 0) + test_assert_idx(cp_data->pb_m_dash, cp); + else if (strcmp(prop, "Sentence_Terminal") == 0) + test_assert_idx(cp_data->pb_m_sentence_terminal, cp); + else if (strcmp(prop, "Terminal_Punctuation") == 0) + test_assert_idx(cp_data->pb_m_terminal_punctuation, cp); + } +} + static void test_unicode_data_line(const char *line, unsigned int line_num) { static uint32_t cp_first = 0; @@ -307,6 +337,57 @@ static void test_unicode_data_line(const char *line, unsigned int line_num) cp_first = 0; } +static void +test_word_break_property_line(const char *line, unsigned int line_num) +{ + uint32_t cp_first, cp_last, cp; + const char *prop; + + if (!parse_prop_file_line(line, UCD_WORD_BREAK_PROPERTY_TXT, line_num, + &cp_first, &cp_last, &prop, NULL)) + return; + + for (cp = cp_first; cp <= cp_last && !test_has_failed(); cp++) { + const struct unicode_code_point_data *cp_data = + unicode_code_point_get_data(cp); + + if (strcmp(prop, "CR") == 0) + test_assert_idx(cp_data->pb_wb_cr, cp); + else if (strcmp(prop, "LF") == 0) + test_assert_idx(cp_data->pb_wb_lf, cp); + else if (strcmp(prop, "Newline") == 0) + test_assert_idx(cp_data->pb_wb_newline, cp); + else if (strcmp(prop, "Extend") == 0) + test_assert_idx(cp_data->pb_wb_extend, cp); + else if (strcmp(prop, "ZWJ") == 0) + test_assert_idx(cp_data->pb_wb_zwj, cp); + else if (strcmp(prop, "Regional_Indicator") == 0) + test_assert_idx(cp_data->pb_wb_regional_indicator, cp); + else if (strcmp(prop, "Format") == 0) + test_assert_idx(cp_data->pb_wb_format, cp); + else if (strcmp(prop, "Katakana") == 0) + test_assert_idx(cp_data->pb_wb_katakana, cp); + else if (strcmp(prop, "Hebrew_Letter") == 0) + test_assert_idx(cp_data->pb_wb_hebrew_letter, cp); + else if (strcmp(prop, "ALetter") == 0) + test_assert_idx(cp_data->pb_wb_aletter, cp); + else if (strcmp(prop, "Single_Quote") == 0) + test_assert_idx(cp_data->pb_wb_single_quote, cp); + else if (strcmp(prop, "Double_Quote") == 0) + test_assert_idx(cp_data->pb_wb_double_quote, cp); + else if (strcmp(prop, "MidNumLet") == 0) + test_assert_idx(cp_data->pb_wb_midnumlet, cp); + else if (strcmp(prop, "MidLetter") == 0) + test_assert_idx(cp_data->pb_wb_midletter, cp); + else if (strcmp(prop, "MidNum") == 0) + test_assert_idx(cp_data->pb_wb_midnum, cp); + else if (strcmp(prop, "Numeric") == 0) + test_assert_idx(cp_data->pb_wb_numeric, cp); + else if (strcmp(prop, "ExtendNumLet") == 0) + test_assert_idx(cp_data->pb_wb_extendnumlet, cp); + } +} + static void test_ucd_file(const char *filename, void (*test_line)(const char *line, unsigned int line_num)) @@ -358,5 +439,8 @@ void test_unicode_data(void) test_composition_exclusions_line); test_ucd_file(UCD_DERIVED_NORMALIZATION_PROPS_TXT, test_derived_normalization_props_line); + test_ucd_file(UCD_PROP_LIST_TXT, test_prop_list_line); test_ucd_file(UCD_UNICODE_DATA_TXT, test_unicode_data_line); + test_ucd_file(UCD_WORD_BREAK_PROPERTY_TXT, + test_word_break_property_line); } diff --git a/src/lib/unicode-data-static.h b/src/lib/unicode-data-static.h index 63a43c5d0b..5c5a65e2b0 100644 --- a/src/lib/unicode-data-static.h +++ b/src/lib/unicode-data-static.h @@ -133,6 +133,39 @@ struct unicode_code_point_data { uint16_t composition_offset; uint32_t simple_titlecase_mapping; + + /* Property bits (UAX #44, Section 5.1) */ + + /* General */ + bool pb_g_white_space:1; + + /* Identifiers */ + bool pb_i_pattern_white_space:1; + + /* Miscellaneous */ + bool pb_m_quotation_mark:1; + bool pb_m_dash:1; + bool pb_m_sentence_terminal:1; + bool pb_m_terminal_punctuation:1; + + /* Word_Break (UAX #29, Section 4.1) */ + bool pb_wb_cr:1; + bool pb_wb_lf:1; + bool pb_wb_newline:1; + bool pb_wb_extend:1; + bool pb_wb_zwj:1; // Not currently used + bool pb_wb_regional_indicator:1; + bool pb_wb_format:1; + bool pb_wb_katakana:1; + bool pb_wb_hebrew_letter:1; + bool pb_wb_aletter:1; + bool pb_wb_single_quote:1; + bool pb_wb_double_quote:1; + bool pb_wb_midnumlet:1; + bool pb_wb_midletter:1; + bool pb_wb_midnum:1; + bool pb_wb_numeric:1; + bool pb_wb_extendnumlet:1; }; #endif diff --git a/src/lib/unicode-ucd-compile.py b/src/lib/unicode-ucd-compile.py index 1664691f0c..923fd9a191 100755 --- a/src/lib/unicode-ucd-compile.py +++ b/src/lib/unicode-ucd-compile.py @@ -416,6 +416,138 @@ def read_ucd_files(): cpd.nfkc_quick_check = value CodePointRange(cprng[0], cprng[1], cpd) + # PropList.txt + with UCDFileOpen("PropList.txt") as ucd: + line_num = 0 + for line in ucd.fd: + line_num = line_num + 1 + data = line.split("#") + line = data[0].strip() + if len(line) == 0: + continue + + cols = line.split(";") + if len(cols) < 2: + die(f"{ucd}:{line_num}: Missing columns") + + cprng = parse_cp_range(cols[0]) + if cprng is None: + continue + + prop = cols[1].strip() + if prop == "White_Space": + cpd = CodePointData() + cpd.pb_g_white_space = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "Pattern_White_Space": + cpd = CodePointData() + cpd.pb_i_pattern_white_space = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "Quotation_Mark": + cpd = CodePointData() + cpd.pb_m_quotation_mark = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "Dash": + cpd = CodePointData() + cpd.pb_m_dash = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "Sentence_Terminal": + cpd = CodePointData() + cpd.pb_m_sentence_terminal = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "Terminal_Punctuation": + cpd = CodePointData() + cpd.pb_m_terminal_punctuation = True + CodePointRange(cprng[0], cprng[1], cpd) + + # WordBreakProperty.txt + with UCDFileOpen("WordBreakProperty.txt") as ucd: + line_num = 0 + for line in ucd.fd: + line_num = line_num + 1 + data = line.split("#") + line = data[0].strip() + if len(line) == 0: + continue + + cols = line.split(";") + if len(cols) < 2: + die(f"{ucd}:{line_num}: Missing columns") + + cprng = parse_cp_range(cols[0]) + if cprng is None: + continue + + prop = cols[1].strip() + if prop == "CR": + cpd = CodePointData() + cpd.pb_wb_cr = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "LF": + cpd = CodePointData() + cpd.pb_wb_lf = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "Newline": + cpd = CodePointData() + cpd.pb_wb_newline = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "Extend": + cpd = CodePointData() + cpd.pb_wb_extend = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "ZWJ": + cpd = CodePointData() + cpd.pb_wb_zwj = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "Regional_Indicator": + cpd = CodePointData() + cpd.pb_wb_regional_indicator = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "Format": + cpd = CodePointData() + cpd.pb_wb_format = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "Katakana": + cpd = CodePointData() + cpd.pb_wb_katakana = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "Hebrew_Letter": + cpd = CodePointData() + cpd.pb_wb_hebrew_letter = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "ALetter": + cpd = CodePointData() + cpd.pb_wb_aletter = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "Single_Quote": + cpd = CodePointData() + cpd.pb_wb_single_quote = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "Double_Quote": + cpd = CodePointData() + cpd.pb_wb_double_quote = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "MidNumLet": + cpd = CodePointData() + cpd.pb_wb_midnumlet = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "MidLetter": + cpd = CodePointData() + cpd.pb_wb_midletter = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "MidNum": + cpd = CodePointData() + cpd.pb_wb_midnum = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "Numeric": + cpd = CodePointData() + cpd.pb_wb_numeric = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "ExtendNumLet": + cpd = CodePointData() + cpd.pb_wb_extendnumlet = True + CodePointRange(cprng[0], cprng[1], cpd) + def expand_decompositions(): global ud_codepoints @@ -947,6 +1079,52 @@ def write_tables_c(): "\t\t.simple_titlecase_mapping = 0x%04X," % cpd.simple_titlecase_mapping ) + if hasattr(cpd, "pb_g_white_space"): + print("\t\t.pb_g_white_space = TRUE,") + if hasattr(cpd, "pb_i_pattern_white_space"): + print("\t\t.pb_i_pattern_white_space = TRUE,") + if hasattr(cpd, "pb_m_quotation_mark"): + print("\t\t.pb_m_quotation_mark = TRUE,") + if hasattr(cpd, "pb_m_dash"): + print("\t\t.pb_m_dash = TRUE,") + if hasattr(cpd, "pb_m_sentence_terminal"): + print("\t\t.pb_m_sentence_terminal = TRUE,") + if hasattr(cpd, "pb_m_terminal_punctuation"): + print("\t\t.pb_m_terminal_punctuation = TRUE,") + if hasattr(cpd, "pb_wb_cr"): + print("\t\t.pb_wb_cr = TRUE,") + if hasattr(cpd, "pb_wb_lf"): + print("\t\t.pb_wb_lf = TRUE,") + if hasattr(cpd, "pb_wb_newline"): + print("\t\t.pb_wb_newline = TRUE,") + if hasattr(cpd, "pb_wb_extend"): + print("\t\t.pb_wb_extend = TRUE,") + if hasattr(cpd, "pb_wb_zwj"): + print("\t\t.pb_wb_zwj = TRUE,") + if hasattr(cpd, "pb_wb_regional_indicator"): + print("\t\t.pb_wb_regional_indicator = TRUE,") + if hasattr(cpd, "pb_wb_format"): + print("\t\t.pb_wb_format = TRUE,") + if hasattr(cpd, "pb_wb_katakana"): + print("\t\t.pb_wb_katakana = TRUE,") + if hasattr(cpd, "pb_wb_hebrew_letter"): + print("\t\t.pb_wb_hebrew_letter = TRUE,") + if hasattr(cpd, "pb_wb_aletter"): + print("\t\t.pb_wb_aletter = TRUE,") + if hasattr(cpd, "pb_wb_single_quote"): + print("\t\t.pb_wb_single_quote = TRUE,") + if hasattr(cpd, "pb_wb_double_quote"): + print("\t\t.pb_wb_double_quote = TRUE,") + if hasattr(cpd, "pb_wb_midnumlet"): + print("\t\t.pb_wb_midnumlet = TRUE,") + if hasattr(cpd, "pb_wb_midletter"): + print("\t\t.pb_wb_midletter = TRUE,") + if hasattr(cpd, "pb_wb_midnum"): + print("\t\t.pb_wb_midnum = TRUE,") + if hasattr(cpd, "pb_wb_numeric"): + print("\t\t.pb_wb_numeric = TRUE,") + if hasattr(cpd, "pb_wb_extendnumlet"): + print("\t\t.pb_wb_extendnumlet = TRUE,") print("\t},") print("};") print("")