From: Stephan Bosch Date: Fri, 11 Apr 2025 03:35:10 +0000 (+0200) Subject: lib: unicode - Implement text segmentation at grapheme cluster boundaries X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=3f43b66652952ba1177ad05922f61a71f9523d2f;p=thirdparty%2Fdovecot%2Fcore.git lib: unicode - Implement text segmentation at grapheme cluster boundaries --- diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am index 52331348bc..1e4b850fd5 100644 --- a/src/lib/Makefile.am +++ b/src/lib/Makefile.am @@ -22,6 +22,9 @@ UCD_FILES = \ $(UCD_DIR)/CompositionExclusions.txt \ $(UCD_DIR)/DerivedCoreProperties.txt \ $(UCD_DIR)/DerivedNormalizationProps.txt \ + $(UCD_DIR)/emoji-data.txt \ + $(UCD_DIR)/GraphemeBreakProperty.txt \ + $(UCD_DIR)/GraphemeBreakTest.txt \ $(UCD_DIR)/NormalizationTest.txt \ $(UCD_DIR)/PropertyValueAliases.txt \ $(UCD_DIR)/PropList.txt \ @@ -66,6 +69,12 @@ $(UCD_DIR)/DerivedCoreProperties.txt: $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/DerivedCoreProperties.txt $(UCD_DIR)/DerivedNormalizationProps.txt: $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/DerivedNormalizationProps.txt +$(UCD_DIR)/emoji-data.txt: + $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/emoji-data.txt +$(UCD_DIR)/GraphemeBreakProperty.txt: + $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/GraphemeBreakProperty.txt +$(UCD_DIR)/GraphemeBreakTest.txt: + $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/GraphemeBreakTest.txt $(UCD_DIR)/NormalizationTest.txt: $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/NormalizationTest.txt $(UCD_DIR)/PropertyValueAliases.txt: @@ -241,6 +250,7 @@ liblib_la_SOURCES = \ unlink-directory.c \ unlink-old-files.c \ unichar.c \ + unicode-break.c \ unicode-data-types.c \ unicode-data-tables.c \ unicode-data.c \ @@ -406,6 +416,7 @@ headers = \ unlink-directory.h \ unlink-old-files.h \ unichar.h \ + unicode-break.h \ unicode-data-static.h \ unicode-data-types.h \ unicode-data-tables.h \ @@ -518,6 +529,7 @@ test_lib_SOURCES = \ test-str-table.c \ test-time-util.c \ test-unichar.c \ + test-unicode-break.c \ test-unicode-data.c \ test-unicode-nf.c \ test-unicode-casemap.c \ diff --git a/src/lib/test-lib.inc b/src/lib/test-lib.inc index a27df9af9c..91c6a97124 100644 --- a/src/lib/test-lib.inc +++ b/src/lib/test-lib.inc @@ -106,6 +106,7 @@ TEST(test_str_sanitize) TEST(test_str_table) TEST(test_time_util) TEST(test_unichar) +TEST(test_unicode_break) TEST(test_unicode_data) TEST(test_unicode_nf) TEST(test_unicode_casemap) diff --git a/src/lib/test-unicode-break.c b/src/lib/test-unicode-break.c new file mode 100644 index 0000000000..31eb23c398 --- /dev/null +++ b/src/lib/test-unicode-break.c @@ -0,0 +1,100 @@ +/* Copyright (c) 2025 Dovecot authors, see the included COPYING file */ + +#include "test-lib.h" +#include "strnum.h" +#include "str.h" +#include "array.h" +#include "istream.h" +#include "unichar.h" +#include "unicode-break.h" + +#include + +#define UCD_GRAPHEME_BREAK_TEST_TXT "GraphemeBreakTest.txt" + +#define BREAK_MARKER "\xc3\xb7" +#define NO_BREAK_MARKER "\xc3\x97" + +static void +test_gcb_line(const char *file, const char *line, unsigned int line_num) +{ + struct unicode_gc_break ubrk; + const char *const *tokens = t_strsplit(line, " "); + + unicode_gc_break_init(&ubrk); + while (tokens[0] != NULL && tokens[1] != NULL && !test_has_failed()) { + const char *brk = tokens[0]; + const char *cp_hex = tokens[1]; + bool break_m1_test = FALSE; + uint32_t cp; + + if (strcmp(brk, BREAK_MARKER) == 0) + break_m1_test = TRUE; + else if (strcmp(brk, NO_BREAK_MARKER) != 0) { + test_failed(t_strdup_printf( + "Invalid data at %s:%u: " + "Bad break marker", file, line_num)); + return; + } + + if (str_to_uint32_hex(cp_hex, &cp) < 0) { + test_failed(t_strdup_printf( + "Invalid data at %s:%u: " + "Bad code point", file, line_num)); + return; + } + + const struct unicode_code_point_data *cp_data = NULL; + bool break_m1; + + break_m1 = unicode_gc_break_cp(&ubrk, cp, &cp_data); + + test_assert_idx(break_m1 == break_m1_test, line_num); + + tokens += 2; + } + + test_assert_strcmp_idx(tokens[0], BREAK_MARKER, line_num); +} + +static void +test_ucd_file(const char *file, + void (*test_line)(const char *file, const char *line, + unsigned int line_num)) +{ + const char *file_path = t_strconcat(UCD_DIR, "/", file, NULL); + + test_begin(t_strdup_printf("unicode_break - %s", file)); + + struct istream *input = i_stream_create_file(file_path, 1024); + unsigned int line_num = 0; + + while (!test_has_failed()) { + char *line = i_stream_read_next_line(input); + if (line == NULL) + break; + line_num++; + + /* remove any trailing whitespace and comment */ + char *end = strchr(line, '#'); + if (end == NULL && *line != '\0') + end = &line[strlen(line) - 1]; + while ((end - 1) >= line && (end[-1] == '\t' || end[-1] == ' ')) + end--; + *end = '\0'; + if (*line == '\0') + continue; + + T_BEGIN { + test_line(file, line, line_num); + } T_END; + } + + i_stream_destroy(&input); + test_end(); +} + +void test_unicode_break(void) +{ + test_ucd_file(UCD_GRAPHEME_BREAK_TEST_TXT, test_gcb_line); +} diff --git a/src/lib/test-unicode-data.c b/src/lib/test-unicode-data.c index 5ee9eb205b..1a44f44aae 100644 --- a/src/lib/test-unicode-data.c +++ b/src/lib/test-unicode-data.c @@ -12,6 +12,7 @@ #define UCD_CASE_FOLDING_TXT "CaseFolding.txt" #define UCD_COMPOSITION_EXCLUSIONS_TXT "CompositionExclusions.txt" #define UCD_DERIVED_NORMALIZATION_PROPS_TXT "DerivedNormalizationProps.txt" +#define UCD_GRAPHEME_BREAK_PROPERTY_TXT "GraphemeBreakProperty.txt" #define UCD_PROP_LIST_TXT "PropList.txt" #define UCD_SPECIAL_CASING_TXT "SpecialCasing.txt" #define UCD_UNICODE_DATA_TXT "UnicodeData.txt" @@ -225,6 +226,49 @@ test_derived_normalization_props_line(const char *line, unsigned int line_num) } } +static void +test_grapheme_break_property_line(const char *line, unsigned int line_num) +{ + uint32_t cp_first, cp_last, cp; + const char *prop; + + if (!parse_prop_file_line(line, UCD_GRAPHEME_BREAK_PROPERTY_TXT, + line_num, &cp_first, &cp_last, &prop, NULL)) + return; + + for (cp = cp_first; cp <= cp_last && !test_has_failed(); cp++) { + const struct unicode_code_point_data *cp_data = + unicode_code_point_get_data(cp); + + if (strcmp(prop, "CR") == 0) + test_assert_idx(cp_data->pb_b_cr, cp); + else if (strcmp(prop, "LF") == 0) + test_assert_idx(cp_data->pb_b_lf, cp); + else if (strcmp(prop, "Control") == 0) + test_assert_idx(cp_data->pb_gcb_control, cp); + else if (strcmp(prop, "Extend") == 0) + test_assert_idx(cp_data->pb_gcb_extend, cp); + else if (strcmp(prop, "ZWJ") == 0) + test_assert_idx(cp_data->pb_b_zwj, cp); + else if (strcmp(prop, "Regional_Indicator") == 0) + test_assert_idx(cp_data->pb_b_regional_indicator, cp); + else if (strcmp(prop, "Prepend") == 0) + test_assert_idx(cp_data->pb_gcb_prepend, cp); + else if (strcmp(prop, "SpacingMark") == 0) + test_assert_idx(cp_data->pb_gcb_spacingmark, cp); + else if (strcmp(prop, "L") == 0) + test_assert_idx(cp_data->pb_gcb_l, cp); + else if (strcmp(prop, "V") == 0) + test_assert_idx(cp_data->pb_gcb_v, cp); + else if (strcmp(prop, "T") == 0) + test_assert_idx(cp_data->pb_gcb_t, cp); + else if (strcmp(prop, "LV") == 0) + test_assert_idx(cp_data->pb_gcb_lv, cp); + else if (strcmp(prop, "LVT") == 0) + test_assert_idx(cp_data->pb_gcb_lvt, cp); + } +} + static void test_prop_list_line(const char *line, unsigned int line_num) { uint32_t cp_first, cp_last, cp; @@ -600,6 +644,8 @@ void test_unicode_data(void) test_composition_exclusions_line); test_ucd_file(UCD_DERIVED_NORMALIZATION_PROPS_TXT, test_derived_normalization_props_line); + test_ucd_file(UCD_GRAPHEME_BREAK_PROPERTY_TXT, + test_grapheme_break_property_line); test_ucd_file(UCD_PROP_LIST_TXT, test_prop_list_line); test_ucd_file(UCD_SPECIAL_CASING_TXT, test_special_casing_line); test_ucd_file(UCD_UNICODE_DATA_TXT, test_unicode_data_line); diff --git a/src/lib/unicode-break.c b/src/lib/unicode-break.c new file mode 100644 index 0000000000..3fb660f5ad --- /dev/null +++ b/src/lib/unicode-break.c @@ -0,0 +1,257 @@ +/* Copyright (c) 2025 Dovecot authors, see the included COPYING file */ + +#include "lib.h" +#include "unicode-data.h" +#include "unicode-break.h" + +/* This file implements the Unicode Text Segmemtation algorithms as specified in + Unicode Standard Annex #29. + */ + +/* + * Grapheme Cluster Boundaries (Unicode Standard Annex #29, Section 3) + */ + +void unicode_gc_break_init(struct unicode_gc_break *gcbrk) +{ + i_zero(gcbrk); +} + +bool unicode_gc_break_cp(struct unicode_gc_break *gcbrk, uint32_t cp, + const struct unicode_code_point_data **_cp_data) +{ + if (*_cp_data == NULL) + *_cp_data = unicode_code_point_get_data(cp); + + const struct unicode_code_point_data *cp_data = *_cp_data; + int bstatus = -1; + + /* GB1: Break at the start and end of text. + sot + Any + Any + eot + */ + if (!gcbrk->gb1) { + gcbrk->gb1 = TRUE; + bstatus = 1; + } + + /* GB3: Do not break between a CR and LF. + CR x LF + */ + if (gcbrk->gb3) { + if (cp_data->pb_b_lf) { + if (bstatus < 0) + bstatus = 0; + } + if (!cp_data->pb_b_cr) + gcbrk->gb3 = FALSE; + } else if (cp_data->pb_b_cr) { + gcbrk->gb3 = TRUE; + } + + /* GB4, GB5: Break before and after controls. + (Control | CR | LF) + + + (Control | CR | LF) + */ + if (gcbrk->gb4) { + /* GB4: (Control | CR | LF) / */ + if (bstatus < 0) + bstatus = 1; + if (!cp_data->pb_b_cr && !cp_data->pb_b_lf && + !cp_data->pb_gcb_control) + gcbrk->gb4 = FALSE; + } else if (cp_data->pb_b_cr || cp_data->pb_b_lf || + cp_data->pb_gcb_control) { + gcbrk->gb4 = TRUE; + /* GB5: / (Control | CR | LF) */ + if (bstatus < 0) + bstatus = 1; + } + + /* GB6: Do not break Hangul syllable or other conjoining sequences. + L x (L | V | LV | LVT) + */ + if (gcbrk->gb6) { + if (cp_data->pb_gcb_v || cp_data->pb_gcb_lv || + cp_data->pb_gcb_lvt) { + if (bstatus < 0) + bstatus = 0; + gcbrk->gb6 = FALSE; + } else if (cp_data->pb_gcb_l) { + if (bstatus < 0) + bstatus = 0; + } else { + gcbrk->gb6 = FALSE; + } + } else if (cp_data->pb_gcb_l) { + gcbrk->gb6 = TRUE; + } + + /* GB7: Do not break Hangul syllable or other conjoining sequences. + (LV | V) x (V | T) + */ + if (gcbrk->gb7) { + if (cp_data->pb_gcb_t) { + if (bstatus < 0) + bstatus = 0; + gcbrk->gb7 = FALSE; + } else if (cp_data->pb_gcb_v) { + if (bstatus < 0) + bstatus = 0; + } else { + gcbrk->gb7 = FALSE; + } + } else if (cp_data->pb_gcb_lv || cp_data->pb_gcb_v) { + gcbrk->gb7 = TRUE; + } + + /* GB8: Do not break Hangul syllable or other conjoining sequences. + (LVT | T) x T + */ + if (gcbrk->gb8) { + if (!cp_data->pb_gcb_t) + gcbrk->gb8 = FALSE; + else { + if (bstatus < 0) + bstatus = 0; + } + } else if (cp_data->pb_gcb_lvt || cp_data->pb_gcb_t) { + gcbrk->gb8 = TRUE; + } + + /* GB9: Do not break before extending characters or ZWJ. + x (Extend | ZWJ) + */ + if (cp_data->pb_gcb_extend || cp_data->pb_b_zwj) { + if (bstatus < 0) + bstatus = 0; + } + + /* GB9a: Do not break before SpacingMarks. + x SpacingMark + */ + if (cp_data->pb_gcb_spacingmark) { + if (bstatus < 0) + bstatus = 0; + } + + /* GB9b: Do not break after Prepend characters. + Prepend x + */ + if (gcbrk->gb9b) { + if (bstatus < 0) + bstatus = 0; + if (!cp_data->pb_gcb_prepend) + gcbrk->gb9b = FALSE; + } else if (cp_data->pb_gcb_prepend) { + gcbrk->gb9b = TRUE; + } + + /* GB9c: Do not break within Indic conjuncts. + */ + enum { + GB9C_STATE_NONE = 0, + GB9C_STATE_CONSONANT, + GB9C_STATE_LINKER, + }; + switch (gcbrk->gb9c) { + case GB9C_STATE_NONE: + switch (cp_data->indic_conjunct_break) { + case UNICODE_INDIC_CONJUNCT_BREAK_CONSONANT: + gcbrk->gb9c = GB9C_STATE_CONSONANT; + break; + default: + break; + } + break; + case GB9C_STATE_CONSONANT: + switch (cp_data->indic_conjunct_break) { + case UNICODE_INDIC_CONJUNCT_BREAK_LINKER: + gcbrk->gb9c = GB9C_STATE_LINKER; + case UNICODE_INDIC_CONJUNCT_BREAK_CONSONANT: + case UNICODE_INDIC_CONJUNCT_BREAK_EXTEND: + break; + default: + gcbrk->gb9c = GB9C_STATE_NONE; + break; + } + break; + case GB9C_STATE_LINKER: + switch (cp_data->indic_conjunct_break) { + case UNICODE_INDIC_CONJUNCT_BREAK_LINKER: + case UNICODE_INDIC_CONJUNCT_BREAK_EXTEND: + break; + case UNICODE_INDIC_CONJUNCT_BREAK_CONSONANT: + if (bstatus < 0) + bstatus = 0; + gcbrk->gb9c = GB9C_STATE_CONSONANT; + break; + default: + gcbrk->gb9c = GB9C_STATE_NONE; + break; + } + break; + default: + i_unreached(); + } + + /* GB11: Do not break within emoji ZWJ sequences. + \p{Extended_Pictographic} Extend* ZWJ x \p{Extended_Pictographic} + */ + enum { + GB11_STATE_NONE = 0, + GB11_STATE_EP, + GB11_STATE_ZWJ, + }; + switch (gcbrk->gb11) { + case GB11_STATE_NONE: + if (cp_data->pb_e_extended_pictographic) + gcbrk->gb11 = GB11_STATE_EP; + break; + case GB11_STATE_EP: + if (cp_data->pb_e_extended_pictographic) + break; + if (cp_data->pb_gcb_extend) + break; + if (cp_data->pb_b_zwj) { + gcbrk->gb11 = GB11_STATE_ZWJ; + break; + } + gcbrk->gb11 = GB11_STATE_NONE; + break; + case GB11_STATE_ZWJ: + if (cp_data->pb_e_extended_pictographic) { + if (bstatus < 0) + bstatus = 0; + gcbrk->gb11 = GB11_STATE_EP; + break; + } + gcbrk->gb11 = GB11_STATE_NONE; + break; + default: + i_unreached(); + } + + /* GB12, GB13: Do not break within emoji flag sequences. That is, do not + break between regional indicator (RI) symbols if there is + an odd number of RI characters before the break point. + sot (RI RI)* RI x RI + [^RI] (RI RI)* RI x RI + */ + if (gcbrk->gb12) { + if (cp_data->pb_b_regional_indicator) { + if (bstatus < 0) + bstatus = 0; + } + gcbrk->gb12 = FALSE; + } else if (cp_data->pb_b_regional_indicator) { + gcbrk->gb12 = TRUE; + } + + /* GB999: Otherwise, break everywhere. + (Any + Any) + */ + if (bstatus == 0) + return FALSE; + return TRUE; +} diff --git a/src/lib/unicode-break.h b/src/lib/unicode-break.h new file mode 100644 index 0000000000..74c08d7237 --- /dev/null +++ b/src/lib/unicode-break.h @@ -0,0 +1,33 @@ +#ifndef UNICODE_BREAK_H +#define UNICODE_BREAK_H + +struct unicode_code_point_data; + +/* + * Grapheme Cluster Boundaries (Unicode Standard Annex #29, Section 3) + */ + +struct unicode_gc_break { + unsigned int gb9c; + unsigned int gb11; + bool gb1:1; + bool gb3:1; + bool gb4:1; + bool gb6:1; + bool gb7:1; + bool gb8:1; + bool gb9b:1; + bool gb12:1; +}; + +void unicode_gc_break_init(struct unicode_gc_break *gcbrk); + +/* Returns TRUE if a grapheme boundary exists before the codepoint provided in + cp. Any code point data for cp that was looked up earlier can be provided in + the _cp_data pointer, or if it was NULL it can be retrieved there after the + call. + */ +bool unicode_gc_break_cp(struct unicode_gc_break *gcbrk, uint32_t cp, + const struct unicode_code_point_data **_cp_data); + +#endif diff --git a/src/lib/unicode-data-static.h b/src/lib/unicode-data-static.h index 0d74fb8050..9a2575c623 100644 --- a/src/lib/unicode-data-static.h +++ b/src/lib/unicode-data-static.h @@ -115,6 +115,13 @@ enum unicode_nf_quick_check { UNICODE_NFD_QUICK_CHECK_MASK = (0x03 << 0), }; +enum unicode_indic_conjunct_break { + UNICODE_INDIC_CONJUNCT_BREAK_NONE = 0, + UNICODE_INDIC_CONJUNCT_BREAK_LINKER, + UNICODE_INDIC_CONJUNCT_BREAK_CONSONANT, + UNICODE_INDIC_CONJUNCT_BREAK_EXTEND, +}; + struct unicode_code_point_data { uint8_t general_category; // Not yet used uint8_t canonical_combining_class; @@ -142,11 +149,16 @@ struct unicode_code_point_data { uint32_t simple_titlecase_mapping; + uint8_t indic_conjunct_break:3; + /* Property bits (UAX #44, Section 5.1) */ /* General */ bool pb_g_white_space:1; + /* Emoji */ + bool pb_e_extended_pictographic:1; + /* Identifiers */ bool pb_i_pattern_white_space:1; @@ -159,9 +171,20 @@ struct unicode_code_point_data { /* Common Break */ bool pb_b_cr:1; bool pb_b_lf:1; - bool pb_b_zwj:1; // Not currently used + bool pb_b_zwj:1; bool pb_b_regional_indicator:1; + /* Grapheme_Cluster_Break (UAX #29, Section 3.1) */ + bool pb_gcb_control:1; + bool pb_gcb_extend:1; + bool pb_gcb_prepend:1; + bool pb_gcb_spacingmark:1; + bool pb_gcb_l:1; + bool pb_gcb_v:1; + bool pb_gcb_t:1; + bool pb_gcb_lv:1; + bool pb_gcb_lvt:1; + /* Word_Break (UAX #29, Section 4.1) */ bool pb_wb_newline:1; bool pb_wb_extend:1; diff --git a/src/lib/unicode-ucd-compile.py b/src/lib/unicode-ucd-compile.py index 15be589cb3..9ff5ed4bd6 100755 --- a/src/lib/unicode-ucd-compile.py +++ b/src/lib/unicode-ucd-compile.py @@ -452,6 +452,32 @@ def read_ucd_files(): for cp in range(cprng[0], cprng[1] + 1): ud_composition_exclusions[cp] = True + # DerivedCoreProperties.txt + with UCDFileOpen("DerivedCoreProperties.txt") as ucd: + for line in ucd.fd: + data = line.split("#") + if len(data) == 0: + continue + + if len(data[0]) == 0: + continue + columns = data[0].split(";") + if len(columns) < 2: + continue + + cprng = parse_cp_range(columns[0]) + if cprng is None: + continue + + prop = columns[1].strip() + if prop != "InCB": + continue + + value = columns[2].strip() + cpd = CodePointData() + cpd.indic_conjunct_break = value + CodePointRange(cprng[0], cprng[1], cpd) + # DerivedNormalizationProps.txt with UCDFileOpen("DerivedNormalizationProps.txt") as ucd: line_num = 0 @@ -491,6 +517,99 @@ def read_ucd_files(): cpd.nfkc_quick_check = value CodePointRange(cprng[0], cprng[1], cpd) + # emoji-data.txt + with UCDFileOpen("emoji-data.txt") as ucd: + for line in ucd.fd: + data = line.split("#") + if len(data) == 0: + continue + + if len(data[0]) == 0: + continue + columns = data[0].split(";") + if len(columns) < 2: + continue + + cprng = parse_cp_range(columns[0]) + if cprng is None: + continue + + prop = columns[1].strip() + if prop != "Extended_Pictographic": + continue + + cpd = CodePointData() + cpd.pb_e_extended_pictographic = True + CodePointRange(cprng[0], cprng[1], cpd) + + # GraphemeBreakProperty.txt + with UCDFileOpen("GraphemeBreakProperty.txt") as ucd: + for line in ucd.fd: + data = line.split("#") + if len(data[0]) == 0: + continue + columns = data[0].split(";") + if len(columns) < 2: + continue + + cprng = parse_cp_range(columns[0]) + if cprng is None: + continue + + prop = columns[1].strip() + if prop == "CR": + cpd = CodePointData() + cpd.pb_b_cr = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "LF": + cpd = CodePointData() + cpd.pb_b_lf = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "Control": + cpd = CodePointData() + cpd.pb_gcb_control = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "Extend": + cpd = CodePointData() + cpd.pb_gcb_extend = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "ZWJ": + cpd = CodePointData() + cpd.pb_b_zwj = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "Regional_Indicator": + cpd = CodePointData() + cpd.pb_b_regional_indicator = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "Prepend": + cpd = CodePointData() + cpd.pb_gcb_prepend = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "SpacingMark": + cpd = CodePointData() + cpd.pb_gcb_spacingmark = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "L": + cpd = CodePointData() + cpd.pb_gcb_l = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "V": + cpd = CodePointData() + cpd.pb_gcb_v = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "T": + cpd = CodePointData() + cpd.pb_gcb_t = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "LV": + cpd = CodePointData() + cpd.pb_gcb_lv = True + CodePointRange(cprng[0], cprng[1], cpd) + elif prop == "LVT": + cpd = CodePointData() + cpd.pb_gcb_lvt = True + CodePointRange(cprng[0], cprng[1], cpd) + # PropList.txt with UCDFileOpen("PropList.txt") as ucd: line_num = 0 @@ -1119,6 +1238,12 @@ def decomposition_type_def(dt): return "UNICODE_DECOMPOSITION_TYPE_%s" % dt.upper() +def indic_conjunct_break_def(icb): + icb_uc = icb.upper() + + return "UNICODE_INDIC_CONJUNCT_BREAK_%s" % icb_uc + + def print_list(code_list): last = len(code_list) - 1 n = 0 @@ -1315,8 +1440,15 @@ def write_tables_c_cpd(cpd): "\t\t.simple_titlecase_mapping = 0x%04X," % cpd.simple_titlecase_mapping ) + if hasattr(cpd, "indic_conjunct_break"): + print( + "\t\t.indic_conjunct_break = %s," + % indic_conjunct_break_def(cpd.indic_conjunct_break) + ) if hasattr(cpd, "pb_g_white_space"): print("\t\t.pb_g_white_space = TRUE,") + if hasattr(cpd, "pb_e_extended_pictographic"): + print("\t\t.pb_e_extended_pictographic = TRUE,") if hasattr(cpd, "pb_i_pattern_white_space"): print("\t\t.pb_i_pattern_white_space = TRUE,") if hasattr(cpd, "pb_m_quotation_mark"): @@ -1335,6 +1467,24 @@ def write_tables_c_cpd(cpd): print("\t\t.pb_b_zwj = TRUE,") if hasattr(cpd, "pb_b_regional_indicator"): print("\t\t.pb_b_regional_indicator = TRUE,") + if hasattr(cpd, "pb_gcb_control"): + print("\t\t.pb_gcb_control = TRUE,") + if hasattr(cpd, "pb_gcb_extend"): + print("\t\t.pb_gcb_extend = TRUE,") + if hasattr(cpd, "pb_gcb_prepend"): + print("\t\t.pb_gcb_prepend = TRUE,") + if hasattr(cpd, "pb_gcb_spacingmark"): + print("\t\t.pb_gcb_spacingmark = TRUE,") + if hasattr(cpd, "pb_gcb_l"): + print("\t\t.pb_gcb_l = TRUE,") + if hasattr(cpd, "pb_gcb_v"): + print("\t\t.pb_gcb_v = TRUE,") + if hasattr(cpd, "pb_gcb_t"): + print("\t\t.pb_gcb_t = TRUE,") + if hasattr(cpd, "pb_gcb_lv"): + print("\t\t.pb_gcb_lv = TRUE,") + if hasattr(cpd, "pb_gcb_lvt"): + print("\t\t.pb_gcb_lvt = TRUE,") if hasattr(cpd, "pb_wb_newline"): print("\t\t.pb_wb_newline = TRUE,") if hasattr(cpd, "pb_wb_extend"):