$(UCD_DIR)/CompositionExclusions.txt \
$(UCD_DIR)/DerivedCoreProperties.txt \
$(UCD_DIR)/DerivedNormalizationProps.txt \
+ $(UCD_DIR)/emoji-data.txt \
+ $(UCD_DIR)/GraphemeBreakProperty.txt \
+ $(UCD_DIR)/GraphemeBreakTest.txt \
$(UCD_DIR)/NormalizationTest.txt \
$(UCD_DIR)/PropertyValueAliases.txt \
$(UCD_DIR)/PropList.txt \
$(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/DerivedCoreProperties.txt
$(UCD_DIR)/DerivedNormalizationProps.txt:
$(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/DerivedNormalizationProps.txt
+$(UCD_DIR)/emoji-data.txt:
+ $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/emoji-data.txt
+$(UCD_DIR)/GraphemeBreakProperty.txt:
+ $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/GraphemeBreakProperty.txt
+$(UCD_DIR)/GraphemeBreakTest.txt:
+ $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/GraphemeBreakTest.txt
$(UCD_DIR)/NormalizationTest.txt:
$(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/NormalizationTest.txt
$(UCD_DIR)/PropertyValueAliases.txt:
unlink-directory.c \
unlink-old-files.c \
unichar.c \
+ unicode-break.c \
unicode-data-types.c \
unicode-data-tables.c \
unicode-data.c \
unlink-directory.h \
unlink-old-files.h \
unichar.h \
+ unicode-break.h \
unicode-data-static.h \
unicode-data-types.h \
unicode-data-tables.h \
test-str-table.c \
test-time-util.c \
test-unichar.c \
+ test-unicode-break.c \
test-unicode-data.c \
test-unicode-nf.c \
test-unicode-casemap.c \
TEST(test_str_table)
TEST(test_time_util)
TEST(test_unichar)
+TEST(test_unicode_break)
TEST(test_unicode_data)
TEST(test_unicode_nf)
TEST(test_unicode_casemap)
--- /dev/null
+/* Copyright (c) 2025 Dovecot authors, see the included COPYING file */
+
+#include "test-lib.h"
+#include "strnum.h"
+#include "str.h"
+#include "array.h"
+#include "istream.h"
+#include "unichar.h"
+#include "unicode-break.h"
+
+#include <fcntl.h>
+
+#define UCD_GRAPHEME_BREAK_TEST_TXT "GraphemeBreakTest.txt"
+
+#define BREAK_MARKER "\xc3\xb7"
+#define NO_BREAK_MARKER "\xc3\x97"
+
+static void
+test_gcb_line(const char *file, const char *line, unsigned int line_num)
+{
+ struct unicode_gc_break ubrk;
+ const char *const *tokens = t_strsplit(line, " ");
+
+ unicode_gc_break_init(&ubrk);
+ while (tokens[0] != NULL && tokens[1] != NULL && !test_has_failed()) {
+ const char *brk = tokens[0];
+ const char *cp_hex = tokens[1];
+ bool break_m1_test = FALSE;
+ uint32_t cp;
+
+ if (strcmp(brk, BREAK_MARKER) == 0)
+ break_m1_test = TRUE;
+ else if (strcmp(brk, NO_BREAK_MARKER) != 0) {
+ test_failed(t_strdup_printf(
+ "Invalid data at %s:%u: "
+ "Bad break marker", file, line_num));
+ return;
+ }
+
+ if (str_to_uint32_hex(cp_hex, &cp) < 0) {
+ test_failed(t_strdup_printf(
+ "Invalid data at %s:%u: "
+ "Bad code point", file, line_num));
+ return;
+ }
+
+ const struct unicode_code_point_data *cp_data = NULL;
+ bool break_m1;
+
+ break_m1 = unicode_gc_break_cp(&ubrk, cp, &cp_data);
+
+ test_assert_idx(break_m1 == break_m1_test, line_num);
+
+ tokens += 2;
+ }
+
+ test_assert_strcmp_idx(tokens[0], BREAK_MARKER, line_num);
+}
+
+static void
+test_ucd_file(const char *file,
+ void (*test_line)(const char *file, const char *line,
+ unsigned int line_num))
+{
+ const char *file_path = t_strconcat(UCD_DIR, "/", file, NULL);
+
+ test_begin(t_strdup_printf("unicode_break - %s", file));
+
+ struct istream *input = i_stream_create_file(file_path, 1024);
+ unsigned int line_num = 0;
+
+ while (!test_has_failed()) {
+ char *line = i_stream_read_next_line(input);
+ if (line == NULL)
+ break;
+ line_num++;
+
+ /* remove any trailing whitespace and comment */
+ char *end = strchr(line, '#');
+ if (end == NULL && *line != '\0')
+ end = &line[strlen(line) - 1];
+ while ((end - 1) >= line && (end[-1] == '\t' || end[-1] == ' '))
+ end--;
+ *end = '\0';
+ if (*line == '\0')
+ continue;
+
+ T_BEGIN {
+ test_line(file, line, line_num);
+ } T_END;
+ }
+
+ i_stream_destroy(&input);
+ test_end();
+}
+
+void test_unicode_break(void)
+{
+ test_ucd_file(UCD_GRAPHEME_BREAK_TEST_TXT, test_gcb_line);
+}
#define UCD_CASE_FOLDING_TXT "CaseFolding.txt"
#define UCD_COMPOSITION_EXCLUSIONS_TXT "CompositionExclusions.txt"
#define UCD_DERIVED_NORMALIZATION_PROPS_TXT "DerivedNormalizationProps.txt"
+#define UCD_GRAPHEME_BREAK_PROPERTY_TXT "GraphemeBreakProperty.txt"
#define UCD_PROP_LIST_TXT "PropList.txt"
#define UCD_SPECIAL_CASING_TXT "SpecialCasing.txt"
#define UCD_UNICODE_DATA_TXT "UnicodeData.txt"
}
}
+static void
+test_grapheme_break_property_line(const char *line, unsigned int line_num)
+{
+ uint32_t cp_first, cp_last, cp;
+ const char *prop;
+
+ if (!parse_prop_file_line(line, UCD_GRAPHEME_BREAK_PROPERTY_TXT,
+ line_num, &cp_first, &cp_last, &prop, NULL))
+ return;
+
+ for (cp = cp_first; cp <= cp_last && !test_has_failed(); cp++) {
+ const struct unicode_code_point_data *cp_data =
+ unicode_code_point_get_data(cp);
+
+ if (strcmp(prop, "CR") == 0)
+ test_assert_idx(cp_data->pb_b_cr, cp);
+ else if (strcmp(prop, "LF") == 0)
+ test_assert_idx(cp_data->pb_b_lf, cp);
+ else if (strcmp(prop, "Control") == 0)
+ test_assert_idx(cp_data->pb_gcb_control, cp);
+ else if (strcmp(prop, "Extend") == 0)
+ test_assert_idx(cp_data->pb_gcb_extend, cp);
+ else if (strcmp(prop, "ZWJ") == 0)
+ test_assert_idx(cp_data->pb_b_zwj, cp);
+ else if (strcmp(prop, "Regional_Indicator") == 0)
+ test_assert_idx(cp_data->pb_b_regional_indicator, cp);
+ else if (strcmp(prop, "Prepend") == 0)
+ test_assert_idx(cp_data->pb_gcb_prepend, cp);
+ else if (strcmp(prop, "SpacingMark") == 0)
+ test_assert_idx(cp_data->pb_gcb_spacingmark, cp);
+ else if (strcmp(prop, "L") == 0)
+ test_assert_idx(cp_data->pb_gcb_l, cp);
+ else if (strcmp(prop, "V") == 0)
+ test_assert_idx(cp_data->pb_gcb_v, cp);
+ else if (strcmp(prop, "T") == 0)
+ test_assert_idx(cp_data->pb_gcb_t, cp);
+ else if (strcmp(prop, "LV") == 0)
+ test_assert_idx(cp_data->pb_gcb_lv, cp);
+ else if (strcmp(prop, "LVT") == 0)
+ test_assert_idx(cp_data->pb_gcb_lvt, cp);
+ }
+}
+
static void test_prop_list_line(const char *line, unsigned int line_num)
{
uint32_t cp_first, cp_last, cp;
test_composition_exclusions_line);
test_ucd_file(UCD_DERIVED_NORMALIZATION_PROPS_TXT,
test_derived_normalization_props_line);
+ test_ucd_file(UCD_GRAPHEME_BREAK_PROPERTY_TXT,
+ test_grapheme_break_property_line);
test_ucd_file(UCD_PROP_LIST_TXT, test_prop_list_line);
test_ucd_file(UCD_SPECIAL_CASING_TXT, test_special_casing_line);
test_ucd_file(UCD_UNICODE_DATA_TXT, test_unicode_data_line);
--- /dev/null
+/* Copyright (c) 2025 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "unicode-data.h"
+#include "unicode-break.h"
+
+/* This file implements the Unicode Text Segmemtation algorithms as specified in
+ Unicode Standard Annex #29.
+ */
+
+/*
+ * Grapheme Cluster Boundaries (Unicode Standard Annex #29, Section 3)
+ */
+
+void unicode_gc_break_init(struct unicode_gc_break *gcbrk)
+{
+ i_zero(gcbrk);
+}
+
+bool unicode_gc_break_cp(struct unicode_gc_break *gcbrk, uint32_t cp,
+ const struct unicode_code_point_data **_cp_data)
+{
+ if (*_cp_data == NULL)
+ *_cp_data = unicode_code_point_get_data(cp);
+
+ const struct unicode_code_point_data *cp_data = *_cp_data;
+ int bstatus = -1;
+
+ /* GB1: Break at the start and end of text.
+ sot + Any
+ Any + eot
+ */
+ if (!gcbrk->gb1) {
+ gcbrk->gb1 = TRUE;
+ bstatus = 1;
+ }
+
+ /* GB3: Do not break between a CR and LF.
+ CR x LF
+ */
+ if (gcbrk->gb3) {
+ if (cp_data->pb_b_lf) {
+ if (bstatus < 0)
+ bstatus = 0;
+ }
+ if (!cp_data->pb_b_cr)
+ gcbrk->gb3 = FALSE;
+ } else if (cp_data->pb_b_cr) {
+ gcbrk->gb3 = TRUE;
+ }
+
+ /* GB4, GB5: Break before and after controls.
+ (Control | CR | LF) +
+ + (Control | CR | LF)
+ */
+ if (gcbrk->gb4) {
+ /* GB4: (Control | CR | LF) / */
+ if (bstatus < 0)
+ bstatus = 1;
+ if (!cp_data->pb_b_cr && !cp_data->pb_b_lf &&
+ !cp_data->pb_gcb_control)
+ gcbrk->gb4 = FALSE;
+ } else if (cp_data->pb_b_cr || cp_data->pb_b_lf ||
+ cp_data->pb_gcb_control) {
+ gcbrk->gb4 = TRUE;
+ /* GB5: / (Control | CR | LF) */
+ if (bstatus < 0)
+ bstatus = 1;
+ }
+
+ /* GB6: Do not break Hangul syllable or other conjoining sequences.
+ L x (L | V | LV | LVT)
+ */
+ if (gcbrk->gb6) {
+ if (cp_data->pb_gcb_v || cp_data->pb_gcb_lv ||
+ cp_data->pb_gcb_lvt) {
+ if (bstatus < 0)
+ bstatus = 0;
+ gcbrk->gb6 = FALSE;
+ } else if (cp_data->pb_gcb_l) {
+ if (bstatus < 0)
+ bstatus = 0;
+ } else {
+ gcbrk->gb6 = FALSE;
+ }
+ } else if (cp_data->pb_gcb_l) {
+ gcbrk->gb6 = TRUE;
+ }
+
+ /* GB7: Do not break Hangul syllable or other conjoining sequences.
+ (LV | V) x (V | T)
+ */
+ if (gcbrk->gb7) {
+ if (cp_data->pb_gcb_t) {
+ if (bstatus < 0)
+ bstatus = 0;
+ gcbrk->gb7 = FALSE;
+ } else if (cp_data->pb_gcb_v) {
+ if (bstatus < 0)
+ bstatus = 0;
+ } else {
+ gcbrk->gb7 = FALSE;
+ }
+ } else if (cp_data->pb_gcb_lv || cp_data->pb_gcb_v) {
+ gcbrk->gb7 = TRUE;
+ }
+
+ /* GB8: Do not break Hangul syllable or other conjoining sequences.
+ (LVT | T) x T
+ */
+ if (gcbrk->gb8) {
+ if (!cp_data->pb_gcb_t)
+ gcbrk->gb8 = FALSE;
+ else {
+ if (bstatus < 0)
+ bstatus = 0;
+ }
+ } else if (cp_data->pb_gcb_lvt || cp_data->pb_gcb_t) {
+ gcbrk->gb8 = TRUE;
+ }
+
+ /* GB9: Do not break before extending characters or ZWJ.
+ x (Extend | ZWJ)
+ */
+ if (cp_data->pb_gcb_extend || cp_data->pb_b_zwj) {
+ if (bstatus < 0)
+ bstatus = 0;
+ }
+
+ /* GB9a: Do not break before SpacingMarks.
+ x SpacingMark
+ */
+ if (cp_data->pb_gcb_spacingmark) {
+ if (bstatus < 0)
+ bstatus = 0;
+ }
+
+ /* GB9b: Do not break after Prepend characters.
+ Prepend x
+ */
+ if (gcbrk->gb9b) {
+ if (bstatus < 0)
+ bstatus = 0;
+ if (!cp_data->pb_gcb_prepend)
+ gcbrk->gb9b = FALSE;
+ } else if (cp_data->pb_gcb_prepend) {
+ gcbrk->gb9b = TRUE;
+ }
+
+ /* GB9c: Do not break within Indic conjuncts.
+ */
+ enum {
+ GB9C_STATE_NONE = 0,
+ GB9C_STATE_CONSONANT,
+ GB9C_STATE_LINKER,
+ };
+ switch (gcbrk->gb9c) {
+ case GB9C_STATE_NONE:
+ switch (cp_data->indic_conjunct_break) {
+ case UNICODE_INDIC_CONJUNCT_BREAK_CONSONANT:
+ gcbrk->gb9c = GB9C_STATE_CONSONANT;
+ break;
+ default:
+ break;
+ }
+ break;
+ case GB9C_STATE_CONSONANT:
+ switch (cp_data->indic_conjunct_break) {
+ case UNICODE_INDIC_CONJUNCT_BREAK_LINKER:
+ gcbrk->gb9c = GB9C_STATE_LINKER;
+ case UNICODE_INDIC_CONJUNCT_BREAK_CONSONANT:
+ case UNICODE_INDIC_CONJUNCT_BREAK_EXTEND:
+ break;
+ default:
+ gcbrk->gb9c = GB9C_STATE_NONE;
+ break;
+ }
+ break;
+ case GB9C_STATE_LINKER:
+ switch (cp_data->indic_conjunct_break) {
+ case UNICODE_INDIC_CONJUNCT_BREAK_LINKER:
+ case UNICODE_INDIC_CONJUNCT_BREAK_EXTEND:
+ break;
+ case UNICODE_INDIC_CONJUNCT_BREAK_CONSONANT:
+ if (bstatus < 0)
+ bstatus = 0;
+ gcbrk->gb9c = GB9C_STATE_CONSONANT;
+ break;
+ default:
+ gcbrk->gb9c = GB9C_STATE_NONE;
+ break;
+ }
+ break;
+ default:
+ i_unreached();
+ }
+
+ /* GB11: Do not break within emoji ZWJ sequences.
+ \p{Extended_Pictographic} Extend* ZWJ x \p{Extended_Pictographic}
+ */
+ enum {
+ GB11_STATE_NONE = 0,
+ GB11_STATE_EP,
+ GB11_STATE_ZWJ,
+ };
+ switch (gcbrk->gb11) {
+ case GB11_STATE_NONE:
+ if (cp_data->pb_e_extended_pictographic)
+ gcbrk->gb11 = GB11_STATE_EP;
+ break;
+ case GB11_STATE_EP:
+ if (cp_data->pb_e_extended_pictographic)
+ break;
+ if (cp_data->pb_gcb_extend)
+ break;
+ if (cp_data->pb_b_zwj) {
+ gcbrk->gb11 = GB11_STATE_ZWJ;
+ break;
+ }
+ gcbrk->gb11 = GB11_STATE_NONE;
+ break;
+ case GB11_STATE_ZWJ:
+ if (cp_data->pb_e_extended_pictographic) {
+ if (bstatus < 0)
+ bstatus = 0;
+ gcbrk->gb11 = GB11_STATE_EP;
+ break;
+ }
+ gcbrk->gb11 = GB11_STATE_NONE;
+ break;
+ default:
+ i_unreached();
+ }
+
+ /* GB12, GB13: Do not break within emoji flag sequences. That is, do not
+ break between regional indicator (RI) symbols if there is
+ an odd number of RI characters before the break point.
+ sot (RI RI)* RI x RI
+ [^RI] (RI RI)* RI x RI
+ */
+ if (gcbrk->gb12) {
+ if (cp_data->pb_b_regional_indicator) {
+ if (bstatus < 0)
+ bstatus = 0;
+ }
+ gcbrk->gb12 = FALSE;
+ } else if (cp_data->pb_b_regional_indicator) {
+ gcbrk->gb12 = TRUE;
+ }
+
+ /* GB999: Otherwise, break everywhere.
+ (Any + Any)
+ */
+ if (bstatus == 0)
+ return FALSE;
+ return TRUE;
+}
--- /dev/null
+#ifndef UNICODE_BREAK_H
+#define UNICODE_BREAK_H
+
+struct unicode_code_point_data;
+
+/*
+ * Grapheme Cluster Boundaries (Unicode Standard Annex #29, Section 3)
+ */
+
+struct unicode_gc_break {
+ unsigned int gb9c;
+ unsigned int gb11;
+ bool gb1:1;
+ bool gb3:1;
+ bool gb4:1;
+ bool gb6:1;
+ bool gb7:1;
+ bool gb8:1;
+ bool gb9b:1;
+ bool gb12:1;
+};
+
+void unicode_gc_break_init(struct unicode_gc_break *gcbrk);
+
+/* Returns TRUE if a grapheme boundary exists before the codepoint provided in
+ cp. Any code point data for cp that was looked up earlier can be provided in
+ the _cp_data pointer, or if it was NULL it can be retrieved there after the
+ call.
+ */
+bool unicode_gc_break_cp(struct unicode_gc_break *gcbrk, uint32_t cp,
+ const struct unicode_code_point_data **_cp_data);
+
+#endif
UNICODE_NFD_QUICK_CHECK_MASK = (0x03 << 0),
};
+enum unicode_indic_conjunct_break {
+ UNICODE_INDIC_CONJUNCT_BREAK_NONE = 0,
+ UNICODE_INDIC_CONJUNCT_BREAK_LINKER,
+ UNICODE_INDIC_CONJUNCT_BREAK_CONSONANT,
+ UNICODE_INDIC_CONJUNCT_BREAK_EXTEND,
+};
+
struct unicode_code_point_data {
uint8_t general_category; // Not yet used
uint8_t canonical_combining_class;
uint32_t simple_titlecase_mapping;
+ uint8_t indic_conjunct_break:3;
+
/* Property bits (UAX #44, Section 5.1) */
/* General */
bool pb_g_white_space:1;
+ /* Emoji */
+ bool pb_e_extended_pictographic:1;
+
/* Identifiers */
bool pb_i_pattern_white_space:1;
/* Common Break */
bool pb_b_cr:1;
bool pb_b_lf:1;
- bool pb_b_zwj:1; // Not currently used
+ bool pb_b_zwj:1;
bool pb_b_regional_indicator:1;
+ /* Grapheme_Cluster_Break (UAX #29, Section 3.1) */
+ bool pb_gcb_control:1;
+ bool pb_gcb_extend:1;
+ bool pb_gcb_prepend:1;
+ bool pb_gcb_spacingmark:1;
+ bool pb_gcb_l:1;
+ bool pb_gcb_v:1;
+ bool pb_gcb_t:1;
+ bool pb_gcb_lv:1;
+ bool pb_gcb_lvt:1;
+
/* Word_Break (UAX #29, Section 4.1) */
bool pb_wb_newline:1;
bool pb_wb_extend:1;
for cp in range(cprng[0], cprng[1] + 1):
ud_composition_exclusions[cp] = True
+ # DerivedCoreProperties.txt
+ with UCDFileOpen("DerivedCoreProperties.txt") as ucd:
+ for line in ucd.fd:
+ data = line.split("#")
+ if len(data) == 0:
+ continue
+
+ if len(data[0]) == 0:
+ continue
+ columns = data[0].split(";")
+ if len(columns) < 2:
+ continue
+
+ cprng = parse_cp_range(columns[0])
+ if cprng is None:
+ continue
+
+ prop = columns[1].strip()
+ if prop != "InCB":
+ continue
+
+ value = columns[2].strip()
+ cpd = CodePointData()
+ cpd.indic_conjunct_break = value
+ CodePointRange(cprng[0], cprng[1], cpd)
+
# DerivedNormalizationProps.txt
with UCDFileOpen("DerivedNormalizationProps.txt") as ucd:
line_num = 0
cpd.nfkc_quick_check = value
CodePointRange(cprng[0], cprng[1], cpd)
+ # emoji-data.txt
+ with UCDFileOpen("emoji-data.txt") as ucd:
+ for line in ucd.fd:
+ data = line.split("#")
+ if len(data) == 0:
+ continue
+
+ if len(data[0]) == 0:
+ continue
+ columns = data[0].split(";")
+ if len(columns) < 2:
+ continue
+
+ cprng = parse_cp_range(columns[0])
+ if cprng is None:
+ continue
+
+ prop = columns[1].strip()
+ if prop != "Extended_Pictographic":
+ continue
+
+ cpd = CodePointData()
+ cpd.pb_e_extended_pictographic = True
+ CodePointRange(cprng[0], cprng[1], cpd)
+
+ # GraphemeBreakProperty.txt
+ with UCDFileOpen("GraphemeBreakProperty.txt") as ucd:
+ for line in ucd.fd:
+ data = line.split("#")
+ if len(data[0]) == 0:
+ continue
+ columns = data[0].split(";")
+ if len(columns) < 2:
+ continue
+
+ cprng = parse_cp_range(columns[0])
+ if cprng is None:
+ continue
+
+ prop = columns[1].strip()
+ if prop == "CR":
+ cpd = CodePointData()
+ cpd.pb_b_cr = True
+ CodePointRange(cprng[0], cprng[1], cpd)
+ elif prop == "LF":
+ cpd = CodePointData()
+ cpd.pb_b_lf = True
+ CodePointRange(cprng[0], cprng[1], cpd)
+ elif prop == "Control":
+ cpd = CodePointData()
+ cpd.pb_gcb_control = True
+ CodePointRange(cprng[0], cprng[1], cpd)
+ elif prop == "Extend":
+ cpd = CodePointData()
+ cpd.pb_gcb_extend = True
+ CodePointRange(cprng[0], cprng[1], cpd)
+ elif prop == "ZWJ":
+ cpd = CodePointData()
+ cpd.pb_b_zwj = True
+ CodePointRange(cprng[0], cprng[1], cpd)
+ elif prop == "Regional_Indicator":
+ cpd = CodePointData()
+ cpd.pb_b_regional_indicator = True
+ CodePointRange(cprng[0], cprng[1], cpd)
+ elif prop == "Prepend":
+ cpd = CodePointData()
+ cpd.pb_gcb_prepend = True
+ CodePointRange(cprng[0], cprng[1], cpd)
+ elif prop == "SpacingMark":
+ cpd = CodePointData()
+ cpd.pb_gcb_spacingmark = True
+ CodePointRange(cprng[0], cprng[1], cpd)
+ elif prop == "L":
+ cpd = CodePointData()
+ cpd.pb_gcb_l = True
+ CodePointRange(cprng[0], cprng[1], cpd)
+ elif prop == "V":
+ cpd = CodePointData()
+ cpd.pb_gcb_v = True
+ CodePointRange(cprng[0], cprng[1], cpd)
+ elif prop == "T":
+ cpd = CodePointData()
+ cpd.pb_gcb_t = True
+ CodePointRange(cprng[0], cprng[1], cpd)
+ elif prop == "LV":
+ cpd = CodePointData()
+ cpd.pb_gcb_lv = True
+ CodePointRange(cprng[0], cprng[1], cpd)
+ elif prop == "LVT":
+ cpd = CodePointData()
+ cpd.pb_gcb_lvt = True
+ CodePointRange(cprng[0], cprng[1], cpd)
+
# PropList.txt
with UCDFileOpen("PropList.txt") as ucd:
line_num = 0
return "UNICODE_DECOMPOSITION_TYPE_%s" % dt.upper()
+def indic_conjunct_break_def(icb):
+ icb_uc = icb.upper()
+
+ return "UNICODE_INDIC_CONJUNCT_BREAK_%s" % icb_uc
+
+
def print_list(code_list):
last = len(code_list) - 1
n = 0
"\t\t.simple_titlecase_mapping = 0x%04X,"
% cpd.simple_titlecase_mapping
)
+ if hasattr(cpd, "indic_conjunct_break"):
+ print(
+ "\t\t.indic_conjunct_break = %s,"
+ % indic_conjunct_break_def(cpd.indic_conjunct_break)
+ )
if hasattr(cpd, "pb_g_white_space"):
print("\t\t.pb_g_white_space = TRUE,")
+ if hasattr(cpd, "pb_e_extended_pictographic"):
+ print("\t\t.pb_e_extended_pictographic = TRUE,")
if hasattr(cpd, "pb_i_pattern_white_space"):
print("\t\t.pb_i_pattern_white_space = TRUE,")
if hasattr(cpd, "pb_m_quotation_mark"):
print("\t\t.pb_b_zwj = TRUE,")
if hasattr(cpd, "pb_b_regional_indicator"):
print("\t\t.pb_b_regional_indicator = TRUE,")
+ if hasattr(cpd, "pb_gcb_control"):
+ print("\t\t.pb_gcb_control = TRUE,")
+ if hasattr(cpd, "pb_gcb_extend"):
+ print("\t\t.pb_gcb_extend = TRUE,")
+ if hasattr(cpd, "pb_gcb_prepend"):
+ print("\t\t.pb_gcb_prepend = TRUE,")
+ if hasattr(cpd, "pb_gcb_spacingmark"):
+ print("\t\t.pb_gcb_spacingmark = TRUE,")
+ if hasattr(cpd, "pb_gcb_l"):
+ print("\t\t.pb_gcb_l = TRUE,")
+ if hasattr(cpd, "pb_gcb_v"):
+ print("\t\t.pb_gcb_v = TRUE,")
+ if hasattr(cpd, "pb_gcb_t"):
+ print("\t\t.pb_gcb_t = TRUE,")
+ if hasattr(cpd, "pb_gcb_lv"):
+ print("\t\t.pb_gcb_lv = TRUE,")
+ if hasattr(cpd, "pb_gcb_lvt"):
+ print("\t\t.pb_gcb_lvt = TRUE,")
if hasattr(cpd, "pb_wb_newline"):
print("\t\t.pb_wb_newline = TRUE,")
if hasattr(cpd, "pb_wb_extend"):