From: Stephan Bosch <stephan.bosch@open-xchange.com>
Date: Fri, 11 Apr 2025 03:35:10 +0000 (+0200)
Subject: lib: unicode - Implement text segmentation at grapheme cluster boundaries
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=3f43b66652952ba1177ad05922f61a71f9523d2f;p=thirdparty%2Fdovecot%2Fcore.git

lib: unicode - Implement text segmentation at grapheme cluster boundaries
---

diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am
index 52331348bc..1e4b850fd5 100644
--- a/src/lib/Makefile.am
+++ b/src/lib/Makefile.am
@@ -22,6 +22,9 @@ UCD_FILES = \
 	$(UCD_DIR)/CompositionExclusions.txt \
 	$(UCD_DIR)/DerivedCoreProperties.txt \
 	$(UCD_DIR)/DerivedNormalizationProps.txt \
+	$(UCD_DIR)/emoji-data.txt \
+	$(UCD_DIR)/GraphemeBreakProperty.txt \
+	$(UCD_DIR)/GraphemeBreakTest.txt \
 	$(UCD_DIR)/NormalizationTest.txt \
 	$(UCD_DIR)/PropertyValueAliases.txt \
 	$(UCD_DIR)/PropList.txt \
@@ -66,6 +69,12 @@ $(UCD_DIR)/DerivedCoreProperties.txt:
 	$(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/DerivedCoreProperties.txt
 $(UCD_DIR)/DerivedNormalizationProps.txt:
 	$(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/DerivedNormalizationProps.txt
+$(UCD_DIR)/emoji-data.txt:
+	$(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/emoji-data.txt
+$(UCD_DIR)/GraphemeBreakProperty.txt:
+	$(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/GraphemeBreakProperty.txt
+$(UCD_DIR)/GraphemeBreakTest.txt:
+	$(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/GraphemeBreakTest.txt
 $(UCD_DIR)/NormalizationTest.txt:
 	$(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/NormalizationTest.txt
 $(UCD_DIR)/PropertyValueAliases.txt:
@@ -241,6 +250,7 @@ liblib_la_SOURCES = \
 	unlink-directory.c \
 	unlink-old-files.c \
 	unichar.c \
+	unicode-break.c \
 	unicode-data-types.c \
 	unicode-data-tables.c \
 	unicode-data.c \
@@ -406,6 +416,7 @@ headers = \
 	unlink-directory.h \
 	unlink-old-files.h \
 	unichar.h \
+	unicode-break.h \
 	unicode-data-static.h \
 	unicode-data-types.h \
 	unicode-data-tables.h \
@@ -518,6 +529,7 @@ test_lib_SOURCES = \
 	test-str-table.c \
 	test-time-util.c \
 	test-unichar.c \
+	test-unicode-break.c \
 	test-unicode-data.c \
 	test-unicode-nf.c \
 	test-unicode-casemap.c \
diff --git a/src/lib/test-lib.inc b/src/lib/test-lib.inc
index a27df9af9c..91c6a97124 100644
--- a/src/lib/test-lib.inc
+++ b/src/lib/test-lib.inc
@@ -106,6 +106,7 @@ TEST(test_str_sanitize)
 TEST(test_str_table)
 TEST(test_time_util)
 TEST(test_unichar)
+TEST(test_unicode_break)
 TEST(test_unicode_data)
 TEST(test_unicode_nf)
 TEST(test_unicode_casemap)
diff --git a/src/lib/test-unicode-break.c b/src/lib/test-unicode-break.c
new file mode 100644
index 0000000000..31eb23c398
--- /dev/null
+++ b/src/lib/test-unicode-break.c
@@ -0,0 +1,100 @@
+/* Copyright (c) 2025 Dovecot authors, see the included COPYING file */
+
+#include "test-lib.h"
+#include "strnum.h"
+#include "str.h"
+#include "array.h"
+#include "istream.h"
+#include "unichar.h"
+#include "unicode-break.h"
+
+#include <fcntl.h>
+
+#define UCD_GRAPHEME_BREAK_TEST_TXT "GraphemeBreakTest.txt"
+
+#define BREAK_MARKER "\xc3\xb7"
+#define NO_BREAK_MARKER "\xc3\x97"
+
+static void
+test_gcb_line(const char *file, const char *line, unsigned int line_num)
+{
+	struct unicode_gc_break ubrk;
+	const char *const *tokens = t_strsplit(line, " ");
+
+	unicode_gc_break_init(&ubrk);
+	while (tokens[0] != NULL && tokens[1] != NULL && !test_has_failed()) {
+		const char *brk = tokens[0];
+		const char *cp_hex = tokens[1];
+		bool break_m1_test = FALSE;
+		uint32_t cp;
+
+		if (strcmp(brk, BREAK_MARKER) == 0)
+			break_m1_test = TRUE;
+		else if (strcmp(brk, NO_BREAK_MARKER) != 0) {
+			test_failed(t_strdup_printf(
+				"Invalid data at %s:%u: "
+				"Bad break marker", file, line_num));
+			return;
+		}
+
+		if (str_to_uint32_hex(cp_hex, &cp) < 0) {
+			test_failed(t_strdup_printf(
+				"Invalid data at %s:%u: "
+				"Bad code point", file, line_num));
+			return;
+		}
+
+		const struct unicode_code_point_data *cp_data = NULL;
+		bool break_m1;
+
+		break_m1 = unicode_gc_break_cp(&ubrk, cp, &cp_data);
+
+		test_assert_idx(break_m1 == break_m1_test, line_num);
+
+		tokens += 2;
+	}
+
+	test_assert_strcmp_idx(tokens[0], BREAK_MARKER, line_num);
+}
+
+static void
+test_ucd_file(const char *file,
+	      void (*test_line)(const char *file, const char *line,
+				unsigned int line_num))
+{
+	const char *file_path = t_strconcat(UCD_DIR, "/", file, NULL);
+
+	test_begin(t_strdup_printf("unicode_break - %s", file));
+
+	struct istream *input = i_stream_create_file(file_path, 1024);
+	unsigned int line_num = 0;
+
+	while (!test_has_failed()) {
+		char *line = i_stream_read_next_line(input);
+		if (line == NULL)
+			break;
+		line_num++;
+
+		/* remove any trailing whitespace and comment */
+		char *end = strchr(line, '#');
+		if (end == NULL && *line != '\0')
+			end = &line[strlen(line) - 1];
+		while ((end - 1) >= line && (end[-1] == '\t' || end[-1] == ' '))
+			end--;
+		*end = '\0';
+		if (*line == '\0')
+			continue;
+
+		T_BEGIN {
+			test_line(file, line, line_num);
+		} T_END;
+	}
+
+	i_stream_destroy(&input);
+	test_end();
+}
+
+void test_unicode_break(void)
+{
+	test_ucd_file(UCD_GRAPHEME_BREAK_TEST_TXT, test_gcb_line);
+}
diff --git a/src/lib/test-unicode-data.c b/src/lib/test-unicode-data.c
index 5ee9eb205b..1a44f44aae 100644
--- a/src/lib/test-unicode-data.c
+++ b/src/lib/test-unicode-data.c
@@ -12,6 +12,7 @@
 #define UCD_CASE_FOLDING_TXT "CaseFolding.txt"
 #define UCD_COMPOSITION_EXCLUSIONS_TXT "CompositionExclusions.txt"
 #define UCD_DERIVED_NORMALIZATION_PROPS_TXT "DerivedNormalizationProps.txt"
+#define UCD_GRAPHEME_BREAK_PROPERTY_TXT "GraphemeBreakProperty.txt"
 #define UCD_PROP_LIST_TXT "PropList.txt"
 #define UCD_SPECIAL_CASING_TXT "SpecialCasing.txt"
 #define UCD_UNICODE_DATA_TXT "UnicodeData.txt"
@@ -225,6 +226,49 @@ test_derived_normalization_props_line(const char *line, unsigned int line_num)
 	}
 }
 
+static void
+test_grapheme_break_property_line(const char *line, unsigned int line_num)
+{
+	uint32_t cp_first, cp_last, cp;
+	const char *prop;
+
+	if (!parse_prop_file_line(line, UCD_GRAPHEME_BREAK_PROPERTY_TXT,
+				  line_num, &cp_first, &cp_last, &prop, NULL))
+		return;
+
+	for (cp = cp_first; cp <= cp_last && !test_has_failed(); cp++) {
+		const struct unicode_code_point_data *cp_data =
+			unicode_code_point_get_data(cp);
+
+		if (strcmp(prop, "CR") == 0)
+			test_assert_idx(cp_data->pb_b_cr, cp);
+		else if (strcmp(prop, "LF") == 0)
+			test_assert_idx(cp_data->pb_b_lf, cp);
+		else if (strcmp(prop, "Control") == 0)
+			test_assert_idx(cp_data->pb_gcb_control, cp);
+		else if (strcmp(prop, "Extend") == 0)
+			test_assert_idx(cp_data->pb_gcb_extend, cp);
+		else if (strcmp(prop, "ZWJ") == 0)
+			test_assert_idx(cp_data->pb_b_zwj, cp);
+		else if (strcmp(prop, "Regional_Indicator") == 0)
+			test_assert_idx(cp_data->pb_b_regional_indicator, cp);
+		else if (strcmp(prop, "Prepend") == 0)
+			test_assert_idx(cp_data->pb_gcb_prepend, cp);
+		else if (strcmp(prop, "SpacingMark") == 0)
+			test_assert_idx(cp_data->pb_gcb_spacingmark, cp);
+		else if (strcmp(prop, "L") == 0)
+			test_assert_idx(cp_data->pb_gcb_l, cp);
+		else if (strcmp(prop, "V") == 0)
+			test_assert_idx(cp_data->pb_gcb_v, cp);
+		else if (strcmp(prop, "T") == 0)
+			test_assert_idx(cp_data->pb_gcb_t, cp);
+		else if (strcmp(prop, "LV") == 0)
+			test_assert_idx(cp_data->pb_gcb_lv, cp);
+		else if (strcmp(prop, "LVT") == 0)
+			test_assert_idx(cp_data->pb_gcb_lvt, cp);
+	}
+}
+
 static void test_prop_list_line(const char *line, unsigned int line_num)
 {
 	uint32_t cp_first, cp_last, cp;
@@ -600,6 +644,8 @@ void test_unicode_data(void)
 		      test_composition_exclusions_line);
 	test_ucd_file(UCD_DERIVED_NORMALIZATION_PROPS_TXT,
 		      test_derived_normalization_props_line);
+	test_ucd_file(UCD_GRAPHEME_BREAK_PROPERTY_TXT,
+		      test_grapheme_break_property_line);
 	test_ucd_file(UCD_PROP_LIST_TXT, test_prop_list_line);
 	test_ucd_file(UCD_SPECIAL_CASING_TXT, test_special_casing_line);
 	test_ucd_file(UCD_UNICODE_DATA_TXT, test_unicode_data_line);
diff --git a/src/lib/unicode-break.c b/src/lib/unicode-break.c
new file mode 100644
index 0000000000..3fb660f5ad
--- /dev/null
+++ b/src/lib/unicode-break.c
@@ -0,0 +1,257 @@
+/* Copyright (c) 2025 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "unicode-data.h"
+#include "unicode-break.h"
+
+/* This file implements the Unicode Text Segmemtation algorithms as specified in
+   Unicode Standard Annex #29.
+ */
+
+/*
+ * Grapheme Cluster Boundaries (Unicode Standard Annex #29, Section 3)
+ */
+
+void unicode_gc_break_init(struct unicode_gc_break *gcbrk)
+{
+	i_zero(gcbrk);
+}
+
+bool unicode_gc_break_cp(struct unicode_gc_break *gcbrk, uint32_t cp,
+			 const struct unicode_code_point_data **_cp_data)
+{
+	if (*_cp_data == NULL)
+		*_cp_data = unicode_code_point_get_data(cp);
+
+	const struct unicode_code_point_data *cp_data = *_cp_data;
+	int bstatus = -1;
+
+	/* GB1: Break at the start and end of text.
+	   sot + Any
+	   Any + eot
+	 */
+	if (!gcbrk->gb1) {
+		gcbrk->gb1 = TRUE;
+		bstatus = 1;
+	}
+
+	/* GB3: Do not break between a CR and LF.
+	   CR x LF
+	 */
+	if (gcbrk->gb3) {
+		if (cp_data->pb_b_lf) {
+			if (bstatus < 0)
+				bstatus = 0;
+		}
+		if (!cp_data->pb_b_cr)
+			gcbrk->gb3 = FALSE;
+	} else if (cp_data->pb_b_cr) {
+		gcbrk->gb3 = TRUE;
+	}
+
+	/* GB4, GB5: Break before and after controls.
+	   (Control | CR | LF) +
+	   + (Control | CR | LF)
+	 */
+	if (gcbrk->gb4) {
+		/* GB4: (Control | CR | LF) / */
+		if (bstatus < 0)
+			bstatus = 1;
+		if (!cp_data->pb_b_cr && !cp_data->pb_b_lf &&
+		    !cp_data->pb_gcb_control)
+			gcbrk->gb4 = FALSE;
+	} else if (cp_data->pb_b_cr || cp_data->pb_b_lf ||
+		   cp_data->pb_gcb_control) {
+		gcbrk->gb4 = TRUE;
+		/* GB5: / (Control | CR | LF) */
+		if (bstatus < 0)
+			bstatus = 1;
+	}
+
+	/* GB6: Do not break Hangul syllable or other conjoining sequences.
+	   L x (L | V | LV | LVT)
+	 */
+	if (gcbrk->gb6) {
+		if (cp_data->pb_gcb_v || cp_data->pb_gcb_lv ||
+		    cp_data->pb_gcb_lvt) {
+			if (bstatus < 0)
+				bstatus = 0;
+			gcbrk->gb6 = FALSE;
+		} else if (cp_data->pb_gcb_l) {
+			if (bstatus < 0)
+				bstatus = 0;
+		} else {
+			gcbrk->gb6 = FALSE;
+		}
+	} else if (cp_data->pb_gcb_l) {
+		gcbrk->gb6 = TRUE;
+	}
+
+	/* GB7: Do not break Hangul syllable or other conjoining sequences.
+	   (LV | V) x (V | T)
+	 */
+	if (gcbrk->gb7) {
+		if (cp_data->pb_gcb_t) {
+			if (bstatus < 0)
+				bstatus = 0;
+			gcbrk->gb7 = FALSE;
+		} else if (cp_data->pb_gcb_v) {
+			if (bstatus < 0)
+				bstatus = 0;
+		} else {
+			gcbrk->gb7 = FALSE;
+		}
+	} else if (cp_data->pb_gcb_lv || cp_data->pb_gcb_v) {
+		gcbrk->gb7 = TRUE;
+	}
+
+	/* GB8: Do not break Hangul syllable or other conjoining sequences.
+	   (LVT | T) x T
+	 */
+	if (gcbrk->gb8) {
+		if (!cp_data->pb_gcb_t)
+			gcbrk->gb8 = FALSE;
+		else {
+			if (bstatus < 0)
+				bstatus = 0;
+		}
+	} else if (cp_data->pb_gcb_lvt || cp_data->pb_gcb_t) {
+		gcbrk->gb8 = TRUE;
+	}
+
+	/* GB9: Do not break before extending characters or ZWJ.
+	   x (Extend | ZWJ)
+	 */
+	if (cp_data->pb_gcb_extend || cp_data->pb_b_zwj) {
+		if (bstatus < 0)
+			bstatus = 0;
+	}
+
+	/* GB9a: Do not break before SpacingMarks.
+	   x SpacingMark
+	 */
+	if (cp_data->pb_gcb_spacingmark) {
+		if (bstatus < 0)
+			bstatus = 0;
+	}
+
+	/* GB9b: Do not break after Prepend characters.
+	   Prepend x
+	 */
+	if (gcbrk->gb9b) {
+		if (bstatus < 0)
+			bstatus = 0;
+		if (!cp_data->pb_gcb_prepend)
+			gcbrk->gb9b = FALSE;
+	} else if (cp_data->pb_gcb_prepend) {
+		gcbrk->gb9b = TRUE;
+	}
+
+	/* GB9c: Do not break within Indic conjuncts.
+	 */
+	enum {
+		GB9C_STATE_NONE = 0,
+		GB9C_STATE_CONSONANT,
+		GB9C_STATE_LINKER,
+	};
+	switch (gcbrk->gb9c) {
+	case GB9C_STATE_NONE:
+		switch (cp_data->indic_conjunct_break) {
+		case UNICODE_INDIC_CONJUNCT_BREAK_CONSONANT:
+			gcbrk->gb9c = GB9C_STATE_CONSONANT;
+			break;
+		default:
+			break;
+		}
+		break;
+	case GB9C_STATE_CONSONANT:
+		switch (cp_data->indic_conjunct_break) {
+		case UNICODE_INDIC_CONJUNCT_BREAK_LINKER:
+			gcbrk->gb9c = GB9C_STATE_LINKER;
+		case UNICODE_INDIC_CONJUNCT_BREAK_CONSONANT:
+		case UNICODE_INDIC_CONJUNCT_BREAK_EXTEND:
+			break;
+		default:
+			gcbrk->gb9c = GB9C_STATE_NONE;
+			break;
+		}
+		break;
+	case GB9C_STATE_LINKER:
+		switch (cp_data->indic_conjunct_break) {
+		case UNICODE_INDIC_CONJUNCT_BREAK_LINKER:
+		case UNICODE_INDIC_CONJUNCT_BREAK_EXTEND:
+			break;
+		case UNICODE_INDIC_CONJUNCT_BREAK_CONSONANT:
+			if (bstatus < 0)
+				bstatus = 0;
+			gcbrk->gb9c = GB9C_STATE_CONSONANT;
+			break;
+		default:
+			gcbrk->gb9c = GB9C_STATE_NONE;
+			break;
+		}
+		break;
+	default:
+		i_unreached();
+	}
+
+	/* GB11: Do not break within emoji ZWJ sequences.
+	   \p{Extended_Pictographic} Extend* ZWJ x \p{Extended_Pictographic}
+	 */
+	enum {
+		GB11_STATE_NONE = 0,
+		GB11_STATE_EP,
+		GB11_STATE_ZWJ,
+	};
+	switch (gcbrk->gb11) {
+	case GB11_STATE_NONE:
+		if (cp_data->pb_e_extended_pictographic)
+			gcbrk->gb11 = GB11_STATE_EP;
+		break;
+	case GB11_STATE_EP:
+		if (cp_data->pb_e_extended_pictographic)
+			break;
+		if (cp_data->pb_gcb_extend)
+			break;
+		if (cp_data->pb_b_zwj) {
+			gcbrk->gb11 = GB11_STATE_ZWJ;
+			break;
+		}
+		gcbrk->gb11 = GB11_STATE_NONE;
+		break;
+	case GB11_STATE_ZWJ:
+		if (cp_data->pb_e_extended_pictographic) {
+			if (bstatus < 0)
+				bstatus = 0;
+			gcbrk->gb11 = GB11_STATE_EP;
+			break;
+		}
+		gcbrk->gb11 = GB11_STATE_NONE;
+		break;
+	default:
+		i_unreached();
+	}
+
+	/* GB12, GB13: Do not break within emoji flag sequences. That is, do not
+		       break between regional indicator (RI) symbols if there is
+		       an odd number of RI characters before the break point.
+	   sot   (RI RI)* RI x RI
+	   [^RI] (RI RI)* RI x RI
+	 */
+	if (gcbrk->gb12) {
+		if (cp_data->pb_b_regional_indicator) {
+			if (bstatus < 0)
+				bstatus = 0;
+		}
+		gcbrk->gb12 = FALSE;
+	} else if (cp_data->pb_b_regional_indicator) {
+		gcbrk->gb12 = TRUE;
+	}
+
+	/* GB999: Otherwise, break everywhere.
+	   (Any + Any)
+	 */
+	if (bstatus == 0)
+		return FALSE;
+	return TRUE;
+}
diff --git a/src/lib/unicode-break.h b/src/lib/unicode-break.h
new file mode 100644
index 0000000000..74c08d7237
--- /dev/null
+++ b/src/lib/unicode-break.h
@@ -0,0 +1,33 @@
+#ifndef UNICODE_BREAK_H
+#define UNICODE_BREAK_H
+
+struct unicode_code_point_data;
+
+/*
+ * Grapheme Cluster Boundaries (Unicode Standard Annex #29, Section 3)
+ */
+
+struct unicode_gc_break {
+	unsigned int gb9c;
+	unsigned int gb11;
+	bool gb1:1;
+	bool gb3:1;
+	bool gb4:1;
+	bool gb6:1;
+	bool gb7:1;
+	bool gb8:1;
+	bool gb9b:1;
+	bool gb12:1;
+};
+
+void unicode_gc_break_init(struct unicode_gc_break *gcbrk);
+
+/* Returns TRUE if a grapheme boundary exists before the codepoint provided in
+   cp. Any code point data for cp that was looked up earlier can be provided in
+   the _cp_data pointer, or if it was NULL it can be retrieved there after the
+   call.
+ */
+bool unicode_gc_break_cp(struct unicode_gc_break *gcbrk, uint32_t cp,
+			 const struct unicode_code_point_data **_cp_data);
+
+#endif
diff --git a/src/lib/unicode-data-static.h b/src/lib/unicode-data-static.h
index 0d74fb8050..9a2575c623 100644
--- a/src/lib/unicode-data-static.h
+++ b/src/lib/unicode-data-static.h
@@ -115,6 +115,13 @@ enum unicode_nf_quick_check {
 	UNICODE_NFD_QUICK_CHECK_MASK   = (0x03 << 0),
 };
 
+enum unicode_indic_conjunct_break {
+	UNICODE_INDIC_CONJUNCT_BREAK_NONE = 0,
+	UNICODE_INDIC_CONJUNCT_BREAK_LINKER,
+	UNICODE_INDIC_CONJUNCT_BREAK_CONSONANT,
+	UNICODE_INDIC_CONJUNCT_BREAK_EXTEND,
+};
+
 struct unicode_code_point_data {
 	uint8_t general_category; // Not yet used
 	uint8_t canonical_combining_class;
@@ -142,11 +149,16 @@ struct unicode_code_point_data {
 
 	uint32_t simple_titlecase_mapping;
 
+	uint8_t indic_conjunct_break:3;
+
 	/* Property bits (UAX #44, Section 5.1) */
 
 	/* General */
 	bool pb_g_white_space:1;
 
+	/* Emoji */
+	bool pb_e_extended_pictographic:1;
+
 	/* Identifiers */
 	bool pb_i_pattern_white_space:1;
 
@@ -159,9 +171,20 @@ struct unicode_code_point_data {
 	/* Common Break */
 	bool pb_b_cr:1;
 	bool pb_b_lf:1;
-	bool pb_b_zwj:1; // Not currently used
+	bool pb_b_zwj:1;
 	bool pb_b_regional_indicator:1;
 
+	/* Grapheme_Cluster_Break (UAX #29, Section 3.1) */
+	bool pb_gcb_control:1;
+	bool pb_gcb_extend:1;
+	bool pb_gcb_prepend:1;
+	bool pb_gcb_spacingmark:1;
+	bool pb_gcb_l:1;
+	bool pb_gcb_v:1;
+	bool pb_gcb_t:1;
+	bool pb_gcb_lv:1;
+	bool pb_gcb_lvt:1;
+
 	/* Word_Break (UAX #29, Section 4.1) */
 	bool pb_wb_newline:1;
 	bool pb_wb_extend:1;
diff --git a/src/lib/unicode-ucd-compile.py b/src/lib/unicode-ucd-compile.py
index 15be589cb3..9ff5ed4bd6 100755
--- a/src/lib/unicode-ucd-compile.py
+++ b/src/lib/unicode-ucd-compile.py
@@ -452,6 +452,32 @@ def read_ucd_files():
             for cp in range(cprng[0], cprng[1] + 1):
                 ud_composition_exclusions[cp] = True
 
+    # DerivedCoreProperties.txt
+    with UCDFileOpen("DerivedCoreProperties.txt") as ucd:
+        for line in ucd.fd:
+            data = line.split("#")
+            if len(data) == 0:
+                continue
+
+            if len(data[0]) == 0:
+                continue
+            columns = data[0].split(";")
+            if len(columns) < 2:
+                continue
+
+            cprng = parse_cp_range(columns[0])
+            if cprng is None:
+                continue
+
+            prop = columns[1].strip()
+            if prop != "InCB":
+                continue
+
+            value = columns[2].strip()
+            cpd = CodePointData()
+            cpd.indic_conjunct_break = value
+            CodePointRange(cprng[0], cprng[1], cpd)
+
     # DerivedNormalizationProps.txt
     with UCDFileOpen("DerivedNormalizationProps.txt") as ucd:
         line_num = 0
@@ -491,6 +517,99 @@ def read_ucd_files():
                 cpd.nfkc_quick_check = value
                 CodePointRange(cprng[0], cprng[1], cpd)
 
+    # emoji-data.txt
+    with UCDFileOpen("emoji-data.txt") as ucd:
+        for line in ucd.fd:
+            data = line.split("#")
+            if len(data) == 0:
+                continue
+
+            if len(data[0]) == 0:
+                continue
+            columns = data[0].split(";")
+            if len(columns) < 2:
+                continue
+
+            cprng = parse_cp_range(columns[0])
+            if cprng is None:
+                continue
+
+            prop = columns[1].strip()
+            if prop != "Extended_Pictographic":
+                continue
+
+            cpd = CodePointData()
+            cpd.pb_e_extended_pictographic = True
+            CodePointRange(cprng[0], cprng[1], cpd)
+
+    # GraphemeBreakProperty.txt
+    with UCDFileOpen("GraphemeBreakProperty.txt") as ucd:
+        for line in ucd.fd:
+            data = line.split("#")
+            if len(data[0]) == 0:
+                continue
+            columns = data[0].split(";")
+            if len(columns) < 2:
+                continue
+
+            cprng = parse_cp_range(columns[0])
+            if cprng is None:
+                continue
+
+            prop = columns[1].strip()
+            if prop == "CR":
+                cpd = CodePointData()
+                cpd.pb_b_cr = True
+                CodePointRange(cprng[0], cprng[1], cpd)
+            elif prop == "LF":
+                cpd = CodePointData()
+                cpd.pb_b_lf = True
+                CodePointRange(cprng[0], cprng[1], cpd)
+            elif prop == "Control":
+                cpd = CodePointData()
+                cpd.pb_gcb_control = True
+                CodePointRange(cprng[0], cprng[1], cpd)
+            elif prop == "Extend":
+                cpd = CodePointData()
+                cpd.pb_gcb_extend = True
+                CodePointRange(cprng[0], cprng[1], cpd)
+            elif prop == "ZWJ":
+                cpd = CodePointData()
+                cpd.pb_b_zwj = True
+                CodePointRange(cprng[0], cprng[1], cpd)
+            elif prop == "Regional_Indicator":
+                cpd = CodePointData()
+                cpd.pb_b_regional_indicator = True
+                CodePointRange(cprng[0], cprng[1], cpd)
+            elif prop == "Prepend":
+                cpd = CodePointData()
+                cpd.pb_gcb_prepend = True
+                CodePointRange(cprng[0], cprng[1], cpd)
+            elif prop == "SpacingMark":
+                cpd = CodePointData()
+                cpd.pb_gcb_spacingmark = True
+                CodePointRange(cprng[0], cprng[1], cpd)
+            elif prop == "L":
+                cpd = CodePointData()
+                cpd.pb_gcb_l = True
+                CodePointRange(cprng[0], cprng[1], cpd)
+            elif prop == "V":
+                cpd = CodePointData()
+                cpd.pb_gcb_v = True
+                CodePointRange(cprng[0], cprng[1], cpd)
+            elif prop == "T":
+                cpd = CodePointData()
+                cpd.pb_gcb_t = True
+                CodePointRange(cprng[0], cprng[1], cpd)
+            elif prop == "LV":
+                cpd = CodePointData()
+                cpd.pb_gcb_lv = True
+                CodePointRange(cprng[0], cprng[1], cpd)
+            elif prop == "LVT":
+                cpd = CodePointData()
+                cpd.pb_gcb_lvt = True
+                CodePointRange(cprng[0], cprng[1], cpd)
+
     # PropList.txt
     with UCDFileOpen("PropList.txt") as ucd:
         line_num = 0
@@ -1119,6 +1238,12 @@ def decomposition_type_def(dt):
     return "UNICODE_DECOMPOSITION_TYPE_%s" % dt.upper()
 
 
+def indic_conjunct_break_def(icb):
+    icb_uc = icb.upper()
+
+    return "UNICODE_INDIC_CONJUNCT_BREAK_%s" % icb_uc
+
+
 def print_list(code_list):
     last = len(code_list) - 1
     n = 0
@@ -1315,8 +1440,15 @@ def write_tables_c_cpd(cpd):
             "\t\t.simple_titlecase_mapping = 0x%04X,"
             % cpd.simple_titlecase_mapping
         )
+    if hasattr(cpd, "indic_conjunct_break"):
+        print(
+            "\t\t.indic_conjunct_break = %s,"
+            % indic_conjunct_break_def(cpd.indic_conjunct_break)
+        )
     if hasattr(cpd, "pb_g_white_space"):
         print("\t\t.pb_g_white_space = TRUE,")
+    if hasattr(cpd, "pb_e_extended_pictographic"):
+        print("\t\t.pb_e_extended_pictographic = TRUE,")
     if hasattr(cpd, "pb_i_pattern_white_space"):
         print("\t\t.pb_i_pattern_white_space = TRUE,")
     if hasattr(cpd, "pb_m_quotation_mark"):
@@ -1335,6 +1467,24 @@ def write_tables_c_cpd(cpd):
         print("\t\t.pb_b_zwj = TRUE,")
     if hasattr(cpd, "pb_b_regional_indicator"):
         print("\t\t.pb_b_regional_indicator = TRUE,")
+    if hasattr(cpd, "pb_gcb_control"):
+        print("\t\t.pb_gcb_control = TRUE,")
+    if hasattr(cpd, "pb_gcb_extend"):
+        print("\t\t.pb_gcb_extend = TRUE,")
+    if hasattr(cpd, "pb_gcb_prepend"):
+        print("\t\t.pb_gcb_prepend = TRUE,")
+    if hasattr(cpd, "pb_gcb_spacingmark"):
+        print("\t\t.pb_gcb_spacingmark = TRUE,")
+    if hasattr(cpd, "pb_gcb_l"):
+        print("\t\t.pb_gcb_l = TRUE,")
+    if hasattr(cpd, "pb_gcb_v"):
+        print("\t\t.pb_gcb_v = TRUE,")
+    if hasattr(cpd, "pb_gcb_t"):
+        print("\t\t.pb_gcb_t = TRUE,")
+    if hasattr(cpd, "pb_gcb_lv"):
+        print("\t\t.pb_gcb_lv = TRUE,")
+    if hasattr(cpd, "pb_gcb_lvt"):
+        print("\t\t.pb_gcb_lvt = TRUE,")
     if hasattr(cpd, "pb_wb_newline"):
         print("\t\t.pb_wb_newline = TRUE,")
     if hasattr(cpd, "pb_wb_extend"):