lib: unicode - Implement text segmentation at grapheme cluster boundaries

author Stephan Bosch <stephan.bosch@open-xchange.com>

Fri, 11 Apr 2025 03:35:10 +0000 (05:35 +0200)

committer aki.tuomi <aki.tuomi@open-xchange.com>

Mon, 8 Dec 2025 14:37:04 +0000 (14:37 +0000)
author Stephan Bosch <stephan.bosch@open-xchange.com>
Fri, 11 Apr 2025 03:35:10 +0000 (05:35 +0200)
committer aki.tuomi <aki.tuomi@open-xchange.com>
Mon, 8 Dec 2025 14:37:04 +0000 (14:37 +0000)
diff --git a/src/lib/Makefile.am b/src/lib/Makefile.am

index 52331348bc6a9b80465433c366cfa805626fc499..1e4b850fd55b3b90c29d287530f5fa2fa7d71a01 100644 (file)
--- a/src/lib/Makefile.am
+++ b/src/lib/Makefile.am
@@ -22,6 +22,9 @@ UCD_FILES = \
         $(UCD_DIR)/CompositionExclusions.txt \
         $(UCD_DIR)/DerivedCoreProperties.txt \
         $(UCD_DIR)/DerivedNormalizationProps.txt \
+       $(UCD_DIR)/emoji-data.txt \
+       $(UCD_DIR)/GraphemeBreakProperty.txt \
+       $(UCD_DIR)/GraphemeBreakTest.txt \
         $(UCD_DIR)/NormalizationTest.txt \
         $(UCD_DIR)/PropertyValueAliases.txt \
         $(UCD_DIR)/PropList.txt \
@@ -66,6 +69,12 @@ $(UCD_DIR)/DerivedCoreProperties.txt:
         $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/DerivedCoreProperties.txt
  $(UCD_DIR)/DerivedNormalizationProps.txt:
         $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/DerivedNormalizationProps.txt
+$(UCD_DIR)/emoji-data.txt:
+       $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/emoji-data.txt
+$(UCD_DIR)/GraphemeBreakProperty.txt:
+       $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/GraphemeBreakProperty.txt
+$(UCD_DIR)/GraphemeBreakTest.txt:
+       $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/GraphemeBreakTest.txt
  $(UCD_DIR)/NormalizationTest.txt:
         $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ $(UCD_URL)/NormalizationTest.txt
  $(UCD_DIR)/PropertyValueAliases.txt:
@@ -241,6 +250,7 @@ liblib_la_SOURCES = \
         unlink-directory.c \
         unlink-old-files.c \
         unichar.c \
+       unicode-break.c \
         unicode-data-types.c \
         unicode-data-tables.c \
         unicode-data.c \
@@ -406,6 +416,7 @@ headers = \
         unlink-directory.h \
         unlink-old-files.h \
         unichar.h \
+       unicode-break.h \
         unicode-data-static.h \
         unicode-data-types.h \
         unicode-data-tables.h \
@@ -518,6 +529,7 @@ test_lib_SOURCES = \
         test-str-table.c \
         test-time-util.c \
         test-unichar.c \
+       test-unicode-break.c \
         test-unicode-data.c \
         test-unicode-nf.c \
         test-unicode-casemap.c \
diff --git a/src/lib/test-lib.inc b/src/lib/test-lib.inc

index a27df9af9c0c79414dfa533862b6646062825449..91c6a971247fe6b1a936d877dd4db8ef2be80a4c 100644 (file)
--- a/src/lib/test-lib.inc
+++ b/src/lib/test-lib.inc
@@ -106,6 +106,7 @@ TEST(test_str_sanitize)
  TEST(test_str_table)
  TEST(test_time_util)
  TEST(test_unichar)
+TEST(test_unicode_break)
  TEST(test_unicode_data)
  TEST(test_unicode_nf)
  TEST(test_unicode_casemap)
diff --git a/src/lib/test-unicode-break.c b/src/lib/test-unicode-break.c

new file mode 100644 (file)

index 0000000..31eb23c
--- /dev/null
+++ b/src/lib/test-unicode-break.c
@@ -0,0 +1,100 @@
+/* Copyright (c) 2025 Dovecot authors, see the included COPYING file */
+
+#include "test-lib.h"
+#include "strnum.h"
+#include "str.h"
+#include "array.h"
+#include "istream.h"
+#include "unichar.h"
+#include "unicode-break.h"
+
+#include <fcntl.h>
+
+#define UCD_GRAPHEME_BREAK_TEST_TXT "GraphemeBreakTest.txt"
+
+#define BREAK_MARKER "\xc3\xb7"
+#define NO_BREAK_MARKER "\xc3\x97"
+
+static void
+test_gcb_line(const char *file, const char *line, unsigned int line_num)
+{
+       struct unicode_gc_break ubrk;
+       const char *const *tokens = t_strsplit(line, " ");
+
+       unicode_gc_break_init(&ubrk);
+       while (tokens[0] != NULL && tokens[1] != NULL && !test_has_failed()) {
+               const char *brk = tokens[0];
+               const char *cp_hex = tokens[1];
+               bool break_m1_test = FALSE;
+               uint32_t cp;
+
+               if (strcmp(brk, BREAK_MARKER) == 0)
+                       break_m1_test = TRUE;
+               else if (strcmp(brk, NO_BREAK_MARKER) != 0) {
+                       test_failed(t_strdup_printf(
+                               "Invalid data at %s:%u: "
+                               "Bad break marker", file, line_num));
+                       return;
+               }
+
+               if (str_to_uint32_hex(cp_hex, &cp) < 0) {
+                       test_failed(t_strdup_printf(
+                               "Invalid data at %s:%u: "
+                               "Bad code point", file, line_num));
+                       return;
+               }
+
+               const struct unicode_code_point_data *cp_data = NULL;
+               bool break_m1;
+
+               break_m1 = unicode_gc_break_cp(&ubrk, cp, &cp_data);
+
+               test_assert_idx(break_m1 == break_m1_test, line_num);
+
+               tokens += 2;
+       }
+
+       test_assert_strcmp_idx(tokens[0], BREAK_MARKER, line_num);
+}
+
+static void
+test_ucd_file(const char *file,
+             void (*test_line)(const char *file, const char *line,
+                               unsigned int line_num))
+{
+       const char *file_path = t_strconcat(UCD_DIR, "/", file, NULL);
+
+       test_begin(t_strdup_printf("unicode_break - %s", file));
+
+       struct istream *input = i_stream_create_file(file_path, 1024);
+       unsigned int line_num = 0;
+
+       while (!test_has_failed()) {
+               char *line = i_stream_read_next_line(input);
+               if (line == NULL)
+                       break;
+               line_num++;
+
+               /* remove any trailing whitespace and comment */
+               char *end = strchr(line, '#');
+               if (end == NULL && *line != '\0')
+                       end = &line[strlen(line) - 1];
+               while ((end - 1) >= line && (end[-1] == '\t' || end[-1] == ' '))
+                       end--;
+               *end = '\0';
+               if (*line == '\0')
+                       continue;
+
+               T_BEGIN {
+                       test_line(file, line, line_num);
+               } T_END;
+       }
+
+       i_stream_destroy(&input);
+       test_end();
+}
+
+void test_unicode_break(void)
+{
+       test_ucd_file(UCD_GRAPHEME_BREAK_TEST_TXT, test_gcb_line);
+}
diff --git a/src/lib/test-unicode-data.c b/src/lib/test-unicode-data.c

index 5ee9eb205b626e022c31ddb6ab1888041f7b6dbd..1a44f44aae7eae30d488c6f55ddaebe0e89cc569 100644 (file)
--- a/src/lib/test-unicode-data.c
+++ b/src/lib/test-unicode-data.c
@@ -12,6 +12,7 @@
  #define UCD_CASE_FOLDING_TXT "CaseFolding.txt"
  #define UCD_COMPOSITION_EXCLUSIONS_TXT "CompositionExclusions.txt"
  #define UCD_DERIVED_NORMALIZATION_PROPS_TXT "DerivedNormalizationProps.txt"
+#define UCD_GRAPHEME_BREAK_PROPERTY_TXT "GraphemeBreakProperty.txt"
  #define UCD_PROP_LIST_TXT "PropList.txt"
  #define UCD_SPECIAL_CASING_TXT "SpecialCasing.txt"
  #define UCD_UNICODE_DATA_TXT "UnicodeData.txt"
@@ -225,6 +226,49 @@ test_derived_normalization_props_line(const char *line, unsigned int line_num)
         }
  }
  
+static void
+test_grapheme_break_property_line(const char *line, unsigned int line_num)
+{
+       uint32_t cp_first, cp_last, cp;
+       const char *prop;
+
+       if (!parse_prop_file_line(line, UCD_GRAPHEME_BREAK_PROPERTY_TXT,
+                                 line_num, &cp_first, &cp_last, &prop, NULL))
+               return;
+
+       for (cp = cp_first; cp <= cp_last && !test_has_failed(); cp++) {
+               const struct unicode_code_point_data *cp_data =
+                       unicode_code_point_get_data(cp);
+
+               if (strcmp(prop, "CR") == 0)
+                       test_assert_idx(cp_data->pb_b_cr, cp);
+               else if (strcmp(prop, "LF") == 0)
+                       test_assert_idx(cp_data->pb_b_lf, cp);
+               else if (strcmp(prop, "Control") == 0)
+                       test_assert_idx(cp_data->pb_gcb_control, cp);
+               else if (strcmp(prop, "Extend") == 0)
+                       test_assert_idx(cp_data->pb_gcb_extend, cp);
+               else if (strcmp(prop, "ZWJ") == 0)
+                       test_assert_idx(cp_data->pb_b_zwj, cp);
+               else if (strcmp(prop, "Regional_Indicator") == 0)
+                       test_assert_idx(cp_data->pb_b_regional_indicator, cp);
+               else if (strcmp(prop, "Prepend") == 0)
+                       test_assert_idx(cp_data->pb_gcb_prepend, cp);
+               else if (strcmp(prop, "SpacingMark") == 0)
+                       test_assert_idx(cp_data->pb_gcb_spacingmark, cp);
+               else if (strcmp(prop, "L") == 0)
+                       test_assert_idx(cp_data->pb_gcb_l, cp);
+               else if (strcmp(prop, "V") == 0)
+                       test_assert_idx(cp_data->pb_gcb_v, cp);
+               else if (strcmp(prop, "T") == 0)
+                       test_assert_idx(cp_data->pb_gcb_t, cp);
+               else if (strcmp(prop, "LV") == 0)
+                       test_assert_idx(cp_data->pb_gcb_lv, cp);
+               else if (strcmp(prop, "LVT") == 0)
+                       test_assert_idx(cp_data->pb_gcb_lvt, cp);
+       }
+}
+
  static void test_prop_list_line(const char *line, unsigned int line_num)
  {
         uint32_t cp_first, cp_last, cp;
@@ -600,6 +644,8 @@ void test_unicode_data(void)
                       test_composition_exclusions_line);
         test_ucd_file(UCD_DERIVED_NORMALIZATION_PROPS_TXT,
                       test_derived_normalization_props_line);
+       test_ucd_file(UCD_GRAPHEME_BREAK_PROPERTY_TXT,
+                     test_grapheme_break_property_line);
         test_ucd_file(UCD_PROP_LIST_TXT, test_prop_list_line);
         test_ucd_file(UCD_SPECIAL_CASING_TXT, test_special_casing_line);
         test_ucd_file(UCD_UNICODE_DATA_TXT, test_unicode_data_line);
diff --git a/src/lib/unicode-break.c b/src/lib/unicode-break.c

new file mode 100644 (file)

index 0000000..3fb660f
--- /dev/null
+++ b/src/lib/unicode-break.c
@@ -0,0 +1,257 @@
+/* Copyright (c) 2025 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "unicode-data.h"
+#include "unicode-break.h"
+
+/* This file implements the Unicode Text Segmemtation algorithms as specified in
+   Unicode Standard Annex #29.
+ */
+
+/*
+ * Grapheme Cluster Boundaries (Unicode Standard Annex #29, Section 3)
+ */
+
+void unicode_gc_break_init(struct unicode_gc_break *gcbrk)
+{
+       i_zero(gcbrk);
+}
+
+bool unicode_gc_break_cp(struct unicode_gc_break *gcbrk, uint32_t cp,
+                        const struct unicode_code_point_data **_cp_data)
+{
+       if (*_cp_data == NULL)
+               *_cp_data = unicode_code_point_get_data(cp);
+
+       const struct unicode_code_point_data *cp_data = *_cp_data;
+       int bstatus = -1;
+
+       /* GB1: Break at the start and end of text.
+          sot + Any
+          Any + eot
+        */
+       if (!gcbrk->gb1) {
+               gcbrk->gb1 = TRUE;
+               bstatus = 1;
+       }
+
+       /* GB3: Do not break between a CR and LF.
+          CR x LF
+        */
+       if (gcbrk->gb3) {
+               if (cp_data->pb_b_lf) {
+                       if (bstatus < 0)
+                               bstatus = 0;
+               }
+               if (!cp_data->pb_b_cr)
+                       gcbrk->gb3 = FALSE;
+       } else if (cp_data->pb_b_cr) {
+               gcbrk->gb3 = TRUE;
+       }
+
+       /* GB4, GB5: Break before and after controls.
+          (Control | CR | LF) +
+          + (Control | CR | LF)
+        */
+       if (gcbrk->gb4) {
+               /* GB4: (Control | CR | LF) / */
+               if (bstatus < 0)
+                       bstatus = 1;
+               if (!cp_data->pb_b_cr && !cp_data->pb_b_lf &&
+                   !cp_data->pb_gcb_control)
+                       gcbrk->gb4 = FALSE;
+       } else if (cp_data->pb_b_cr || cp_data->pb_b_lf ||
+                  cp_data->pb_gcb_control) {
+               gcbrk->gb4 = TRUE;
+               /* GB5: / (Control | CR | LF) */
+               if (bstatus < 0)
+                       bstatus = 1;
+       }
+
+       /* GB6: Do not break Hangul syllable or other conjoining sequences.
+          L x (L | V | LV | LVT)
+        */
+       if (gcbrk->gb6) {
+               if (cp_data->pb_gcb_v || cp_data->pb_gcb_lv ||
+                   cp_data->pb_gcb_lvt) {
+                       if (bstatus < 0)
+                               bstatus = 0;
+                       gcbrk->gb6 = FALSE;
+               } else if (cp_data->pb_gcb_l) {
+                       if (bstatus < 0)
+                               bstatus = 0;
+               } else {
+                       gcbrk->gb6 = FALSE;
+               }
+       } else if (cp_data->pb_gcb_l) {
+               gcbrk->gb6 = TRUE;
+       }
+
+       /* GB7: Do not break Hangul syllable or other conjoining sequences.
+          (LV | V) x (V | T)
+        */
+       if (gcbrk->gb7) {
+               if (cp_data->pb_gcb_t) {
+                       if (bstatus < 0)
+                               bstatus = 0;
+                       gcbrk->gb7 = FALSE;
+               } else if (cp_data->pb_gcb_v) {
+                       if (bstatus < 0)
+                               bstatus = 0;
+               } else {
+                       gcbrk->gb7 = FALSE;
+               }
+       } else if (cp_data->pb_gcb_lv || cp_data->pb_gcb_v) {
+               gcbrk->gb7 = TRUE;
+       }
+
+       /* GB8: Do not break Hangul syllable or other conjoining sequences.
+          (LVT | T) x T
+        */
+       if (gcbrk->gb8) {
+               if (!cp_data->pb_gcb_t)
+                       gcbrk->gb8 = FALSE;
+               else {
+                       if (bstatus < 0)
+                               bstatus = 0;
+               }
+       } else if (cp_data->pb_gcb_lvt || cp_data->pb_gcb_t) {
+               gcbrk->gb8 = TRUE;
+       }
+
+       /* GB9: Do not break before extending characters or ZWJ.
+          x (Extend | ZWJ)
+        */
+       if (cp_data->pb_gcb_extend || cp_data->pb_b_zwj) {
+               if (bstatus < 0)
+                       bstatus = 0;
+       }
+
+       /* GB9a: Do not break before SpacingMarks.
+          x SpacingMark
+        */
+       if (cp_data->pb_gcb_spacingmark) {
+               if (bstatus < 0)
+                       bstatus = 0;
+       }
+
+       /* GB9b: Do not break after Prepend characters.
+          Prepend x
+        */
+       if (gcbrk->gb9b) {
+               if (bstatus < 0)
+                       bstatus = 0;
+               if (!cp_data->pb_gcb_prepend)
+                       gcbrk->gb9b = FALSE;
+       } else if (cp_data->pb_gcb_prepend) {
+               gcbrk->gb9b = TRUE;
+       }
+
+       /* GB9c: Do not break within Indic conjuncts.
+        */
+       enum {
+               GB9C_STATE_NONE = 0,
+               GB9C_STATE_CONSONANT,
+               GB9C_STATE_LINKER,
+       };
+       switch (gcbrk->gb9c) {
+       case GB9C_STATE_NONE:
+               switch (cp_data->indic_conjunct_break) {
+               case UNICODE_INDIC_CONJUNCT_BREAK_CONSONANT:
+                       gcbrk->gb9c = GB9C_STATE_CONSONANT;
+                       break;
+               default:
+                       break;
+               }
+               break;
+       case GB9C_STATE_CONSONANT:
+               switch (cp_data->indic_conjunct_break) {
+               case UNICODE_INDIC_CONJUNCT_BREAK_LINKER:
+                       gcbrk->gb9c = GB9C_STATE_LINKER;
+               case UNICODE_INDIC_CONJUNCT_BREAK_CONSONANT:
+               case UNICODE_INDIC_CONJUNCT_BREAK_EXTEND:
+                       break;
+               default:
+                       gcbrk->gb9c = GB9C_STATE_NONE;
+                       break;
+               }
+               break;
+       case GB9C_STATE_LINKER:
+               switch (cp_data->indic_conjunct_break) {
+               case UNICODE_INDIC_CONJUNCT_BREAK_LINKER:
+               case UNICODE_INDIC_CONJUNCT_BREAK_EXTEND:
+                       break;
+               case UNICODE_INDIC_CONJUNCT_BREAK_CONSONANT:
+                       if (bstatus < 0)
+                               bstatus = 0;
+                       gcbrk->gb9c = GB9C_STATE_CONSONANT;
+                       break;
+               default:
+                       gcbrk->gb9c = GB9C_STATE_NONE;
+                       break;
+               }
+               break;
+       default:
+               i_unreached();
+       }
+
+       /* GB11: Do not break within emoji ZWJ sequences.
+          \p{Extended_Pictographic} Extend* ZWJ x \p{Extended_Pictographic}
+        */
+       enum {
+               GB11_STATE_NONE = 0,
+               GB11_STATE_EP,
+               GB11_STATE_ZWJ,
+       };
+       switch (gcbrk->gb11) {
+       case GB11_STATE_NONE:
+               if (cp_data->pb_e_extended_pictographic)
+                       gcbrk->gb11 = GB11_STATE_EP;
+               break;
+       case GB11_STATE_EP:
+               if (cp_data->pb_e_extended_pictographic)
+                       break;
+               if (cp_data->pb_gcb_extend)
+                       break;
+               if (cp_data->pb_b_zwj) {
+                       gcbrk->gb11 = GB11_STATE_ZWJ;
+                       break;
+               }
+               gcbrk->gb11 = GB11_STATE_NONE;
+               break;
+       case GB11_STATE_ZWJ:
+               if (cp_data->pb_e_extended_pictographic) {
+                       if (bstatus < 0)
+                               bstatus = 0;
+                       gcbrk->gb11 = GB11_STATE_EP;
+                       break;
+               }
+               gcbrk->gb11 = GB11_STATE_NONE;
+               break;
+       default:
+               i_unreached();
+       }
+
+       /* GB12, GB13: Do not break within emoji flag sequences. That is, do not
+                      break between regional indicator (RI) symbols if there is
+                      an odd number of RI characters before the break point.
+          sot   (RI RI)* RI x RI
+          [^RI] (RI RI)* RI x RI
+        */
+       if (gcbrk->gb12) {
+               if (cp_data->pb_b_regional_indicator) {
+                       if (bstatus < 0)
+                               bstatus = 0;
+               }
+               gcbrk->gb12 = FALSE;
+       } else if (cp_data->pb_b_regional_indicator) {
+               gcbrk->gb12 = TRUE;
+       }
+
+       /* GB999: Otherwise, break everywhere.
+          (Any + Any)
+        */
+       if (bstatus == 0)
+               return FALSE;
+       return TRUE;
+}
diff --git a/src/lib/unicode-break.h b/src/lib/unicode-break.h

new file mode 100644 (file)

index 0000000..74c08d7
--- /dev/null
+++ b/src/lib/unicode-break.h
@@ -0,0 +1,33 @@
+#ifndef UNICODE_BREAK_H
+#define UNICODE_BREAK_H
+
+struct unicode_code_point_data;
+
+/*
+ * Grapheme Cluster Boundaries (Unicode Standard Annex #29, Section 3)
+ */
+
+struct unicode_gc_break {
+       unsigned int gb9c;
+       unsigned int gb11;
+       bool gb1:1;
+       bool gb3:1;
+       bool gb4:1;
+       bool gb6:1;
+       bool gb7:1;
+       bool gb8:1;
+       bool gb9b:1;
+       bool gb12:1;
+};
+
+void unicode_gc_break_init(struct unicode_gc_break *gcbrk);
+
+/* Returns TRUE if a grapheme boundary exists before the codepoint provided in
+   cp. Any code point data for cp that was looked up earlier can be provided in
+   the _cp_data pointer, or if it was NULL it can be retrieved there after the
+   call.
+ */
+bool unicode_gc_break_cp(struct unicode_gc_break *gcbrk, uint32_t cp,
+                        const struct unicode_code_point_data **_cp_data);
+
+#endif
diff --git a/src/lib/unicode-data-static.h b/src/lib/unicode-data-static.h

index 0d74fb80504da15d57e565af65c6bebf0ddca292..9a2575c6230323354bcb03131f9e6656570988c1 100644 (file)
--- a/src/lib/unicode-data-static.h
+++ b/src/lib/unicode-data-static.h
@@ -115,6 +115,13 @@ enum unicode_nf_quick_check {
         UNICODE_NFD_QUICK_CHECK_MASK   = (0x03 << 0),
  };
  
+enum unicode_indic_conjunct_break {
+       UNICODE_INDIC_CONJUNCT_BREAK_NONE = 0,
+       UNICODE_INDIC_CONJUNCT_BREAK_LINKER,
+       UNICODE_INDIC_CONJUNCT_BREAK_CONSONANT,
+       UNICODE_INDIC_CONJUNCT_BREAK_EXTEND,
+};
+
  struct unicode_code_point_data {
         uint8_t general_category; // Not yet used
         uint8_t canonical_combining_class;
@@ -142,11 +149,16 @@ struct unicode_code_point_data {
  
         uint32_t simple_titlecase_mapping;
  
+       uint8_t indic_conjunct_break:3;
+
         /* Property bits (UAX #44, Section 5.1) */
  
         /* General */
         bool pb_g_white_space:1;
  
+       /* Emoji */
+       bool pb_e_extended_pictographic:1;
+
         /* Identifiers */
         bool pb_i_pattern_white_space:1;
  
@@ -159,9 +171,20 @@ struct unicode_code_point_data {
         /* Common Break */
         bool pb_b_cr:1;
         bool pb_b_lf:1;
-       bool pb_b_zwj:1; // Not currently used
+       bool pb_b_zwj:1;
         bool pb_b_regional_indicator:1;
  
+       /* Grapheme_Cluster_Break (UAX #29, Section 3.1) */
+       bool pb_gcb_control:1;
+       bool pb_gcb_extend:1;
+       bool pb_gcb_prepend:1;
+       bool pb_gcb_spacingmark:1;
+       bool pb_gcb_l:1;
+       bool pb_gcb_v:1;
+       bool pb_gcb_t:1;
+       bool pb_gcb_lv:1;
+       bool pb_gcb_lvt:1;
+
         /* Word_Break (UAX #29, Section 4.1) */
         bool pb_wb_newline:1;
         bool pb_wb_extend:1;
diff --git a/src/lib/unicode-ucd-compile.py b/src/lib/unicode-ucd-compile.py

index 15be589cb3d8be01f39743d780d8f172aeda83bf..9ff5ed4bd6f3300f93226557066f84df5d1ec57a 100755 (executable)
--- a/src/lib/unicode-ucd-compile.py
+++ b/src/lib/unicode-ucd-compile.py
@@ -452,6 +452,32 @@ def read_ucd_files():
              for cp in range(cprng[0], cprng[1] + 1):
                  ud_composition_exclusions[cp] = True
  
+    # DerivedCoreProperties.txt
+    with UCDFileOpen("DerivedCoreProperties.txt") as ucd:
+        for line in ucd.fd:
+            data = line.split("#")
+            if len(data) == 0:
+                continue
+
+            if len(data[0]) == 0:
+                continue
+            columns = data[0].split(";")
+            if len(columns) < 2:
+                continue
+
+            cprng = parse_cp_range(columns[0])
+            if cprng is None:
+                continue
+
+            prop = columns[1].strip()
+            if prop != "InCB":
+                continue
+
+            value = columns[2].strip()
+            cpd = CodePointData()
+            cpd.indic_conjunct_break = value
+            CodePointRange(cprng[0], cprng[1], cpd)
+
      # DerivedNormalizationProps.txt
      with UCDFileOpen("DerivedNormalizationProps.txt") as ucd:
          line_num = 0
@@ -491,6 +517,99 @@ def read_ucd_files():
                  cpd.nfkc_quick_check = value
                  CodePointRange(cprng[0], cprng[1], cpd)
  
+    # emoji-data.txt
+    with UCDFileOpen("emoji-data.txt") as ucd:
+        for line in ucd.fd:
+            data = line.split("#")
+            if len(data) == 0:
+                continue
+
+            if len(data[0]) == 0:
+                continue
+            columns = data[0].split(";")
+            if len(columns) < 2:
+                continue
+
+            cprng = parse_cp_range(columns[0])
+            if cprng is None:
+                continue
+
+            prop = columns[1].strip()
+            if prop != "Extended_Pictographic":
+                continue
+
+            cpd = CodePointData()
+            cpd.pb_e_extended_pictographic = True
+            CodePointRange(cprng[0], cprng[1], cpd)
+
+    # GraphemeBreakProperty.txt
+    with UCDFileOpen("GraphemeBreakProperty.txt") as ucd:
+        for line in ucd.fd:
+            data = line.split("#")
+            if len(data[0]) == 0:
+                continue
+            columns = data[0].split(";")
+            if len(columns) < 2:
+                continue
+
+            cprng = parse_cp_range(columns[0])
+            if cprng is None:
+                continue
+
+            prop = columns[1].strip()
+            if prop == "CR":
+                cpd = CodePointData()
+                cpd.pb_b_cr = True
+                CodePointRange(cprng[0], cprng[1], cpd)
+            elif prop == "LF":
+                cpd = CodePointData()
+                cpd.pb_b_lf = True
+                CodePointRange(cprng[0], cprng[1], cpd)
+            elif prop == "Control":
+                cpd = CodePointData()
+                cpd.pb_gcb_control = True
+                CodePointRange(cprng[0], cprng[1], cpd)
+            elif prop == "Extend":
+                cpd = CodePointData()
+                cpd.pb_gcb_extend = True
+                CodePointRange(cprng[0], cprng[1], cpd)
+            elif prop == "ZWJ":
+                cpd = CodePointData()
+                cpd.pb_b_zwj = True
+                CodePointRange(cprng[0], cprng[1], cpd)
+            elif prop == "Regional_Indicator":
+                cpd = CodePointData()
+                cpd.pb_b_regional_indicator = True
+                CodePointRange(cprng[0], cprng[1], cpd)
+            elif prop == "Prepend":
+                cpd = CodePointData()
+                cpd.pb_gcb_prepend = True
+                CodePointRange(cprng[0], cprng[1], cpd)
+            elif prop == "SpacingMark":
+                cpd = CodePointData()
+                cpd.pb_gcb_spacingmark = True
+                CodePointRange(cprng[0], cprng[1], cpd)
+            elif prop == "L":
+                cpd = CodePointData()
+                cpd.pb_gcb_l = True
+                CodePointRange(cprng[0], cprng[1], cpd)
+            elif prop == "V":
+                cpd = CodePointData()
+                cpd.pb_gcb_v = True
+                CodePointRange(cprng[0], cprng[1], cpd)
+            elif prop == "T":
+                cpd = CodePointData()
+                cpd.pb_gcb_t = True
+                CodePointRange(cprng[0], cprng[1], cpd)
+            elif prop == "LV":
+                cpd = CodePointData()
+                cpd.pb_gcb_lv = True
+                CodePointRange(cprng[0], cprng[1], cpd)
+            elif prop == "LVT":
+                cpd = CodePointData()
+                cpd.pb_gcb_lvt = True
+                CodePointRange(cprng[0], cprng[1], cpd)
+
      # PropList.txt
      with UCDFileOpen("PropList.txt") as ucd:
          line_num = 0
@@ -1119,6 +1238,12 @@ def decomposition_type_def(dt):
      return "UNICODE_DECOMPOSITION_TYPE_%s" % dt.upper()
  
  
+def indic_conjunct_break_def(icb):
+    icb_uc = icb.upper()
+
+    return "UNICODE_INDIC_CONJUNCT_BREAK_%s" % icb_uc
+
+
  def print_list(code_list):
      last = len(code_list) - 1
      n = 0
@@ -1315,8 +1440,15 @@ def write_tables_c_cpd(cpd):
              "\t\t.simple_titlecase_mapping = 0x%04X,"
              % cpd.simple_titlecase_mapping
          )
+    if hasattr(cpd, "indic_conjunct_break"):
+        print(
+            "\t\t.indic_conjunct_break = %s,"
+            % indic_conjunct_break_def(cpd.indic_conjunct_break)
+        )
      if hasattr(cpd, "pb_g_white_space"):
          print("\t\t.pb_g_white_space = TRUE,")
+    if hasattr(cpd, "pb_e_extended_pictographic"):
+        print("\t\t.pb_e_extended_pictographic = TRUE,")
      if hasattr(cpd, "pb_i_pattern_white_space"):
          print("\t\t.pb_i_pattern_white_space = TRUE,")
      if hasattr(cpd, "pb_m_quotation_mark"):
@@ -1335,6 +1467,24 @@ def write_tables_c_cpd(cpd):
          print("\t\t.pb_b_zwj = TRUE,")
      if hasattr(cpd, "pb_b_regional_indicator"):
          print("\t\t.pb_b_regional_indicator = TRUE,")
+    if hasattr(cpd, "pb_gcb_control"):
+        print("\t\t.pb_gcb_control = TRUE,")
+    if hasattr(cpd, "pb_gcb_extend"):
+        print("\t\t.pb_gcb_extend = TRUE,")
+    if hasattr(cpd, "pb_gcb_prepend"):
+        print("\t\t.pb_gcb_prepend = TRUE,")
+    if hasattr(cpd, "pb_gcb_spacingmark"):
+        print("\t\t.pb_gcb_spacingmark = TRUE,")
+    if hasattr(cpd, "pb_gcb_l"):
+        print("\t\t.pb_gcb_l = TRUE,")
+    if hasattr(cpd, "pb_gcb_v"):
+        print("\t\t.pb_gcb_v = TRUE,")
+    if hasattr(cpd, "pb_gcb_t"):
+        print("\t\t.pb_gcb_t = TRUE,")
+    if hasattr(cpd, "pb_gcb_lv"):
+        print("\t\t.pb_gcb_lv = TRUE,")
+    if hasattr(cpd, "pb_gcb_lvt"):
+        print("\t\t.pb_gcb_lvt = TRUE,")
      if hasattr(cpd, "pb_wb_newline"):
          print("\t\t.pb_wb_newline = TRUE,")
      if hasattr(cpd, "pb_wb_extend"):
author	Stephan Bosch <stephan.bosch@open-xchange.com>
	Fri, 11 Apr 2025 03:35:10 +0000 (05:35 +0200)
committer	aki.tuomi <aki.tuomi@open-xchange.com>
	Mon, 8 Dec 2025 14:37:04 +0000 (14:37 +0000)
src/lib/Makefile.am		patch \| blob \| blame \| history
src/lib/test-lib.inc		patch \| blob \| blame \| history
src/lib/test-unicode-break.c	[new file with mode: 0644]	patch \| blob
src/lib/test-unicode-data.c		patch \| blob \| blame \| history
src/lib/unicode-break.c	[new file with mode: 0644]	patch \| blob
src/lib/unicode-break.h	[new file with mode: 0644]	patch \| blob
src/lib/unicode-data-static.h		patch \| blob \| blame \| history
src/lib/unicode-ucd-compile.py		patch \| blob \| blame \| history