lib: unichar - Add grapheme cluster scanner

author Stephan Bosch <stephan.bosch@open-xchange.com>

Mon, 8 Dec 2025 04:30:38 +0000 (05:30 +0100)

committer aki.tuomi <aki.tuomi@open-xchange.com>

Mon, 8 Dec 2025 14:37:04 +0000 (14:37 +0000)
author Stephan Bosch <stephan.bosch@open-xchange.com>
Mon, 8 Dec 2025 04:30:38 +0000 (05:30 +0100)
committer aki.tuomi <aki.tuomi@open-xchange.com>
Mon, 8 Dec 2025 14:37:04 +0000 (14:37 +0000)
diff --git a/src/lib/test-unichar.c b/src/lib/test-unichar.c

index b0f39743a9263f96b732cdc6dc840086de25aa4d..b84ff59d59e7bd092872881525b2583e0021e999 100644 (file)
--- a/src/lib/test-unichar.c
+++ b/src/lib/test-unichar.c
@@ -168,6 +168,52 @@ static void test_unichar_collation(void)
         test_end();
  }
  
+static void test_unichar_grapheme_clusters(void)
+{
+       const char *in[] = {
+               /* Simple ASCII */
+               "frop",
+               /* U+1019 U+1039 U+1018 U+102C U+1037 */
+               "\xE1\x80\x99\xE1\x80\xB9\xE1\x80\x98\xE1\x80\xAC\xE1\x80\xB7"
+       };
+       /* Use TAB to mark grapheme boundaries */
+       const char *tb[] = {
+               /* Simple ASCII: break points after every byte */
+               "f\tr\to\tp\t",
+               /* U+1019 U+1039 U+1018 U+102C U+1037 */
+               "\xE1\x80\x99\xE1\x80\xB9\xE1\x80\x98\t\xE1\x80\xAC\xE1\x80\xB7\t",
+       };
+       unsigned int n_in = N_ELEMENTS(in), n_tb = N_ELEMENTS(tb), i;
+
+       i_assert(n_in == n_tb);
+
+       test_begin("unichar grapheme clusters");
+
+       string_t *tb_buf = t_str_new(256);
+       for (i = 0; i < n_in; i++) {
+               struct uni_gc_scanner gcsc;
+
+               str_truncate(tb_buf, 0);
+               uni_gc_scanner_init(&gcsc, in[i], strlen(in[i]));
+
+               while (!uni_gc_scan_at_end(&gcsc)) {
+                       const unsigned char *gc;
+                       size_t gc_size;
+
+                       gc = uni_gc_scan_get(&gcsc, &gc_size);
+                       if (gc_size == 0)
+                               break;
+
+                       str_append_data(tb_buf, gc, gc_size);
+                       str_append_c(tb_buf, '\t');
+                       uni_gc_scan_shift(&gcsc);
+               }
+
+               test_assert_strcmp_idx(str_c(tb_buf), tb[i], i);
+       }
+       test_end();
+}
+
  void test_unichar(void)
  {
         static const char overlong_utf8[] = "\xf8\x80\x95\x81\xa1";
@@ -222,4 +268,6 @@ void test_unichar(void)
         test_unichar_uni_utf8_partial_strlen_n();
         test_unichar_valid_unicode();
         test_unichar_surrogates();
+
+       test_unichar_grapheme_clusters();
  }
diff --git a/src/lib/test-unicode-break.c b/src/lib/test-unicode-break.c

index 31eb23c398644c497f6ca3b85e0a232e714c0ba6..cd0428d4de41788691f8f4f8c7343452eef50d52 100644 (file)
--- a/src/lib/test-unicode-break.c
+++ b/src/lib/test-unicode-break.c
@@ -18,10 +18,10 @@
  static void
  test_gcb_line(const char *file, const char *line, unsigned int line_num)
  {
-       struct unicode_gc_break ubrk;
+       struct unicode_gc_break gcbrk;
         const char *const *tokens = t_strsplit(line, " ");
  
-       unicode_gc_break_init(&ubrk);
+       unicode_gc_break_init(&gcbrk);
         while (tokens[0] != NULL && tokens[1] != NULL && !test_has_failed()) {
                 const char *brk = tokens[0];
                 const char *cp_hex = tokens[1];
@@ -47,7 +47,7 @@ test_gcb_line(const char *file, const char *line, unsigned int line_num)
                 const struct unicode_code_point_data *cp_data = NULL;
                 bool break_m1;
  
-               break_m1 = unicode_gc_break_cp(&ubrk, cp, &cp_data);
+               break_m1 = unicode_gc_break_cp(&gcbrk, cp, &cp_data);
  
                 test_assert_idx(break_m1 == break_m1_test, line_num);
  
diff --git a/src/lib/unichar.c b/src/lib/unichar.c

index b582e4769d01097c17a1d11944266d6c4f0dbb75..79239ae1b0321466116eed715363d91745a0979d 100644 (file)
--- a/src/lib/unichar.c
+++ b/src/lib/unichar.c
@@ -602,3 +602,59 @@ size_t uni_utf8_data_truncate(const unsigned char *data, size_t old_size,
                 max_new_size--;
         return max_new_size;
  }
+
+/*
+ * Grapheme clusters
+ */
+
+void uni_gc_scanner_init(struct uni_gc_scanner *gcsc,
+                        const void *input, size_t size)
+{
+       i_zero(gcsc);
+       unicode_gc_break_init(&gcsc->gcbrk);
+       gcsc->p = input;
+       gcsc->pend = gcsc->p + size;
+}
+
+bool uni_gc_scan_shift(struct uni_gc_scanner *gcsc)
+{
+       bool first = (gcsc->poffset == NULL);
+
+       /* Reset offset to last grapheme boundary (after the last grapheme
+          cluster we indicated). */
+       gcsc->poffset = gcsc->p;
+       /* Shift pointer past last code point; starts the next grapheme cluster
+          we shall compose in this call. */
+       gcsc->p += gcsc->cp_size;
+       gcsc->cp_size = 0;
+       while (gcsc->p < gcsc->pend) {
+               /* Decode next UTF-8 code point */
+               gcsc->cp_size = uni_utf8_get_char_n(
+                       gcsc->p, gcsc->pend - gcsc->p, &gcsc->cp);
+               /* We expect valid and complete UTF-8 input */
+               i_assert(gcsc->cp_size > 0);
+
+               /* Determine whether there exists a grapheme cluster boundary
+                  before this code point. */
+               const struct unicode_code_point_data *cp_data = NULL;
+               if (unicode_gc_break_cp(&gcsc->gcbrk, gcsc->cp, &cp_data)) {
+                       /* Yes, but ignore the very first grapheme boundary that
+                          occurs at the start of input. */
+                       if (!first) {
+                               /* Grapheme cluster detected, but it does *NOT*
+                                  include the last code point we decoded just
+                                  now. */
+                               i_assert(gcsc->p > gcsc->poffset);
+                               return TRUE;
+                       }
+                       first = FALSE;
+               }
+
+               /* Shift pointer past last code point; include this in the next
+                  grapheme cluster we shall compose in this call. */
+               gcsc->p += gcsc->cp_size;
+               gcsc->cp_size = 0;
+       }
+       /* Return whether there is any last remaining grapheme cluster. */
+       return (gcsc->p > gcsc->poffset);
+}
diff --git a/src/lib/unichar.h b/src/lib/unichar.h

index 68943a4866d18353f4d228b5d65fc0a253cbd10c..52b97e743c43ed649ed13e9a7f350e42a2224558 100644 (file)
--- a/src/lib/unichar.h
+++ b/src/lib/unichar.h
@@ -1,6 +1,8 @@
  #ifndef UNICHAR_H
  #define UNICHAR_H
  
+#include "unicode-break.h"
+
  /* Character used to replace invalid input. */
  #define UNICODE_REPLACEMENT_CHAR 0xfffd
  #define UNICODE_REPLACEMENT_CHAR_UTF8 "\xEF\xBF\xBD"
@@ -207,4 +209,67 @@ static inline void uni_split_surrogate(unichar_t chr, unichar_t *high_r, unichar
         *high_r = UTF16_SURROGATE_HIGH(chr);
         *low_r = UTF16_SURROGATE_LOW(chr);
  }
+
+/*
+ * Grapheme clusters
+ */
+
+/* The grapheme cluster scanner is used to split a Unicode string into a
+   sequence of grapheme clusters, which are in essence the Unicode characters as
+   perceived by the user. These can be longer than a single code point and by
+   consequence longer than a single octet. The Unicode standard defines what
+   constitutes a grapheme cluster in Annex #29. */
+
+struct uni_gc_scanner {
+       pool_t pool;
+       struct unicode_gc_break gcbrk;
+
+       const unsigned char *poffset, *p, *pend;
+
+       unichar_t cp;
+       int cp_size;
+};
+
+/* Initialize the scanner. */
+void uni_gc_scanner_init(struct uni_gc_scanner *gcsc,
+                        const void *input, size_t size);
+/* Shift scanner position to next grapheme cluster. Returns TRUE when scanner
+   points to a valid grapheme cluster and has not reached the end. */
+bool uni_gc_scan_shift(struct uni_gc_scanner *gcsc) ATTR_NOWARN_UNUSED_RESULT;
+
+
+/* Obtain a pointer to the current grapheme cluster the scanner points to.
+   Returns the size of the cluster in octets in size_r. */
+static inline const unsigned char *
+uni_gc_scan_get(struct uni_gc_scanner *gcsc, size_t *size_r)
+{
+       if (gcsc->poffset == NULL)
+               uni_gc_scan_shift(gcsc);
+       if (size_r != NULL)
+               *size_r = gcsc->p - gcsc->poffset;
+       return gcsc->poffset;
+}
+
+/* Convenience function for checking whether current grapheme cluster is a
+   particular (single-octet) ASCII character.  */
+static inline bool
+uni_gc_scan_ascii_equals(struct uni_gc_scanner *gcsc, unsigned int c)
+{
+       size_t gc_size;
+       const unsigned char *gc = uni_gc_scan_get(gcsc, &gc_size);
+
+       if (gc_size != 1)
+               return FALSE;
+
+       return (*gc == (unsigned char)c);
+}
+
+/* Returns TRUE when the scanner has reached the end of input. */
+static inline bool uni_gc_scan_at_end(struct uni_gc_scanner *gcsc)
+{
+       size_t gc_size;
+       (void)uni_gc_scan_get(gcsc, &gc_size);
+       return (gc_size == 0);
+}
+
  #endif
author	Stephan Bosch <stephan.bosch@open-xchange.com>
	Mon, 8 Dec 2025 04:30:38 +0000 (05:30 +0100)
committer	aki.tuomi <aki.tuomi@open-xchange.com>
	Mon, 8 Dec 2025 14:37:04 +0000 (14:37 +0000)
src/lib/test-unichar.c		patch \| blob \| blame \| history
src/lib/test-unicode-break.c		patch \| blob \| blame \| history
src/lib/unichar.c		patch \| blob \| blame \| history
src/lib/unichar.h		patch \| blob \| blame \| history