test_end();
}
+static void test_unichar_grapheme_clusters(void)
+{
+ const char *in[] = {
+ /* Simple ASCII */
+ "frop",
+ /* U+1019 U+1039 U+1018 U+102C U+1037 */
+ "\xE1\x80\x99\xE1\x80\xB9\xE1\x80\x98\xE1\x80\xAC\xE1\x80\xB7"
+ };
+ /* Use TAB to mark grapheme boundaries */
+ const char *tb[] = {
+ /* Simple ASCII: break points after every byte */
+ "f\tr\to\tp\t",
+ /* U+1019 U+1039 U+1018 U+102C U+1037 */
+ "\xE1\x80\x99\xE1\x80\xB9\xE1\x80\x98\t\xE1\x80\xAC\xE1\x80\xB7\t",
+ };
+ unsigned int n_in = N_ELEMENTS(in), n_tb = N_ELEMENTS(tb), i;
+
+ i_assert(n_in == n_tb);
+
+ test_begin("unichar grapheme clusters");
+
+ string_t *tb_buf = t_str_new(256);
+ for (i = 0; i < n_in; i++) {
+ struct uni_gc_scanner gcsc;
+
+ str_truncate(tb_buf, 0);
+ uni_gc_scanner_init(&gcsc, in[i], strlen(in[i]));
+
+ while (!uni_gc_scan_at_end(&gcsc)) {
+ const unsigned char *gc;
+ size_t gc_size;
+
+ gc = uni_gc_scan_get(&gcsc, &gc_size);
+ if (gc_size == 0)
+ break;
+
+ str_append_data(tb_buf, gc, gc_size);
+ str_append_c(tb_buf, '\t');
+ uni_gc_scan_shift(&gcsc);
+ }
+
+ test_assert_strcmp_idx(str_c(tb_buf), tb[i], i);
+ }
+ test_end();
+}
+
void test_unichar(void)
{
static const char overlong_utf8[] = "\xf8\x80\x95\x81\xa1";
test_unichar_uni_utf8_partial_strlen_n();
test_unichar_valid_unicode();
test_unichar_surrogates();
+
+ test_unichar_grapheme_clusters();
}
static void
test_gcb_line(const char *file, const char *line, unsigned int line_num)
{
- struct unicode_gc_break ubrk;
+ struct unicode_gc_break gcbrk;
const char *const *tokens = t_strsplit(line, " ");
- unicode_gc_break_init(&ubrk);
+ unicode_gc_break_init(&gcbrk);
while (tokens[0] != NULL && tokens[1] != NULL && !test_has_failed()) {
const char *brk = tokens[0];
const char *cp_hex = tokens[1];
const struct unicode_code_point_data *cp_data = NULL;
bool break_m1;
- break_m1 = unicode_gc_break_cp(&ubrk, cp, &cp_data);
+ break_m1 = unicode_gc_break_cp(&gcbrk, cp, &cp_data);
test_assert_idx(break_m1 == break_m1_test, line_num);
max_new_size--;
return max_new_size;
}
+
+/*
+ * Grapheme clusters
+ */
+
+void uni_gc_scanner_init(struct uni_gc_scanner *gcsc,
+ const void *input, size_t size)
+{
+ i_zero(gcsc);
+ unicode_gc_break_init(&gcsc->gcbrk);
+ gcsc->p = input;
+ gcsc->pend = gcsc->p + size;
+}
+
+bool uni_gc_scan_shift(struct uni_gc_scanner *gcsc)
+{
+ bool first = (gcsc->poffset == NULL);
+
+ /* Reset offset to last grapheme boundary (after the last grapheme
+ cluster we indicated). */
+ gcsc->poffset = gcsc->p;
+ /* Shift pointer past last code point; starts the next grapheme cluster
+ we shall compose in this call. */
+ gcsc->p += gcsc->cp_size;
+ gcsc->cp_size = 0;
+ while (gcsc->p < gcsc->pend) {
+ /* Decode next UTF-8 code point */
+ gcsc->cp_size = uni_utf8_get_char_n(
+ gcsc->p, gcsc->pend - gcsc->p, &gcsc->cp);
+ /* We expect valid and complete UTF-8 input */
+ i_assert(gcsc->cp_size > 0);
+
+ /* Determine whether there exists a grapheme cluster boundary
+ before this code point. */
+ const struct unicode_code_point_data *cp_data = NULL;
+ if (unicode_gc_break_cp(&gcsc->gcbrk, gcsc->cp, &cp_data)) {
+ /* Yes, but ignore the very first grapheme boundary that
+ occurs at the start of input. */
+ if (!first) {
+ /* Grapheme cluster detected, but it does *NOT*
+ include the last code point we decoded just
+ now. */
+ i_assert(gcsc->p > gcsc->poffset);
+ return TRUE;
+ }
+ first = FALSE;
+ }
+
+ /* Shift pointer past last code point; include this in the next
+ grapheme cluster we shall compose in this call. */
+ gcsc->p += gcsc->cp_size;
+ gcsc->cp_size = 0;
+ }
+ /* Return whether there is any last remaining grapheme cluster. */
+ return (gcsc->p > gcsc->poffset);
+}
#ifndef UNICHAR_H
#define UNICHAR_H
+#include "unicode-break.h"
+
/* Character used to replace invalid input. */
#define UNICODE_REPLACEMENT_CHAR 0xfffd
#define UNICODE_REPLACEMENT_CHAR_UTF8 "\xEF\xBF\xBD"
*high_r = UTF16_SURROGATE_HIGH(chr);
*low_r = UTF16_SURROGATE_LOW(chr);
}
+
+/*
+ * Grapheme clusters
+ */
+
+/* The grapheme cluster scanner is used to split a Unicode string into a
+ sequence of grapheme clusters, which are in essence the Unicode characters as
+ perceived by the user. These can be longer than a single code point and by
+ consequence longer than a single octet. The Unicode standard defines what
+ constitutes a grapheme cluster in Annex #29. */
+
+struct uni_gc_scanner {
+ pool_t pool;
+ struct unicode_gc_break gcbrk;
+
+ const unsigned char *poffset, *p, *pend;
+
+ unichar_t cp;
+ int cp_size;
+};
+
+/* Initialize the scanner. */
+void uni_gc_scanner_init(struct uni_gc_scanner *gcsc,
+ const void *input, size_t size);
+/* Shift scanner position to next grapheme cluster. Returns TRUE when scanner
+ points to a valid grapheme cluster and has not reached the end. */
+bool uni_gc_scan_shift(struct uni_gc_scanner *gcsc) ATTR_NOWARN_UNUSED_RESULT;
+
+
+/* Obtain a pointer to the current grapheme cluster the scanner points to.
+ Returns the size of the cluster in octets in size_r. */
+static inline const unsigned char *
+uni_gc_scan_get(struct uni_gc_scanner *gcsc, size_t *size_r)
+{
+ if (gcsc->poffset == NULL)
+ uni_gc_scan_shift(gcsc);
+ if (size_r != NULL)
+ *size_r = gcsc->p - gcsc->poffset;
+ return gcsc->poffset;
+}
+
+/* Convenience function for checking whether current grapheme cluster is a
+ particular (single-octet) ASCII character. */
+static inline bool
+uni_gc_scan_ascii_equals(struct uni_gc_scanner *gcsc, unsigned int c)
+{
+ size_t gc_size;
+ const unsigned char *gc = uni_gc_scan_get(gcsc, &gc_size);
+
+ if (gc_size != 1)
+ return FALSE;
+
+ return (*gc == (unsigned char)c);
+}
+
+/* Returns TRUE when the scanner has reached the end of input. */
+static inline bool uni_gc_scan_at_end(struct uni_gc_scanner *gcsc)
+{
+ size_t gc_size;
+ (void)uni_gc_scan_get(gcsc, &gc_size);
+ return (gc_size == 0);
+}
+
#endif