+2025-05-18 Bruno Haible <bruno@clisp.org>
+
+ unigbrk/u*-grapheme-prev: Support Indic, Emojis, regional indicators.
+ Reported by Kang-Che Sung <explorer09@gmail.com> in
+ <https://lists.gnu.org/archive/html/bug-libunistring/2025-03/msg00000.html>.
+ * lib/unigbrk/u-grapheme-prev.h: New file, based on
+ lib/unigbrk/u-grapheme-breaks.h.
+ * lib/unigbrk/u8-grapheme-prev.c: Include unictype.h and
+ u-grapheme-prev.h.
+ (u8_grapheme_prev): Remove function.
+ * lib/unigbrk/u16-grapheme-prev.c: Include unictype.h and
+ u-grapheme-prev.h.
+ (u16_grapheme_prev): Remove function.
+ * lib/unigbrk/u32-grapheme-prev.c: Include unictype.h and
+ u-grapheme-prev.h.
+ (u32_grapheme_prev): Remove function.
+ * modules/unigbrk/u8-grapheme-prev (Files): Add
+ lib/unigbrk/u-grapheme-prev.h.
+ (Depends-on): Remove unigbrk/uc-is-grapheme-break. Add unigbrk/base,
+ unigbrk/uc-gbrk-prop, unictype/incb-of,
+ unictype/property-extended-pictographic, bool.
+ (configure.ac): Bump required libunistring version.
+ * modules/unigbrk/u16-grapheme-prev (Files): Add
+ lib/unigbrk/u-grapheme-prev.h.
+ (Depends-on): Remove unigbrk/uc-is-grapheme-break. Add unigbrk/base,
+ unigbrk/uc-gbrk-prop, unictype/incb-of,
+ unictype/property-extended-pictographic, bool.
+ (configure.ac): Bump required libunistring version.
+ * modules/unigbrk/u32-grapheme-prev (Files): Add
+ lib/unigbrk/u-grapheme-prev.h.
+ (Depends-on): Remove unigbrk/uc-is-grapheme-break. Add unigbrk/base,
+ unigbrk/uc-gbrk-prop, unictype/incb-of,
+ unictype/property-extended-pictographic, bool.
+ (configure.ac): Bump required libunistring version.
+ * tests/unigbrk/test-u8-grapheme-prev.c (main): Add more test cases,
+ from tests/unigbrk/test-u8-grapheme-breaks.c.
+ * tests/unigbrk/test-u16-grapheme-prev.c (main): Add more test cases,
+ from tests/unigbrk/test-u16-grapheme-breaks.c.
+ * tests/unigbrk/test-u32-grapheme-prev.c (main): Add more test cases,
+ from tests/unigbrk/test-u32-grapheme-breaks.c.
+
2025-05-18 Bruno Haible <bruno@clisp.org>
unigbrk/u*-grapheme-next: Support Indic, Emojis, regional indicators.
--- /dev/null
+/* Grapheme cluster break function.
+ Copyright (C) 2010-2025 Free Software Foundation, Inc.
+
+ This file is free software.
+ It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
+ You can redistribute it and/or modify it under either
+ - the terms of the GNU Lesser General Public License as published
+ by the Free Software Foundation, either version 3, or (at your
+ option) any later version, or
+ - the terms of the GNU General Public License as published by the
+ Free Software Foundation; either version 2, or (at your option)
+ any later version, or
+ - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
+
+ This file is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License and the GNU General Public License
+ for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License and of the GNU General Public License along with this
+ program. If not, see <https://www.gnu.org/licenses/>. */
+
+/* Written by Bruno Haible <bruno@clisp.org>, 2025. */
+
+/* This file implements section 3 "Grapheme Cluster Boundaries"
+ of Unicode Standard Annex #29 <https://www.unicode.org/reports/tr29/>
+ backwards. */
+
+/* Returns true if the string [s_start, s) ends with a sequence of
+ Indic_Conjunct_Break values like:
+ consonant {extend|linker}* linker {extend|linker}*
+ */
+static bool
+ends_with_incb_consonant_extended_linker_extended (const UNIT *s,
+ const UNIT *s_start)
+{
+ /* Look for
+ consonant {extend|linker}*
+ with at least one linker. */
+ bool seen_linker = false;
+
+ while (s > s_start)
+ {
+ const UNIT *prev_s;
+ ucs4_t uc;
+
+ prev_s = U_PREV (&uc, s, s_start);
+ if (prev_s == NULL)
+ /* Ill-formed UTF-8 encoding. */
+ break;
+
+ int incb = uc_indic_conjunct_break (uc);
+ if (incb == UC_INDIC_CONJUNCT_BREAK_CONSONANT)
+ return seen_linker;
+ if (!(incb >= UC_INDIC_CONJUNCT_BREAK_LINKER))
+ break;
+ seen_linker |= (incb == UC_INDIC_CONJUNCT_BREAK_LINKER);
+
+ s = prev_s;
+ }
+
+ return false;
+}
+
+/* Returns true if the string [s_start, s) ends with a sequence of
+ characters like:
+ \p{Extended_Pictographic} Extend*
+ */
+static bool
+ends_with_emoji_modifier_sequence (const UNIT *s, const UNIT *s_start)
+{
+ while (s > s_start)
+ {
+ const UNIT *prev_s;
+ ucs4_t uc;
+
+ prev_s = U_PREV (&uc, s, s_start);
+ if (prev_s == NULL)
+ /* Ill-formed UTF-8 encoding. */
+ break;
+
+ if (uc_is_property_extended_pictographic (uc))
+ return true;
+
+ if (uc_graphemeclusterbreak_property (uc) != GBP_EXTEND)
+ break;
+
+ s = prev_s;
+ }
+
+ return false;
+}
+
+/* Returns the number of consecutive regional indicator (RI) characters
+ at the end of the string [s_start, s). */
+static size_t
+ends_with_ri_count (const UNIT *s, const UNIT *s_start)
+{
+ size_t ri_count = 0;
+
+ while (s > s_start)
+ {
+ const UNIT *prev_s;
+ ucs4_t uc;
+
+ prev_s = U_PREV (&uc, s, s_start);
+ if (prev_s == NULL)
+ /* Ill-formed UTF-8 encoding. */
+ break;
+
+ if (uc_graphemeclusterbreak_property (uc) == GBP_RI)
+ ri_count++;
+ else
+ break;
+
+ s = prev_s;
+ }
+
+ return ri_count;
+}
+
+const UNIT *
+FUNC (const UNIT *s, const UNIT *s_start)
+{
+ if (s == s_start)
+ return NULL;
+
+ /* Traverse the string backwards, from s down to s_start. */
+
+ /* Grapheme Cluster break property of the next character.
+ -1 at the very end of the string. */
+ int next_char_prop = -1;
+
+ /* Indic_Conjunct_Break property of the next character.
+ -1 at the very end of the string. */
+ int next_char_incb = -1;
+
+ /* Extended_Pictographic property of the next character.
+ false at the very end of the string. */
+ bool next_char_epic = false;
+
+ do
+ {
+ const UNIT *prev_s;
+ ucs4_t uc;
+
+ prev_s = U_PREV (&uc, s, s_start);
+ if (prev_s == NULL)
+ {
+ /* Ill-formed UTF-8 encoding. */
+ return s_start;
+ }
+
+ int prop = uc_graphemeclusterbreak_property (uc);
+ int incb = uc_indic_conjunct_break (uc);
+ bool epic = uc_is_property_extended_pictographic (uc);
+
+ /* Break at the end of the string (GB2). */
+ if (next_char_prop < 0)
+ /* *p = 1 */;
+ else
+ {
+ /* No break between CR and LF (GB3). */
+ if (prop == GBP_CR && next_char_prop == GBP_LF)
+ /* *p = 0 */;
+ /* Break before and after newlines (GB4, GB5). */
+ else if ((prop == GBP_CR
+ || prop == GBP_LF
+ || prop == GBP_CONTROL)
+ || (next_char_prop == GBP_CR
+ || next_char_prop == GBP_LF
+ || next_char_prop == GBP_CONTROL))
+ break /* *p = 1 */;
+ /* No break between Hangul syllable sequences (GB6, GB7, GB8). */
+ else if ((prop == GBP_L
+ && (next_char_prop == GBP_L
+ || next_char_prop == GBP_V
+ || next_char_prop == GBP_LV
+ || next_char_prop == GBP_LVT))
+ || ((prop == GBP_LV
+ || prop == GBP_V)
+ && (next_char_prop == GBP_V
+ || next_char_prop == GBP_T))
+ || ((prop == GBP_LVT
+ || prop == GBP_T)
+ && next_char_prop == GBP_T))
+ /* *p = 0 */;
+ /* No break before extending characters or ZWJ (GB9). */
+ else if (next_char_prop == GBP_EXTEND || next_char_prop == GBP_ZWJ)
+ /* *p = 0 */;
+ /* No break before SpacingMarks (GB9a). */
+ else if (next_char_prop == GBP_SPACINGMARK)
+ /* *p = 0 */;
+ /* No break after Prepend characters (GB9b). */
+ else if (prop == GBP_PREPEND)
+ /* *p = 0 */;
+ /* No break within certain combinations of Indic_Conjunct_Break
+ values: Between
+ consonant {extend|linker}* linker {extend|linker}*
+ and
+ consonant
+ (GB9c). */
+ else if (next_char_incb == UC_INDIC_CONJUNCT_BREAK_CONSONANT
+ && ends_with_incb_consonant_extended_linker_extended (s, s_start))
+ /* *p = 0 */;
+ /* No break within emoji modifier sequences or emoji zwj sequences
+ (GB11). */
+ else if (next_char_epic
+ && prop == GBP_ZWJ
+ && ends_with_emoji_modifier_sequence (prev_s, s_start))
+ /* *p = 0 */;
+ /* No break between RI if there is an odd number of RI
+ characters before (GB12, GB13). */
+ else if (next_char_prop == GBP_RI
+ && prop == GBP_RI
+ && (ends_with_ri_count (prev_s, s_start) % 2) == 0)
+ /* *p = 0 */;
+ /* Break everywhere (GB999). */
+ else
+ break /* *p = 1 */;
+ }
+
+ s = prev_s;
+ next_char_prop = prop;
+ next_char_incb = incb;
+ next_char_epic = epic;
+ }
+ while (s > s_start);
+
+ return s;
+}
/* Previous grapheme cluster function.
Copyright (C) 2010-2025 Free Software Foundation, Inc.
- Written by Ben Pfaff <blp@cs.stanford.edu>, 2010.
This file is free software.
It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
License and of the GNU General Public License along with this
program. If not, see <https://www.gnu.org/licenses/>. */
+/* Written by Bruno Haible <bruno@clisp.org>, 2025. */
+
/* Don't use the const-improved function macros in this compilation unit. */
#define _LIBUNISTRING_NO_CONST_GENERICS
/* Specification. */
#include "unigbrk.h"
+#include "unictype.h"
#include "unistr.h"
-const uint16_t *
-u16_grapheme_prev (const uint16_t *s, const uint16_t *start)
-{
- ucs4_t next;
-
- if (s == start)
- return NULL;
-
- s = u16_prev (&next, s, start);
- while (s != start)
- {
- const uint16_t *prev_s;
- ucs4_t prev;
-
- prev_s = u16_prev (&prev, s, start);
- if (prev_s == NULL)
- {
- /* Ill-formed UTF-16 encoding. */
- return start;
- }
-
- if (uc_is_grapheme_break (prev, next))
- break;
-
- s = prev_s;
- next = prev;
- }
-
- return s;
-}
+#define FUNC u16_grapheme_prev
+#define UNIT uint16_t
+#define U_PREV u16_prev
+#include "u-grapheme-prev.h"
/* Previous grapheme cluster function.
Copyright (C) 2010-2025 Free Software Foundation, Inc.
- Written by Ben Pfaff <blp@cs.stanford.edu>, 2010.
This file is free software.
It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
License and of the GNU General Public License along with this
program. If not, see <https://www.gnu.org/licenses/>. */
+/* Written by Bruno Haible <bruno@clisp.org>, 2025. */
+
/* Don't use the const-improved function macros in this compilation unit. */
#define _LIBUNISTRING_NO_CONST_GENERICS
/* Specification. */
#include "unigbrk.h"
+#include "unictype.h"
#include "unistr.h"
-const uint32_t *
-u32_grapheme_prev (const uint32_t *s, const uint32_t *start)
-{
- ucs4_t next;
-
- if (s == start)
- return NULL;
-
- u32_prev (&next, s, start);
- for (s--; s != start; s--)
- {
- ucs4_t prev;
-
- if (u32_prev (&prev, s, start) == NULL)
- {
- /* Ill-formed UTF-32 encoding. */
- return start;
- }
-
- if (uc_is_grapheme_break (prev, next))
- break;
-
- next = prev;
- }
-
- return s;
-}
+#define FUNC u32_grapheme_prev
+#define UNIT uint32_t
+#define U_PREV u32_prev
+#include "u-grapheme-prev.h"
/* Previous grapheme cluster function.
Copyright (C) 2010-2025 Free Software Foundation, Inc.
- Written by Ben Pfaff <blp@cs.stanford.edu>, 2010.
This file is free software.
It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
License and of the GNU General Public License along with this
program. If not, see <https://www.gnu.org/licenses/>. */
+/* Written by Bruno Haible <bruno@clisp.org>, 2025. */
+
/* Don't use the const-improved function macros in this compilation unit. */
#define _LIBUNISTRING_NO_CONST_GENERICS
/* Specification. */
#include "unigbrk.h"
+#include "unictype.h"
#include "unistr.h"
-const uint8_t *
-u8_grapheme_prev (const uint8_t *s, const uint8_t *start)
-{
- ucs4_t next;
-
- if (s == start)
- return NULL;
-
- s = u8_prev (&next, s, start);
- while (s != start)
- {
- const uint8_t *prev_s;
- ucs4_t prev;
-
- prev_s = u8_prev (&prev, s, start);
- if (prev_s == NULL)
- {
- /* Ill-formed UTF-8 encoding. */
- return start;
- }
-
- if (uc_is_grapheme_break (prev, next))
- break;
-
- s = prev_s;
- next = prev;
- }
-
- return s;
-}
+#define FUNC u8_grapheme_prev
+#define UNIT uint8_t
+#define U_PREV u8_prev
+#include "u-grapheme-prev.h"
Files:
lib/unigbrk/u16-grapheme-prev.c
+lib/unigbrk/u-grapheme-prev.h
tests/macros.h
Depends-on:
-unigbrk/uc-is-grapheme-break
+unigbrk/base
+unigbrk/uc-gbrk-prop
+unictype/incb-of
+unictype/property-extended-pictographic
unistr/u16-prev
+bool
configure.ac:
gl_MODULE_INDICATOR([unigbrk/u16-grapheme-prev])
-gl_LIBUNISTRING_MODULE([1.3], [unigbrk/u16-grapheme-prev])
+gl_LIBUNISTRING_MODULE([1.4], [unigbrk/u16-grapheme-prev])
Makefile.am:
if LIBUNISTRING_COMPILE_UNIGBRK_U16_GRAPHEME_PREV
Files:
lib/unigbrk/u32-grapheme-prev.c
+lib/unigbrk/u-grapheme-prev.h
tests/macros.h
Depends-on:
-unigbrk/uc-is-grapheme-break
+unigbrk/base
+unigbrk/uc-gbrk-prop
+unictype/incb-of
+unictype/property-extended-pictographic
unistr/u32-prev
+bool
configure.ac:
gl_MODULE_INDICATOR([unigbrk/u32-grapheme-prev])
-gl_LIBUNISTRING_MODULE([1.3], [unigbrk/u32-grapheme-prev])
+gl_LIBUNISTRING_MODULE([1.4], [unigbrk/u32-grapheme-prev])
Makefile.am:
if LIBUNISTRING_COMPILE_UNIGBRK_U32_GRAPHEME_PREV
Files:
lib/unigbrk/u8-grapheme-prev.c
+lib/unigbrk/u-grapheme-prev.h
tests/macros.h
Depends-on:
-unigbrk/uc-is-grapheme-break
+unigbrk/base
+unigbrk/uc-gbrk-prop
+unictype/incb-of
+unictype/property-extended-pictographic
unistr/u8-prev
+bool
configure.ac:
gl_MODULE_INDICATOR([unigbrk/u8-grapheme-prev])
-gl_LIBUNISTRING_MODULE([1.3], [unigbrk/u8-grapheme-prev])
+gl_LIBUNISTRING_MODULE([1.4], [unigbrk/u8-grapheme-prev])
Makefile.am:
if LIBUNISTRING_COMPILE_UNIGBRK_U8_GRAPHEME_PREV
test_u16_grapheme_prev (1, 'e', ACUTE, 'x', -1);
test_u16_grapheme_prev (2, 'e', ACUTE, 'e', ACUTE, -1);
+ /* CR LF handling. */
+ test_u16_grapheme_prev (2, 'c', '\r', '\n', -1);
+
+ /* Emoji modifier / ZWJ sequence. */
+ test_u16_grapheme_prev (5, 0x2605, 0x0305, 0x0347, 0x200D, 0x2600, -1);
+
+ /* Regional indicators. */
+ test_u16_grapheme_prev (4, 0xD83C, 0xDDE9, 0xD83C, 0xDDEA, 0xD83C, 0xDDEB, 0xD83C, 0xDDF7, -1);
+
/* Surrogate pairs. */
test_u16_grapheme_prev (2, 0xd83d, 0xde10, -1); /* 😐: neutral face. */
test_u16_grapheme_prev (3, 0xd83d, 0xde10, GRAVE, -1);
test_u32_grapheme_prev (1, 'e', ACUTE, 'x', -1);
test_u32_grapheme_prev (2, 'e', ACUTE, 'e', ACUTE, -1);
+ /* CR LF handling. */
+ test_u32_grapheme_prev (2, 'c', '\r', '\n', -1);
+
+ /* Emoji modifier / ZWJ sequence. */
+ test_u32_grapheme_prev (5, 0x2605, 0x0305, 0x0347, 0x200D, 0x2600, -1);
+
+ /* Regional indicators. */
+ test_u32_grapheme_prev (2, 0x1F1E9, 0x1F1EA, 0x1F1EB, 0x1F1F7, -1);
+
/* Outside BMP. */
#define NEUTRAL_FACE 0x1f610 /* 😐: neutral face. */
test_u32_grapheme_prev (1, NEUTRAL_FACE, -1);
test_u8_grapheme_prev ("e"ACUTE"x", 4, 1);
test_u8_grapheme_prev ("e"ACUTE "e"ACUTE, 6, 3);
+ /* CR LF handling. */
+ test_u8_grapheme_prev ("c\r\n", 3, 2);
+
+ /* Emoji modifier / ZWJ sequence. */
+ test_u8_grapheme_prev ("\342\230\205\314\205\315\207\342\200\215\342\230\200",
+ 13, 13);
+
+ /* Regional indicators. */
+ test_u8_grapheme_prev ("\360\237\207\251\360\237\207\252\360\237\207\253\360\237\207\267",
+ 16, 8);
+
return test_exit_status;
}