From: Bruno Haible Date: Mon, 19 May 2025 00:01:32 +0000 (+0200) Subject: unigbrk/u*-grapheme-prev: Support Indic, Emojis, regional indicators. X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0a319fd506eeb3fa832e50c9a2ed5aa492401200;p=thirdparty%2Fgnulib.git unigbrk/u*-grapheme-prev: Support Indic, Emojis, regional indicators. Reported by Kang-Che Sung in . * lib/unigbrk/u-grapheme-prev.h: New file, based on lib/unigbrk/u-grapheme-breaks.h. * lib/unigbrk/u8-grapheme-prev.c: Include unictype.h and u-grapheme-prev.h. (u8_grapheme_prev): Remove function. * lib/unigbrk/u16-grapheme-prev.c: Include unictype.h and u-grapheme-prev.h. (u16_grapheme_prev): Remove function. * lib/unigbrk/u32-grapheme-prev.c: Include unictype.h and u-grapheme-prev.h. (u32_grapheme_prev): Remove function. * modules/unigbrk/u8-grapheme-prev (Files): Add lib/unigbrk/u-grapheme-prev.h. (Depends-on): Remove unigbrk/uc-is-grapheme-break. Add unigbrk/base, unigbrk/uc-gbrk-prop, unictype/incb-of, unictype/property-extended-pictographic, bool. (configure.ac): Bump required libunistring version. * modules/unigbrk/u16-grapheme-prev (Files): Add lib/unigbrk/u-grapheme-prev.h. (Depends-on): Remove unigbrk/uc-is-grapheme-break. Add unigbrk/base, unigbrk/uc-gbrk-prop, unictype/incb-of, unictype/property-extended-pictographic, bool. (configure.ac): Bump required libunistring version. * modules/unigbrk/u32-grapheme-prev (Files): Add lib/unigbrk/u-grapheme-prev.h. (Depends-on): Remove unigbrk/uc-is-grapheme-break. Add unigbrk/base, unigbrk/uc-gbrk-prop, unictype/incb-of, unictype/property-extended-pictographic, bool. (configure.ac): Bump required libunistring version. * tests/unigbrk/test-u8-grapheme-prev.c (main): Add more test cases, from tests/unigbrk/test-u8-grapheme-breaks.c. * tests/unigbrk/test-u16-grapheme-prev.c (main): Add more test cases, from tests/unigbrk/test-u16-grapheme-breaks.c. * tests/unigbrk/test-u32-grapheme-prev.c (main): Add more test cases, from tests/unigbrk/test-u32-grapheme-breaks.c. --- diff --git a/ChangeLog b/ChangeLog index b53bca0cb3..09c5b008df 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,44 @@ +2025-05-18 Bruno Haible + + unigbrk/u*-grapheme-prev: Support Indic, Emojis, regional indicators. + Reported by Kang-Che Sung in + . + * lib/unigbrk/u-grapheme-prev.h: New file, based on + lib/unigbrk/u-grapheme-breaks.h. + * lib/unigbrk/u8-grapheme-prev.c: Include unictype.h and + u-grapheme-prev.h. + (u8_grapheme_prev): Remove function. + * lib/unigbrk/u16-grapheme-prev.c: Include unictype.h and + u-grapheme-prev.h. + (u16_grapheme_prev): Remove function. + * lib/unigbrk/u32-grapheme-prev.c: Include unictype.h and + u-grapheme-prev.h. + (u32_grapheme_prev): Remove function. + * modules/unigbrk/u8-grapheme-prev (Files): Add + lib/unigbrk/u-grapheme-prev.h. + (Depends-on): Remove unigbrk/uc-is-grapheme-break. Add unigbrk/base, + unigbrk/uc-gbrk-prop, unictype/incb-of, + unictype/property-extended-pictographic, bool. + (configure.ac): Bump required libunistring version. + * modules/unigbrk/u16-grapheme-prev (Files): Add + lib/unigbrk/u-grapheme-prev.h. + (Depends-on): Remove unigbrk/uc-is-grapheme-break. Add unigbrk/base, + unigbrk/uc-gbrk-prop, unictype/incb-of, + unictype/property-extended-pictographic, bool. + (configure.ac): Bump required libunistring version. + * modules/unigbrk/u32-grapheme-prev (Files): Add + lib/unigbrk/u-grapheme-prev.h. + (Depends-on): Remove unigbrk/uc-is-grapheme-break. Add unigbrk/base, + unigbrk/uc-gbrk-prop, unictype/incb-of, + unictype/property-extended-pictographic, bool. + (configure.ac): Bump required libunistring version. + * tests/unigbrk/test-u8-grapheme-prev.c (main): Add more test cases, + from tests/unigbrk/test-u8-grapheme-breaks.c. + * tests/unigbrk/test-u16-grapheme-prev.c (main): Add more test cases, + from tests/unigbrk/test-u16-grapheme-breaks.c. + * tests/unigbrk/test-u32-grapheme-prev.c (main): Add more test cases, + from tests/unigbrk/test-u32-grapheme-breaks.c. + 2025-05-18 Bruno Haible unigbrk/u*-grapheme-next: Support Indic, Emojis, regional indicators. diff --git a/lib/unigbrk/u-grapheme-prev.h b/lib/unigbrk/u-grapheme-prev.h new file mode 100644 index 0000000000..0894d5992e --- /dev/null +++ b/lib/unigbrk/u-grapheme-prev.h @@ -0,0 +1,233 @@ +/* Grapheme cluster break function. + Copyright (C) 2010-2025 Free Software Foundation, Inc. + + This file is free software. + It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". + You can redistribute it and/or modify it under either + - the terms of the GNU Lesser General Public License as published + by the Free Software Foundation, either version 3, or (at your + option) any later version, or + - the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2, or (at your option) + any later version, or + - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+". + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License and the GNU General Public License + for more details. + + You should have received a copy of the GNU Lesser General Public + License and of the GNU General Public License along with this + program. If not, see . */ + +/* Written by Bruno Haible , 2025. */ + +/* This file implements section 3 "Grapheme Cluster Boundaries" + of Unicode Standard Annex #29 + backwards. */ + +/* Returns true if the string [s_start, s) ends with a sequence of + Indic_Conjunct_Break values like: + consonant {extend|linker}* linker {extend|linker}* + */ +static bool +ends_with_incb_consonant_extended_linker_extended (const UNIT *s, + const UNIT *s_start) +{ + /* Look for + consonant {extend|linker}* + with at least one linker. */ + bool seen_linker = false; + + while (s > s_start) + { + const UNIT *prev_s; + ucs4_t uc; + + prev_s = U_PREV (&uc, s, s_start); + if (prev_s == NULL) + /* Ill-formed UTF-8 encoding. */ + break; + + int incb = uc_indic_conjunct_break (uc); + if (incb == UC_INDIC_CONJUNCT_BREAK_CONSONANT) + return seen_linker; + if (!(incb >= UC_INDIC_CONJUNCT_BREAK_LINKER)) + break; + seen_linker |= (incb == UC_INDIC_CONJUNCT_BREAK_LINKER); + + s = prev_s; + } + + return false; +} + +/* Returns true if the string [s_start, s) ends with a sequence of + characters like: + \p{Extended_Pictographic} Extend* + */ +static bool +ends_with_emoji_modifier_sequence (const UNIT *s, const UNIT *s_start) +{ + while (s > s_start) + { + const UNIT *prev_s; + ucs4_t uc; + + prev_s = U_PREV (&uc, s, s_start); + if (prev_s == NULL) + /* Ill-formed UTF-8 encoding. */ + break; + + if (uc_is_property_extended_pictographic (uc)) + return true; + + if (uc_graphemeclusterbreak_property (uc) != GBP_EXTEND) + break; + + s = prev_s; + } + + return false; +} + +/* Returns the number of consecutive regional indicator (RI) characters + at the end of the string [s_start, s). */ +static size_t +ends_with_ri_count (const UNIT *s, const UNIT *s_start) +{ + size_t ri_count = 0; + + while (s > s_start) + { + const UNIT *prev_s; + ucs4_t uc; + + prev_s = U_PREV (&uc, s, s_start); + if (prev_s == NULL) + /* Ill-formed UTF-8 encoding. */ + break; + + if (uc_graphemeclusterbreak_property (uc) == GBP_RI) + ri_count++; + else + break; + + s = prev_s; + } + + return ri_count; +} + +const UNIT * +FUNC (const UNIT *s, const UNIT *s_start) +{ + if (s == s_start) + return NULL; + + /* Traverse the string backwards, from s down to s_start. */ + + /* Grapheme Cluster break property of the next character. + -1 at the very end of the string. */ + int next_char_prop = -1; + + /* Indic_Conjunct_Break property of the next character. + -1 at the very end of the string. */ + int next_char_incb = -1; + + /* Extended_Pictographic property of the next character. + false at the very end of the string. */ + bool next_char_epic = false; + + do + { + const UNIT *prev_s; + ucs4_t uc; + + prev_s = U_PREV (&uc, s, s_start); + if (prev_s == NULL) + { + /* Ill-formed UTF-8 encoding. */ + return s_start; + } + + int prop = uc_graphemeclusterbreak_property (uc); + int incb = uc_indic_conjunct_break (uc); + bool epic = uc_is_property_extended_pictographic (uc); + + /* Break at the end of the string (GB2). */ + if (next_char_prop < 0) + /* *p = 1 */; + else + { + /* No break between CR and LF (GB3). */ + if (prop == GBP_CR && next_char_prop == GBP_LF) + /* *p = 0 */; + /* Break before and after newlines (GB4, GB5). */ + else if ((prop == GBP_CR + || prop == GBP_LF + || prop == GBP_CONTROL) + || (next_char_prop == GBP_CR + || next_char_prop == GBP_LF + || next_char_prop == GBP_CONTROL)) + break /* *p = 1 */; + /* No break between Hangul syllable sequences (GB6, GB7, GB8). */ + else if ((prop == GBP_L + && (next_char_prop == GBP_L + || next_char_prop == GBP_V + || next_char_prop == GBP_LV + || next_char_prop == GBP_LVT)) + || ((prop == GBP_LV + || prop == GBP_V) + && (next_char_prop == GBP_V + || next_char_prop == GBP_T)) + || ((prop == GBP_LVT + || prop == GBP_T) + && next_char_prop == GBP_T)) + /* *p = 0 */; + /* No break before extending characters or ZWJ (GB9). */ + else if (next_char_prop == GBP_EXTEND || next_char_prop == GBP_ZWJ) + /* *p = 0 */; + /* No break before SpacingMarks (GB9a). */ + else if (next_char_prop == GBP_SPACINGMARK) + /* *p = 0 */; + /* No break after Prepend characters (GB9b). */ + else if (prop == GBP_PREPEND) + /* *p = 0 */; + /* No break within certain combinations of Indic_Conjunct_Break + values: Between + consonant {extend|linker}* linker {extend|linker}* + and + consonant + (GB9c). */ + else if (next_char_incb == UC_INDIC_CONJUNCT_BREAK_CONSONANT + && ends_with_incb_consonant_extended_linker_extended (s, s_start)) + /* *p = 0 */; + /* No break within emoji modifier sequences or emoji zwj sequences + (GB11). */ + else if (next_char_epic + && prop == GBP_ZWJ + && ends_with_emoji_modifier_sequence (prev_s, s_start)) + /* *p = 0 */; + /* No break between RI if there is an odd number of RI + characters before (GB12, GB13). */ + else if (next_char_prop == GBP_RI + && prop == GBP_RI + && (ends_with_ri_count (prev_s, s_start) % 2) == 0) + /* *p = 0 */; + /* Break everywhere (GB999). */ + else + break /* *p = 1 */; + } + + s = prev_s; + next_char_prop = prop; + next_char_incb = incb; + next_char_epic = epic; + } + while (s > s_start); + + return s; +} diff --git a/lib/unigbrk/u16-grapheme-prev.c b/lib/unigbrk/u16-grapheme-prev.c index 02fe72f261..4c70e11843 100644 --- a/lib/unigbrk/u16-grapheme-prev.c +++ b/lib/unigbrk/u16-grapheme-prev.c @@ -1,6 +1,5 @@ /* Previous grapheme cluster function. Copyright (C) 2010-2025 Free Software Foundation, Inc. - Written by Ben Pfaff , 2010. This file is free software. It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". @@ -23,6 +22,8 @@ License and of the GNU General Public License along with this program. If not, see . */ +/* Written by Bruno Haible , 2025. */ + /* Don't use the const-improved function macros in this compilation unit. */ #define _LIBUNISTRING_NO_CONST_GENERICS @@ -31,35 +32,10 @@ /* Specification. */ #include "unigbrk.h" +#include "unictype.h" #include "unistr.h" -const uint16_t * -u16_grapheme_prev (const uint16_t *s, const uint16_t *start) -{ - ucs4_t next; - - if (s == start) - return NULL; - - s = u16_prev (&next, s, start); - while (s != start) - { - const uint16_t *prev_s; - ucs4_t prev; - - prev_s = u16_prev (&prev, s, start); - if (prev_s == NULL) - { - /* Ill-formed UTF-16 encoding. */ - return start; - } - - if (uc_is_grapheme_break (prev, next)) - break; - - s = prev_s; - next = prev; - } - - return s; -} +#define FUNC u16_grapheme_prev +#define UNIT uint16_t +#define U_PREV u16_prev +#include "u-grapheme-prev.h" diff --git a/lib/unigbrk/u32-grapheme-prev.c b/lib/unigbrk/u32-grapheme-prev.c index c76fb9ab52..977a1977c6 100644 --- a/lib/unigbrk/u32-grapheme-prev.c +++ b/lib/unigbrk/u32-grapheme-prev.c @@ -1,6 +1,5 @@ /* Previous grapheme cluster function. Copyright (C) 2010-2025 Free Software Foundation, Inc. - Written by Ben Pfaff , 2010. This file is free software. It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". @@ -23,6 +22,8 @@ License and of the GNU General Public License along with this program. If not, see . */ +/* Written by Bruno Haible , 2025. */ + /* Don't use the const-improved function macros in this compilation unit. */ #define _LIBUNISTRING_NO_CONST_GENERICS @@ -31,32 +32,10 @@ /* Specification. */ #include "unigbrk.h" +#include "unictype.h" #include "unistr.h" -const uint32_t * -u32_grapheme_prev (const uint32_t *s, const uint32_t *start) -{ - ucs4_t next; - - if (s == start) - return NULL; - - u32_prev (&next, s, start); - for (s--; s != start; s--) - { - ucs4_t prev; - - if (u32_prev (&prev, s, start) == NULL) - { - /* Ill-formed UTF-32 encoding. */ - return start; - } - - if (uc_is_grapheme_break (prev, next)) - break; - - next = prev; - } - - return s; -} +#define FUNC u32_grapheme_prev +#define UNIT uint32_t +#define U_PREV u32_prev +#include "u-grapheme-prev.h" diff --git a/lib/unigbrk/u8-grapheme-prev.c b/lib/unigbrk/u8-grapheme-prev.c index 79748cf3fb..a2d872f01b 100644 --- a/lib/unigbrk/u8-grapheme-prev.c +++ b/lib/unigbrk/u8-grapheme-prev.c @@ -1,6 +1,5 @@ /* Previous grapheme cluster function. Copyright (C) 2010-2025 Free Software Foundation, Inc. - Written by Ben Pfaff , 2010. This file is free software. It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". @@ -23,6 +22,8 @@ License and of the GNU General Public License along with this program. If not, see . */ +/* Written by Bruno Haible , 2025. */ + /* Don't use the const-improved function macros in this compilation unit. */ #define _LIBUNISTRING_NO_CONST_GENERICS @@ -31,35 +32,10 @@ /* Specification. */ #include "unigbrk.h" +#include "unictype.h" #include "unistr.h" -const uint8_t * -u8_grapheme_prev (const uint8_t *s, const uint8_t *start) -{ - ucs4_t next; - - if (s == start) - return NULL; - - s = u8_prev (&next, s, start); - while (s != start) - { - const uint8_t *prev_s; - ucs4_t prev; - - prev_s = u8_prev (&prev, s, start); - if (prev_s == NULL) - { - /* Ill-formed UTF-8 encoding. */ - return start; - } - - if (uc_is_grapheme_break (prev, next)) - break; - - s = prev_s; - next = prev; - } - - return s; -} +#define FUNC u8_grapheme_prev +#define UNIT uint8_t +#define U_PREV u8_prev +#include "u-grapheme-prev.h" diff --git a/modules/unigbrk/u16-grapheme-prev b/modules/unigbrk/u16-grapheme-prev index d9393efabf..1135de7230 100644 --- a/modules/unigbrk/u16-grapheme-prev +++ b/modules/unigbrk/u16-grapheme-prev @@ -3,15 +3,20 @@ Find start of previous grapheme cluster in UTF-16 string. Files: lib/unigbrk/u16-grapheme-prev.c +lib/unigbrk/u-grapheme-prev.h tests/macros.h Depends-on: -unigbrk/uc-is-grapheme-break +unigbrk/base +unigbrk/uc-gbrk-prop +unictype/incb-of +unictype/property-extended-pictographic unistr/u16-prev +bool configure.ac: gl_MODULE_INDICATOR([unigbrk/u16-grapheme-prev]) -gl_LIBUNISTRING_MODULE([1.3], [unigbrk/u16-grapheme-prev]) +gl_LIBUNISTRING_MODULE([1.4], [unigbrk/u16-grapheme-prev]) Makefile.am: if LIBUNISTRING_COMPILE_UNIGBRK_U16_GRAPHEME_PREV diff --git a/modules/unigbrk/u32-grapheme-prev b/modules/unigbrk/u32-grapheme-prev index 4997a508eb..6d3223a813 100644 --- a/modules/unigbrk/u32-grapheme-prev +++ b/modules/unigbrk/u32-grapheme-prev @@ -3,15 +3,20 @@ Find start of previous grapheme cluster in UTF-32 string. Files: lib/unigbrk/u32-grapheme-prev.c +lib/unigbrk/u-grapheme-prev.h tests/macros.h Depends-on: -unigbrk/uc-is-grapheme-break +unigbrk/base +unigbrk/uc-gbrk-prop +unictype/incb-of +unictype/property-extended-pictographic unistr/u32-prev +bool configure.ac: gl_MODULE_INDICATOR([unigbrk/u32-grapheme-prev]) -gl_LIBUNISTRING_MODULE([1.3], [unigbrk/u32-grapheme-prev]) +gl_LIBUNISTRING_MODULE([1.4], [unigbrk/u32-grapheme-prev]) Makefile.am: if LIBUNISTRING_COMPILE_UNIGBRK_U32_GRAPHEME_PREV diff --git a/modules/unigbrk/u8-grapheme-prev b/modules/unigbrk/u8-grapheme-prev index 29c9501ab9..1ed0335c0c 100644 --- a/modules/unigbrk/u8-grapheme-prev +++ b/modules/unigbrk/u8-grapheme-prev @@ -3,15 +3,20 @@ Find start of previous grapheme cluster in UTF-8 string. Files: lib/unigbrk/u8-grapheme-prev.c +lib/unigbrk/u-grapheme-prev.h tests/macros.h Depends-on: -unigbrk/uc-is-grapheme-break +unigbrk/base +unigbrk/uc-gbrk-prop +unictype/incb-of +unictype/property-extended-pictographic unistr/u8-prev +bool configure.ac: gl_MODULE_INDICATOR([unigbrk/u8-grapheme-prev]) -gl_LIBUNISTRING_MODULE([1.3], [unigbrk/u8-grapheme-prev]) +gl_LIBUNISTRING_MODULE([1.4], [unigbrk/u8-grapheme-prev]) Makefile.am: if LIBUNISTRING_COMPILE_UNIGBRK_U8_GRAPHEME_PREV diff --git a/tests/unigbrk/test-u16-grapheme-prev.c b/tests/unigbrk/test-u16-grapheme-prev.c index 60d1ec9e63..4baec3b0cc 100644 --- a/tests/unigbrk/test-u16-grapheme-prev.c +++ b/tests/unigbrk/test-u16-grapheme-prev.c @@ -97,6 +97,15 @@ main (void) test_u16_grapheme_prev (1, 'e', ACUTE, 'x', -1); test_u16_grapheme_prev (2, 'e', ACUTE, 'e', ACUTE, -1); + /* CR LF handling. */ + test_u16_grapheme_prev (2, 'c', '\r', '\n', -1); + + /* Emoji modifier / ZWJ sequence. */ + test_u16_grapheme_prev (5, 0x2605, 0x0305, 0x0347, 0x200D, 0x2600, -1); + + /* Regional indicators. */ + test_u16_grapheme_prev (4, 0xD83C, 0xDDE9, 0xD83C, 0xDDEA, 0xD83C, 0xDDEB, 0xD83C, 0xDDF7, -1); + /* Surrogate pairs. */ test_u16_grapheme_prev (2, 0xd83d, 0xde10, -1); /* 😐: neutral face. */ test_u16_grapheme_prev (3, 0xd83d, 0xde10, GRAVE, -1); diff --git a/tests/unigbrk/test-u32-grapheme-prev.c b/tests/unigbrk/test-u32-grapheme-prev.c index 8420fa4968..ae855bf337 100644 --- a/tests/unigbrk/test-u32-grapheme-prev.c +++ b/tests/unigbrk/test-u32-grapheme-prev.c @@ -97,6 +97,15 @@ main (void) test_u32_grapheme_prev (1, 'e', ACUTE, 'x', -1); test_u32_grapheme_prev (2, 'e', ACUTE, 'e', ACUTE, -1); + /* CR LF handling. */ + test_u32_grapheme_prev (2, 'c', '\r', '\n', -1); + + /* Emoji modifier / ZWJ sequence. */ + test_u32_grapheme_prev (5, 0x2605, 0x0305, 0x0347, 0x200D, 0x2600, -1); + + /* Regional indicators. */ + test_u32_grapheme_prev (2, 0x1F1E9, 0x1F1EA, 0x1F1EB, 0x1F1F7, -1); + /* Outside BMP. */ #define NEUTRAL_FACE 0x1f610 /* 😐: neutral face. */ test_u32_grapheme_prev (1, NEUTRAL_FACE, -1); diff --git a/tests/unigbrk/test-u8-grapheme-prev.c b/tests/unigbrk/test-u8-grapheme-prev.c index 0a63d4dc3f..6d6ab46ac0 100644 --- a/tests/unigbrk/test-u8-grapheme-prev.c +++ b/tests/unigbrk/test-u8-grapheme-prev.c @@ -77,5 +77,16 @@ main (void) test_u8_grapheme_prev ("e"ACUTE"x", 4, 1); test_u8_grapheme_prev ("e"ACUTE "e"ACUTE, 6, 3); + /* CR LF handling. */ + test_u8_grapheme_prev ("c\r\n", 3, 2); + + /* Emoji modifier / ZWJ sequence. */ + test_u8_grapheme_prev ("\342\230\205\314\205\315\207\342\200\215\342\230\200", + 13, 13); + + /* Regional indicators. */ + test_u8_grapheme_prev ("\360\237\207\251\360\237\207\252\360\237\207\253\360\237\207\267", + 16, 8); + return test_exit_status; }