From: Bruno Haible Date: Sun, 18 May 2025 23:53:23 +0000 (+0200) Subject: unigbrk/u*-grapheme-next: Support Indic, Emojis, regional indicators. X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f783bbc7678359628bdd36a3c53a5af79c1e75a4;p=thirdparty%2Fgnulib.git unigbrk/u*-grapheme-next: Support Indic, Emojis, regional indicators. Reported by Kang-Che Sung in and by Lich in . * lib/unigbrk/u-grapheme-next.h: New file, based on lib/unigbrk/u-grapheme-breaks.h. * lib/unigbrk/u8-grapheme-next.c: Include unictype.h and u-grapheme-next.h. (u8_grapheme_next): Remove function. * lib/unigbrk/u16-grapheme-next.c: Include unictype.h and u-grapheme-next.h. (u16_grapheme_next): Remove function. * lib/unigbrk/u32-grapheme-next.c: Include unictype.h and u-grapheme-next.h. (u32_grapheme_next): Remove function. * modules/unigbrk/u8-grapheme-next (Files): Add lib/unigbrk/u-grapheme-next.h. (Depends-on): Remove unigbrk/uc-is-grapheme-break. Add unigbrk/base, unigbrk/uc-gbrk-prop, unictype/incb-of, unictype/property-extended-pictographic, bool. (configure.ac): Bump required libunistring version. * modules/unigbrk/u16-grapheme-next (Files): Add lib/unigbrk/u-grapheme-next.h. (Depends-on): Remove unigbrk/uc-is-grapheme-break. Add unigbrk/base, unigbrk/uc-gbrk-prop, unictype/incb-of, unictype/property-extended-pictographic, bool. (configure.ac): Bump required libunistring version. * modules/unigbrk/u32-grapheme-next (Files): Add lib/unigbrk/u-grapheme-next.h. (Depends-on): Remove unigbrk/uc-is-grapheme-break. Add unigbrk/base, unigbrk/uc-gbrk-prop, unictype/incb-of, unictype/property-extended-pictographic, bool. (configure.ac): Bump required libunistring version. * tests/unigbrk/test-u8-grapheme-next.c (main): Add more test cases, from tests/unigbrk/test-u8-grapheme-breaks.c. * tests/unigbrk/test-u16-grapheme-next.c (main): Add more test cases, from tests/unigbrk/test-u16-grapheme-breaks.c. * tests/unigbrk/test-u32-grapheme-next.c (main): Add more test cases, from tests/unigbrk/test-u32-grapheme-breaks.c. --- diff --git a/ChangeLog b/ChangeLog index f9eecbc588..b53bca0cb3 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,46 @@ +2025-05-18 Bruno Haible + + unigbrk/u*-grapheme-next: Support Indic, Emojis, regional indicators. + Reported by Kang-Che Sung in + + and by Lich in + . + * lib/unigbrk/u-grapheme-next.h: New file, based on + lib/unigbrk/u-grapheme-breaks.h. + * lib/unigbrk/u8-grapheme-next.c: Include unictype.h and + u-grapheme-next.h. + (u8_grapheme_next): Remove function. + * lib/unigbrk/u16-grapheme-next.c: Include unictype.h and + u-grapheme-next.h. + (u16_grapheme_next): Remove function. + * lib/unigbrk/u32-grapheme-next.c: Include unictype.h and + u-grapheme-next.h. + (u32_grapheme_next): Remove function. + * modules/unigbrk/u8-grapheme-next (Files): Add + lib/unigbrk/u-grapheme-next.h. + (Depends-on): Remove unigbrk/uc-is-grapheme-break. Add unigbrk/base, + unigbrk/uc-gbrk-prop, unictype/incb-of, + unictype/property-extended-pictographic, bool. + (configure.ac): Bump required libunistring version. + * modules/unigbrk/u16-grapheme-next (Files): Add + lib/unigbrk/u-grapheme-next.h. + (Depends-on): Remove unigbrk/uc-is-grapheme-break. Add unigbrk/base, + unigbrk/uc-gbrk-prop, unictype/incb-of, + unictype/property-extended-pictographic, bool. + (configure.ac): Bump required libunistring version. + * modules/unigbrk/u32-grapheme-next (Files): Add + lib/unigbrk/u-grapheme-next.h. + (Depends-on): Remove unigbrk/uc-is-grapheme-break. Add unigbrk/base, + unigbrk/uc-gbrk-prop, unictype/incb-of, + unictype/property-extended-pictographic, bool. + (configure.ac): Bump required libunistring version. + * tests/unigbrk/test-u8-grapheme-next.c (main): Add more test cases, + from tests/unigbrk/test-u8-grapheme-breaks.c. + * tests/unigbrk/test-u16-grapheme-next.c (main): Add more test cases, + from tests/unigbrk/test-u16-grapheme-breaks.c. + * tests/unigbrk/test-u32-grapheme-next.c (main): Add more test cases, + from tests/unigbrk/test-u32-grapheme-breaks.c. + 2025-05-18 Bruno Haible unigbrk/u*-grapheme-breaks: Tiny optimization. diff --git a/lib/unigbrk/u-grapheme-next.h b/lib/unigbrk/u-grapheme-next.h new file mode 100644 index 0000000000..9ca07436e9 --- /dev/null +++ b/lib/unigbrk/u-grapheme-next.h @@ -0,0 +1,159 @@ +/* Grapheme cluster break function. + Copyright (C) 2010-2025 Free Software Foundation, Inc. + + This file is free software. + It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". + You can redistribute it and/or modify it under either + - the terms of the GNU Lesser General Public License as published + by the Free Software Foundation, either version 3, or (at your + option) any later version, or + - the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2, or (at your option) + any later version, or + - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+". + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License and the GNU General Public License + for more details. + + You should have received a copy of the GNU Lesser General Public + License and of the GNU General Public License along with this + program. If not, see . */ + +/* Written by Ben Pfaff, Daiki Ueno, Bruno Haible. */ + +/* This file implements section 3 "Grapheme Cluster Boundaries" + of Unicode Standard Annex #29 . */ + +const UNIT * +FUNC (const UNIT *s, const UNIT *s_end) +{ + if (s == s_end) + return NULL; + + /* Grapheme Cluster break property of the last character. + -1 at the very beginning of the string. */ + int last_char_prop = -1; + + /* True if the last character ends a sequence of Indic_Conjunct_Break + values: consonant {extend|linker}* */ + bool incb_consonant_extended = false; + /* True if the last character ends a sequence of Indic_Conjunct_Break + values: consonant {extend|linker}* linker */ + bool incb_consonant_extended_linker = false; + /* True if the last character ends a sequence of Indic_Conjunct_Break + values: consonant {extend|linker}* linker {extend|linker}* */ + bool incb_consonant_extended_linker_extended = false; + + /* True if the last character ends an emoji modifier sequence + \p{Extended_Pictographic} Extend*. */ + bool emoji_modifier_sequence = false; + /* True if the last character was immediately preceded by an + emoji modifier sequence \p{Extended_Pictographic} Extend*. */ + bool emoji_modifier_sequence_before_last_char = false; + + /* Number of consecutive regional indicator (RI) characters seen + immediately before the current point. */ + size_t ri_count = 0; + + do + { + ucs4_t uc; + int count = U_MBTOUC (&uc, s, s_end - s); + int prop = uc_graphemeclusterbreak_property (uc); + int incb = uc_indic_conjunct_break (uc); + + /* Break at the start of the string (GB1). */ + if (last_char_prop < 0) + /* *p = 1 */; + else + { + /* No break between CR and LF (GB3). */ + if (last_char_prop == GBP_CR && prop == GBP_LF) + /* *p = 0 */; + /* Break before and after newlines (GB4, GB5). */ + else if ((last_char_prop == GBP_CR + || last_char_prop == GBP_LF + || last_char_prop == GBP_CONTROL) + || (prop == GBP_CR + || prop == GBP_LF + || prop == GBP_CONTROL)) + break /* *p = 1 */; + /* No break between Hangul syllable sequences (GB6, GB7, GB8). */ + else if ((last_char_prop == GBP_L + && (prop == GBP_L + || prop == GBP_V + || prop == GBP_LV + || prop == GBP_LVT)) + || ((last_char_prop == GBP_LV + || last_char_prop == GBP_V) + && (prop == GBP_V + || prop == GBP_T)) + || ((last_char_prop == GBP_LVT + || last_char_prop == GBP_T) + && prop == GBP_T)) + /* *p = 0 */; + /* No break before extending characters or ZWJ (GB9). */ + else if (prop == GBP_EXTEND || prop == GBP_ZWJ) + /* *p = 0 */; + /* No break before SpacingMarks (GB9a). */ + else if (prop == GBP_SPACINGMARK) + /* *p = 0 */; + /* No break after Prepend characters (GB9b). */ + else if (last_char_prop == GBP_PREPEND) + /* *p = 0 */; + /* No break within certain combinations of Indic_Conjunct_Break + values: Between + consonant {extend|linker}* linker {extend|linker}* + and + consonant + (GB9c). */ + else if (incb_consonant_extended_linker_extended + && incb == UC_INDIC_CONJUNCT_BREAK_CONSONANT) + /* *p = 0 */; + /* No break within emoji modifier sequences or emoji zwj sequences + (GB11). */ + else if (last_char_prop == GBP_ZWJ + && emoji_modifier_sequence_before_last_char + && uc_is_property_extended_pictographic (uc)) + /* *p = 0 */; + /* No break between RI if there is an odd number of RI + characters before (GB12, GB13). */ + else if (prop == GBP_RI && (ri_count % 2) != 0) + /* *p = 0 */; + /* Break everywhere (GB999). */ + else + break /* *p = 1 */; + } + + incb_consonant_extended_linker = + incb_consonant_extended && incb == UC_INDIC_CONJUNCT_BREAK_LINKER; + incb_consonant_extended_linker_extended = + (incb_consonant_extended_linker + || (incb_consonant_extended_linker_extended + && incb >= UC_INDIC_CONJUNCT_BREAK_LINKER)); + incb_consonant_extended = + (incb == UC_INDIC_CONJUNCT_BREAK_CONSONANT + || (incb_consonant_extended + && incb >= UC_INDIC_CONJUNCT_BREAK_LINKER)); + + emoji_modifier_sequence_before_last_char = emoji_modifier_sequence; + emoji_modifier_sequence = + (emoji_modifier_sequence && prop == GBP_EXTEND) + || uc_is_property_extended_pictographic (uc); + + last_char_prop = prop; + + if (prop == GBP_RI) + ri_count++; + else + ri_count = 0; + + s += count; + } + while (s < s_end); + + return s; +} diff --git a/lib/unigbrk/u16-grapheme-next.c b/lib/unigbrk/u16-grapheme-next.c index b0e47e17c8..5e7a783d8f 100644 --- a/lib/unigbrk/u16-grapheme-next.c +++ b/lib/unigbrk/u16-grapheme-next.c @@ -1,6 +1,5 @@ /* Next grapheme cluster function. Copyright (C) 2010-2025 Free Software Foundation, Inc. - Written by Ben Pfaff , 2010. This file is free software. It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". @@ -23,6 +22,8 @@ License and of the GNU General Public License along with this program. If not, see . */ +/* Written by Bruno Haible , 2025. */ + /* Don't use the const-improved function macros in this compilation unit. */ #define _LIBUNISTRING_NO_CONST_GENERICS @@ -31,27 +32,10 @@ /* Specification. */ #include "unigbrk.h" +#include "unictype.h" #include "unistr.h" -const uint16_t * -u16_grapheme_next (const uint16_t *s, const uint16_t *end) -{ - ucs4_t prev; - int mblen; - - if (s == end) - return NULL; - - for (s += u16_mbtouc (&prev, s, end - s); s != end; s += mblen) - { - ucs4_t next; - - mblen = u16_mbtouc (&next, s, end - s); - if (uc_is_grapheme_break (prev, next)) - break; - - prev = next; - } - - return s; -} +#define FUNC u16_grapheme_next +#define UNIT uint16_t +#define U_MBTOUC u16_mbtouc +#include "u-grapheme-next.h" diff --git a/lib/unigbrk/u32-grapheme-next.c b/lib/unigbrk/u32-grapheme-next.c index 28fc5052e5..1c9adfa6f3 100644 --- a/lib/unigbrk/u32-grapheme-next.c +++ b/lib/unigbrk/u32-grapheme-next.c @@ -1,6 +1,5 @@ /* Next grapheme cluster function. Copyright (C) 2010-2025 Free Software Foundation, Inc. - Written by Ben Pfaff , 2010. This file is free software. It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". @@ -23,6 +22,8 @@ License and of the GNU General Public License along with this program. If not, see . */ +/* Written by Bruno Haible , 2025. */ + /* Don't use the const-improved function macros in this compilation unit. */ #define _LIBUNISTRING_NO_CONST_GENERICS @@ -31,27 +32,10 @@ /* Specification. */ #include "unigbrk.h" +#include "unictype.h" #include "unistr.h" -const uint32_t * -u32_grapheme_next (const uint32_t *s, const uint32_t *end) -{ - ucs4_t prev; - - if (s == end) - return NULL; - - u32_mbtouc (&prev, s, end - s); - for (s++; s != end; s++) - { - ucs4_t next; - - u32_mbtouc (&next, s, end - s); - if (uc_is_grapheme_break (prev, next)) - break; - - prev = next; - } - - return s; -} +#define FUNC u32_grapheme_next +#define UNIT uint32_t +#define U_MBTOUC u32_mbtouc +#include "u-grapheme-next.h" diff --git a/lib/unigbrk/u8-grapheme-next.c b/lib/unigbrk/u8-grapheme-next.c index b1d2e3dd3e..2ec094da2a 100644 --- a/lib/unigbrk/u8-grapheme-next.c +++ b/lib/unigbrk/u8-grapheme-next.c @@ -1,6 +1,5 @@ /* Next grapheme cluster function. Copyright (C) 2010-2025 Free Software Foundation, Inc. - Written by Ben Pfaff , 2010. This file is free software. It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+". @@ -23,6 +22,8 @@ License and of the GNU General Public License along with this program. If not, see . */ +/* Written by Bruno Haible , 2025. */ + /* Don't use the const-improved function macros in this compilation unit. */ #define _LIBUNISTRING_NO_CONST_GENERICS @@ -31,27 +32,10 @@ /* Specification. */ #include "unigbrk.h" +#include "unictype.h" #include "unistr.h" -const uint8_t * -u8_grapheme_next (const uint8_t *s, const uint8_t *end) -{ - ucs4_t prev; - int mblen; - - if (s == end) - return NULL; - - for (s += u8_mbtouc (&prev, s, end - s); s != end; s += mblen) - { - ucs4_t next; - - mblen = u8_mbtouc (&next, s, end - s); - if (uc_is_grapheme_break (prev, next)) - break; - - prev = next; - } - - return s; -} +#define FUNC u8_grapheme_next +#define UNIT uint8_t +#define U_MBTOUC u8_mbtouc +#include "u-grapheme-next.h" diff --git a/modules/unigbrk/u16-grapheme-next b/modules/unigbrk/u16-grapheme-next index 7443375331..8eb500a77d 100644 --- a/modules/unigbrk/u16-grapheme-next +++ b/modules/unigbrk/u16-grapheme-next @@ -3,15 +3,20 @@ Find start of next grapheme cluster in UTF-16 string. Files: lib/unigbrk/u16-grapheme-next.c +lib/unigbrk/u-grapheme-next.h tests/macros.h Depends-on: -unigbrk/uc-is-grapheme-break +unigbrk/base +unigbrk/uc-gbrk-prop +unictype/incb-of +unictype/property-extended-pictographic unistr/u16-mbtouc +bool configure.ac: gl_MODULE_INDICATOR([unigbrk/u16-grapheme-next]) -gl_LIBUNISTRING_MODULE([1.3], [unigbrk/u16-grapheme-next]) +gl_LIBUNISTRING_MODULE([1.4], [unigbrk/u16-grapheme-next]) Makefile.am: if LIBUNISTRING_COMPILE_UNIGBRK_U16_GRAPHEME_NEXT diff --git a/modules/unigbrk/u32-grapheme-next b/modules/unigbrk/u32-grapheme-next index 28daec526c..5045c390a9 100644 --- a/modules/unigbrk/u32-grapheme-next +++ b/modules/unigbrk/u32-grapheme-next @@ -3,15 +3,20 @@ Find start of next grapheme cluster in UTF-32 string. Files: lib/unigbrk/u32-grapheme-next.c +lib/unigbrk/u-grapheme-next.h tests/macros.h Depends-on: -unigbrk/uc-is-grapheme-break +unigbrk/base +unigbrk/uc-gbrk-prop +unictype/incb-of +unictype/property-extended-pictographic unistr/u32-mbtouc +bool configure.ac: gl_MODULE_INDICATOR([unigbrk/u32-grapheme-next]) -gl_LIBUNISTRING_MODULE([1.3], [unigbrk/u32-grapheme-next]) +gl_LIBUNISTRING_MODULE([1.4], [unigbrk/u32-grapheme-next]) Makefile.am: if LIBUNISTRING_COMPILE_UNIGBRK_U32_GRAPHEME_NEXT diff --git a/modules/unigbrk/u8-grapheme-next b/modules/unigbrk/u8-grapheme-next index 50fc06a5c0..c8197cd4e7 100644 --- a/modules/unigbrk/u8-grapheme-next +++ b/modules/unigbrk/u8-grapheme-next @@ -3,15 +3,20 @@ Find start of next grapheme cluster in UTF-8 string. Files: lib/unigbrk/u8-grapheme-next.c +lib/unigbrk/u-grapheme-next.h tests/macros.h Depends-on: -unigbrk/uc-is-grapheme-break +unigbrk/base +unigbrk/uc-gbrk-prop +unictype/incb-of +unictype/property-extended-pictographic unistr/u8-mbtouc +bool configure.ac: gl_MODULE_INDICATOR([unigbrk/u8-grapheme-next]) -gl_LIBUNISTRING_MODULE([1.3], [unigbrk/u8-grapheme-next]) +gl_LIBUNISTRING_MODULE([1.4], [unigbrk/u8-grapheme-next]) Makefile.am: if LIBUNISTRING_COMPILE_UNIGBRK_U8_GRAPHEME_NEXT diff --git a/tests/unigbrk/test-u16-grapheme-next.c b/tests/unigbrk/test-u16-grapheme-next.c index d2647a31a4..555770a96f 100644 --- a/tests/unigbrk/test-u16-grapheme-next.c +++ b/tests/unigbrk/test-u16-grapheme-next.c @@ -95,6 +95,15 @@ main (void) test_u16_grapheme_next (2, 'e', ACUTE, 'x', -1); test_u16_grapheme_next (2, 'e', ACUTE, 'e', ACUTE, -1); + /* CR LF handling. */ + test_u16_grapheme_next (2, '\r', '\n', 'd', -1); + + /* Emoji modifier / ZWJ sequence. */ + test_u16_grapheme_next (5, 0x2605, 0x0305, 0x0347, 0x200D, 0x2600, -1); + + /* Regional indicators. */ + test_u16_grapheme_next (4, 0xD83C, 0xDDE9, 0xD83C, 0xDDEA, 0xD83C, 0xDDEB, 0xD83C, 0xDDF7, -1); + /* Surrogate pairs. */ test_u16_grapheme_next (2, 0xd83d, 0xde10, -1); /* 😐: neutral face. */ test_u16_grapheme_next (3, 0xd83d, 0xde10, GRAVE, -1); diff --git a/tests/unigbrk/test-u32-grapheme-next.c b/tests/unigbrk/test-u32-grapheme-next.c index 58fb1e2eb5..db3a1590f8 100644 --- a/tests/unigbrk/test-u32-grapheme-next.c +++ b/tests/unigbrk/test-u32-grapheme-next.c @@ -95,6 +95,15 @@ main (void) test_u32_grapheme_next (2, 'e', ACUTE, 'x', -1); test_u32_grapheme_next (2, 'e', ACUTE, 'e', ACUTE, -1); + /* CR LF handling. */ + test_u32_grapheme_next (2, '\r', '\n', 'd', -1); + + /* Emoji modifier / ZWJ sequence. */ + test_u32_grapheme_next (5, 0x2605, 0x0305, 0x0347, 0x200D, 0x2600, -1); + + /* Regional indicators. */ + test_u32_grapheme_next (2, 0x1F1E9, 0x1F1EA, 0x1F1EB, 0x1F1F7, -1); + /* Outside BMP. */ #define NEUTRAL_FACE 0x1f610 /* 😐: neutral face. */ test_u32_grapheme_next (1, NEUTRAL_FACE, -1); diff --git a/tests/unigbrk/test-u8-grapheme-next.c b/tests/unigbrk/test-u8-grapheme-next.c index a818504bf6..00521639a3 100644 --- a/tests/unigbrk/test-u8-grapheme-next.c +++ b/tests/unigbrk/test-u8-grapheme-next.c @@ -76,5 +76,16 @@ main (void) test_u8_grapheme_next ("e"ACUTE"x", 4, 3); test_u8_grapheme_next ("e"ACUTE "e"ACUTE, 6, 3); + /* CR LF handling. */ + test_u8_grapheme_next ("\r\nd", 3, 2); + + /* Emoji modifier / ZWJ sequence. */ + test_u8_grapheme_next ("\342\230\205\314\205\315\207\342\200\215\342\230\200", + 13, 13); + + /* Regional indicators. */ + test_u8_grapheme_next ("\360\237\207\251\360\237\207\252\360\237\207\253\360\237\207\267", + 16, 8); + return test_exit_status; }