]> git.ipfire.org Git - thirdparty/gnulib.git/commitdiff
unigbrk/u*-grapheme-next: Support Indic, Emojis, regional indicators.
authorBruno Haible <bruno@clisp.org>
Sun, 18 May 2025 23:53:23 +0000 (01:53 +0200)
committerBruno Haible <bruno@clisp.org>
Sun, 18 May 2025 23:57:04 +0000 (01:57 +0200)
Reported by Kang-Che Sung <explorer09@gmail.com> in
<https://lists.gnu.org/archive/html/bug-libunistring/2025-03/msg00000.html>
and by Lich <author@lch361.net> in
<https://lists.gnu.org/archive/html/bug-libunistring/2025-05/msg00000.html>.

* lib/unigbrk/u-grapheme-next.h: New file, based on
lib/unigbrk/u-grapheme-breaks.h.
* lib/unigbrk/u8-grapheme-next.c: Include unictype.h and
u-grapheme-next.h.
(u8_grapheme_next): Remove function.
* lib/unigbrk/u16-grapheme-next.c: Include unictype.h and
u-grapheme-next.h.
(u16_grapheme_next): Remove function.
* lib/unigbrk/u32-grapheme-next.c: Include unictype.h and
u-grapheme-next.h.
(u32_grapheme_next): Remove function.
* modules/unigbrk/u8-grapheme-next (Files): Add
lib/unigbrk/u-grapheme-next.h.
(Depends-on): Remove unigbrk/uc-is-grapheme-break. Add unigbrk/base,
unigbrk/uc-gbrk-prop, unictype/incb-of,
unictype/property-extended-pictographic, bool.
(configure.ac): Bump required libunistring version.
* modules/unigbrk/u16-grapheme-next (Files): Add
lib/unigbrk/u-grapheme-next.h.
(Depends-on): Remove unigbrk/uc-is-grapheme-break. Add unigbrk/base,
unigbrk/uc-gbrk-prop, unictype/incb-of,
unictype/property-extended-pictographic, bool.
(configure.ac): Bump required libunistring version.
* modules/unigbrk/u32-grapheme-next (Files): Add
lib/unigbrk/u-grapheme-next.h.
(Depends-on): Remove unigbrk/uc-is-grapheme-break. Add unigbrk/base,
unigbrk/uc-gbrk-prop, unictype/incb-of,
unictype/property-extended-pictographic, bool.
(configure.ac): Bump required libunistring version.
* tests/unigbrk/test-u8-grapheme-next.c (main): Add more test cases,
from tests/unigbrk/test-u8-grapheme-breaks.c.
* tests/unigbrk/test-u16-grapheme-next.c (main): Add more test cases,
from tests/unigbrk/test-u16-grapheme-breaks.c.
* tests/unigbrk/test-u32-grapheme-next.c (main): Add more test cases,
from tests/unigbrk/test-u32-grapheme-breaks.c.

ChangeLog
lib/unigbrk/u-grapheme-next.h [new file with mode: 0644]
lib/unigbrk/u16-grapheme-next.c
lib/unigbrk/u32-grapheme-next.c
lib/unigbrk/u8-grapheme-next.c
modules/unigbrk/u16-grapheme-next
modules/unigbrk/u32-grapheme-next
modules/unigbrk/u8-grapheme-next
tests/unigbrk/test-u16-grapheme-next.c
tests/unigbrk/test-u32-grapheme-next.c
tests/unigbrk/test-u8-grapheme-next.c

index f9eecbc588f54af3c8c82b55e1d10f1135f25862..b53bca0cb3a769c8b8f41e83fc1f301c78f1e412 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,46 @@
+2025-05-18  Bruno Haible  <bruno@clisp.org>
+
+       unigbrk/u*-grapheme-next: Support Indic, Emojis, regional indicators.
+       Reported by Kang-Che Sung <explorer09@gmail.com> in
+       <https://lists.gnu.org/archive/html/bug-libunistring/2025-03/msg00000.html>
+       and by Lich <author@lch361.net> in
+       <https://lists.gnu.org/archive/html/bug-libunistring/2025-05/msg00000.html>.
+       * lib/unigbrk/u-grapheme-next.h: New file, based on
+       lib/unigbrk/u-grapheme-breaks.h.
+       * lib/unigbrk/u8-grapheme-next.c: Include unictype.h and
+       u-grapheme-next.h.
+       (u8_grapheme_next): Remove function.
+       * lib/unigbrk/u16-grapheme-next.c: Include unictype.h and
+       u-grapheme-next.h.
+       (u16_grapheme_next): Remove function.
+       * lib/unigbrk/u32-grapheme-next.c: Include unictype.h and
+       u-grapheme-next.h.
+       (u32_grapheme_next): Remove function.
+       * modules/unigbrk/u8-grapheme-next (Files): Add
+       lib/unigbrk/u-grapheme-next.h.
+       (Depends-on): Remove unigbrk/uc-is-grapheme-break. Add unigbrk/base,
+       unigbrk/uc-gbrk-prop, unictype/incb-of,
+       unictype/property-extended-pictographic, bool.
+       (configure.ac): Bump required libunistring version.
+       * modules/unigbrk/u16-grapheme-next (Files): Add
+       lib/unigbrk/u-grapheme-next.h.
+       (Depends-on): Remove unigbrk/uc-is-grapheme-break. Add unigbrk/base,
+       unigbrk/uc-gbrk-prop, unictype/incb-of,
+       unictype/property-extended-pictographic, bool.
+       (configure.ac): Bump required libunistring version.
+       * modules/unigbrk/u32-grapheme-next (Files): Add
+       lib/unigbrk/u-grapheme-next.h.
+       (Depends-on): Remove unigbrk/uc-is-grapheme-break. Add unigbrk/base,
+       unigbrk/uc-gbrk-prop, unictype/incb-of,
+       unictype/property-extended-pictographic, bool.
+       (configure.ac): Bump required libunistring version.
+       * tests/unigbrk/test-u8-grapheme-next.c (main): Add more test cases,
+       from tests/unigbrk/test-u8-grapheme-breaks.c.
+       * tests/unigbrk/test-u16-grapheme-next.c (main): Add more test cases,
+       from tests/unigbrk/test-u16-grapheme-breaks.c.
+       * tests/unigbrk/test-u32-grapheme-next.c (main): Add more test cases,
+       from tests/unigbrk/test-u32-grapheme-breaks.c.
+
 2025-05-18  Bruno Haible  <bruno@clisp.org>
 
        unigbrk/u*-grapheme-breaks: Tiny optimization.
diff --git a/lib/unigbrk/u-grapheme-next.h b/lib/unigbrk/u-grapheme-next.h
new file mode 100644 (file)
index 0000000..9ca0743
--- /dev/null
@@ -0,0 +1,159 @@
+/* Grapheme cluster break function.
+   Copyright (C) 2010-2025 Free Software Foundation, Inc.
+
+   This file is free software.
+   It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
+   You can redistribute it and/or modify it under either
+     - the terms of the GNU Lesser General Public License as published
+       by the Free Software Foundation, either version 3, or (at your
+       option) any later version, or
+     - the terms of the GNU General Public License as published by the
+       Free Software Foundation; either version 2, or (at your option)
+       any later version, or
+     - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
+
+   This file is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License and the GNU General Public License
+   for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License and of the GNU General Public License along with this
+   program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+/* Written by Ben Pfaff, Daiki Ueno, Bruno Haible.  */
+
+/* This file implements section 3 "Grapheme Cluster Boundaries"
+   of Unicode Standard Annex #29 <https://www.unicode.org/reports/tr29/>.  */
+
+const UNIT *
+FUNC (const UNIT *s, const UNIT *s_end)
+{
+  if (s == s_end)
+    return NULL;
+
+  /* Grapheme Cluster break property of the last character.
+     -1 at the very beginning of the string.  */
+  int last_char_prop = -1;
+
+  /* True if the last character ends a sequence of Indic_Conjunct_Break
+     values:  consonant {extend|linker}*  */
+  bool incb_consonant_extended = false;
+  /* True if the last character ends a sequence of Indic_Conjunct_Break
+     values:  consonant {extend|linker}* linker  */
+  bool incb_consonant_extended_linker = false;
+  /* True if the last character ends a sequence of Indic_Conjunct_Break
+     values:  consonant {extend|linker}* linker {extend|linker}*  */
+  bool incb_consonant_extended_linker_extended = false;
+
+  /* True if the last character ends an emoji modifier sequence
+     \p{Extended_Pictographic} Extend*.  */
+  bool emoji_modifier_sequence = false;
+  /* True if the last character was immediately preceded by an
+     emoji modifier sequence   \p{Extended_Pictographic} Extend*.  */
+  bool emoji_modifier_sequence_before_last_char = false;
+
+  /* Number of consecutive regional indicator (RI) characters seen
+     immediately before the current point.  */
+  size_t ri_count = 0;
+
+  do
+    {
+      ucs4_t uc;
+      int count = U_MBTOUC (&uc, s, s_end - s);
+      int prop = uc_graphemeclusterbreak_property (uc);
+      int incb = uc_indic_conjunct_break (uc);
+
+      /* Break at the start of the string (GB1).  */
+      if (last_char_prop < 0)
+        /* *p = 1 */;
+      else
+        {
+          /* No break between CR and LF (GB3).  */
+          if (last_char_prop == GBP_CR && prop == GBP_LF)
+            /* *p = 0 */;
+          /* Break before and after newlines (GB4, GB5).  */
+          else if ((last_char_prop == GBP_CR
+                    || last_char_prop == GBP_LF
+                    || last_char_prop == GBP_CONTROL)
+                   || (prop == GBP_CR
+                       || prop == GBP_LF
+                       || prop == GBP_CONTROL))
+            break /* *p = 1 */;
+          /* No break between Hangul syllable sequences (GB6, GB7, GB8).  */
+          else if ((last_char_prop == GBP_L
+                    && (prop == GBP_L
+                        || prop == GBP_V
+                        || prop == GBP_LV
+                        || prop == GBP_LVT))
+                   || ((last_char_prop == GBP_LV
+                        || last_char_prop == GBP_V)
+                       && (prop == GBP_V
+                           || prop == GBP_T))
+                   || ((last_char_prop == GBP_LVT
+                        || last_char_prop == GBP_T)
+                       && prop == GBP_T))
+            /* *p = 0 */;
+          /* No break before extending characters or ZWJ (GB9).  */
+          else if (prop == GBP_EXTEND || prop == GBP_ZWJ)
+            /* *p = 0 */;
+          /* No break before SpacingMarks (GB9a).  */
+          else if (prop == GBP_SPACINGMARK)
+            /* *p = 0 */;
+          /* No break after Prepend characters (GB9b).  */
+          else if (last_char_prop == GBP_PREPEND)
+            /* *p = 0 */;
+          /* No break within certain combinations of Indic_Conjunct_Break
+             values: Between
+               consonant {extend|linker}* linker {extend|linker}*
+             and
+               consonant
+             (GB9c).  */
+          else if (incb_consonant_extended_linker_extended
+                   && incb == UC_INDIC_CONJUNCT_BREAK_CONSONANT)
+            /* *p = 0 */;
+          /* No break within emoji modifier sequences or emoji zwj sequences
+             (GB11).  */
+          else if (last_char_prop == GBP_ZWJ
+                   && emoji_modifier_sequence_before_last_char
+                   && uc_is_property_extended_pictographic (uc))
+            /* *p = 0 */;
+          /* No break between RI if there is an odd number of RI
+             characters before (GB12, GB13).  */
+          else if (prop == GBP_RI && (ri_count % 2) != 0)
+            /* *p = 0 */;
+          /* Break everywhere (GB999).  */
+          else
+            break /* *p = 1 */;
+        }
+
+      incb_consonant_extended_linker =
+        incb_consonant_extended && incb == UC_INDIC_CONJUNCT_BREAK_LINKER;
+      incb_consonant_extended_linker_extended =
+        (incb_consonant_extended_linker
+         || (incb_consonant_extended_linker_extended
+             && incb >= UC_INDIC_CONJUNCT_BREAK_LINKER));
+      incb_consonant_extended =
+        (incb == UC_INDIC_CONJUNCT_BREAK_CONSONANT
+         || (incb_consonant_extended
+             && incb >= UC_INDIC_CONJUNCT_BREAK_LINKER));
+
+      emoji_modifier_sequence_before_last_char = emoji_modifier_sequence;
+      emoji_modifier_sequence =
+        (emoji_modifier_sequence && prop == GBP_EXTEND)
+        || uc_is_property_extended_pictographic (uc);
+
+      last_char_prop = prop;
+
+      if (prop == GBP_RI)
+        ri_count++;
+      else
+        ri_count = 0;
+
+      s += count;
+    }
+  while (s < s_end);
+
+  return s;
+}
index b0e47e17c884a0095298fdf13778e2e89940d6fc..5e7a783d8f35a4a3efe1a1c3e3f4cdd2c865acea 100644 (file)
@@ -1,6 +1,5 @@
 /* Next grapheme cluster function.
    Copyright (C) 2010-2025 Free Software Foundation, Inc.
-   Written by Ben Pfaff <blp@cs.stanford.edu>, 2010.
 
    This file is free software.
    It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
@@ -23,6 +22,8 @@
    License and of the GNU General Public License along with this
    program.  If not, see <https://www.gnu.org/licenses/>.  */
 
+/* Written by Bruno Haible <bruno@clisp.org>, 2025.  */
+
 /* Don't use the const-improved function macros in this compilation unit.  */
 #define _LIBUNISTRING_NO_CONST_GENERICS
 
 /* Specification.  */
 #include "unigbrk.h"
 
+#include "unictype.h"
 #include "unistr.h"
 
-const uint16_t *
-u16_grapheme_next (const uint16_t *s, const uint16_t *end)
-{
-  ucs4_t prev;
-  int mblen;
-
-  if (s == end)
-    return NULL;
-
-  for (s += u16_mbtouc (&prev, s, end - s); s != end; s += mblen)
-    {
-      ucs4_t next;
-
-      mblen = u16_mbtouc (&next, s, end - s);
-      if (uc_is_grapheme_break (prev, next))
-        break;
-
-      prev = next;
-    }
-
-  return s;
-}
+#define FUNC u16_grapheme_next
+#define UNIT uint16_t
+#define U_MBTOUC u16_mbtouc
+#include "u-grapheme-next.h"
index 28fc5052e52e3899377465a73c7f7bc21413754a..1c9adfa6f3952facf6789d275b9ed3b5a251913a 100644 (file)
@@ -1,6 +1,5 @@
 /* Next grapheme cluster function.
    Copyright (C) 2010-2025 Free Software Foundation, Inc.
-   Written by Ben Pfaff <blp@cs.stanford.edu>, 2010.
 
    This file is free software.
    It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
@@ -23,6 +22,8 @@
    License and of the GNU General Public License along with this
    program.  If not, see <https://www.gnu.org/licenses/>.  */
 
+/* Written by Bruno Haible <bruno@clisp.org>, 2025.  */
+
 /* Don't use the const-improved function macros in this compilation unit.  */
 #define _LIBUNISTRING_NO_CONST_GENERICS
 
 /* Specification.  */
 #include "unigbrk.h"
 
+#include "unictype.h"
 #include "unistr.h"
 
-const uint32_t *
-u32_grapheme_next (const uint32_t *s, const uint32_t *end)
-{
-  ucs4_t prev;
-
-  if (s == end)
-    return NULL;
-
-  u32_mbtouc (&prev, s, end - s);
-  for (s++; s != end; s++)
-    {
-      ucs4_t next;
-
-      u32_mbtouc (&next, s, end - s);
-      if (uc_is_grapheme_break (prev, next))
-        break;
-
-      prev = next;
-    }
-
-  return s;
-}
+#define FUNC u32_grapheme_next
+#define UNIT uint32_t
+#define U_MBTOUC u32_mbtouc
+#include "u-grapheme-next.h"
index b1d2e3dd3ec984ad7caff732f73b5e31ba0878e7..2ec094da2a3428c0e677eaa9f639aeb746333aae 100644 (file)
@@ -1,6 +1,5 @@
 /* Next grapheme cluster function.
    Copyright (C) 2010-2025 Free Software Foundation, Inc.
-   Written by Ben Pfaff <blp@cs.stanford.edu>, 2010.
 
    This file is free software.
    It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
@@ -23,6 +22,8 @@
    License and of the GNU General Public License along with this
    program.  If not, see <https://www.gnu.org/licenses/>.  */
 
+/* Written by Bruno Haible <bruno@clisp.org>, 2025.  */
+
 /* Don't use the const-improved function macros in this compilation unit.  */
 #define _LIBUNISTRING_NO_CONST_GENERICS
 
 /* Specification.  */
 #include "unigbrk.h"
 
+#include "unictype.h"
 #include "unistr.h"
 
-const uint8_t *
-u8_grapheme_next (const uint8_t *s, const uint8_t *end)
-{
-  ucs4_t prev;
-  int mblen;
-
-  if (s == end)
-    return NULL;
-
-  for (s += u8_mbtouc (&prev, s, end - s); s != end; s += mblen)
-    {
-      ucs4_t next;
-
-      mblen = u8_mbtouc (&next, s, end - s);
-      if (uc_is_grapheme_break (prev, next))
-        break;
-
-      prev = next;
-    }
-
-  return s;
-}
+#define FUNC u8_grapheme_next
+#define UNIT uint8_t
+#define U_MBTOUC u8_mbtouc
+#include "u-grapheme-next.h"
index 7443375331e11183923d5c36491b02ea48d1d0bb..8eb500a77df3776bf1b6103e3cdbd85ff76b28e0 100644 (file)
@@ -3,15 +3,20 @@ Find start of next grapheme cluster in UTF-16 string.
 
 Files:
 lib/unigbrk/u16-grapheme-next.c
+lib/unigbrk/u-grapheme-next.h
 tests/macros.h
 
 Depends-on:
-unigbrk/uc-is-grapheme-break
+unigbrk/base
+unigbrk/uc-gbrk-prop
+unictype/incb-of
+unictype/property-extended-pictographic
 unistr/u16-mbtouc
+bool
 
 configure.ac:
 gl_MODULE_INDICATOR([unigbrk/u16-grapheme-next])
-gl_LIBUNISTRING_MODULE([1.3], [unigbrk/u16-grapheme-next])
+gl_LIBUNISTRING_MODULE([1.4], [unigbrk/u16-grapheme-next])
 
 Makefile.am:
 if LIBUNISTRING_COMPILE_UNIGBRK_U16_GRAPHEME_NEXT
index 28daec526c0c5aae6d7a08505e544d18ec49a09d..5045c390a975825e28c8ad94a0e60e5669a84154 100644 (file)
@@ -3,15 +3,20 @@ Find start of next grapheme cluster in UTF-32 string.
 
 Files:
 lib/unigbrk/u32-grapheme-next.c
+lib/unigbrk/u-grapheme-next.h
 tests/macros.h
 
 Depends-on:
-unigbrk/uc-is-grapheme-break
+unigbrk/base
+unigbrk/uc-gbrk-prop
+unictype/incb-of
+unictype/property-extended-pictographic
 unistr/u32-mbtouc
+bool
 
 configure.ac:
 gl_MODULE_INDICATOR([unigbrk/u32-grapheme-next])
-gl_LIBUNISTRING_MODULE([1.3], [unigbrk/u32-grapheme-next])
+gl_LIBUNISTRING_MODULE([1.4], [unigbrk/u32-grapheme-next])
 
 Makefile.am:
 if LIBUNISTRING_COMPILE_UNIGBRK_U32_GRAPHEME_NEXT
index 50fc06a5c054e8a5e6f2bd8dd6b78c6678db9e8e..c8197cd4e742c2182741b2b73eb57ba1130ed8c6 100644 (file)
@@ -3,15 +3,20 @@ Find start of next grapheme cluster in UTF-8 string.
 
 Files:
 lib/unigbrk/u8-grapheme-next.c
+lib/unigbrk/u-grapheme-next.h
 tests/macros.h
 
 Depends-on:
-unigbrk/uc-is-grapheme-break
+unigbrk/base
+unigbrk/uc-gbrk-prop
+unictype/incb-of
+unictype/property-extended-pictographic
 unistr/u8-mbtouc
+bool
 
 configure.ac:
 gl_MODULE_INDICATOR([unigbrk/u8-grapheme-next])
-gl_LIBUNISTRING_MODULE([1.3], [unigbrk/u8-grapheme-next])
+gl_LIBUNISTRING_MODULE([1.4], [unigbrk/u8-grapheme-next])
 
 Makefile.am:
 if LIBUNISTRING_COMPILE_UNIGBRK_U8_GRAPHEME_NEXT
index d2647a31a4750362c5ed938efb4c8c4645a62559..555770a96f906d3bce4441dab738b4f4903cbf5a 100644 (file)
@@ -95,6 +95,15 @@ main (void)
   test_u16_grapheme_next (2, 'e', ACUTE, 'x', -1);
   test_u16_grapheme_next (2, 'e', ACUTE, 'e', ACUTE, -1);
 
+  /* CR LF handling.  */
+  test_u16_grapheme_next (2, '\r', '\n', 'd', -1);
+
+  /* Emoji modifier / ZWJ sequence. */
+  test_u16_grapheme_next (5, 0x2605, 0x0305, 0x0347, 0x200D, 0x2600, -1);
+
+  /* Regional indicators. */
+  test_u16_grapheme_next (4, 0xD83C, 0xDDE9, 0xD83C, 0xDDEA, 0xD83C, 0xDDEB, 0xD83C, 0xDDF7, -1);
+
   /* Surrogate pairs. */
   test_u16_grapheme_next (2, 0xd83d, 0xde10, -1); /* 😐: neutral face. */
   test_u16_grapheme_next (3, 0xd83d, 0xde10, GRAVE, -1);
index 58fb1e2eb558210fae7061903a0248905194a139..db3a1590f8f1c6eceda5699267463fe8724f0c50 100644 (file)
@@ -95,6 +95,15 @@ main (void)
   test_u32_grapheme_next (2, 'e', ACUTE, 'x', -1);
   test_u32_grapheme_next (2, 'e', ACUTE, 'e', ACUTE, -1);
 
+  /* CR LF handling.  */
+  test_u32_grapheme_next (2, '\r', '\n', 'd', -1);
+
+  /* Emoji modifier / ZWJ sequence. */
+  test_u32_grapheme_next (5, 0x2605, 0x0305, 0x0347, 0x200D, 0x2600, -1);
+
+  /* Regional indicators. */
+  test_u32_grapheme_next (2, 0x1F1E9, 0x1F1EA, 0x1F1EB, 0x1F1F7, -1);
+
   /* Outside BMP. */
 #define NEUTRAL_FACE 0x1f610    /* 😐: neutral face. */
   test_u32_grapheme_next (1, NEUTRAL_FACE, -1);
index a818504bf6bb5aaccc904beb788895622d25ef08..00521639a314a44e38b0d862e0ffdf0b8a0f44aa 100644 (file)
@@ -76,5 +76,16 @@ main (void)
   test_u8_grapheme_next ("e"ACUTE"x", 4, 3);
   test_u8_grapheme_next ("e"ACUTE "e"ACUTE, 6, 3);
 
+  /* CR LF handling.  */
+  test_u8_grapheme_next ("\r\nd", 3, 2);
+
+  /* Emoji modifier / ZWJ sequence. */
+  test_u8_grapheme_next ("\342\230\205\314\205\315\207\342\200\215\342\230\200",
+                         13, 13);
+
+  /* Regional indicators. */
+  test_u8_grapheme_next ("\360\237\207\251\360\237\207\252\360\237\207\253\360\237\207\267",
+                         16, 8);
+
   return test_exit_status;
 }