mbuiterf: Implement multi-byte per encoding error (MEE) consistently.

author Bruno Haible <bruno@clisp.org>

Mon, 25 May 2026 23:29:29 +0000 (01:29 +0200)

committer Bruno Haible <bruno@clisp.org>

Mon, 25 May 2026 23:31:23 +0000 (01:31 +0200)
author Bruno Haible <bruno@clisp.org>
Mon, 25 May 2026 23:29:29 +0000 (01:29 +0200)
committer Bruno Haible <bruno@clisp.org>
Mon, 25 May 2026 23:31:23 +0000 (01:31 +0200)
diff --git a/ChangeLog b/ChangeLog

index 7c6b25c4714562284aabe62881257d2b7a913f08..3b9f26165d119e712db523639f24828d0ba73443 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,44 @@
+2026-05-25  Bruno Haible  <bruno@clisp.org>
+
+       mbuiterf: Implement multi-byte per encoding error (MEE) consistently.
+       * lib/mbuiterf.h: Include mbiter-aux.h.
+       (struct mbuif_state): Add field is_utf8.
+       (mbuiterf_next): Invoke mbiter_is_utf8, mbiter_utf8_maximal_subpart.
+       (mbuif_init): Initialize the field is_utf8.
+       * modules/mbuiterf (Depends-on): Add mbiter-aux.
+       * tests/test-mbslen.c (OR): New macro, copied from
+       tests/test-mbsnlen.c.
+       (main): Add more test cases with incomplete characters.
+       * tests/test-mbschr2.sh: Renamed from tests/test-mbschr.sh.
+       * tests/test-mbschr2.c: Renamed from tests/test-mbschr.c.
+       * tests/test-mbschr1.sh: New file, based on
+       tests/test-mbmemcasecmp-3.sh.
+       * tests/test-mbschr1.c: New file.
+       * modules/mbschr-tests (Files): Update accordingly. Add locale-en.m4,
+       locale-fr.m4.
+       (configure.ac): Invoke gt_LOCALE_EN_UTF8, gt_LOCALE_FR_UTF8.
+       (Makefile.am): Arrange to compile test-mbschr1 and test-mbschr2 and to
+       run test-mbschr1.sh, test-mbschr2.sh.
+       * tests/test-mbsrchr2.sh: Renamed from tests/test-mbsrchr.sh.
+       * tests/test-mbsrchr2.c: Renamed from tests/test-mbsrchr.c.
+       * tests/test-mbsrchr1.sh: New file, based on
+       tests/test-mbmemcasecmp-3.sh.
+       * tests/test-mbsrchr1.c: New file.
+       * modules/mbsrchr-tests (Files): Update accordingly. Add locale-en.m4,
+       locale-fr.m4.
+       (configure.ac): Invoke gt_LOCALE_EN_UTF8, gt_LOCALE_FR_UTF8.
+       (Makefile.am): Arrange to compile test-mbsrchr1 and test-mbsrchr2 and to
+       run test-mbsrchr1.sh, test-mbsrchr2.sh.
+       * tests/test-mbscspn.c (OR): New macro, copied from
+       tests/test-mbsnlen.c.
+       (main): Add test cases with incomplete characters.
+       * tests/test-mbspbrk.c (OR): New macro, copied from
+       tests/test-mbsnlen.c.
+       (main): Add test cases with incomplete characters.
+       * tests/test-mbsspn.c (OR): New macro, copied from
+       tests/test-mbsnlen.c.
+       (main): Add test cases with incomplete characters.
+
  2026-05-25  Bruno Haible  <bruno@clisp.org>
  
         mbuiter: Implement multi-byte per encoding error (MEE) consistently.
diff --git a/lib/mbuiterf.h b/lib/mbuiterf.h

index f8cb0f9595426cf10ce01836c51cd4309d6fd4a0..19761a88a496390b33ad88ff7423185d98c507d0 100644 (file)
--- a/lib/mbuiterf.h
+++ b/lib/mbuiterf.h
@@ -94,6 +94,7 @@
  #include <wchar.h>
  
  #include "mbchar.h"
+#include "mbiter-aux.h"
  #include "strnlen1.h"
  
  _GL_INLINE_HEADER_BEGIN
@@ -118,6 +119,7 @@ struct mbuif_state
                             before and after every mbuiterf_next invocation.
                           */
    unsigned int cur_max; /* A cache of MB_CUR_MAX.  */
+  int is_utf8;          /* A cache of mbiter_is_utf8.  */
  };
  
  MBUITERF_INLINE mbchar_t
@@ -145,18 +147,23 @@ mbuiterf_next (struct mbuif_state *ps, const char *iter)
        ps->in_shift = true;
      with_shift:;
        #endif
+      size_t avail_bytes = strnlen1 (iter, ps->cur_max);
        size_t bytes;
        char32_t wc;
-      bytes = mbrtoc32 (&wc, iter, strnlen1 (iter, ps->cur_max), &ps->state);
+      bytes = mbrtoc32 (&wc, iter, avail_bytes, &ps->state);
        if (bytes == (size_t) -1)
          {
            /* An invalid multibyte sequence was encountered.  */
+          size_t ebytes =
+            (mbiter_is_utf8 (&ps->is_utf8)
+             ? mbiter_utf8_maximal_subpart (iter, avail_bytes)
+             : 1);
            /* Allow the next invocation to continue from a sane state.  */
            #if !GNULIB_MBRTOC32_REGULAR
            ps->in_shift = false;
            #endif
            mbszero (&ps->state);
-          return (mbchar_t) { .ptr = iter, .bytes = 1, .wc_valid = false };
+          return (mbchar_t) { .ptr = iter, .bytes = ebytes, .wc_valid = false };
          }
        else if (bytes == (size_t) -2)
          {
@@ -197,12 +204,12 @@ typedef struct mbuif_state mbuif_state_t;
  #if !GNULIB_MBRTOC32_REGULAR
  #define mbuif_init(st) \
    ((st).in_shift = false, mbszero (&(st).state), \
-   (st).cur_max = MB_CUR_MAX)
+   (st).cur_max = MB_CUR_MAX, (st).is_utf8 = -1)
  #else
  /* Optimized: no in_shift.  */
  #define mbuif_init(st) \
    (mbszero (&(st).state), \
-   (st).cur_max = MB_CUR_MAX)
+   (st).cur_max = MB_CUR_MAX, (st).is_utf8 = -1)
  #endif
  #if !GNULIB_MBRTOC32_REGULAR
  #define mbuif_avail(st, iter) ((st).in_shift || (*(iter) != '\0'))
diff --git a/modules/mbschr-tests b/modules/mbschr-tests

index ef26e7336332c56e1b0c81347b622e2efe159ffa..fb879f2baa82e9451cf10d55e496322b288a6b52 100644 (file)
--- a/modules/mbschr-tests
+++ b/modules/mbschr-tests
@@ -1,7 +1,11 @@
  Files:
-tests/test-mbschr.sh
-tests/test-mbschr.c
+tests/test-mbschr1.sh
+tests/test-mbschr1.c
+tests/test-mbschr2.sh
+tests/test-mbschr2.c
  tests/macros.h
+m4/locale-en.m4
+m4/locale-fr.m4
  m4/locale-zh.m4
  m4/codeset.m4
  
@@ -9,10 +13,16 @@ Depends-on:
  setlocale
  
  configure.ac:
+gt_LOCALE_EN_UTF8
+gt_LOCALE_FR_UTF8
  gt_LOCALE_ZH_CN
  
  Makefile.am:
-TESTS += test-mbschr.sh
-TESTS_ENVIRONMENT += LOCALE_ZH_CN='@LOCALE_ZH_CN@'
-check_PROGRAMS += test-mbschr
-test_mbschr_LDADD = $(LDADD) $(LIBUNISTRING) $(SETLOCALE_LIB) $(MBRTOWC_LIB) $(LIBC32CONV)
+TESTS += test-mbschr1.sh test-mbschr2.sh
+TESTS_ENVIRONMENT += \
+  LOCALE_EN_UTF8='@LOCALE_EN_UTF8@' \
+  LOCALE_FR_UTF8='@LOCALE_FR_UTF8@' \
+  LOCALE_ZH_CN='@LOCALE_ZH_CN@'
+check_PROGRAMS += test-mbschr1 test-mbschr2
+test_mbschr1_LDADD = $(LDADD) $(LIBUNISTRING) $(SETLOCALE_LIB) $(MBRTOWC_LIB) $(LIBC32CONV)
+test_mbschr2_LDADD = $(LDADD) $(LIBUNISTRING) $(SETLOCALE_LIB) $(MBRTOWC_LIB) $(LIBC32CONV)
diff --git a/modules/mbsrchr-tests b/modules/mbsrchr-tests

index dba14707894b0b6f8c43f73dc032dbe9b9b1fb1a..07243ca86ff9c3992acc8fa7d9cc4a88c1016fc7 100644 (file)
--- a/modules/mbsrchr-tests
+++ b/modules/mbsrchr-tests
@@ -1,7 +1,11 @@
  Files:
-tests/test-mbsrchr.sh
-tests/test-mbsrchr.c
+tests/test-mbsrchr1.sh
+tests/test-mbsrchr1.c
+tests/test-mbsrchr2.sh
+tests/test-mbsrchr2.c
  tests/macros.h
+m4/locale-en.m4
+m4/locale-fr.m4
  m4/locale-zh.m4
  m4/codeset.m4
  
@@ -9,10 +13,16 @@ Depends-on:
  setlocale
  
  configure.ac:
+gt_LOCALE_EN_UTF8
+gt_LOCALE_FR_UTF8
  gt_LOCALE_ZH_CN
  
  Makefile.am:
-TESTS += test-mbsrchr.sh
-TESTS_ENVIRONMENT += LOCALE_ZH_CN='@LOCALE_ZH_CN@'
-check_PROGRAMS += test-mbsrchr
-test_mbsrchr_LDADD = $(LDADD) $(LIBUNISTRING) $(SETLOCALE_LIB) $(MBRTOWC_LIB) $(LIBC32CONV)
+TESTS += test-mbsrchr1.sh test-mbsrchr2.sh
+TESTS_ENVIRONMENT += \
+  LOCALE_EN_UTF8='@LOCALE_EN_UTF8@' \
+  LOCALE_FR_UTF8='@LOCALE_FR_UTF8@' \
+  LOCALE_ZH_CN='@LOCALE_ZH_CN@'
+check_PROGRAMS += test-mbsrchr1 test-mbsrchr2
+test_mbsrchr1_LDADD = $(LDADD) $(LIBUNISTRING) $(SETLOCALE_LIB) $(MBRTOWC_LIB) $(LIBC32CONV)
+test_mbsrchr2_LDADD = $(LDADD) $(LIBUNISTRING) $(SETLOCALE_LIB) $(MBRTOWC_LIB) $(LIBC32CONV)
diff --git a/modules/mbuiterf b/modules/mbuiterf

index e5e22f9d09daad3edc23cd41af969c359238951a..d93cc8fa732e341d290308547bdb06247383dcd1 100644 (file)
--- a/modules/mbuiterf
+++ b/modules/mbuiterf
@@ -13,6 +13,7 @@ mbchar
  mbrtoc32
  mbsinit
  mbszero
+mbiter-aux
  uchar-h
  bool
  strnlen1
diff --git a/tests/test-mbschr1.c b/tests/test-mbschr1.c

new file mode 100644 (file)

index 0000000..1000491
--- /dev/null
+++ b/tests/test-mbschr1.c
@@ -0,0 +1,107 @@
+/* Test of searching a string for a character in a UTF-8 locale.
+   Copyright (C) 2026 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+/* Written by Bruno Haible <bruno@clisp.org>, 2026.  */
+
+#include <config.h>
+
+#include <string.h>
+
+#include <locale.h>
+
+#include "macros.h"
+
+/* The mcel-based implementation of mbsnlen behaves differently than the
+   original one.  Namely, for invalid/incomplete byte sequences:
+   Where we ideally should have multi-byte-per-encoding-error (MEE) behaviour
+   everywhere, mcel implements single-byte-per-encoding-error (SEE) behaviour.
+   See <https://lists.gnu.org/archive/html/bug-gnulib/2023-07/msg00131.html>,
+       <https://lists.gnu.org/archive/html/bug-gnulib/2023-07/msg00145.html>.
+   Therefore, here we have different expected results, depending on the
+   implementation.  */
+#if GNULIB_MCEL_PREFER
+# define OR(a,b) b
+#else
+# define OR(a,b) a
+#endif
+
+int
+main ()
+{
+  /* configure should already have checked that the locale is supported.  */
+  if (setlocale (LC_ALL, "") == NULL)
+    return 1;
+
+  /* Incomplete characters.  See
+     https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf
+     page 128 table 3-11.  */
+
+  /* "\341\200\240" = 0xE1 0x80 0xA0 = U+1020.  */
+  {
+    const char input[] = "\341\200";
+    const char *result = mbschr (input, '\341');
+    ASSERT (result == OR (NULL, input + 0));
+  }
+  {
+    const char input[] = "\341\200";
+    const char *result = mbschr (input, '\200');
+    ASSERT (result == OR (NULL, input + 1));
+  }
+  {
+    const char input[] = "\341\200\341";
+    const char *result = mbschr (input, '\341');
+    ASSERT (result == input + OR(2,0));
+  }
+
+  /* "\360\221\222\240" = 0xF0 0x91 0x92 0xA0 = U+114A0.  */
+  {
+    const char input[] = "\360\221\222";
+    const char *result = mbschr (input, '\360');
+    ASSERT (result == OR (NULL, input + 0));
+  }
+  {
+    const char input[] = "\360\221\222";
+    const char *result = mbschr (input, '\221');
+    ASSERT (result == OR (NULL, input + 1));
+  }
+  {
+    const char input[] = "\360\221\222";
+    const char *result = mbschr (input, '\222');
+    ASSERT (result == OR (NULL, input + 2));
+  }
+  {
+    const char input[] = "\360\221\222\360";
+    const char *result = mbschr (input, '\360');
+    ASSERT (result == input + OR(3,0));
+  }
+  {
+    const char input[] = "\360\221";
+    const char *result = mbschr (input, '\360');
+    ASSERT (result == OR (NULL, input + 0));
+  }
+  {
+    const char input[] = "\360\221";
+    const char *result = mbschr (input, '\221');
+    ASSERT (result == OR (NULL, input + 1));
+  }
+  {
+    const char input[] = "\360\221\360";
+    const char *result = mbschr (input, '\360');
+    ASSERT (result == input + OR(2,0));
+  }
+
+  return test_exit_status;
+}
diff --git a/tests/test-mbschr1.sh b/tests/test-mbschr1.sh

new file mode 100755 (executable)

index 0000000..48e258d
--- /dev/null
+++ b/tests/test-mbschr1.sh
@@ -0,0 +1,23 @@
+#!/bin/sh
+
+# Test whether a specific UTF-8 locale is installed.
+: "${LOCALE_EN_UTF8=en_US.UTF-8}"
+: "${LOCALE_FR_UTF8=fr_FR.UTF-8}"
+if test "$LOCALE_EN_UTF8" = none && test $LOCALE_FR_UTF8 = none; then
+  if test -f /usr/bin/localedef; then
+    echo "Skipping test: no english or french Unicode locale is installed"
+  else
+    echo "Skipping test: no english or french Unicode locale is supported"
+  fi
+  exit 77
+fi
+
+# It's sufficient to test in one of the two locales.
+if test $LOCALE_FR_UTF8 != none; then
+  testlocale=$LOCALE_FR_UTF8
+else
+  testlocale="$LOCALE_EN_UTF8"
+fi
+
+LC_ALL="$testlocale" \
+${CHECKER} ./test-mbschr1${EXEEXT}
diff --git a/tests/test-mbschr.c b/tests/test-mbschr2.c

similarity index 96%

rename from tests/test-mbschr.c

rename to tests/test-mbschr2.c

index f7678eb41bab61b27c7bbcdd5499ab4c8bddb1ff..5eae208c975b02d92392dcce924b3a9eeb96b2e3 100644 (file)
--- a/tests/test-mbschr.c
+++ b/tests/test-mbschr2.c
@@ -1,4 +1,4 @@
-/* Test of searching a string for a character.
+/* Test of searching a string for a character in a GB18030 locale.
     Copyright (C) 2007-2026 Free Software Foundation, Inc.
  
     This program is free software: you can redistribute it and/or modify
diff --git a/tests/test-mbschr.sh b/tests/test-mbschr2.sh

similarity index 90%

rename from tests/test-mbschr.sh

rename to tests/test-mbschr2.sh

index 7e62b3f08a71e1d0895c24e1c5b8567700d5af52..c75973c362f6341ad25afe09aded8043ed30ffac 100755 (executable)
--- a/tests/test-mbschr.sh
+++ b/tests/test-mbschr2.sh
@@ -12,4 +12,4 @@ if test $LOCALE_ZH_CN = none; then
  fi
  
  LC_ALL=$LOCALE_ZH_CN \
-${CHECKER} ./test-mbschr${EXEEXT}
+${CHECKER} ./test-mbschr2${EXEEXT}
diff --git a/tests/test-mbscspn.c b/tests/test-mbscspn.c

index 0fa513748f53e5e7b5d1b9f6b047524d11fb4260..5cc248711d4eee9384443606bd391fd92d3d2099 100644 (file)
--- a/tests/test-mbscspn.c
+++ b/tests/test-mbscspn.c
@@ -24,6 +24,20 @@
  
  #include "macros.h"
  
+/* The mcel-based implementation of mbsnlen behaves differently than the
+   original one.  Namely, for invalid/incomplete byte sequences:
+   Where we ideally should have multi-byte-per-encoding-error (MEE) behaviour
+   everywhere, mcel implements single-byte-per-encoding-error (SEE) behaviour.
+   See <https://lists.gnu.org/archive/html/bug-gnulib/2023-07/msg00131.html>,
+       <https://lists.gnu.org/archive/html/bug-gnulib/2023-07/msg00145.html>.
+   Therefore, here we have different expected results, depending on the
+   implementation.  */
+#if GNULIB_MCEL_PREFER
+# define OR(a,b) b
+#else
+# define OR(a,b) a
+#endif
+
  int
  main ()
  {
@@ -57,5 +71,69 @@ main ()
      ASSERT (mbscspn (input, "\303") == 14); /* invalid multibyte sequence */
    }
  
+  /* Incomplete characters.  See
+     https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf
+     page 128 table 3-11.  */
+
+  /* "\341\200\240" = 0xE1 0x80 0xA0 = U+1020.  */
+  {
+    const char input[] = "\341\200\240x\341\200y";
+    ASSERT (mbscspn (input, "\341\200") == 4);
+  }
+  {
+    const char input[] = "\341\200\240x\341\200";
+    ASSERT (mbscspn (input, "\341\200") == 4);
+  }
+  {
+    const char input[] = "\341\200\240x\341\200";
+    ASSERT (mbscspn (input, "\341") == OR(6,4));
+  }
+  {
+    const char input[] = "\341\200\240x\341y";
+    ASSERT (mbscspn (input, "\341") == 4);
+  }
+  {
+    const char input[] = "\341\200\240x\341";
+    ASSERT (mbscspn (input, "\341") == 4);
+  }
+
+  /* "\360\221\222\240" = 0xF0 0x91 0x92 0xA0 = U+114A0.  */
+  {
+    const char input[] = "\360\221\222\240x\360\221\222y";
+    ASSERT (mbscspn (input, "\360\221\222") == 5);
+  }
+  {
+    const char input[] = "\360\221\222\240x\360\221\222";
+    ASSERT (mbscspn (input, "\360\221\222") == 5);
+  }
+  {
+    const char input[] = "\360\221\222\240x\360\221\222";
+    ASSERT (mbscspn (input, "\360\221") == OR(8,5));
+  }
+  {
+    const char input[] = "\360\221\222\240x\360\221y";
+    ASSERT (mbscspn (input, "\360\221") == 5);
+  }
+  {
+    const char input[] = "\360\221\222\240x\360\221";
+    ASSERT (mbscspn (input, "\360\221") == 5);
+  }
+  {
+    const char input[] = "\360\221\222\240x\360\221\222";
+    ASSERT (mbscspn (input, "\360") == OR(8,5));
+  }
+  {
+    const char input[] = "\360\221\222\240x\360\221";
+    ASSERT (mbscspn (input, "\360") == OR(7,5));
+  }
+  {
+    const char input[] = "\360\221\222\240x\360y";
+    ASSERT (mbscspn (input, "\360") == 5);
+  }
+  {
+    const char input[] = "\360\221\222\240x\360";
+    ASSERT (mbscspn (input, "\360") == 5);
+  }
+
    return test_exit_status;
  }
diff --git a/tests/test-mbslen.c b/tests/test-mbslen.c

index b32a74a2966a1f3e0d77cb800418911ca78390b4..9cf867357953d8e3189557e553b2601923023981 100644 (file)
--- a/tests/test-mbslen.c
+++ b/tests/test-mbslen.c
@@ -24,6 +24,20 @@
  
  #include "macros.h"
  
+/* The mcel-based implementation of mbsnlen behaves differently than the
+   original one.  Namely, for invalid/incomplete byte sequences:
+   Where we ideally should have multi-byte-per-encoding-error (MEE) behaviour
+   everywhere, mcel implements single-byte-per-encoding-error (SEE) behaviour.
+   See <https://lists.gnu.org/archive/html/bug-gnulib/2023-07/msg00131.html>,
+       <https://lists.gnu.org/archive/html/bug-gnulib/2023-07/msg00145.html>.
+   Therefore, here we have different expected results, depending on the
+   implementation.  */
+#if GNULIB_MCEL_PREFER
+# define OR(a,b) b
+#else
+# define OR(a,b) a
+#endif
+
  int
  main ()
  {
@@ -39,9 +53,17 @@ main ()
    ASSERT (mbslen ("7\342\202\254") == 2); /* "7€" */
    ASSERT (mbslen ("\360\237\220\203") == 1); /* "🐃" */
  
-  ASSERT (mbslen ("\303") == 1); /* invalid multibyte sequence */
-  ASSERT (mbslen ("\342\202") == 2); /* 2x invalid multibyte sequence */
-  ASSERT (mbslen ("\360\237\220") == 3); /* 3x invalid multibyte sequence */
+  /* Incomplete characters.  See
+     https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf
+     page 128 table 3-11.  */
+  ASSERT (mbslen ("\303") == 1);
+  /* "\341\200\240" = 0xE1 0x80 0xA0 = U+1020.  */
+  ASSERT (mbslen ("\341\200") == OR(1,2));
+  ASSERT (mbslen ("\341") == 1);
+  /* "\360\221\222\240" = 0xF0 0x91 0x92 0xA0 = U+114A0.  */
+  ASSERT (mbslen ("\360\221\222") == OR(1,3));
+  ASSERT (mbslen ("\360\221") == OR(1,2));
+  ASSERT (mbslen ("\360") == 1);
  
    return test_exit_status;
  }
diff --git a/tests/test-mbspbrk.c b/tests/test-mbspbrk.c

index ce396eba182c89fc96a9695ee64ae4255a0d6fe4..a0f86d36521a3699cdeae5ba60199467671149ce 100644 (file)
--- a/tests/test-mbspbrk.c
+++ b/tests/test-mbspbrk.c
@@ -24,6 +24,20 @@
  
  #include "macros.h"
  
+/* The mcel-based implementation of mbsnlen behaves differently than the
+   original one.  Namely, for invalid/incomplete byte sequences:
+   Where we ideally should have multi-byte-per-encoding-error (MEE) behaviour
+   everywhere, mcel implements single-byte-per-encoding-error (SEE) behaviour.
+   See <https://lists.gnu.org/archive/html/bug-gnulib/2023-07/msg00131.html>,
+       <https://lists.gnu.org/archive/html/bug-gnulib/2023-07/msg00145.html>.
+   Therefore, here we have different expected results, depending on the
+   implementation.  */
+#if GNULIB_MCEL_PREFER
+# define OR(a,b) b
+#else
+# define OR(a,b) a
+#endif
+
  int
  main ()
  {
@@ -51,5 +65,69 @@ main ()
      ASSERT (mbspbrk (input, "\303") == NULL); /* invalid multibyte sequence */
    }
  
+  /* Incomplete characters.  See
+     https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf
+     page 128 table 3-11.  */
+
+  /* "\341\200\240" = 0xE1 0x80 0xA0 = U+1020.  */
+  {
+    const char input[] = "\341\200\240x\341\200y";
+    ASSERT (mbspbrk (input, "\341\200") == input + 4);
+  }
+  {
+    const char input[] = "\341\200\240x\341\200";
+    ASSERT (mbspbrk (input, "\341\200") == input + 4);
+  }
+  {
+    const char input[] = "\341\200\240x\341\200";
+    ASSERT (mbspbrk (input, "\341") == OR (NULL, input + 4));
+  }
+  {
+    const char input[] = "\341\200\240x\341y";
+    ASSERT (mbspbrk (input, "\341") == input + 4);
+  }
+  {
+    const char input[] = "\341\200\240x\341";
+    ASSERT (mbspbrk (input, "\341") == input + 4);
+  }
+
+  /* "\360\221\222\240" = 0xF0 0x91 0x92 0xA0 = U+114A0.  */
+  {
+    const char input[] = "\360\221\222\240x\360\221\222y";
+    ASSERT (mbspbrk (input, "\360\221\222") == input + 5);
+  }
+  {
+    const char input[] = "\360\221\222\240x\360\221\222";
+    ASSERT (mbspbrk (input, "\360\221\222") == input + 5);
+  }
+  {
+    const char input[] = "\360\221\222\240x\360\221\222";
+    ASSERT (mbspbrk (input, "\360\221") == OR (NULL, input + 5));
+  }
+  {
+    const char input[] = "\360\221\222\240x\360\221y";
+    ASSERT (mbspbrk (input, "\360\221") == input + 5);
+  }
+  {
+    const char input[] = "\360\221\222\240x\360\221";
+    ASSERT (mbspbrk (input, "\360\221") == input + 5);
+  }
+  {
+    const char input[] = "\360\221\222\240x\360\221\222";
+    ASSERT (mbspbrk (input, "\360") == OR (NULL, input + 5));
+  }
+  {
+    const char input[] = "\360\221\222\240x\360\221";
+    ASSERT (mbspbrk (input, "\360") == OR (NULL, input + 5));
+  }
+  {
+    const char input[] = "\360\221\222\240x\360y";
+    ASSERT (mbspbrk (input, "\360") == input + 5);
+  }
+  {
+    const char input[] = "\360\221\222\240x\360";
+    ASSERT (mbspbrk (input, "\360") == input + 5);
+  }
+
    return test_exit_status;
  }
diff --git a/tests/test-mbsrchr1.c b/tests/test-mbsrchr1.c

new file mode 100644 (file)

index 0000000..91c5e73
--- /dev/null
+++ b/tests/test-mbsrchr1.c
@@ -0,0 +1,107 @@
+/* Test of searching a string for the last occurrence of a character.
+   Copyright (C) 2026 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+/* Written by Bruno Haible <bruno@clisp.org>, 2026.  */
+
+#include <config.h>
+
+#include <string.h>
+
+#include <locale.h>
+
+#include "macros.h"
+
+/* The mcel-based implementation of mbsnlen behaves differently than the
+   original one.  Namely, for invalid/incomplete byte sequences:
+   Where we ideally should have multi-byte-per-encoding-error (MEE) behaviour
+   everywhere, mcel implements single-byte-per-encoding-error (SEE) behaviour.
+   See <https://lists.gnu.org/archive/html/bug-gnulib/2023-07/msg00131.html>,
+       <https://lists.gnu.org/archive/html/bug-gnulib/2023-07/msg00145.html>.
+   Therefore, here we have different expected results, depending on the
+   implementation.  */
+#if GNULIB_MCEL_PREFER
+# define OR(a,b) b
+#else
+# define OR(a,b) a
+#endif
+
+int
+main ()
+{
+  /* configure should already have checked that the locale is supported.  */
+  if (setlocale (LC_ALL, "") == NULL)
+    return 1;
+
+  /* Incomplete characters.  See
+     https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf
+     page 128 table 3-11.  */
+
+  /* "\341\200\240" = 0xE1 0x80 0xA0 = U+1020.  */
+  {
+    const char input[] = "\341\200";
+    const char *result = mbsrchr (input, '\341');
+    ASSERT (result == OR (NULL, input + 0));
+  }
+  {
+    const char input[] = "\341\200";
+    const char *result = mbsrchr (input, '\200');
+    ASSERT (result == OR (NULL, input + 1));
+  }
+  {
+    const char input[] = "\341\200\341";
+    const char *result = mbsrchr (input, '\341');
+    ASSERT (result == input + 2);
+  }
+
+  /* "\360\221\222\240" = 0xF0 0x91 0x92 0xA0 = U+114A0.  */
+  {
+    const char input[] = "\360\221\222";
+    const char *result = mbsrchr (input, '\360');
+    ASSERT (result == OR (NULL, input + 0));
+  }
+  {
+    const char input[] = "\360\221\222";
+    const char *result = mbsrchr (input, '\221');
+    ASSERT (result == OR (NULL, input + 1));
+  }
+  {
+    const char input[] = "\360\221\222";
+    const char *result = mbsrchr (input, '\222');
+    ASSERT (result == OR (NULL, input + 2));
+  }
+  {
+    const char input[] = "\360\221\222\360";
+    const char *result = mbsrchr (input, '\360');
+    ASSERT (result == input + 3);
+  }
+  {
+    const char input[] = "\360\221";
+    const char *result = mbsrchr (input, '\360');
+    ASSERT (result == OR (NULL, input + 0));
+  }
+  {
+    const char input[] = "\360\221";
+    const char *result = mbsrchr (input, '\221');
+    ASSERT (result == OR (NULL, input + 1));
+  }
+  {
+    const char input[] = "\360\221\360";
+    const char *result = mbsrchr (input, '\360');
+    ASSERT (result == input + 2);
+  }
+
+  return test_exit_status;
+}
diff --git a/tests/test-mbsrchr1.sh b/tests/test-mbsrchr1.sh

new file mode 100755 (executable)

index 0000000..ce0d000
--- /dev/null
+++ b/tests/test-mbsrchr1.sh
@@ -0,0 +1,23 @@
+#!/bin/sh
+
+# Test whether a specific UTF-8 locale is installed.
+: "${LOCALE_EN_UTF8=en_US.UTF-8}"
+: "${LOCALE_FR_UTF8=fr_FR.UTF-8}"
+if test "$LOCALE_EN_UTF8" = none && test $LOCALE_FR_UTF8 = none; then
+  if test -f /usr/bin/localedef; then
+    echo "Skipping test: no english or french Unicode locale is installed"
+  else
+    echo "Skipping test: no english or french Unicode locale is supported"
+  fi
+  exit 77
+fi
+
+# It's sufficient to test in one of the two locales.
+if test $LOCALE_FR_UTF8 != none; then
+  testlocale=$LOCALE_FR_UTF8
+else
+  testlocale="$LOCALE_EN_UTF8"
+fi
+
+LC_ALL="$testlocale" \
+${CHECKER} ./test-mbsrchr1${EXEEXT}
diff --git a/tests/test-mbsrchr.c b/tests/test-mbsrchr2.c

similarity index 100%

rename from tests/test-mbsrchr.c

rename to tests/test-mbsrchr2.c
diff --git a/tests/test-mbsrchr.sh b/tests/test-mbsrchr2.sh

similarity index 90%

rename from tests/test-mbsrchr.sh

rename to tests/test-mbsrchr2.sh

index 84c40b7bf871bcc9ece313cfcd1727861d9cc7ce..cce61decc69da1ce45bfc904cc6d75d40e82b7ba 100755 (executable)
--- a/tests/test-mbsrchr.sh
+++ b/tests/test-mbsrchr2.sh
@@ -12,4 +12,4 @@ if test $LOCALE_ZH_CN = none; then
  fi
  
  LC_ALL=$LOCALE_ZH_CN \
-${CHECKER} ./test-mbsrchr${EXEEXT}
+${CHECKER} ./test-mbsrchr2${EXEEXT}
diff --git a/tests/test-mbsspn.c b/tests/test-mbsspn.c

index cce1d08dcef133d51970e319197c8234f5c78a29..d2edeaa89a3e96762a61ec416ba3ba29ac98e001 100644 (file)
--- a/tests/test-mbsspn.c
+++ b/tests/test-mbsspn.c
@@ -24,6 +24,20 @@
  
  #include "macros.h"
  
+/* The mcel-based implementation of mbsnlen behaves differently than the
+   original one.  Namely, for invalid/incomplete byte sequences:
+   Where we ideally should have multi-byte-per-encoding-error (MEE) behaviour
+   everywhere, mcel implements single-byte-per-encoding-error (SEE) behaviour.
+   See <https://lists.gnu.org/archive/html/bug-gnulib/2023-07/msg00131.html>,
+       <https://lists.gnu.org/archive/html/bug-gnulib/2023-07/msg00145.html>.
+   Therefore, here we have different expected results, depending on the
+   implementation.  */
+#if GNULIB_MCEL_PREFER
+# define OR(a,b) b
+#else
+# define OR(a,b) a
+#endif
+
  int
  main ()
  {
@@ -53,5 +67,49 @@ main ()
      ASSERT (mbsspn (input, "\303") == 0); /* invalid multibyte sequence */
    }
  
+  /* Incomplete characters.  See
+     https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf
+     page 128 table 3-11.  */
+
+  /* "\341\200\240" = 0xE1 0x80 0xA0 = U+1020.  */
+  {
+    const char input[] = "\341\200\341\200\240";
+    ASSERT (mbsspn (input, "\341\200") == 2);
+  }
+  {
+    const char input[] = "\341\200\341\200\240";
+    ASSERT (mbsspn (input, "\341") == OR(0,1));
+  }
+  {
+    const char input[] = "\341\341\200\240";
+    ASSERT (mbsspn (input, "\341") == 1);
+  }
+
+  /* "\360\221\222\240" = 0xF0 0x91 0x92 0xA0 = U+114A0.  */
+  {
+    const char input[] = "\360\221\222\360\221\222\240";
+    ASSERT (mbsspn (input, "\360\221\222") == 3);
+  }
+  {
+    const char input[] = "\360\221\222\360\221\222\240";
+    ASSERT (mbsspn (input, "\360\221") == OR(0,2));
+  }
+  {
+    const char input[] = "\360\221\360\221\222\240";
+    ASSERT (mbsspn (input, "\360\221") == 2);
+  }
+  {
+    const char input[] = "\360\221\222\360\221\222\240";
+    ASSERT (mbsspn (input, "\360") == OR(0,1));
+  }
+  {
+    const char input[] = "\360\221\360\221\222\240";
+    ASSERT (mbsspn (input, "\360") == OR(0,1));
+  }
+  {
+    const char input[] = "\360\360\221\222\240";
+    ASSERT (mbsspn (input, "\360") == 1);
+  }
+
    return test_exit_status;
  }
author	Bruno Haible <bruno@clisp.org>
	Mon, 25 May 2026 23:29:29 +0000 (01:29 +0200)
committer	Bruno Haible <bruno@clisp.org>
	Mon, 25 May 2026 23:31:23 +0000 (01:31 +0200)
ChangeLog		patch \| blob \| blame \| history
lib/mbuiterf.h		patch \| blob \| blame \| history
modules/mbschr-tests		patch \| blob \| blame \| history
modules/mbsrchr-tests		patch \| blob \| blame \| history
modules/mbuiterf		patch \| blob \| blame \| history
tests/test-mbschr1.c	[new file with mode: 0644]	patch \| blob
tests/test-mbschr1.sh	[new file with mode: 0755]	patch \| blob
tests/test-mbschr2.c	[moved from tests/test-mbschr.c with 96% similarity]	patch \| blob \| blame \| history
tests/test-mbschr2.sh	[moved from tests/test-mbschr.sh with 90% similarity]	patch \| blob \| blame \| history
tests/test-mbscspn.c		patch \| blob \| blame \| history
tests/test-mbslen.c		patch \| blob \| blame \| history
tests/test-mbspbrk.c		patch \| blob \| blame \| history
tests/test-mbsrchr1.c	[new file with mode: 0644]	patch \| blob
tests/test-mbsrchr1.sh	[new file with mode: 0755]	patch \| blob
tests/test-mbsrchr2.c	[moved from tests/test-mbsrchr.c with 100% similarity]	patch \| blob \| blame \| history
tests/test-mbsrchr2.sh	[moved from tests/test-mbsrchr.sh with 90% similarity]	patch \| blob \| blame \| history
tests/test-mbsspn.c		patch \| blob \| blame \| history