* lib/mbuiterf.h: Include mbiter-aux.h.
(struct mbuif_state): Add field is_utf8.
(mbuiterf_next): Invoke mbiter_is_utf8, mbiter_utf8_maximal_subpart.
(mbuif_init): Initialize the field is_utf8.
* modules/mbuiterf (Depends-on): Add mbiter-aux.
* tests/test-mbslen.c (OR): New macro, copied from
tests/test-mbsnlen.c.
(main): Add more test cases with incomplete characters.
* tests/test-mbschr2.sh: Renamed from tests/test-mbschr.sh.
* tests/test-mbschr2.c: Renamed from tests/test-mbschr.c.
* tests/test-mbschr1.sh: New file, based on
tests/test-mbmemcasecmp-3.sh.
* tests/test-mbschr1.c: New file.
* modules/mbschr-tests (Files): Update accordingly. Add locale-en.m4,
locale-fr.m4.
(configure.ac): Invoke gt_LOCALE_EN_UTF8, gt_LOCALE_FR_UTF8.
(Makefile.am): Arrange to compile test-mbschr1 and test-mbschr2 and to
run test-mbschr1.sh, test-mbschr2.sh.
* tests/test-mbsrchr2.sh: Renamed from tests/test-mbsrchr.sh.
* tests/test-mbsrchr2.c: Renamed from tests/test-mbsrchr.c.
* tests/test-mbsrchr1.sh: New file, based on
tests/test-mbmemcasecmp-3.sh.
* tests/test-mbsrchr1.c: New file.
* modules/mbsrchr-tests (Files): Update accordingly. Add locale-en.m4,
locale-fr.m4.
(configure.ac): Invoke gt_LOCALE_EN_UTF8, gt_LOCALE_FR_UTF8.
(Makefile.am): Arrange to compile test-mbsrchr1 and test-mbsrchr2 and to
run test-mbsrchr1.sh, test-mbsrchr2.sh.
* tests/test-mbscspn.c (OR): New macro, copied from
tests/test-mbsnlen.c.
(main): Add test cases with incomplete characters.
* tests/test-mbspbrk.c (OR): New macro, copied from
tests/test-mbsnlen.c.
(main): Add test cases with incomplete characters.
* tests/test-mbsspn.c (OR): New macro, copied from
tests/test-mbsnlen.c.
(main): Add test cases with incomplete characters.
+2026-05-25 Bruno Haible <bruno@clisp.org>
+
+ mbuiterf: Implement multi-byte per encoding error (MEE) consistently.
+ * lib/mbuiterf.h: Include mbiter-aux.h.
+ (struct mbuif_state): Add field is_utf8.
+ (mbuiterf_next): Invoke mbiter_is_utf8, mbiter_utf8_maximal_subpart.
+ (mbuif_init): Initialize the field is_utf8.
+ * modules/mbuiterf (Depends-on): Add mbiter-aux.
+ * tests/test-mbslen.c (OR): New macro, copied from
+ tests/test-mbsnlen.c.
+ (main): Add more test cases with incomplete characters.
+ * tests/test-mbschr2.sh: Renamed from tests/test-mbschr.sh.
+ * tests/test-mbschr2.c: Renamed from tests/test-mbschr.c.
+ * tests/test-mbschr1.sh: New file, based on
+ tests/test-mbmemcasecmp-3.sh.
+ * tests/test-mbschr1.c: New file.
+ * modules/mbschr-tests (Files): Update accordingly. Add locale-en.m4,
+ locale-fr.m4.
+ (configure.ac): Invoke gt_LOCALE_EN_UTF8, gt_LOCALE_FR_UTF8.
+ (Makefile.am): Arrange to compile test-mbschr1 and test-mbschr2 and to
+ run test-mbschr1.sh, test-mbschr2.sh.
+ * tests/test-mbsrchr2.sh: Renamed from tests/test-mbsrchr.sh.
+ * tests/test-mbsrchr2.c: Renamed from tests/test-mbsrchr.c.
+ * tests/test-mbsrchr1.sh: New file, based on
+ tests/test-mbmemcasecmp-3.sh.
+ * tests/test-mbsrchr1.c: New file.
+ * modules/mbsrchr-tests (Files): Update accordingly. Add locale-en.m4,
+ locale-fr.m4.
+ (configure.ac): Invoke gt_LOCALE_EN_UTF8, gt_LOCALE_FR_UTF8.
+ (Makefile.am): Arrange to compile test-mbsrchr1 and test-mbsrchr2 and to
+ run test-mbsrchr1.sh, test-mbsrchr2.sh.
+ * tests/test-mbscspn.c (OR): New macro, copied from
+ tests/test-mbsnlen.c.
+ (main): Add test cases with incomplete characters.
+ * tests/test-mbspbrk.c (OR): New macro, copied from
+ tests/test-mbsnlen.c.
+ (main): Add test cases with incomplete characters.
+ * tests/test-mbsspn.c (OR): New macro, copied from
+ tests/test-mbsnlen.c.
+ (main): Add test cases with incomplete characters.
+
2026-05-25 Bruno Haible <bruno@clisp.org>
mbuiter: Implement multi-byte per encoding error (MEE) consistently.
#include <wchar.h>
#include "mbchar.h"
+#include "mbiter-aux.h"
#include "strnlen1.h"
_GL_INLINE_HEADER_BEGIN
before and after every mbuiterf_next invocation.
*/
unsigned int cur_max; /* A cache of MB_CUR_MAX. */
+ int is_utf8; /* A cache of mbiter_is_utf8. */
};
MBUITERF_INLINE mbchar_t
ps->in_shift = true;
with_shift:;
#endif
+ size_t avail_bytes = strnlen1 (iter, ps->cur_max);
size_t bytes;
char32_t wc;
- bytes = mbrtoc32 (&wc, iter, strnlen1 (iter, ps->cur_max), &ps->state);
+ bytes = mbrtoc32 (&wc, iter, avail_bytes, &ps->state);
if (bytes == (size_t) -1)
{
/* An invalid multibyte sequence was encountered. */
+ size_t ebytes =
+ (mbiter_is_utf8 (&ps->is_utf8)
+ ? mbiter_utf8_maximal_subpart (iter, avail_bytes)
+ : 1);
/* Allow the next invocation to continue from a sane state. */
#if !GNULIB_MBRTOC32_REGULAR
ps->in_shift = false;
#endif
mbszero (&ps->state);
- return (mbchar_t) { .ptr = iter, .bytes = 1, .wc_valid = false };
+ return (mbchar_t) { .ptr = iter, .bytes = ebytes, .wc_valid = false };
}
else if (bytes == (size_t) -2)
{
#if !GNULIB_MBRTOC32_REGULAR
#define mbuif_init(st) \
((st).in_shift = false, mbszero (&(st).state), \
- (st).cur_max = MB_CUR_MAX)
+ (st).cur_max = MB_CUR_MAX, (st).is_utf8 = -1)
#else
/* Optimized: no in_shift. */
#define mbuif_init(st) \
(mbszero (&(st).state), \
- (st).cur_max = MB_CUR_MAX)
+ (st).cur_max = MB_CUR_MAX, (st).is_utf8 = -1)
#endif
#if !GNULIB_MBRTOC32_REGULAR
#define mbuif_avail(st, iter) ((st).in_shift || (*(iter) != '\0'))
Files:
-tests/test-mbschr.sh
-tests/test-mbschr.c
+tests/test-mbschr1.sh
+tests/test-mbschr1.c
+tests/test-mbschr2.sh
+tests/test-mbschr2.c
tests/macros.h
+m4/locale-en.m4
+m4/locale-fr.m4
m4/locale-zh.m4
m4/codeset.m4
setlocale
configure.ac:
+gt_LOCALE_EN_UTF8
+gt_LOCALE_FR_UTF8
gt_LOCALE_ZH_CN
Makefile.am:
-TESTS += test-mbschr.sh
-TESTS_ENVIRONMENT += LOCALE_ZH_CN='@LOCALE_ZH_CN@'
-check_PROGRAMS += test-mbschr
-test_mbschr_LDADD = $(LDADD) $(LIBUNISTRING) $(SETLOCALE_LIB) $(MBRTOWC_LIB) $(LIBC32CONV)
+TESTS += test-mbschr1.sh test-mbschr2.sh
+TESTS_ENVIRONMENT += \
+ LOCALE_EN_UTF8='@LOCALE_EN_UTF8@' \
+ LOCALE_FR_UTF8='@LOCALE_FR_UTF8@' \
+ LOCALE_ZH_CN='@LOCALE_ZH_CN@'
+check_PROGRAMS += test-mbschr1 test-mbschr2
+test_mbschr1_LDADD = $(LDADD) $(LIBUNISTRING) $(SETLOCALE_LIB) $(MBRTOWC_LIB) $(LIBC32CONV)
+test_mbschr2_LDADD = $(LDADD) $(LIBUNISTRING) $(SETLOCALE_LIB) $(MBRTOWC_LIB) $(LIBC32CONV)
Files:
-tests/test-mbsrchr.sh
-tests/test-mbsrchr.c
+tests/test-mbsrchr1.sh
+tests/test-mbsrchr1.c
+tests/test-mbsrchr2.sh
+tests/test-mbsrchr2.c
tests/macros.h
+m4/locale-en.m4
+m4/locale-fr.m4
m4/locale-zh.m4
m4/codeset.m4
setlocale
configure.ac:
+gt_LOCALE_EN_UTF8
+gt_LOCALE_FR_UTF8
gt_LOCALE_ZH_CN
Makefile.am:
-TESTS += test-mbsrchr.sh
-TESTS_ENVIRONMENT += LOCALE_ZH_CN='@LOCALE_ZH_CN@'
-check_PROGRAMS += test-mbsrchr
-test_mbsrchr_LDADD = $(LDADD) $(LIBUNISTRING) $(SETLOCALE_LIB) $(MBRTOWC_LIB) $(LIBC32CONV)
+TESTS += test-mbsrchr1.sh test-mbsrchr2.sh
+TESTS_ENVIRONMENT += \
+ LOCALE_EN_UTF8='@LOCALE_EN_UTF8@' \
+ LOCALE_FR_UTF8='@LOCALE_FR_UTF8@' \
+ LOCALE_ZH_CN='@LOCALE_ZH_CN@'
+check_PROGRAMS += test-mbsrchr1 test-mbsrchr2
+test_mbsrchr1_LDADD = $(LDADD) $(LIBUNISTRING) $(SETLOCALE_LIB) $(MBRTOWC_LIB) $(LIBC32CONV)
+test_mbsrchr2_LDADD = $(LDADD) $(LIBUNISTRING) $(SETLOCALE_LIB) $(MBRTOWC_LIB) $(LIBC32CONV)
mbrtoc32
mbsinit
mbszero
+mbiter-aux
uchar-h
bool
strnlen1
--- /dev/null
+/* Test of searching a string for a character in a UTF-8 locale.
+ Copyright (C) 2026 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>. */
+
+/* Written by Bruno Haible <bruno@clisp.org>, 2026. */
+
+#include <config.h>
+
+#include <string.h>
+
+#include <locale.h>
+
+#include "macros.h"
+
+/* The mcel-based implementation of mbsnlen behaves differently than the
+ original one. Namely, for invalid/incomplete byte sequences:
+ Where we ideally should have multi-byte-per-encoding-error (MEE) behaviour
+ everywhere, mcel implements single-byte-per-encoding-error (SEE) behaviour.
+ See <https://lists.gnu.org/archive/html/bug-gnulib/2023-07/msg00131.html>,
+ <https://lists.gnu.org/archive/html/bug-gnulib/2023-07/msg00145.html>.
+ Therefore, here we have different expected results, depending on the
+ implementation. */
+#if GNULIB_MCEL_PREFER
+# define OR(a,b) b
+#else
+# define OR(a,b) a
+#endif
+
+int
+main ()
+{
+ /* configure should already have checked that the locale is supported. */
+ if (setlocale (LC_ALL, "") == NULL)
+ return 1;
+
+ /* Incomplete characters. See
+ https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf
+ page 128 table 3-11. */
+
+ /* "\341\200\240" = 0xE1 0x80 0xA0 = U+1020. */
+ {
+ const char input[] = "\341\200";
+ const char *result = mbschr (input, '\341');
+ ASSERT (result == OR (NULL, input + 0));
+ }
+ {
+ const char input[] = "\341\200";
+ const char *result = mbschr (input, '\200');
+ ASSERT (result == OR (NULL, input + 1));
+ }
+ {
+ const char input[] = "\341\200\341";
+ const char *result = mbschr (input, '\341');
+ ASSERT (result == input + OR(2,0));
+ }
+
+ /* "\360\221\222\240" = 0xF0 0x91 0x92 0xA0 = U+114A0. */
+ {
+ const char input[] = "\360\221\222";
+ const char *result = mbschr (input, '\360');
+ ASSERT (result == OR (NULL, input + 0));
+ }
+ {
+ const char input[] = "\360\221\222";
+ const char *result = mbschr (input, '\221');
+ ASSERT (result == OR (NULL, input + 1));
+ }
+ {
+ const char input[] = "\360\221\222";
+ const char *result = mbschr (input, '\222');
+ ASSERT (result == OR (NULL, input + 2));
+ }
+ {
+ const char input[] = "\360\221\222\360";
+ const char *result = mbschr (input, '\360');
+ ASSERT (result == input + OR(3,0));
+ }
+ {
+ const char input[] = "\360\221";
+ const char *result = mbschr (input, '\360');
+ ASSERT (result == OR (NULL, input + 0));
+ }
+ {
+ const char input[] = "\360\221";
+ const char *result = mbschr (input, '\221');
+ ASSERT (result == OR (NULL, input + 1));
+ }
+ {
+ const char input[] = "\360\221\360";
+ const char *result = mbschr (input, '\360');
+ ASSERT (result == input + OR(2,0));
+ }
+
+ return test_exit_status;
+}
--- /dev/null
+#!/bin/sh
+
+# Test whether a specific UTF-8 locale is installed.
+: "${LOCALE_EN_UTF8=en_US.UTF-8}"
+: "${LOCALE_FR_UTF8=fr_FR.UTF-8}"
+if test "$LOCALE_EN_UTF8" = none && test $LOCALE_FR_UTF8 = none; then
+ if test -f /usr/bin/localedef; then
+ echo "Skipping test: no english or french Unicode locale is installed"
+ else
+ echo "Skipping test: no english or french Unicode locale is supported"
+ fi
+ exit 77
+fi
+
+# It's sufficient to test in one of the two locales.
+if test $LOCALE_FR_UTF8 != none; then
+ testlocale=$LOCALE_FR_UTF8
+else
+ testlocale="$LOCALE_EN_UTF8"
+fi
+
+LC_ALL="$testlocale" \
+${CHECKER} ./test-mbschr1${EXEEXT}
-/* Test of searching a string for a character.
+/* Test of searching a string for a character in a GB18030 locale.
Copyright (C) 2007-2026 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
fi
LC_ALL=$LOCALE_ZH_CN \
-${CHECKER} ./test-mbschr${EXEEXT}
+${CHECKER} ./test-mbschr2${EXEEXT}
#include "macros.h"
+/* The mcel-based implementation of mbsnlen behaves differently than the
+ original one. Namely, for invalid/incomplete byte sequences:
+ Where we ideally should have multi-byte-per-encoding-error (MEE) behaviour
+ everywhere, mcel implements single-byte-per-encoding-error (SEE) behaviour.
+ See <https://lists.gnu.org/archive/html/bug-gnulib/2023-07/msg00131.html>,
+ <https://lists.gnu.org/archive/html/bug-gnulib/2023-07/msg00145.html>.
+ Therefore, here we have different expected results, depending on the
+ implementation. */
+#if GNULIB_MCEL_PREFER
+# define OR(a,b) b
+#else
+# define OR(a,b) a
+#endif
+
int
main ()
{
ASSERT (mbscspn (input, "\303") == 14); /* invalid multibyte sequence */
}
+ /* Incomplete characters. See
+ https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf
+ page 128 table 3-11. */
+
+ /* "\341\200\240" = 0xE1 0x80 0xA0 = U+1020. */
+ {
+ const char input[] = "\341\200\240x\341\200y";
+ ASSERT (mbscspn (input, "\341\200") == 4);
+ }
+ {
+ const char input[] = "\341\200\240x\341\200";
+ ASSERT (mbscspn (input, "\341\200") == 4);
+ }
+ {
+ const char input[] = "\341\200\240x\341\200";
+ ASSERT (mbscspn (input, "\341") == OR(6,4));
+ }
+ {
+ const char input[] = "\341\200\240x\341y";
+ ASSERT (mbscspn (input, "\341") == 4);
+ }
+ {
+ const char input[] = "\341\200\240x\341";
+ ASSERT (mbscspn (input, "\341") == 4);
+ }
+
+ /* "\360\221\222\240" = 0xF0 0x91 0x92 0xA0 = U+114A0. */
+ {
+ const char input[] = "\360\221\222\240x\360\221\222y";
+ ASSERT (mbscspn (input, "\360\221\222") == 5);
+ }
+ {
+ const char input[] = "\360\221\222\240x\360\221\222";
+ ASSERT (mbscspn (input, "\360\221\222") == 5);
+ }
+ {
+ const char input[] = "\360\221\222\240x\360\221\222";
+ ASSERT (mbscspn (input, "\360\221") == OR(8,5));
+ }
+ {
+ const char input[] = "\360\221\222\240x\360\221y";
+ ASSERT (mbscspn (input, "\360\221") == 5);
+ }
+ {
+ const char input[] = "\360\221\222\240x\360\221";
+ ASSERT (mbscspn (input, "\360\221") == 5);
+ }
+ {
+ const char input[] = "\360\221\222\240x\360\221\222";
+ ASSERT (mbscspn (input, "\360") == OR(8,5));
+ }
+ {
+ const char input[] = "\360\221\222\240x\360\221";
+ ASSERT (mbscspn (input, "\360") == OR(7,5));
+ }
+ {
+ const char input[] = "\360\221\222\240x\360y";
+ ASSERT (mbscspn (input, "\360") == 5);
+ }
+ {
+ const char input[] = "\360\221\222\240x\360";
+ ASSERT (mbscspn (input, "\360") == 5);
+ }
+
return test_exit_status;
}
#include "macros.h"
+/* The mcel-based implementation of mbsnlen behaves differently than the
+ original one. Namely, for invalid/incomplete byte sequences:
+ Where we ideally should have multi-byte-per-encoding-error (MEE) behaviour
+ everywhere, mcel implements single-byte-per-encoding-error (SEE) behaviour.
+ See <https://lists.gnu.org/archive/html/bug-gnulib/2023-07/msg00131.html>,
+ <https://lists.gnu.org/archive/html/bug-gnulib/2023-07/msg00145.html>.
+ Therefore, here we have different expected results, depending on the
+ implementation. */
+#if GNULIB_MCEL_PREFER
+# define OR(a,b) b
+#else
+# define OR(a,b) a
+#endif
+
int
main ()
{
ASSERT (mbslen ("7\342\202\254") == 2); /* "7€" */
ASSERT (mbslen ("\360\237\220\203") == 1); /* "🐃" */
- ASSERT (mbslen ("\303") == 1); /* invalid multibyte sequence */
- ASSERT (mbslen ("\342\202") == 2); /* 2x invalid multibyte sequence */
- ASSERT (mbslen ("\360\237\220") == 3); /* 3x invalid multibyte sequence */
+ /* Incomplete characters. See
+ https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf
+ page 128 table 3-11. */
+ ASSERT (mbslen ("\303") == 1);
+ /* "\341\200\240" = 0xE1 0x80 0xA0 = U+1020. */
+ ASSERT (mbslen ("\341\200") == OR(1,2));
+ ASSERT (mbslen ("\341") == 1);
+ /* "\360\221\222\240" = 0xF0 0x91 0x92 0xA0 = U+114A0. */
+ ASSERT (mbslen ("\360\221\222") == OR(1,3));
+ ASSERT (mbslen ("\360\221") == OR(1,2));
+ ASSERT (mbslen ("\360") == 1);
return test_exit_status;
}
#include "macros.h"
+/* The mcel-based implementation of mbsnlen behaves differently than the
+ original one. Namely, for invalid/incomplete byte sequences:
+ Where we ideally should have multi-byte-per-encoding-error (MEE) behaviour
+ everywhere, mcel implements single-byte-per-encoding-error (SEE) behaviour.
+ See <https://lists.gnu.org/archive/html/bug-gnulib/2023-07/msg00131.html>,
+ <https://lists.gnu.org/archive/html/bug-gnulib/2023-07/msg00145.html>.
+ Therefore, here we have different expected results, depending on the
+ implementation. */
+#if GNULIB_MCEL_PREFER
+# define OR(a,b) b
+#else
+# define OR(a,b) a
+#endif
+
int
main ()
{
ASSERT (mbspbrk (input, "\303") == NULL); /* invalid multibyte sequence */
}
+ /* Incomplete characters. See
+ https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf
+ page 128 table 3-11. */
+
+ /* "\341\200\240" = 0xE1 0x80 0xA0 = U+1020. */
+ {
+ const char input[] = "\341\200\240x\341\200y";
+ ASSERT (mbspbrk (input, "\341\200") == input + 4);
+ }
+ {
+ const char input[] = "\341\200\240x\341\200";
+ ASSERT (mbspbrk (input, "\341\200") == input + 4);
+ }
+ {
+ const char input[] = "\341\200\240x\341\200";
+ ASSERT (mbspbrk (input, "\341") == OR (NULL, input + 4));
+ }
+ {
+ const char input[] = "\341\200\240x\341y";
+ ASSERT (mbspbrk (input, "\341") == input + 4);
+ }
+ {
+ const char input[] = "\341\200\240x\341";
+ ASSERT (mbspbrk (input, "\341") == input + 4);
+ }
+
+ /* "\360\221\222\240" = 0xF0 0x91 0x92 0xA0 = U+114A0. */
+ {
+ const char input[] = "\360\221\222\240x\360\221\222y";
+ ASSERT (mbspbrk (input, "\360\221\222") == input + 5);
+ }
+ {
+ const char input[] = "\360\221\222\240x\360\221\222";
+ ASSERT (mbspbrk (input, "\360\221\222") == input + 5);
+ }
+ {
+ const char input[] = "\360\221\222\240x\360\221\222";
+ ASSERT (mbspbrk (input, "\360\221") == OR (NULL, input + 5));
+ }
+ {
+ const char input[] = "\360\221\222\240x\360\221y";
+ ASSERT (mbspbrk (input, "\360\221") == input + 5);
+ }
+ {
+ const char input[] = "\360\221\222\240x\360\221";
+ ASSERT (mbspbrk (input, "\360\221") == input + 5);
+ }
+ {
+ const char input[] = "\360\221\222\240x\360\221\222";
+ ASSERT (mbspbrk (input, "\360") == OR (NULL, input + 5));
+ }
+ {
+ const char input[] = "\360\221\222\240x\360\221";
+ ASSERT (mbspbrk (input, "\360") == OR (NULL, input + 5));
+ }
+ {
+ const char input[] = "\360\221\222\240x\360y";
+ ASSERT (mbspbrk (input, "\360") == input + 5);
+ }
+ {
+ const char input[] = "\360\221\222\240x\360";
+ ASSERT (mbspbrk (input, "\360") == input + 5);
+ }
+
return test_exit_status;
}
--- /dev/null
+/* Test of searching a string for the last occurrence of a character.
+ Copyright (C) 2026 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>. */
+
+/* Written by Bruno Haible <bruno@clisp.org>, 2026. */
+
+#include <config.h>
+
+#include <string.h>
+
+#include <locale.h>
+
+#include "macros.h"
+
+/* The mcel-based implementation of mbsnlen behaves differently than the
+ original one. Namely, for invalid/incomplete byte sequences:
+ Where we ideally should have multi-byte-per-encoding-error (MEE) behaviour
+ everywhere, mcel implements single-byte-per-encoding-error (SEE) behaviour.
+ See <https://lists.gnu.org/archive/html/bug-gnulib/2023-07/msg00131.html>,
+ <https://lists.gnu.org/archive/html/bug-gnulib/2023-07/msg00145.html>.
+ Therefore, here we have different expected results, depending on the
+ implementation. */
+#if GNULIB_MCEL_PREFER
+# define OR(a,b) b
+#else
+# define OR(a,b) a
+#endif
+
+int
+main ()
+{
+ /* configure should already have checked that the locale is supported. */
+ if (setlocale (LC_ALL, "") == NULL)
+ return 1;
+
+ /* Incomplete characters. See
+ https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf
+ page 128 table 3-11. */
+
+ /* "\341\200\240" = 0xE1 0x80 0xA0 = U+1020. */
+ {
+ const char input[] = "\341\200";
+ const char *result = mbsrchr (input, '\341');
+ ASSERT (result == OR (NULL, input + 0));
+ }
+ {
+ const char input[] = "\341\200";
+ const char *result = mbsrchr (input, '\200');
+ ASSERT (result == OR (NULL, input + 1));
+ }
+ {
+ const char input[] = "\341\200\341";
+ const char *result = mbsrchr (input, '\341');
+ ASSERT (result == input + 2);
+ }
+
+ /* "\360\221\222\240" = 0xF0 0x91 0x92 0xA0 = U+114A0. */
+ {
+ const char input[] = "\360\221\222";
+ const char *result = mbsrchr (input, '\360');
+ ASSERT (result == OR (NULL, input + 0));
+ }
+ {
+ const char input[] = "\360\221\222";
+ const char *result = mbsrchr (input, '\221');
+ ASSERT (result == OR (NULL, input + 1));
+ }
+ {
+ const char input[] = "\360\221\222";
+ const char *result = mbsrchr (input, '\222');
+ ASSERT (result == OR (NULL, input + 2));
+ }
+ {
+ const char input[] = "\360\221\222\360";
+ const char *result = mbsrchr (input, '\360');
+ ASSERT (result == input + 3);
+ }
+ {
+ const char input[] = "\360\221";
+ const char *result = mbsrchr (input, '\360');
+ ASSERT (result == OR (NULL, input + 0));
+ }
+ {
+ const char input[] = "\360\221";
+ const char *result = mbsrchr (input, '\221');
+ ASSERT (result == OR (NULL, input + 1));
+ }
+ {
+ const char input[] = "\360\221\360";
+ const char *result = mbsrchr (input, '\360');
+ ASSERT (result == input + 2);
+ }
+
+ return test_exit_status;
+}
--- /dev/null
+#!/bin/sh
+
+# Test whether a specific UTF-8 locale is installed.
+: "${LOCALE_EN_UTF8=en_US.UTF-8}"
+: "${LOCALE_FR_UTF8=fr_FR.UTF-8}"
+if test "$LOCALE_EN_UTF8" = none && test $LOCALE_FR_UTF8 = none; then
+ if test -f /usr/bin/localedef; then
+ echo "Skipping test: no english or french Unicode locale is installed"
+ else
+ echo "Skipping test: no english or french Unicode locale is supported"
+ fi
+ exit 77
+fi
+
+# It's sufficient to test in one of the two locales.
+if test $LOCALE_FR_UTF8 != none; then
+ testlocale=$LOCALE_FR_UTF8
+else
+ testlocale="$LOCALE_EN_UTF8"
+fi
+
+LC_ALL="$testlocale" \
+${CHECKER} ./test-mbsrchr1${EXEEXT}
fi
LC_ALL=$LOCALE_ZH_CN \
-${CHECKER} ./test-mbsrchr${EXEEXT}
+${CHECKER} ./test-mbsrchr2${EXEEXT}
#include "macros.h"
+/* The mcel-based implementation of mbsnlen behaves differently than the
+ original one. Namely, for invalid/incomplete byte sequences:
+ Where we ideally should have multi-byte-per-encoding-error (MEE) behaviour
+ everywhere, mcel implements single-byte-per-encoding-error (SEE) behaviour.
+ See <https://lists.gnu.org/archive/html/bug-gnulib/2023-07/msg00131.html>,
+ <https://lists.gnu.org/archive/html/bug-gnulib/2023-07/msg00145.html>.
+ Therefore, here we have different expected results, depending on the
+ implementation. */
+#if GNULIB_MCEL_PREFER
+# define OR(a,b) b
+#else
+# define OR(a,b) a
+#endif
+
int
main ()
{
ASSERT (mbsspn (input, "\303") == 0); /* invalid multibyte sequence */
}
+ /* Incomplete characters. See
+ https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf
+ page 128 table 3-11. */
+
+ /* "\341\200\240" = 0xE1 0x80 0xA0 = U+1020. */
+ {
+ const char input[] = "\341\200\341\200\240";
+ ASSERT (mbsspn (input, "\341\200") == 2);
+ }
+ {
+ const char input[] = "\341\200\341\200\240";
+ ASSERT (mbsspn (input, "\341") == OR(0,1));
+ }
+ {
+ const char input[] = "\341\341\200\240";
+ ASSERT (mbsspn (input, "\341") == 1);
+ }
+
+ /* "\360\221\222\240" = 0xF0 0x91 0x92 0xA0 = U+114A0. */
+ {
+ const char input[] = "\360\221\222\360\221\222\240";
+ ASSERT (mbsspn (input, "\360\221\222") == 3);
+ }
+ {
+ const char input[] = "\360\221\222\360\221\222\240";
+ ASSERT (mbsspn (input, "\360\221") == OR(0,2));
+ }
+ {
+ const char input[] = "\360\221\360\221\222\240";
+ ASSERT (mbsspn (input, "\360\221") == 2);
+ }
+ {
+ const char input[] = "\360\221\222\360\221\222\240";
+ ASSERT (mbsspn (input, "\360") == OR(0,1));
+ }
+ {
+ const char input[] = "\360\221\360\221\222\240";
+ ASSERT (mbsspn (input, "\360") == OR(0,1));
+ }
+ {
+ const char input[] = "\360\360\221\222\240";
+ ASSERT (mbsspn (input, "\360") == 1);
+ }
+
return test_exit_status;
}