* m4/mbrtowc.m4 (gl_MBRTOWC_INVALID_UTF8): New macro.
(gl_FUNC_MBRTOWC): Invoke it. Define MBRTOWC_INVALID_UTF8_BUG if mbrtowc
does not recognize some invalid UTF-8 byte sequences.
* lib/mbrtowc.c (is_locale_utf8, is_locale_utf8_cached): Define also if
MBRTOWC_INVALID_UTF8_BUG.
(rpl_mbrtowc): Handle UTF-8 locales specially also on NetBSD.
* tests/test-mbrtowc.c (main): Add more test cases for the UTF-8
encoding.
* tests/test-mbrlen.c (main): Likewise.
* doc/posix-functions/mbrtowc.texi: Mention the NetBSD bug.
* doc/posix-functions/mbrlen.texi: Likewise.
+2026-06-02 Bruno Haible <bruno@clisp.org>
+
+ mbrtowc, mbrlen: Work around a NetBSD bug in UTF-8 locales.
+ * m4/mbrtowc.m4 (gl_MBRTOWC_INVALID_UTF8): New macro.
+ (gl_FUNC_MBRTOWC): Invoke it. Define MBRTOWC_INVALID_UTF8_BUG if mbrtowc
+ does not recognize some invalid UTF-8 byte sequences.
+ * lib/mbrtowc.c (is_locale_utf8, is_locale_utf8_cached): Define also if
+ MBRTOWC_INVALID_UTF8_BUG.
+ (rpl_mbrtowc): Handle UTF-8 locales specially also on NetBSD.
+ * tests/test-mbrtowc.c (main): Add more test cases for the UTF-8
+ encoding.
+ * tests/test-mbrlen.c (main): Likewise.
+ * doc/posix-functions/mbrtowc.texi: Mention the NetBSD bug.
+ * doc/posix-functions/mbrlen.texi: Likewise.
+
2026-05-31 Paul Eggert <eggert@cs.ucla.edu>
quotearg: remove SIZE_MAX, INT_BITS
character, on some platforms:
HP-UX 11.11, Solaris 11 2010-11.
@item
+This function returns @code{(size_t) -2} instead of @code{(size_t) -1}
+for some invalid byte sequences on some platforms:
+NetBSD 10.
+@item
This function may not return 0 when parsing the NUL character on some platforms:
Solaris 9.
@end itemize
character, on some platforms:
HP-UX 11.11, Solaris 11 2010-11, mingw, MSVC 14.
@item
+This function returns @code{(size_t) -2} instead of @code{(size_t) -1}
+for some invalid byte sequences on some platforms:
+NetBSD 10.
+@item
This function may not return 0 when parsing the NUL character on some platforms:
Solaris 9.
@end itemize
# include <locale.h>
# endif
-# if (GNULIB_WCHAR_SINGLE_LOCALE && __GLIBC__ >= 2 && !__UCLIBC__)
+# if MBRTOWC_INVALID_UTF8_BUG || (GNULIB_WCHAR_SINGLE_LOCALE && __GLIBC__ >= 2 && !__UCLIBC__)
/* Returns 1 if the current locale is an UTF-8 locale, 0 otherwise. */
static inline int
size_t
rpl_mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
{
-# if MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG || (GNULIB_WCHAR_SINGLE_LOCALE && __GLIBC__ >= 2)
+# if (MBRTOWC_RETVAL_BUG || MBRTOWC_EMPTY_INPUT_BUG || MBRTOWC_INVALID_UTF8_BUG \
+ || (GNULIB_WCHAR_SINGLE_LOCALE && __GLIBC__ >= 2))
if (s == NULL)
{
pwc = NULL;
}
# endif
-# if (MBRTOC32_EMPTY_INPUT_BUG || _GL_SMALL_WCHAR_T \
+# if (MBRTOC32_EMPTY_INPUT_BUG || MBRTOWC_INVALID_UTF8_BUG || _GL_SMALL_WCHAR_T \
|| (GNULIB_WCHAR_SINGLE_LOCALE && __GLIBC__ >= 2 && !__UCLIBC__))
if (n == 0)
return (size_t) -2;
# endif
-# if (GNULIB_WCHAR_SINGLE_LOCALE && __GLIBC__ >= 2 && !__UCLIBC__)
+# if MBRTOWC_INVALID_UTF8_BUG || (GNULIB_WCHAR_SINGLE_LOCALE && __GLIBC__ >= 2 && !__UCLIBC__)
/* Optimize the frequent case of an UTF-8 locale.
Since here we are in the !GNULIB_defined_mbstate_t case, i.e. we use
the system's mbstate_t type and have to provide interoperability with
the system's mbsinit() function, this requires knowledge about how the
system's UTF-8 mbrtowc() function stores the state. This knowledge is
- platform-specific. For simplicity, we handle only glibc systems. */
+ platform-specific. For simplicity, we handle only glibc and NetBSD
+ systems. */
if (is_locale_utf8_cached ())
{
static mbstate_t internal_state;
if (ps == NULL)
ps = &internal_state;
+ #if __GLIBC__ >= 2
/* Structure of mbstate_t =
{ int __count; union { wint_t __wch; char __wchb[4]; } __value; }
(see glibc/iconv/gconv_simple.c function utf8_internal_loop):
entire byte sequence.
__value.__wch is the already inferrable bits of the character, of
the form (x << (r*6)) when r bytes are still expected. */
+ #endif
+ #ifdef __NetBSD__
+ /* Structure of mbstate_t =
+ union { int64_t __mbstateL; char __mbstate8[128]; }
+ (see src/lib/libc/citrus/modules/citrus_utf8.c):
+ { void *header; char ch[6]; int chlen; },
+ i.e. ch[0..5] is __mbstate8[sizeof(void*)+0..sizeof(void*)+5],
+ chlen is __mbstate8[sizeof(void*)+8..sizeof(void*)+11]. */
+ #endif
/* Here n > 0. */
- size_t nstate = ps->__count & 7;
+ size_t nstate;
+ #if __GLIBC__ >= 2
+ nstate = ps->__count & 7;
+ #endif
+ #ifdef __NetBSD__
+ nstate = *(int *) &ps->__mbstate8[sizeof (void *) + 8];
+ #endif
char buf[4];
const char *p;
size_t m;
}
else
{
+ #if __GLIBC__ >= 2
size_t t = ps->__count >> 8; /* total expected number of bytes */
if (t > nstate && t <= 4)
{
errno = EINVAL;
return (size_t)(-1);
}
+ #endif
+ #ifdef __NetBSD__
+ buf[0] = ps->__mbstate8[sizeof (void *) + 0];
+ if (nstate >= 2)
+ {
+ buf[1] = ps->__mbstate8[sizeof (void *) + 1];
+ if (nstate >= 3)
+ {
+ buf[2] = ps->__mbstate8[sizeof (void *) + 2];
+ }
+ }
+ #endif
p = buf;
m = nstate;
buf[m++] = s[0];
if (nstate >= (res > 0 ? res : 1))
abort ();
res -= nstate;
+ #if __GLIBC__ >= 2
ps->__count = 0;
+ #endif
+ #ifdef __NetBSD__
+ *(int *) &ps->__mbstate8[sizeof (void *) + 8] = 0;
+ #endif
return res;
incomplete:
/* Here 0 < m < 4. */
{
+ #if __GLIBC__ >= 2
unsigned char c = (unsigned char) p[0];
if (c < 0xE0)
{
| (m > 1 ? ((unsigned char) p[1] & 0x3F) << 12 : 0)
| (m > 2 ? ((unsigned char) p[2] & 0x3F) << 6 : 0);
}
+ #endif
+ #ifdef __NetBSD__
+ *(int *) &ps->__mbstate8[sizeof (void *) + 8] = m;
+ ps->__mbstate8[sizeof (void *) + 0] = p[0];
+ if (m > 1)
+ {
+ ps->__mbstate8[sizeof (void *) + 1] = p[1];
+ if (m > 2)
+ {
+ ps->__mbstate8[sizeof (void *) + 2] = p[2];
+ }
+ }
+ #endif
}
return (size_t)(-2);
# mbrtowc.m4
-# serial 49
+# serial 50
dnl Copyright (C) 2001-2002, 2004-2005, 2008-2026 Free Software Foundation,
dnl Inc.
dnl This file is free software; the Free Software Foundation
gl_MBRTOWC_STORES_INCOMPLETE
gl_MBRTOWC_EMPTY_INPUT
gl_MBRTOWC_C_LOCALE
+ gl_MBRTOWC_INVALID_UTF8
case "$gl_cv_func_mbrtowc_null_arg1" in
*yes) ;;
*) AC_DEFINE([MBRTOWC_NULL_ARG1_BUG], [1],
REPLACE_MBRTOWC=1
;;
esac
+ case "$gl_cv_func_mbrtowc_invalid_UTF8" in
+ *yes) ;;
+ *) AC_DEFINE([MBRTOWC_INVALID_UTF8_BUG], [1],
+ [Define if the mbrtowc function does not recognize some invalid UTF-8 byte sequences.])
+ REPLACE_MBRTOWC=1
+ ;;
+ esac
fi
fi
if test $REPLACE_MBSTATE_T = 1; then
])
])
+dnl Test whether mbrtowc recognizes invalid UTF-8 byte sequences.
+
+AC_DEFUN([gl_MBRTOWC_INVALID_UTF8],
+[
+ AC_REQUIRE([gt_LOCALE_EN_UTF8])
+ AC_REQUIRE([AC_CANONICAL_HOST]) dnl for cross-compiles
+ AC_CACHE_CHECK([whether mbrtowc recognizes invalid UTF-8],
+ [gl_cv_func_mbrtowc_invalid_UTF8],
+ [
+ dnl Initial guess, used when cross-compiling or when no suitable locale
+ dnl is present.
+changequote(,)dnl
+ case "$host_os" in
+ # Guess no on NetBSD.
+ netbsd*) gl_cv_func_mbrtowc_invalid_UTF8="guessing no" ;;
+ # Guess yes otherwise.
+ *) gl_cv_func_mbrtowc_invalid_UTF8="guessing yes" ;;
+ esac
+changequote([,])dnl
+ if test "$LOCALE_EN_UTF8" != none; then
+ AC_RUN_IFELSE(
+ [AC_LANG_SOURCE([[
+#include <locale.h>
+#include <string.h>
+#include <wchar.h>
+int main ()
+{
+ if (setlocale (LC_ALL, "$LOCALE_EN_UTF8") != NULL)
+ {
+ int result = 0;
+ /* This test fails on NetBSD 10. */
+ {
+ mbstate_t state;
+ wchar_t wc;
+
+ memset (&state, '\0', sizeof (mbstate_t));
+ if (mbrtowc (&wc, "\340x", 2, &state) != (size_t)(-1))
+ result |= 1;
+ }
+ /* This test fails on NetBSD 10. */
+ {
+ mbstate_t state;
+ wchar_t wc;
+
+ memset (&state, '\0', sizeof (mbstate_t));
+ if (mbrtowc (&wc, "\360x\360", 3, &state) != (size_t)(-1))
+ result |= 2;
+ }
+ return result;
+ }
+ return 0;
+}]])],
+ [gl_cv_func_mbrtowc_invalid_UTF8=yes],
+ [gl_cv_func_mbrtowc_invalid_UTF8=no],
+ [:])
+ fi
+ ])
+])
+
# Prerequisites of lib/mbrtowc.c and lib/lc-charset-dispatch.c.
AC_DEFUN([gl_PREREQ_MBRTOWC], [
AC_REQUIRE([AC_C_INLINE])
ASSERT (ret == 1);
ASSERT (mbsinit (&state));
}
+ /* Test recognition of invalid byte sequences. */
+ {
+ memset (&state, 0, sizeof (mbstate_t));
+ ret = mbrlen ("\340x", 2, &state);
+ ASSERT (ret == (size_t)(-1));
+ }
+ {
+ memset (&state, 0, sizeof (mbstate_t));
+ ret = mbrlen ("\360x\360", 3, &state);
+ ASSERT (ret == (size_t)(-1));
+ }
return test_exit_status;
case '4':
ASSERT (wctob (wc) == EOF);
ASSERT (mbsinit (&state));
}
+ /* Test recognition of invalid byte sequences. */
+ {
+ memset (&state, 0, sizeof (mbstate_t));
+ wc = (wchar_t) {0xBADFACE};
+ ret = mbrtowc (&wc, "\340x", 2, &state);
+ ASSERT (ret == (size_t)(-1));
+ }
+ {
+ memset (&state, 0, sizeof (mbstate_t));
+ wc = (wchar_t) {0xBADFACE};
+ ret = mbrtowc (&wc, "\360x\360", 3, &state);
+ ASSERT (ret == (size_t)(-1));
+ }
return test_exit_status;
case '4':