From 14d35d5bade2b714e2c7eb4538f21345ebe20684 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Fri, 22 Sep 2023 09:45:12 -0700 Subject: [PATCH] maint: prefer char32_t to wchar_t MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This should work better on non-glibc platforms that don’t use Unicode for wchar_t. However, POSIX appears to prohibit this for printf.c so leave that alone. * bootstrap.conf (gnulib_modules): Add btoc32, c32iscntrl, c32isprint, c32isspace, c32width, mbrtoc32. Remove btoc, wcwidth. * src/df.c, src/ls.c, src/wc.c: Include uchar.h instead of wchar.h and wctype.h. * src/df.c (replace_invalid_chars): * src/ls.c (quote_name_buf): * src/wc.c (isnbspace, wc): Use char32_t instead of wchar_t. --- bootstrap.conf | 8 ++++++-- src/df.c | 9 ++++----- src/ls.c | 8 ++++---- src/printf.c | 1 + src/wc.c | 17 ++++++++--------- 5 files changed, 23 insertions(+), 20 deletions(-) diff --git a/bootstrap.conf b/bootstrap.conf index 0b1ed95f63..fdb059ccea 100644 --- a/bootstrap.conf +++ b/bootstrap.conf @@ -45,10 +45,14 @@ gnulib_modules=" backupfile base32 base64 - btowc + btoc32 buffer-lcm byteswap c-strcase + c32iscntrl + c32isprint + c32isspace + c32width canon-host canonicalize chmodat @@ -161,6 +165,7 @@ gnulib_modules=" malloc-gnu manywarnings mbrlen + mbrtoc32 mbrtowc mbschr mbslen @@ -282,7 +287,6 @@ gnulib_modules=" verify verror version-etc-fsf - wcwidth winsz-ioctl winsz-termios write-any-file diff --git a/src/df.c b/src/df.c index 1f3665a4af..5a41ad38ac 100644 --- a/src/df.c +++ b/src/df.c @@ -23,8 +23,7 @@ #include #include #include -#include -#include +#include #include "system.h" #include "assure.h" @@ -324,13 +323,13 @@ replace_invalid_chars (char *cell) for (char *src = cell; src != srcend; src += n) { - wchar_t wc; + char32_t wc; size_t srcbytes = srcend - src; - n = mbrtowc (&wc, src, srcbytes, &mbstate); + n = mbrtoc32 (&wc, src, srcbytes, &mbstate); bool ok = n <= srcbytes; if (ok) - ok = !iswcntrl (wc); + ok = !c32iscntrl (wc); else n = 1; diff --git a/src/ls.c b/src/ls.c index eb74f16785..769ae85a75 100644 --- a/src/ls.c +++ b/src/ls.c @@ -55,7 +55,7 @@ #include #include #include -#include +#include #if HAVE_LANGINFO_CODESET # include @@ -4612,11 +4612,11 @@ quote_name_buf (char **inbuf, size_t bufsize, char *name, mbstate_t mbstate; mbszero (&mbstate); do { - wchar_t wc; + char32_t wc; size_t bytes; int w; - bytes = mbrtowc (&wc, p, plimit - p, &mbstate); + bytes = mbrtoc32 (&wc, p, plimit - p, &mbstate); if (bytes == (size_t) -1) { @@ -4644,7 +4644,7 @@ quote_name_buf (char **inbuf, size_t bufsize, char *name, /* A null wide character was encountered. */ bytes = 1; - w = wcwidth (wc); + w = c32width (wc); if (w >= 0) { /* A printable multibyte character. diff --git a/src/printf.c b/src/printf.c index 16ea1c17f1..f36b455190 100644 --- a/src/printf.c +++ b/src/printf.c @@ -176,6 +176,7 @@ FUNC_NAME (char const *s) \ wchar_t wc; \ size_t slen = strlen (s); \ ssize_t bytes; \ + /* Use mbrtowc not mbrtoc32, as per POSIX. */ \ bytes = mbrtowc (&wc, s, slen, &mbstate); \ if (0 < bytes) \ { \ diff --git a/src/wc.c b/src/wc.c index b0f92c6b73..c0b37b5576 100644 --- a/src/wc.c +++ b/src/wc.c @@ -23,8 +23,7 @@ #include #include #include -#include -#include +#include #include "system.h" #include "assure.h" @@ -218,7 +217,7 @@ iswnbspace (wint_t wc) static int isnbspace (int c) { - return iswnbspace (btowc (c)); + return iswnbspace (btoc32 (c)); } /* FILE is the name of the file (or null for standard input) @@ -483,7 +482,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) bytes_read += prev; do { - wchar_t wide_char; + char32_t wide_char; size_t n; bool wide = true; @@ -501,7 +500,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) #if SUPPORT_OLD_MBRTOWC backup_state = state; #endif - n = mbrtowc (&wide_char, p, bytes_read, &state); + n = mbrtoc32 (&wide_char, p, bytes_read, &state); if (n == (size_t) -2) { #if SUPPORT_OLD_MBRTOWC @@ -553,17 +552,17 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) in_word = false; break; default: - if (wide && iswprint (wide_char)) + if (wide && c32isprint (wide_char)) { - /* wcwidth can be expensive on OSX for example, + /* c32width can be expensive on OSX for example, so avoid if not needed. */ if (print_linelength) { - int width = wcwidth (wide_char); + int width = c32width (wide_char); if (width > 0) linepos += width; } - if (iswspace (wide_char) || iswnbspace (wide_char)) + if (c32isspace (wide_char) || iswnbspace (wide_char)) goto mb_word_separator; in_word = true; } -- 2.47.2