From: Paul Eggert Date: Sat, 23 Sep 2023 05:09:37 +0000 (-0700) Subject: wc: 3× speedup in C locale X-Git-Tag: v9.5~145 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=17a9e79023fd98886f8e7e673965f61ec9569f6c;p=thirdparty%2Fcoreutils.git wc: 3× speedup in C locale The 3× speedup was measured by invoking 'wc $(find * -type f)' on the coreutils sources etc. on an Ubuntu 23.04 x86-64. These changes also speed up wc 20% in UTF-8 locales. * src/wc.c (wc_isprint, wc_isspace): New static vars. (wc): Use them for speed. (main): Initialize them if needed. (isnbspace): Remove; no longer used. --- diff --git a/NEWS b/NEWS index 0d114fb87a..18f80cb4c4 100644 --- a/NEWS +++ b/NEWS @@ -26,6 +26,11 @@ GNU coreutils NEWS -*- outline -*- tail now supports following multiple processes, with repeated --pid options. +** Improvements + + wc is now much faster in single-byte locales and somewhat faster in + multi-byte locales. + * Noteworthy changes in release 9.4 (2023-08-29) [stable] diff --git a/src/wc.c b/src/wc.c index 673c3eb476..6ec9399b5d 100644 --- a/src/wc.c +++ b/src/wc.c @@ -55,6 +55,9 @@ wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out, uintmax_t *bytes_out); #endif +static bool wc_isprint[UCHAR_MAX + 1]; +static bool wc_isspace[UCHAR_MAX + 1]; + static bool debug; /* Cumulative number of lines, words, chars and bytes in all files so far. @@ -209,12 +212,6 @@ iswnbspace (wint_t wc) || wc == 0x202F || wc == 0x2060); } -static int -isnbspace (int c) -{ - return iswnbspace (btoc32 (c)); -} - /* FILE is the name of the file (or null for standard input) associated with the specified counters. */ static void @@ -479,18 +476,18 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) { char32_t wide_char; size_t n; - bool single_byte_ascii = !in_shift && 0 <= *p && *p < 0x80; + bool single_byte; - if (single_byte_ascii) + if (!in_shift && 0 <= *p && *p < 0x80) { /* Handle most ASCII characters quickly, without calling mbrtowc(). */ n = 1; wide_char = *p; + single_byte = true; } else { - in_shift = true; #if SUPPORT_OLD_MBRTOWC backup_state = state; #endif @@ -500,6 +497,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) #if SUPPORT_OLD_MBRTOWC state = backup_state; #endif + in_shift = true; break; } if (n == (size_t) -1) @@ -525,13 +523,9 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) in_word = true; continue; } - if (mbsinit (&state)) - in_shift = false; - if (n == 0) - { - wide_char = 0; - n = 1; - } + n += !n; + single_byte = n == !in_shift; + in_shift = !mbsinit (&state); } switch (wide_char) @@ -558,18 +552,15 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) default: /* c32width can be expensive on macOS for example, so avoid if not needed. */ - if (print_linelength) + if (single_byte) + linepos += wc_isprint[wide_char]; + else if (print_linelength) { - if (single_byte_ascii) - linepos += !!isprint (wide_char); - else - { - int width = c32width (wide_char); - if (width > 0) - linepos += width; - } + int width = c32width (wide_char); + if (width > 0) + linepos += width; } - if (single_byte_ascii ? isspace (wide_char) + if (single_byte ? wc_isspace[wide_char] : c32isspace (wide_char) || iswnbspace (wide_char)) goto mb_word_separator; @@ -645,8 +636,8 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) in_word = false; break; default: - linepos += !!isprint (c); - if (isspace (c) || isnbspace (c)) + linepos += wc_isprint[c]; + if (wc_isspace[c]) goto word_separator; words += !in_word; in_word = true; @@ -849,6 +840,13 @@ main (int argc, char **argv) || print_linelength)) print_lines = print_words = print_bytes = true; + if (print_linelength) + for (int i = 0; i <= UCHAR_MAX; i++) + wc_isprint[i] = !!isprint (i); + if (print_words) + for (int i = 0; i <= UCHAR_MAX; i++) + wc_isspace[i] = isspace (i) || iswnbspace (btoc32 (i)); + bool read_tokens = false; struct argv_iterator *ai; if (files_from)