]> git.ipfire.org Git - thirdparty/coreutils.git/commitdiff
wc: 3× speedup in C locale
authorPaul Eggert <eggert@cs.ucla.edu>
Sat, 23 Sep 2023 05:09:37 +0000 (22:09 -0700)
committerPaul Eggert <eggert@cs.ucla.edu>
Sat, 23 Sep 2023 07:28:27 +0000 (00:28 -0700)
The 3× speedup was measured by invoking 'wc $(find * -type f)'
on the coreutils sources etc. on an Ubuntu 23.04 x86-64.
These changes also speed up wc 20% in UTF-8 locales.
* src/wc.c (wc_isprint, wc_isspace): New static vars.
(wc): Use them for speed.
(main): Initialize them if needed.
(isnbspace): Remove; no longer used.

NEWS
src/wc.c

diff --git a/NEWS b/NEWS
index 0d114fb87a90ac68bc9d472e78dddaae2e6d2e61..18f80cb4c462fd499e638ce4b60e9dbc0e45bf0a 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -26,6 +26,11 @@ GNU coreutils NEWS                                    -*- outline -*-
 
   tail now supports following multiple processes, with repeated --pid options.
 
+** Improvements
+
+  wc is now much faster in single-byte locales and somewhat faster in
+  multi-byte locales.
+
 
 * Noteworthy changes in release 9.4 (2023-08-29) [stable]
 
index 673c3eb47615bced94752f6cf70165523794491f..6ec9399b5d4c30856248c053bba47c2eadadff4f 100644 (file)
--- a/src/wc.c
+++ b/src/wc.c
@@ -55,6 +55,9 @@ wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out,
                uintmax_t *bytes_out);
 #endif
 
+static bool wc_isprint[UCHAR_MAX + 1];
+static bool wc_isspace[UCHAR_MAX + 1];
+
 static bool debug;
 
 /* Cumulative number of lines, words, chars and bytes in all files so far.
@@ -209,12 +212,6 @@ iswnbspace (wint_t wc)
              || wc == 0x202F || wc == 0x2060);
 }
 
-static int
-isnbspace (int c)
-{
-  return iswnbspace (btoc32 (c));
-}
-
 /* FILE is the name of the file (or null for standard input)
    associated with the specified counters.  */
 static void
@@ -479,18 +476,18 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
             {
               char32_t wide_char;
               size_t n;
-              bool single_byte_ascii = !in_shift && 0 <= *p && *p < 0x80;
+              bool single_byte;
 
-              if (single_byte_ascii)
+              if (!in_shift && 0 <= *p && *p < 0x80)
                 {
                   /* Handle most ASCII characters quickly, without calling
                      mbrtowc().  */
                   n = 1;
                   wide_char = *p;
+                  single_byte = true;
                 }
               else
                 {
-                  in_shift = true;
 #if SUPPORT_OLD_MBRTOWC
                   backup_state = state;
 #endif
@@ -500,6 +497,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
 #if SUPPORT_OLD_MBRTOWC
                       state = backup_state;
 #endif
+                      in_shift = true;
                       break;
                     }
                   if (n == (size_t) -1)
@@ -525,13 +523,9 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
                       in_word = true;
                       continue;
                     }
-                  if (mbsinit (&state))
-                    in_shift = false;
-                  if (n == 0)
-                    {
-                      wide_char = 0;
-                      n = 1;
-                    }
+                  n += !n;
+                  single_byte = n == !in_shift;
+                  in_shift = !mbsinit (&state);
                 }
 
               switch (wide_char)
@@ -558,18 +552,15 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
                 default:
                   /* c32width can be expensive on macOS for example,
                      so avoid if not needed.  */
-                  if (print_linelength)
+                  if (single_byte)
+                    linepos += wc_isprint[wide_char];
+                  else if (print_linelength)
                     {
-                      if (single_byte_ascii)
-                        linepos += !!isprint (wide_char);
-                      else
-                        {
-                          int width = c32width (wide_char);
-                          if (width > 0)
-                            linepos += width;
-                        }
+                      int width = c32width (wide_char);
+                      if (width > 0)
+                        linepos += width;
                     }
-                  if (single_byte_ascii ? isspace (wide_char)
+                  if (single_byte ? wc_isspace[wide_char]
                       : c32isspace (wide_char) || iswnbspace (wide_char))
                     goto mb_word_separator;
 
@@ -645,8 +636,8 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
                   in_word = false;
                   break;
                 default:
-                  linepos += !!isprint (c);
-                  if (isspace (c) || isnbspace (c))
+                  linepos += wc_isprint[c];
+                  if (wc_isspace[c])
                     goto word_separator;
                   words += !in_word;
                   in_word = true;
@@ -849,6 +840,13 @@ main (int argc, char **argv)
          || print_linelength))
     print_lines = print_words = print_bytes = true;
 
+  if (print_linelength)
+    for (int i = 0; i <= UCHAR_MAX; i++)
+      wc_isprint[i] = !!isprint (i);
+  if (print_words)
+    for (int i = 0; i <= UCHAR_MAX; i++)
+      wc_isspace[i] = isspace (i) || iswnbspace (btoc32 (i));
+
   bool read_tokens = false;
   struct argv_iterator *ai;
   if (files_from)