]> git.ipfire.org Git - thirdparty/coreutils.git/commitdiff
wc: optimize processing of ASCII in multi byte locales
authorPádraig Brady <P@draigBrady.com>
Fri, 18 May 2018 04:41:46 +0000 (21:41 -0700)
committerPádraig Brady <P@draigBrady.com>
Thu, 21 Jun 2018 04:00:39 +0000 (21:00 -0700)
===== Benchmark setup (on GNU/Linux) ====
$ yes áááááááááááááááááááá | head -n100000 > mbc.txt
$ yes 12345678901234567890 | head -n100000 > num.txt

===== Before ====
$ time src/wc -Lm < mbc.txt
real    0m0.186s
$ time src/wc -m < mbc.txt
real    0m0.186s
$ time src/wc -Lm < num.txt
real    0m0.055s
$ time src/wc -m < num.txt
real    0m0.056s

==== After ====
$ time src/wc -Lm < mbc.txt
real    0m0.196s
$ time src/wc -m < mbc.txt
real    0m0.173s
$ time src/wc -Lm < num.txt
real    0m0.031s
$ time src/wc -m < num.txt
real    0m0.028s

* src/wc.c (wc): Only call wide variant functions like
iswprint() and wcwidth() for non is_basic() characters.
I.E. non ISO C "basic character set" characters.
This is especially significant on OSX where wcwidth()
is very expensive (about 10x in tests).
* NEWS: Mention the improvement.
Suggested by Eric Fischer.

NEWS
src/wc.c

diff --git a/NEWS b/NEWS
index 101afc08099e406ee546d8b0a1db37654c466f42..2020ab6e3708a26ac72dd55c32f4e690a7d5f333 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -55,6 +55,9 @@ GNU coreutils NEWS                                    -*- outline -*-
   version of XFS.  stat -f --format=%T now reports the file system type,
   and tail -f uses inotify.
 
+  wc avoids redundant processing of ASCII text in multibyte locales,
+  which is especially significant on macOS.
+
 
 * Noteworthy changes in release 8.29 (2017-12-27) [stable]
 
index 0c72042a0b28092ad4660671a7075ba299996dc5..2034c42beebf58c7b6e593316014c7594235dc47 100644 (file)
--- a/src/wc.c
+++ b/src/wc.c
@@ -379,6 +379,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
             {
               wchar_t wide_char;
               size_t n;
+              bool wide = true;
 
               if (!in_shift && is_basic (*p))
                 {
@@ -386,6 +387,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
                      mbrtowc().  */
                   n = 1;
                   wide_char = *p;
+                  wide = false;
                 }
               else
                 {
@@ -419,9 +421,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
                       n = 1;
                     }
                 }
-              p += n;
-              bytes_read -= n;
-              chars++;
+
               switch (wide_char)
                 {
                 case '\n':
@@ -445,17 +445,33 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
                   in_word = false;
                   break;
                 default:
-                  if (iswprint (wide_char))
+                  if (wide && iswprint (wide_char))
                     {
-                      int width = wcwidth (wide_char);
-                      if (width > 0)
-                        linepos += width;
+                      /* wcwidth can be expensive on OSX for example,
+                         so avoid if uneeded.  */
+                      if (print_linelength)
+                        {
+                          int width = wcwidth (wide_char);
+                          if (width > 0)
+                            linepos += width;
+                        }
                       if (iswspace (wide_char))
                         goto mb_word_separator;
                       in_word = true;
                     }
+                  else if (!wide && isprint (to_uchar (*p)))
+                    {
+                      linepos++;
+                      if (isspace (to_uchar (*p)))
+                        goto mb_word_separator;
+                      in_word = true;
+                    }
                   break;
                 }
+
+              p += n;
+              bytes_read -= n;
+              chars++;
             }
           while (bytes_read > 0);