wc: optimize processing of ASCII in multi byte locales

author Pádraig Brady <P@draigBrady.com>

Fri, 18 May 2018 04:41:46 +0000 (21:41 -0700)

committer Pádraig Brady <P@draigBrady.com>

Thu, 21 Jun 2018 04:00:39 +0000 (21:00 -0700)
author Pádraig Brady <P@draigBrady.com>
Fri, 18 May 2018 04:41:46 +0000 (21:41 -0700)
committer Pádraig Brady <P@draigBrady.com>
Thu, 21 Jun 2018 04:00:39 +0000 (21:00 -0700)
diff --git a/NEWS b/NEWS

index 101afc08099e406ee546d8b0a1db37654c466f42..2020ab6e3708a26ac72dd55c32f4e690a7d5f333 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -55,6 +55,9 @@ GNU coreutils NEWS                                    -*- outline -*-
    version of XFS.  stat -f --format=%T now reports the file system type,
    and tail -f uses inotify.
  
+  wc avoids redundant processing of ASCII text in multibyte locales,
+  which is especially significant on macOS.
+
  
  * Noteworthy changes in release 8.29 (2017-12-27) [stable]
  
diff --git a/src/wc.c b/src/wc.c

index 0c72042a0b28092ad4660671a7075ba299996dc5..2034c42beebf58c7b6e593316014c7594235dc47 100644 (file)
--- a/src/wc.c
+++ b/src/wc.c
@@ -379,6 +379,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
              {
                wchar_t wide_char;
                size_t n;
+              bool wide = true;
  
                if (!in_shift && is_basic (*p))
                  {
@@ -386,6 +387,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
                       mbrtowc().  */
                    n = 1;
                    wide_char = *p;
+                  wide = false;
                  }
                else
                  {
@@ -419,9 +421,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
                        n = 1;
                      }
                  }
-              p += n;
-              bytes_read -= n;
-              chars++;
+
                switch (wide_char)
                  {
                  case '\n':
@@ -445,17 +445,33 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
                    in_word = false;
                    break;
                  default:
-                  if (iswprint (wide_char))
+                  if (wide && iswprint (wide_char))
                      {
-                      int width = wcwidth (wide_char);
-                      if (width > 0)
-                        linepos += width;
+                      /* wcwidth can be expensive on OSX for example,
+                         so avoid if uneeded.  */
+                      if (print_linelength)
+                        {
+                          int width = wcwidth (wide_char);
+                          if (width > 0)
+                            linepos += width;
+                        }
                        if (iswspace (wide_char))
                          goto mb_word_separator;
                        in_word = true;
                      }
+                  else if (!wide && isprint (to_uchar (*p)))
+                    {
+                      linepos++;
+                      if (isspace (to_uchar (*p)))
+                        goto mb_word_separator;
+                      in_word = true;
+                    }
                    break;
                  }
+
+              p += n;
+              bytes_read -= n;
+              chars++;
              }
            while (bytes_read > 0);
author	Pádraig Brady <P@draigBrady.com>
	Fri, 18 May 2018 04:41:46 +0000 (21:41 -0700)
committer	Pádraig Brady <P@draigBrady.com>
	Thu, 21 Jun 2018 04:00:39 +0000 (21:00 -0700)
NEWS		patch \| blob \| blame \| history
src/wc.c		patch \| blob \| blame \| history