]> git.ipfire.org Git - thirdparty/coreutils.git/commitdiff
Speed up "wc -m" and "wc -w" in multibyte case.
authorBruno Haible <bruno@clisp.org>
Thu, 8 May 2008 21:15:36 +0000 (23:15 +0200)
committerJim Meyering <meyering@redhat.com>
Thu, 8 May 2008 21:21:04 +0000 (23:21 +0200)
* src/wc.c: Include mbchar.h.
(wc): New variable in_shift. Use it to avoid calling mbrtowc for most
ASCII characters.  Reported via Jan Engelhardt in
http://bugzilla.novell.com/381873 with discussion here
http://thread.gmane.org/gmane.comp.gnu.coreutils.bugs/13520

src/wc.c

index 61ab485b4975ef2917691795ca1ccf29c7cd88fc..1945504bc81b3cdba1769adce23c1573caed7077 100644 (file)
--- a/src/wc.c
+++ b/src/wc.c
@@ -1,5 +1,5 @@
 /* wc - print the number of lines, words, and bytes in files
-   Copyright (C) 85, 91, 1995-2007 Free Software Foundation, Inc.
+   Copyright (C) 85, 91, 1995-2008 Free Software Foundation, Inc.
 
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -28,6 +28,7 @@
 #include "system.h"
 #include "error.h"
 #include "inttostr.h"
+#include "mbchar.h"
 #include "quote.h"
 #include "readtokens0.h"
 #include "safe-read.h"
@@ -274,6 +275,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus)
       bool in_word = false;
       uintmax_t linepos = 0;
       mbstate_t state = { 0, };
+      bool in_shift = false;
 # if SUPPORT_OLD_MBRTOWC
       /* Back-up the state before each multibyte character conversion and
         move the last incomplete character of the buffer to the front
@@ -308,70 +310,81 @@ wc (int fd, char const *file_x, struct fstatus *fstatus)
              wchar_t wide_char;
              size_t n;
 
-# if SUPPORT_OLD_MBRTOWC
-             backup_state = state;
-# endif
-             n = mbrtowc (&wide_char, p, bytes_read, &state);
-             if (n == (size_t) -2)
+             if (!in_shift && is_basic (*p))
                {
-# if SUPPORT_OLD_MBRTOWC
-                 state = backup_state;
-# endif
-                 break;
-               }
-             if (n == (size_t) -1)
-               {
-                 /* Remember that we read a byte, but don't complain
-                    about the error.  Because of the decoding error,
-                    this is a considered to be byte but not a
-                    character (that is, chars is not incremented).  */
-                 p++;
-                 bytes_read--;
+                 /* Handle most ASCII characters quickly, without calling
+                    mbrtowc().  */
+                 n = 1;
+                 wide_char = *p;
                }
              else
                {
+                 in_shift = true;
+# if SUPPORT_OLD_MBRTOWC
+                 backup_state = state;
+# endif
+                 n = mbrtowc (&wide_char, p, bytes_read, &state);
+                 if (n == (size_t) -2)
+                   {
+# if SUPPORT_OLD_MBRTOWC
+                     state = backup_state;
+# endif
+                     break;
+                   }
+                 if (n == (size_t) -1)
+                   {
+                     /* Remember that we read a byte, but don't complain
+                        about the error.  Because of the decoding error,
+                        this is a considered to be byte but not a
+                        character (that is, chars is not incremented).  */
+                     p++;
+                     bytes_read--;
+                     continue;
+                   }
+                 if (mbsinit (&state))
+                   in_shift = false;
                  if (n == 0)
                    {
                      wide_char = 0;
                      n = 1;
                    }
-                 p += n;
-                 bytes_read -= n;
-                 chars++;
-                 switch (wide_char)
+               }
+             p += n;
+             bytes_read -= n;
+             chars++;
+             switch (wide_char)
+               {
+               case '\n':
+                 lines++;
+                 /* Fall through. */
+               case '\r':
+               case '\f':
+                 if (linepos > linelength)
+                   linelength = linepos;
+                 linepos = 0;
+                 goto mb_word_separator;
+               case '\t':
+                 linepos += 8 - (linepos % 8);
+                 goto mb_word_separator;
+               case ' ':
+                 linepos++;
+                 /* Fall through. */
+               case '\v':
+               mb_word_separator:
+                 words += in_word;
+                 in_word = false;
+                 break;
+               default:
+                 if (iswprint (wide_char))
                    {
-                   case '\n':
-                     lines++;
-                     /* Fall through. */
-                   case '\r':
-                   case '\f':
-                     if (linepos > linelength)
-                       linelength = linepos;
-                     linepos = 0;
-                     goto mb_word_separator;
-                   case '\t':
-                     linepos += 8 - (linepos % 8);
-                     goto mb_word_separator;
-                   case ' ':
-                     linepos++;
-                     /* Fall through. */
-                   case '\v':
-                   mb_word_separator:
-                     words += in_word;
-                     in_word = false;
-                     break;
-                   default:
-                     if (iswprint (wide_char))
-                       {
-                         int width = wcwidth (wide_char);
-                         if (width > 0)
-                           linepos += width;
-                         if (iswspace (wide_char))
-                           goto mb_word_separator;
-                         in_word = true;
-                       }
-                     break;
+                     int width = wcwidth (wide_char);
+                     if (width > 0)
+                       linepos += width;
+                     if (iswspace (wide_char))
+                       goto mb_word_separator;
+                     in_word = true;
                    }
+                 break;
                }
            }
          while (bytes_read > 0);