From: Paul Eggert <eggert@cs.ucla.edu>
Date: Fri, 22 Sep 2023 18:13:51 +0000 (-0700)
Subject: wc: fix word count bug
X-Git-Tag: v9.5~147
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=31076e86898ab5b6e7975ca905b9a9b15122ecc6;p=thirdparty%2Fcoreutils.git

wc: fix word count bug

* bootstrap.conf (gnulib_modules): Remove c32isprint.
* src/wc.c (wc): Consider all non-white-space characters
to be word constituents, even if they are not printable.
POSIX requires this, and it is what BSD does.
Partly do this by simplifying the check for a word,
by counting word starts rather than word ends.
* tests/wc/wc.pl: Test for the bug.
---

diff --git a/NEWS b/NEWS
index 81899eacf4..0516b3c9c7 100644
--- a/NEWS
+++ b/NEWS
@@ -11,6 +11,9 @@ GNU coreutils NEWS                                    -*- outline -*-
   numfmt options like --suffix no longer have an arbitrary 127-byte limit.
   [bug introduced with numfmt in coreutils-8.21]
 
+  wc no longer fails to count unprintable characters as parts of words.
+  [bug introduced in textutils-2.1]
+
 ** Changes in behavior
 
   ls --dired now implies long format output without hyperlinks enabled,
diff --git a/bootstrap.conf b/bootstrap.conf
index fdb059ccea..db0c90c670 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -50,7 +50,6 @@ gnulib_modules="
   byteswap
   c-strcase
   c32iscntrl
-  c32isprint
   c32isspace
   c32width
   canon-host
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index 7abfbe3c07..ca691c8174 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -3792,8 +3792,9 @@ contents of files.
 
 @command{wc} counts the number of bytes, characters, words, and newlines
 in each given @var{file}, or standard input if none are given
-or for a @var{file} of @samp{-}.  A word is a nonzero length
-sequence of printable characters delimited by white space.  Synopsis:
+or for a @var{file} of @samp{-}.  A word is a nonempty sequence of non white
+space delimited by white space characters or by start or end of input.
+Synopsis:
 
 @example
 wc [@var{option}]@dots{} [@var{file}]@dots{}
@@ -3853,8 +3854,8 @@ Invalid characters are not counted.
 @itemx --words
 @opindex -w
 @opindex --words
-Print only the word counts.  A word is a nonzero length
-sequence of printable characters separated by white space.
+Print only the word counts.  A word is a nonempty sequence of non white
+space delimited by white space characters or by start or end of input.
 
 @item -l
 @itemx --lines
diff --git a/src/wc.c b/src/wc.c
index 341ff9c7dc..4db3a770d2 100644
--- a/src/wc.c
+++ b/src/wc.c
@@ -167,8 +167,8 @@ Usage: %s [OPTION]... [FILE]...\n\
               program_name, program_name);
       fputs (_("\
 Print newline, word, and byte counts for each FILE, and a total line if\n\
-more than one FILE is specified.  A word is a non-zero-length sequence of\n\
-printable characters delimited by white space.\n\
+more than one FILE is specified.  A word is a nonempty sequence of non white\n\
+space delimited by white space characters or by start or end of input.\n\
 "), stdout);
 
       emit_stdin_note ();
@@ -479,15 +479,14 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
             {
               char32_t wide_char;
               size_t n;
-              bool wide = true;
+              bool single_byte_ascii = !in_shift && 0 <= *p && *p < 0x80;
 
-              if (!in_shift && 0 <= *p && *p < 0x80)
+              if (single_byte_ascii)
                 {
                   /* Handle most ASCII characters quickly, without calling
                      mbrtowc().  */
                   n = 1;
                   wide_char = *p;
-                  wide = false;
                 }
               else
                 {
@@ -543,31 +542,40 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
                   FALLTHROUGH;
                 case '\v':
                 mb_word_separator:
-                  words += in_word;
                   in_word = false;
                   break;
                 default:
-                  if (wide && c32isprint (wide_char))
+                  /* c32width can be expensive on macOS for example,
+                     so avoid if not needed.  */
+                  if (print_linelength)
                     {
-                      /* c32width can be expensive on OSX for example,
-                         so avoid if not needed.  */
-                      if (print_linelength)
+                      if (single_byte_ascii)
+                        linepos += !!isprint (wide_char);
+                      else
                         {
                           int width = c32width (wide_char);
                           if (width > 0)
                             linepos += width;
                         }
-                      if (c32isspace (wide_char) || iswnbspace (wide_char))
-                        goto mb_word_separator;
-                      in_word = true;
-                    }
-                  else if (!wide && isprint (to_uchar (*p)))
-                    {
-                      linepos++;
-                      if (isspace (to_uchar (*p)))
-                        goto mb_word_separator;
-                      in_word = true;
                     }
+                  if (single_byte_ascii ? isspace (wide_char)
+                      : c32isspace (wide_char) || iswnbspace (wide_char))
+                    goto mb_word_separator;
+
+                  /* Count words by counting word starts, i.e., each
+                     white space character (or the start of input)
+                     followed by non white space.
+
+                     POSIX says a word is "a non-zero-length string of
+                     characters delimited by white space".  This is certainly
+                     wrong in some sense, as the string can be delimited
+                     by start or end of input, and it is not clear
+                     what it means when the input contains encoding errors.
+                     Although GNU wc ignores encoding errors when determining
+                     word boundaries, this behavior is not documented or
+                     portable and should not be relied upon.  */
+                  words += !in_word;
+                  in_word = true;
                   break;
                 }
 
@@ -593,7 +601,6 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
         }
       if (linepos > linelength)
         linelength = linepos;
-      words += in_word;
     }
   else
     {
@@ -613,7 +620,8 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
           bytes += bytes_read;
           do
             {
-              switch (*p++)
+              unsigned char c = *p++;
+              switch (c)
                 {
                 case '\n':
                   lines++;
@@ -632,18 +640,14 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
                   FALLTHROUGH;
                 case '\v':
                 word_separator:
-                  words += in_word;
                   in_word = false;
                   break;
                 default:
-                  if (isprint (to_uchar (p[-1])))
-                    {
-                      linepos++;
-                      if (isspace (to_uchar (p[-1]))
-                          || isnbspace (to_uchar (p[-1])))
-                        goto word_separator;
-                      in_word = true;
-                    }
+                  linepos += !!isprint (c);
+                  if (isspace (c) || isnbspace (c))
+                    goto word_separator;
+                  words += !in_word;
+                  in_word = true;
                   break;
                 }
             }
@@ -651,7 +655,6 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
         }
       if (linepos > linelength)
         linelength = linepos;
-      words += in_word;
     }
 
   if (count_chars < print_chars)
diff --git a/tests/wc/wc.pl b/tests/wc/wc.pl
index ed2bc43e4b..8a8a334c89 100755
--- a/tests/wc/wc.pl
+++ b/tests/wc/wc.pl
@@ -41,6 +41,7 @@ my @Tests =
      ['c0', '-L',  {IN_PIPE=>"1\n12\n"},     {OUT=>"2\n"}],
      ['c1', '-L',  {IN_PIPE=>"1\n123\n1\n"}, {OUT=>"3\n"}],
      ['c2', '-L',  {IN_PIPE=>"\n123456"},    {OUT=>"6\n"}],
+     ['d1', '-w',  {IN_PIPE=>"\1\n"},        {OUT=>"1\n"}],
     );
 
 my $save_temps = $ENV{DEBUG};