wc: fix word count bug

author Paul Eggert <eggert@cs.ucla.edu>

Fri, 22 Sep 2023 18:13:51 +0000 (11:13 -0700)

committer Paul Eggert <eggert@cs.ucla.edu>

Sat, 23 Sep 2023 07:28:27 +0000 (00:28 -0700)
author Paul Eggert <eggert@cs.ucla.edu>
Fri, 22 Sep 2023 18:13:51 +0000 (11:13 -0700)
committer Paul Eggert <eggert@cs.ucla.edu>
Sat, 23 Sep 2023 07:28:27 +0000 (00:28 -0700)
diff --git a/NEWS b/NEWS

index 81899eacf418ee45fc818a9b1f0dc2da93a94005..0516b3c9c7e2a8820950770ced2cd71c8e606f5a 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -11,6 +11,9 @@ GNU coreutils NEWS                                    -*- outline -*-
    numfmt options like --suffix no longer have an arbitrary 127-byte limit.
    [bug introduced with numfmt in coreutils-8.21]
  
+  wc no longer fails to count unprintable characters as parts of words.
+  [bug introduced in textutils-2.1]
+
  ** Changes in behavior
  
    ls --dired now implies long format output without hyperlinks enabled,
diff --git a/bootstrap.conf b/bootstrap.conf

index fdb059ccea98b8d5735da24190adeafecf6c8bf8..db0c90c6700104383d83c851b90ef1358207e4c0 100644 (file)
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -50,7 +50,6 @@ gnulib_modules="
    byteswap
    c-strcase
    c32iscntrl
-  c32isprint
    c32isspace
    c32width
    canon-host
diff --git a/doc/coreutils.texi b/doc/coreutils.texi

index 7abfbe3c07aea752b905df7457e813876d2f6f38..ca691c8174a9ed6bf9d08c686d7c23b129aef290 100644 (file)
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -3792,8 +3792,9 @@ contents of files.
  
  @command{wc} counts the number of bytes, characters, words, and newlines
  in each given @var{file}, or standard input if none are given
-or for a @var{file} of @samp{-}.  A word is a nonzero length
-sequence of printable characters delimited by white space.  Synopsis:
+or for a @var{file} of @samp{-}.  A word is a nonempty sequence of non white
+space delimited by white space characters or by start or end of input.
+Synopsis:
  
  @example
  wc [@var{option}]@dots{} [@var{file}]@dots{}
@@ -3853,8 +3854,8 @@ Invalid characters are not counted.
  @itemx --words
  @opindex -w
  @opindex --words
-Print only the word counts.  A word is a nonzero length
-sequence of printable characters separated by white space.
+Print only the word counts.  A word is a nonempty sequence of non white
+space delimited by white space characters or by start or end of input.
  
  @item -l
  @itemx --lines
diff --git a/src/wc.c b/src/wc.c

index 341ff9c7dce90cddf799fd5acc874f22e80aa3c7..4db3a770d23f7da79bc64eb51392b61695149030 100644 (file)
--- a/src/wc.c
+++ b/src/wc.c
@@ -167,8 +167,8 @@ Usage: %s [OPTION]... [FILE]...\n\
                program_name, program_name);
        fputs (_("\
  Print newline, word, and byte counts for each FILE, and a total line if\n\
-more than one FILE is specified.  A word is a non-zero-length sequence of\n\
-printable characters delimited by white space.\n\
+more than one FILE is specified.  A word is a nonempty sequence of non white\n\
+space delimited by white space characters or by start or end of input.\n\
  "), stdout);
  
        emit_stdin_note ();
@@ -479,15 +479,14 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
              {
                char32_t wide_char;
                size_t n;
-              bool wide = true;
+              bool single_byte_ascii = !in_shift && 0 <= *p && *p < 0x80;
  
-              if (!in_shift && 0 <= *p && *p < 0x80)
+              if (single_byte_ascii)
                  {
                    /* Handle most ASCII characters quickly, without calling
                       mbrtowc().  */
                    n = 1;
                    wide_char = *p;
-                  wide = false;
                  }
                else
                  {
@@ -543,31 +542,40 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
                    FALLTHROUGH;
                  case '\v':
                  mb_word_separator:
-                  words += in_word;
                    in_word = false;
                    break;
                  default:
-                  if (wide && c32isprint (wide_char))
+                  /* c32width can be expensive on macOS for example,
+                     so avoid if not needed.  */
+                  if (print_linelength)
                      {
-                      /* c32width can be expensive on OSX for example,
-                         so avoid if not needed.  */
-                      if (print_linelength)
+                      if (single_byte_ascii)
+                        linepos += !!isprint (wide_char);
+                      else
                          {
                            int width = c32width (wide_char);
                            if (width > 0)
                              linepos += width;
                          }
-                      if (c32isspace (wide_char) || iswnbspace (wide_char))
-                        goto mb_word_separator;
-                      in_word = true;
-                    }
-                  else if (!wide && isprint (to_uchar (*p)))
-                    {
-                      linepos++;
-                      if (isspace (to_uchar (*p)))
-                        goto mb_word_separator;
-                      in_word = true;
                      }
+                  if (single_byte_ascii ? isspace (wide_char)
+                      : c32isspace (wide_char) || iswnbspace (wide_char))
+                    goto mb_word_separator;
+
+                  /* Count words by counting word starts, i.e., each
+                     white space character (or the start of input)
+                     followed by non white space.
+
+                     POSIX says a word is "a non-zero-length string of
+                     characters delimited by white space".  This is certainly
+                     wrong in some sense, as the string can be delimited
+                     by start or end of input, and it is not clear
+                     what it means when the input contains encoding errors.
+                     Although GNU wc ignores encoding errors when determining
+                     word boundaries, this behavior is not documented or
+                     portable and should not be relied upon.  */
+                  words += !in_word;
+                  in_word = true;
                    break;
                  }
  
@@ -593,7 +601,6 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
          }
        if (linepos > linelength)
          linelength = linepos;
-      words += in_word;
      }
    else
      {
@@ -613,7 +620,8 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
            bytes += bytes_read;
            do
              {
-              switch (*p++)
+              unsigned char c = *p++;
+              switch (c)
                  {
                  case '\n':
                    lines++;
@@ -632,18 +640,14 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
                    FALLTHROUGH;
                  case '\v':
                  word_separator:
-                  words += in_word;
                    in_word = false;
                    break;
                  default:
-                  if (isprint (to_uchar (p[-1])))
-                    {
-                      linepos++;
-                      if (isspace (to_uchar (p[-1]))
-                          || isnbspace (to_uchar (p[-1])))
-                        goto word_separator;
-                      in_word = true;
-                    }
+                  linepos += !!isprint (c);
+                  if (isspace (c) || isnbspace (c))
+                    goto word_separator;
+                  words += !in_word;
+                  in_word = true;
                    break;
                  }
              }
@@ -651,7 +655,6 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
          }
        if (linepos > linelength)
          linelength = linepos;
-      words += in_word;
      }
  
    if (count_chars < print_chars)
diff --git a/tests/wc/wc.pl b/tests/wc/wc.pl

index ed2bc43e4ba233a30589fda0b823c888a1a840b2..8a8a334c89ac0600faa5474f5a2984e2fcb2e44d 100755 (executable)
--- a/tests/wc/wc.pl
+++ b/tests/wc/wc.pl
@@ -41,6 +41,7 @@ my @Tests =
       ['c0', '-L',  {IN_PIPE=>"1\n12\n"},     {OUT=>"2\n"}],
       ['c1', '-L',  {IN_PIPE=>"1\n123\n1\n"}, {OUT=>"3\n"}],
       ['c2', '-L',  {IN_PIPE=>"\n123456"},    {OUT=>"6\n"}],
+     ['d1', '-w',  {IN_PIPE=>"\1\n"},        {OUT=>"1\n"}],
      );
  
  my $save_temps = $ENV{DEBUG};
author	Paul Eggert <eggert@cs.ucla.edu>
	Fri, 22 Sep 2023 18:13:51 +0000 (11:13 -0700)
committer	Paul Eggert <eggert@cs.ucla.edu>
	Sat, 23 Sep 2023 07:28:27 +0000 (00:28 -0700)
NEWS		patch \| blob \| blame \| history
bootstrap.conf		patch \| blob \| blame \| history
doc/coreutils.texi		patch \| blob \| blame \| history
src/wc.c		patch \| blob \| blame \| history
tests/wc/wc.pl		patch \| blob \| blame \| history