wc: treat encoding errors as non white space

author Paul Eggert <eggert@cs.ucla.edu>

Sat, 23 Sep 2023 03:53:57 +0000 (20:53 -0700)

committer Paul Eggert <eggert@cs.ucla.edu>

Sat, 23 Sep 2023 07:28:27 +0000 (00:28 -0700)
author Paul Eggert <eggert@cs.ucla.edu>
Sat, 23 Sep 2023 03:53:57 +0000 (20:53 -0700)
committer Paul Eggert <eggert@cs.ucla.edu>
Sat, 23 Sep 2023 07:28:27 +0000 (00:28 -0700)
diff --git a/NEWS b/NEWS

index 0516b3c9c7e2a8820950770ced2cd71c8e606f5a..0d114fb87a90ac68bc9d472e78dddaae2e6d2e61 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -19,6 +19,9 @@ GNU coreutils NEWS                                    -*- outline -*-
    ls --dired now implies long format output without hyperlinks enabled,
    and will take precedence over previously specified formats or hyperlink mode.
  
+  wc no longer ignores encoding errors when counting words.
+  Instead, it treats them as non white space.
+
  ** New features
  
    tail now supports following multiple processes, with repeated --pid options.
diff --git a/doc/coreutils.texi b/doc/coreutils.texi

index ca691c8174a9ed6bf9d08c686d7c23b129aef290..4167660a7c27737e5db6ab7d14174bd49b8fe455 100644 (file)
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -3848,7 +3848,7 @@ Print only the byte counts.
  @opindex -m
  @opindex --chars
  Print only the character counts, as per the current locale.
-Invalid characters are not counted.
+Encoding errors are not counted.
  
  @item -w
  @itemx --words
@@ -3856,6 +3856,8 @@ Invalid characters are not counted.
  @opindex --words
  Print only the word counts.  A word is a nonempty sequence of non white
  space delimited by white space characters or by start or end of input.
+The current locale determines which characters are white space.
+GNU @command{wc} treats encoding errors as non white space.
  
  @item -l
  @itemx --lines
diff --git a/src/wc.c b/src/wc.c

index 4db3a770d23f7da79bc64eb51392b61695149030..673c3eb47615bced94752f6cf70165523794491f 100644 (file)
--- a/src/wc.c
+++ b/src/wc.c
@@ -512,6 +512,17 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
                        bytes_read--;
                        mbszero (&state);
                        in_shift = false;
+
+                      /* Treat encoding errors as non white space.
+                         POSIX says a word is "a non-zero-length string of
+                         characters delimited by white space".  This is
+                         wrong in some sense, as the string can be delimited
+                         by start or end of input, and it is unclear what it
+                         means when the input contains encoding errors.
+                         Since encoding errors are not white space,
+                         treat them that way here.  */
+                      words += !in_word;
+                      in_word = true;
                        continue;
                      }
                    if (mbsinit (&state))
@@ -564,16 +575,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
  
                    /* Count words by counting word starts, i.e., each
                       white space character (or the start of input)
-                     followed by non white space.
-
-                     POSIX says a word is "a non-zero-length string of
-                     characters delimited by white space".  This is certainly
-                     wrong in some sense, as the string can be delimited
-                     by start or end of input, and it is not clear
-                     what it means when the input contains encoding errors.
-                     Although GNU wc ignores encoding errors when determining
-                     word boundaries, this behavior is not documented or
-                     portable and should not be relied upon.  */
+                     followed by non white space.  */
                    words += !in_word;
                    in_word = true;
                    break;
author	Paul Eggert <eggert@cs.ucla.edu>
	Sat, 23 Sep 2023 03:53:57 +0000 (20:53 -0700)
committer	Paul Eggert <eggert@cs.ucla.edu>
	Sat, 23 Sep 2023 07:28:27 +0000 (00:28 -0700)
NEWS		patch \| blob \| blame \| history
doc/coreutils.texi		patch \| blob \| blame \| history
src/wc.c		patch \| blob \| blame \| history