From: Paul Eggert <eggert@cs.ucla.edu>
Date: Sat, 23 Sep 2023 03:53:57 +0000 (-0700)
Subject: wc: treat encoding errors as non white space
X-Git-Tag: v9.5~146
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=bee39b93f54c4df105ba8b528c9637d67d16d21f;p=thirdparty%2Fcoreutils.git

wc: treat encoding errors as non white space

* src/wc.c (wc): Treat encoding errors like non white space
characters.
---

diff --git a/NEWS b/NEWS
index 0516b3c9c7..0d114fb87a 100644
--- a/NEWS
+++ b/NEWS
@@ -19,6 +19,9 @@ GNU coreutils NEWS                                    -*- outline -*-
   ls --dired now implies long format output without hyperlinks enabled,
   and will take precedence over previously specified formats or hyperlink mode.
 
+  wc no longer ignores encoding errors when counting words.
+  Instead, it treats them as non white space.
+
 ** New features
 
   tail now supports following multiple processes, with repeated --pid options.
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index ca691c8174..4167660a7c 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -3848,7 +3848,7 @@ Print only the byte counts.
 @opindex -m
 @opindex --chars
 Print only the character counts, as per the current locale.
-Invalid characters are not counted.
+Encoding errors are not counted.
 
 @item -w
 @itemx --words
@@ -3856,6 +3856,8 @@ Invalid characters are not counted.
 @opindex --words
 Print only the word counts.  A word is a nonempty sequence of non white
 space delimited by white space characters or by start or end of input.
+The current locale determines which characters are white space.
+GNU @command{wc} treats encoding errors as non white space.
 
 @item -l
 @itemx --lines
diff --git a/src/wc.c b/src/wc.c
index 4db3a770d2..673c3eb476 100644
--- a/src/wc.c
+++ b/src/wc.c
@@ -512,6 +512,17 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
                       bytes_read--;
                       mbszero (&state);
                       in_shift = false;
+
+                      /* Treat encoding errors as non white space.
+                         POSIX says a word is "a non-zero-length string of
+                         characters delimited by white space".  This is
+                         wrong in some sense, as the string can be delimited
+                         by start or end of input, and it is unclear what it
+                         means when the input contains encoding errors.
+                         Since encoding errors are not white space,
+                         treat them that way here.  */
+                      words += !in_word;
+                      in_word = true;
                       continue;
                     }
                   if (mbsinit (&state))
@@ -564,16 +575,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
 
                   /* Count words by counting word starts, i.e., each
                      white space character (or the start of input)
-                     followed by non white space.
-
-                     POSIX says a word is "a non-zero-length string of
-                     characters delimited by white space".  This is certainly
-                     wrong in some sense, as the string can be delimited
-                     by start or end of input, and it is not clear
-                     what it means when the input contains encoding errors.
-                     Although GNU wc ignores encoding errors when determining
-                     word boundaries, this behavior is not documented or
-                     portable and should not be relied upon.  */
+                     followed by non white space.  */
                   words += !in_word;
                   in_word = true;
                   break;