From: Paul Eggert Date: Sat, 23 Sep 2023 03:53:57 +0000 (-0700) Subject: wc: treat encoding errors as non white space X-Git-Tag: v9.5~146 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=bee39b93f54c4df105ba8b528c9637d67d16d21f;p=thirdparty%2Fcoreutils.git wc: treat encoding errors as non white space * src/wc.c (wc): Treat encoding errors like non white space characters. --- diff --git a/NEWS b/NEWS index 0516b3c9c7..0d114fb87a 100644 --- a/NEWS +++ b/NEWS @@ -19,6 +19,9 @@ GNU coreutils NEWS -*- outline -*- ls --dired now implies long format output without hyperlinks enabled, and will take precedence over previously specified formats or hyperlink mode. + wc no longer ignores encoding errors when counting words. + Instead, it treats them as non white space. + ** New features tail now supports following multiple processes, with repeated --pid options. diff --git a/doc/coreutils.texi b/doc/coreutils.texi index ca691c8174..4167660a7c 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -3848,7 +3848,7 @@ Print only the byte counts. @opindex -m @opindex --chars Print only the character counts, as per the current locale. -Invalid characters are not counted. +Encoding errors are not counted. @item -w @itemx --words @@ -3856,6 +3856,8 @@ Invalid characters are not counted. @opindex --words Print only the word counts. A word is a nonempty sequence of non white space delimited by white space characters or by start or end of input. +The current locale determines which characters are white space. +GNU @command{wc} treats encoding errors as non white space. @item -l @itemx --lines diff --git a/src/wc.c b/src/wc.c index 4db3a770d2..673c3eb476 100644 --- a/src/wc.c +++ b/src/wc.c @@ -512,6 +512,17 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) bytes_read--; mbszero (&state); in_shift = false; + + /* Treat encoding errors as non white space. + POSIX says a word is "a non-zero-length string of + characters delimited by white space". This is + wrong in some sense, as the string can be delimited + by start or end of input, and it is unclear what it + means when the input contains encoding errors. + Since encoding errors are not white space, + treat them that way here. */ + words += !in_word; + in_word = true; continue; } if (mbsinit (&state)) @@ -564,16 +575,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) /* Count words by counting word starts, i.e., each white space character (or the start of input) - followed by non white space. - - POSIX says a word is "a non-zero-length string of - characters delimited by white space". This is certainly - wrong in some sense, as the string can be delimited - by start or end of input, and it is not clear - what it means when the input contains encoding errors. - Although GNU wc ignores encoding errors when determining - word boundaries, this behavior is not documented or - portable and should not be relied upon. */ + followed by non white space. */ words += !in_word; in_word = true; break;