From: Paul Eggert Date: Fri, 22 Sep 2023 18:13:51 +0000 (-0700) Subject: wc: fix word count bug X-Git-Tag: v9.5~147 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=31076e86898ab5b6e7975ca905b9a9b15122ecc6;p=thirdparty%2Fcoreutils.git wc: fix word count bug * bootstrap.conf (gnulib_modules): Remove c32isprint. * src/wc.c (wc): Consider all non-white-space characters to be word constituents, even if they are not printable. POSIX requires this, and it is what BSD does. Partly do this by simplifying the check for a word, by counting word starts rather than word ends. * tests/wc/wc.pl: Test for the bug. --- diff --git a/NEWS b/NEWS index 81899eacf4..0516b3c9c7 100644 --- a/NEWS +++ b/NEWS @@ -11,6 +11,9 @@ GNU coreutils NEWS -*- outline -*- numfmt options like --suffix no longer have an arbitrary 127-byte limit. [bug introduced with numfmt in coreutils-8.21] + wc no longer fails to count unprintable characters as parts of words. + [bug introduced in textutils-2.1] + ** Changes in behavior ls --dired now implies long format output without hyperlinks enabled, diff --git a/bootstrap.conf b/bootstrap.conf index fdb059ccea..db0c90c670 100644 --- a/bootstrap.conf +++ b/bootstrap.conf @@ -50,7 +50,6 @@ gnulib_modules=" byteswap c-strcase c32iscntrl - c32isprint c32isspace c32width canon-host diff --git a/doc/coreutils.texi b/doc/coreutils.texi index 7abfbe3c07..ca691c8174 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -3792,8 +3792,9 @@ contents of files. @command{wc} counts the number of bytes, characters, words, and newlines in each given @var{file}, or standard input if none are given -or for a @var{file} of @samp{-}. A word is a nonzero length -sequence of printable characters delimited by white space. Synopsis: +or for a @var{file} of @samp{-}. A word is a nonempty sequence of non white +space delimited by white space characters or by start or end of input. +Synopsis: @example wc [@var{option}]@dots{} [@var{file}]@dots{} @@ -3853,8 +3854,8 @@ Invalid characters are not counted. @itemx --words @opindex -w @opindex --words -Print only the word counts. A word is a nonzero length -sequence of printable characters separated by white space. +Print only the word counts. A word is a nonempty sequence of non white +space delimited by white space characters or by start or end of input. @item -l @itemx --lines diff --git a/src/wc.c b/src/wc.c index 341ff9c7dc..4db3a770d2 100644 --- a/src/wc.c +++ b/src/wc.c @@ -167,8 +167,8 @@ Usage: %s [OPTION]... [FILE]...\n\ program_name, program_name); fputs (_("\ Print newline, word, and byte counts for each FILE, and a total line if\n\ -more than one FILE is specified. A word is a non-zero-length sequence of\n\ -printable characters delimited by white space.\n\ +more than one FILE is specified. A word is a nonempty sequence of non white\n\ +space delimited by white space characters or by start or end of input.\n\ "), stdout); emit_stdin_note (); @@ -479,15 +479,14 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) { char32_t wide_char; size_t n; - bool wide = true; + bool single_byte_ascii = !in_shift && 0 <= *p && *p < 0x80; - if (!in_shift && 0 <= *p && *p < 0x80) + if (single_byte_ascii) { /* Handle most ASCII characters quickly, without calling mbrtowc(). */ n = 1; wide_char = *p; - wide = false; } else { @@ -543,31 +542,40 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) FALLTHROUGH; case '\v': mb_word_separator: - words += in_word; in_word = false; break; default: - if (wide && c32isprint (wide_char)) + /* c32width can be expensive on macOS for example, + so avoid if not needed. */ + if (print_linelength) { - /* c32width can be expensive on OSX for example, - so avoid if not needed. */ - if (print_linelength) + if (single_byte_ascii) + linepos += !!isprint (wide_char); + else { int width = c32width (wide_char); if (width > 0) linepos += width; } - if (c32isspace (wide_char) || iswnbspace (wide_char)) - goto mb_word_separator; - in_word = true; - } - else if (!wide && isprint (to_uchar (*p))) - { - linepos++; - if (isspace (to_uchar (*p))) - goto mb_word_separator; - in_word = true; } + if (single_byte_ascii ? isspace (wide_char) + : c32isspace (wide_char) || iswnbspace (wide_char)) + goto mb_word_separator; + + /* Count words by counting word starts, i.e., each + white space character (or the start of input) + followed by non white space. + + POSIX says a word is "a non-zero-length string of + characters delimited by white space". This is certainly + wrong in some sense, as the string can be delimited + by start or end of input, and it is not clear + what it means when the input contains encoding errors. + Although GNU wc ignores encoding errors when determining + word boundaries, this behavior is not documented or + portable and should not be relied upon. */ + words += !in_word; + in_word = true; break; } @@ -593,7 +601,6 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) } if (linepos > linelength) linelength = linepos; - words += in_word; } else { @@ -613,7 +620,8 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) bytes += bytes_read; do { - switch (*p++) + unsigned char c = *p++; + switch (c) { case '\n': lines++; @@ -632,18 +640,14 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) FALLTHROUGH; case '\v': word_separator: - words += in_word; in_word = false; break; default: - if (isprint (to_uchar (p[-1]))) - { - linepos++; - if (isspace (to_uchar (p[-1])) - || isnbspace (to_uchar (p[-1]))) - goto word_separator; - in_word = true; - } + linepos += !!isprint (c); + if (isspace (c) || isnbspace (c)) + goto word_separator; + words += !in_word; + in_word = true; break; } } @@ -651,7 +655,6 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) } if (linepos > linelength) linelength = linepos; - words += in_word; } if (count_chars < print_chars) diff --git a/tests/wc/wc.pl b/tests/wc/wc.pl index ed2bc43e4b..8a8a334c89 100755 --- a/tests/wc/wc.pl +++ b/tests/wc/wc.pl @@ -41,6 +41,7 @@ my @Tests = ['c0', '-L', {IN_PIPE=>"1\n12\n"}, {OUT=>"2\n"}], ['c1', '-L', {IN_PIPE=>"1\n123\n1\n"}, {OUT=>"3\n"}], ['c2', '-L', {IN_PIPE=>"\n123456"}, {OUT=>"6\n"}], + ['d1', '-w', {IN_PIPE=>"\1\n"}, {OUT=>"1\n"}], ); my $save_temps = $ENV{DEBUG};