@command{wc} counts the number of bytes, characters, words, and newlines
in each given @var{file}, or standard input if none are given
-or for a @var{file} of @samp{-}. A word is a nonzero length
-sequence of printable characters delimited by white space. Synopsis:
+or for a @var{file} of @samp{-}. A word is a nonempty sequence of non white
+space delimited by white space characters or by start or end of input.
+Synopsis:
@example
wc [@var{option}]@dots{} [@var{file}]@dots{}
@itemx --words
@opindex -w
@opindex --words
-Print only the word counts. A word is a nonzero length
-sequence of printable characters separated by white space.
+Print only the word counts. A word is a nonempty sequence of non white
+space delimited by white space characters or by start or end of input.
@item -l
@itemx --lines
program_name, program_name);
fputs (_("\
Print newline, word, and byte counts for each FILE, and a total line if\n\
-more than one FILE is specified. A word is a non-zero-length sequence of\n\
-printable characters delimited by white space.\n\
+more than one FILE is specified. A word is a nonempty sequence of non white\n\
+space delimited by white space characters or by start or end of input.\n\
"), stdout);
emit_stdin_note ();
{
char32_t wide_char;
size_t n;
- bool wide = true;
+ bool single_byte_ascii = !in_shift && 0 <= *p && *p < 0x80;
- if (!in_shift && 0 <= *p && *p < 0x80)
+ if (single_byte_ascii)
{
/* Handle most ASCII characters quickly, without calling
mbrtowc(). */
n = 1;
wide_char = *p;
- wide = false;
}
else
{
FALLTHROUGH;
case '\v':
mb_word_separator:
- words += in_word;
in_word = false;
break;
default:
- if (wide && c32isprint (wide_char))
+ /* c32width can be expensive on macOS for example,
+ so avoid if not needed. */
+ if (print_linelength)
{
- /* c32width can be expensive on OSX for example,
- so avoid if not needed. */
- if (print_linelength)
+ if (single_byte_ascii)
+ linepos += !!isprint (wide_char);
+ else
{
int width = c32width (wide_char);
if (width > 0)
linepos += width;
}
- if (c32isspace (wide_char) || iswnbspace (wide_char))
- goto mb_word_separator;
- in_word = true;
- }
- else if (!wide && isprint (to_uchar (*p)))
- {
- linepos++;
- if (isspace (to_uchar (*p)))
- goto mb_word_separator;
- in_word = true;
}
+ if (single_byte_ascii ? isspace (wide_char)
+ : c32isspace (wide_char) || iswnbspace (wide_char))
+ goto mb_word_separator;
+
+ /* Count words by counting word starts, i.e., each
+ white space character (or the start of input)
+ followed by non white space.
+
+ POSIX says a word is "a non-zero-length string of
+ characters delimited by white space". This is certainly
+ wrong in some sense, as the string can be delimited
+ by start or end of input, and it is not clear
+ what it means when the input contains encoding errors.
+ Although GNU wc ignores encoding errors when determining
+ word boundaries, this behavior is not documented or
+ portable and should not be relied upon. */
+ words += !in_word;
+ in_word = true;
break;
}
}
if (linepos > linelength)
linelength = linepos;
- words += in_word;
}
else
{
bytes += bytes_read;
do
{
- switch (*p++)
+ unsigned char c = *p++;
+ switch (c)
{
case '\n':
lines++;
FALLTHROUGH;
case '\v':
word_separator:
- words += in_word;
in_word = false;
break;
default:
- if (isprint (to_uchar (p[-1])))
- {
- linepos++;
- if (isspace (to_uchar (p[-1]))
- || isnbspace (to_uchar (p[-1])))
- goto word_separator;
- in_word = true;
- }
+ linepos += !!isprint (c);
+ if (isspace (c) || isnbspace (c))
+ goto word_separator;
+ words += !in_word;
+ in_word = true;
break;
}
}
}
if (linepos > linelength)
linelength = linepos;
- words += in_word;
}
if (count_chars < print_chars)
['c0', '-L', {IN_PIPE=>"1\n12\n"}, {OUT=>"2\n"}],
['c1', '-L', {IN_PIPE=>"1\n123\n1\n"}, {OUT=>"3\n"}],
['c2', '-L', {IN_PIPE=>"\n123456"}, {OUT=>"6\n"}],
+ ['d1', '-w', {IN_PIPE=>"\1\n"}, {OUT=>"1\n"}],
);
my $save_temps = $ENV{DEBUG};