#include <sys/types.h>
#include <uchar.h>
+#include <assure.h>
+#include <argmatch.h>
+#include <argv-iter.h>
+#include <fadvise.h>
+#include <physmem.h>
+#include <readtokens0.h>
+#include <safe-read.h>
+#include <stat-size.h>
+#include <xbinary-io.h>
+
#include "system.h"
-#include "assure.h"
-#include "argmatch.h"
-#include "argv-iter.h"
-#include "fadvise.h"
-#include "physmem.h"
-#include "readtokens0.h"
-#include "safe-read.h"
-#include "stat-size.h"
-#include "xbinary-io.h"
+#include "wc.h"
/* The official name of this program (e.g., no 'g' prefix). */
#define PROGRAM_NAME "wc"
/* Size of atomic reads. */
#define BUFFER_SIZE (16 * 1024)
-#ifdef USE_AVX2_WC_LINECOUNT
-/* From wc_avx2.c */
-extern bool
-wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out,
- uintmax_t *bytes_out);
-#endif
-
static bool wc_isprint[UCHAR_MAX + 1];
static bool wc_isspace[UCHAR_MAX + 1];
putchar ('\n');
}
-static bool
-wc_lines (char const *file, int fd, uintmax_t *lines_out, uintmax_t *bytes_out)
+/* Read FD and return a summary. */
+static struct wc_lines
+wc_lines (int fd)
{
- size_t bytes_read;
- uintmax_t lines, bytes;
- char buf[BUFFER_SIZE + 1];
- bool long_lines = false;
-
- if (!lines_out || !bytes_out)
- {
- return false;
- }
+#ifdef USE_AVX2_WC_LINECOUNT
+ static signed char use_avx2;
+ if (!use_avx2)
+ use_avx2 = avx2_supported () ? 1 : -1;
+ if (0 < use_avx2)
+ return wc_lines_avx2 (fd);
+#endif
- lines = bytes = 0;
+ uintmax_t lines = 0, bytes = 0;
+ bool long_lines = false;
- while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0)
+ while (true)
{
-
- if (bytes_read == SAFE_READ_ERROR)
- {
- error (0, errno, "%s", quotef (file));
- return false;
- }
+ char buf[BUFFER_SIZE + 1];
+ size_t bytes_read = safe_read (fd, buf, BUFFER_SIZE);
+ if (! (0 < bytes_read && bytes_read <= BUFFER_SIZE))
+ return (struct wc_lines) { bytes_read == 0 ? 0 : errno, lines, bytes };
bytes += bytes_read;
-
- char *p = buf;
char *end = buf + bytes_read;
- uintmax_t plines = lines;
+ idx_t buflines = 0;
if (! long_lines)
{
/* Avoid function call overhead for shorter lines. */
- while (p != end)
- lines += *p++ == '\n';
+ for (char *p = buf; p < end; p++)
+ buflines += *p == '\n';
}
else
{
/* rawmemchr is more efficient with longer lines. */
*end = '\n';
- while ((p = rawmemchr (p, '\n')) < end)
- {
- ++p;
- ++lines;
- }
+ for (char *p = buf; (p = rawmemchr (p, '\n')) < end; p++)
+ buflines++;
}
/* If the average line length in the block is >= 15, then use
FIXME: This line length was determined in 2015, on both
x86_64 and ppc64, but it's worth re-evaluating in future with
newer compilers, CPUs, or memchr() implementations etc. */
- if (lines - plines <= bytes_read / 15)
- long_lines = true;
- else
- long_lines = false;
+ long_lines = 15 * buflines <= bytes_read;
+ lines += buflines;
}
-
- *bytes_out = bytes;
- *lines_out = lines;
-
- return true;
}
/* Count words. FILE_X is the name of the file (or null for standard
static bool
wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
{
- bool ok = true;
+ int err = 0;
char buf[BUFFER_SIZE + 1];
size_t bytes_read;
uintmax_t lines, words, chars, bytes, linelength;
{
if (bytes_read == SAFE_READ_ERROR)
{
- error (0, errno, "%s", quotef (file));
- ok = false;
+ err = errno;
break;
}
bytes += bytes_read;
}
else if (!count_chars && !count_complicated)
{
-#ifdef USE_AVX2_WC_LINECOUNT
- static bool (*wc_lines_p) (char const *, int, uintmax_t *, uintmax_t *);
- if (!wc_lines_p)
- wc_lines_p = avx2_supported () ? wc_lines_avx2 : wc_lines;
-#else
- bool (*wc_lines_p) (char const *, int, uintmax_t *, uintmax_t *)
- = wc_lines;
-#endif
-
/* Use a separate loop when counting only lines or lines and bytes --
but not chars or words. */
- ok = wc_lines_p (file, fd, &lines, &bytes);
+ struct wc_lines w = wc_lines (fd);
+ err = w.err;
+ lines = w.lines;
+ bytes = w.bytes;
}
else if (MB_CUR_MAX > 1)
{
char const *p;
if (bytes_read == SAFE_READ_ERROR)
{
- error (0, errno, "%s", quotef (file));
- ok = false;
+ err = errno;
break;
}
char const *p = buf;
if (bytes_read == SAFE_READ_ERROR)
{
- error (0, errno, "%s", quotef (file));
- ok = false;
+ err = errno;
break;
}
if (linelength > max_line_length)
max_line_length = linelength;
- return ok;
+ if (err)
+ error (0, err, "%s", quotef (file));
+ return !err;
}
static bool
#include <config.h>
+#include "wc.h"
+
#include "system.h"
#include "safe-read.h"
so there is no single bytes in the optimal case. */
#define BUFSIZE (16320)
-extern bool
-wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out,
- uintmax_t *bytes_out);
-
-extern bool
-wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out,
- uintmax_t *bytes_out)
+/* Read FD and return a summary. */
+struct wc_lines
+wc_lines_avx2 (int fd)
{
- __m256i accumulator;
- __m256i accumulator2;
- __m256i zeroes;
- __m256i endlines;
- __m256i avx_buf[BUFSIZE / sizeof (__m256i)];
- __m256i *datap;
uintmax_t lines = 0;
uintmax_t bytes = 0;
- size_t bytes_read = 0;
-
- if (!lines_out || !bytes_out)
- return false;
+ __m256i
+ zeroes = _mm256_setzero_si256 (),
+ endlines = _mm256_set1_epi8 ('\n');
- /* Using two parallel accumulators gave a good performance increase.
- Adding a third gave no additional benefit, at least on an
- Intel Xeon E3-1231v3. Maybe on a newer CPU with additional vector
- execution engines it would be a win. */
- accumulator = _mm256_setzero_si256 ();
- accumulator2 = _mm256_setzero_si256 ();
- zeroes = _mm256_setzero_si256 ();
- endlines = _mm256_set1_epi8 ('\n');
-
- while ((bytes_read = safe_read (fd, avx_buf, sizeof (avx_buf))) > 0)
+ while (true)
{
- __m256i to_match;
- __m256i to_match2;
- __m256i matches;
- __m256i matches2;
-
- if (bytes_read == SAFE_READ_ERROR)
- {
- error (0, errno, "%s", quotef (file));
- return false;
- }
+ /* Using two parallel accumulators gave a good performance increase.
+ Adding a third gave no additional benefit, at least on an
+ Intel Xeon E3-1231v3. Maybe on a newer CPU with additional vector
+ execution engines it would be a win. */
+ __m256i
+ accumulator = _mm256_setzero_si256 (),
+ accumulator2 = _mm256_setzero_si256 (),
+ avx_buf[BUFSIZE / sizeof (__m256i)];
+
+ size_t bytes_read = safe_read (fd, avx_buf, sizeof avx_buf);
+ if (! (0 < bytes_read && bytes_read <= sizeof avx_buf))
+ return (struct wc_lines) { bytes_read == 0 ? 0 : errno, lines, bytes };
bytes += bytes_read;
-
- datap = avx_buf;
- char *end = ((char *)avx_buf) + bytes_read;
+ __m256i *datap = avx_buf;
while (bytes_read >= 64)
{
- to_match = _mm256_load_si256 (datap);
- to_match2 = _mm256_load_si256 (datap + 1);
+ __m256i
+ to_match = _mm256_load_si256 (datap),
+ to_match2 = _mm256_load_si256 (datap + 1),
+ matches = _mm256_cmpeq_epi8 (to_match, endlines),
+ matches2 = _mm256_cmpeq_epi8 (to_match2, endlines);
- matches = _mm256_cmpeq_epi8 (to_match, endlines);
- matches2 = _mm256_cmpeq_epi8 (to_match2, endlines);
/* Compare will set each 8 bit integer in the register to 0xFF
on match. When we subtract it the 8 bit accumulators
will underflow, so this is equal to adding 1. */
bytes_read -= 64;
}
- /* Horizontally add all 8 bit integers in the register,
- and then reset it */
+ /* Horizontally add all 8 bit integers in the register. */
accumulator = _mm256_sad_epu8 (accumulator, zeroes);
lines += _mm256_extract_epi16 (accumulator, 0)
+ _mm256_extract_epi16 (accumulator, 4)
+ _mm256_extract_epi16 (accumulator, 8)
+ _mm256_extract_epi16 (accumulator, 12);
- accumulator = _mm256_setzero_si256 ();
accumulator2 = _mm256_sad_epu8 (accumulator2, zeroes);
lines += _mm256_extract_epi16 (accumulator2, 0)
+ _mm256_extract_epi16 (accumulator2, 4)
+ _mm256_extract_epi16 (accumulator2, 8)
+ _mm256_extract_epi16 (accumulator2, 12);
- accumulator2 = _mm256_setzero_si256 ();
/* Finish up any left over bytes */
- char *p = (char *)datap;
- while (p != end)
- lines += *p++ == '\n';
+ char *end = (char *) datap + bytes_read;
+ for (char *p = (char *) datap; p < end; p++)
+ lines += *p == '\n';
}
-
- *lines_out = lines;
- *bytes_out = bytes;
-
- return true;
}