From 8d41285fe494613a732f8060606d68ea3a8181d0 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Sat, 23 Sep 2023 13:38:08 -0700 Subject: [PATCH] wc: improve avx2 API MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit * src/wc.c: Use "#include <...>" for files not in the current dir. Include "wc.h" instead of declaring wc_lines_avx2 by hand. (wc_lines): New API, with no file name (no longer needed) and with a return struct rather than arg pointers. All uses changed. Use avx2_supported directly instead of using a function pointer. Exploit C99-style declarations after statements. Multiply by 15 rather than dividing; it’s faster and more accurate and cannot overflow here. (wc): Simplify based on wc_lines API change. * src/wc.h: New file. * src/wc_avx2.c: Include it, to check API better. (wc_lines_avx2): Use new API. All uses changed. Exploit C99. Make locals more local. --- src/wc.c | 118 ++++++++++++++++++++------------------------------ src/wc.h | 3 ++ src/wc_avx2.c | 87 +++++++++++++------------------------ 3 files changed, 80 insertions(+), 128 deletions(-) create mode 100644 src/wc.h diff --git a/src/wc.c b/src/wc.c index d6d89ff341..67dcb7c3f7 100644 --- a/src/wc.c +++ b/src/wc.c @@ -25,16 +25,18 @@ #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include + #include "system.h" -#include "assure.h" -#include "argmatch.h" -#include "argv-iter.h" -#include "fadvise.h" -#include "physmem.h" -#include "readtokens0.h" -#include "safe-read.h" -#include "stat-size.h" -#include "xbinary-io.h" +#include "wc.h" /* The official name of this program (e.g., no 'g' prefix). */ #define PROGRAM_NAME "wc" @@ -46,13 +48,6 @@ /* Size of atomic reads. */ #define BUFFER_SIZE (16 * 1024) -#ifdef USE_AVX2_WC_LINECOUNT -/* From wc_avx2.c */ -extern bool -wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out, - uintmax_t *bytes_out); -#endif - static bool wc_isprint[UCHAR_MAX + 1]; static bool wc_isspace[UCHAR_MAX + 1]; @@ -253,51 +248,44 @@ write_counts (uintmax_t lines, putchar ('\n'); } -static bool -wc_lines (char const *file, int fd, uintmax_t *lines_out, uintmax_t *bytes_out) +/* Read FD and return a summary. */ +static struct wc_lines +wc_lines (int fd) { - size_t bytes_read; - uintmax_t lines, bytes; - char buf[BUFFER_SIZE + 1]; - bool long_lines = false; - - if (!lines_out || !bytes_out) - { - return false; - } +#ifdef USE_AVX2_WC_LINECOUNT + static signed char use_avx2; + if (!use_avx2) + use_avx2 = avx2_supported () ? 1 : -1; + if (0 < use_avx2) + return wc_lines_avx2 (fd); +#endif - lines = bytes = 0; + uintmax_t lines = 0, bytes = 0; + bool long_lines = false; - while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0) + while (true) { - - if (bytes_read == SAFE_READ_ERROR) - { - error (0, errno, "%s", quotef (file)); - return false; - } + char buf[BUFFER_SIZE + 1]; + size_t bytes_read = safe_read (fd, buf, BUFFER_SIZE); + if (! (0 < bytes_read && bytes_read <= BUFFER_SIZE)) + return (struct wc_lines) { bytes_read == 0 ? 0 : errno, lines, bytes }; bytes += bytes_read; - - char *p = buf; char *end = buf + bytes_read; - uintmax_t plines = lines; + idx_t buflines = 0; if (! long_lines) { /* Avoid function call overhead for shorter lines. */ - while (p != end) - lines += *p++ == '\n'; + for (char *p = buf; p < end; p++) + buflines += *p == '\n'; } else { /* rawmemchr is more efficient with longer lines. */ *end = '\n'; - while ((p = rawmemchr (p, '\n')) < end) - { - ++p; - ++lines; - } + for (char *p = buf; (p = rawmemchr (p, '\n')) < end; p++) + buflines++; } /* If the average line length in the block is >= 15, then use @@ -306,16 +294,9 @@ wc_lines (char const *file, int fd, uintmax_t *lines_out, uintmax_t *bytes_out) FIXME: This line length was determined in 2015, on both x86_64 and ppc64, but it's worth re-evaluating in future with newer compilers, CPUs, or memchr() implementations etc. */ - if (lines - plines <= bytes_read / 15) - long_lines = true; - else - long_lines = false; + long_lines = 15 * buflines <= bytes_read; + lines += buflines; } - - *bytes_out = bytes; - *lines_out = lines; - - return true; } /* Count words. FILE_X is the name of the file (or null for standard @@ -325,7 +306,7 @@ wc_lines (char const *file, int fd, uintmax_t *lines_out, uintmax_t *bytes_out) static bool wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) { - bool ok = true; + int err = 0; char buf[BUFFER_SIZE + 1]; size_t bytes_read; uintmax_t lines, words, chars, bytes, linelength; @@ -412,8 +393,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) { if (bytes_read == SAFE_READ_ERROR) { - error (0, errno, "%s", quotef (file)); - ok = false; + err = errno; break; } bytes += bytes_read; @@ -422,18 +402,12 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) } else if (!count_chars && !count_complicated) { -#ifdef USE_AVX2_WC_LINECOUNT - static bool (*wc_lines_p) (char const *, int, uintmax_t *, uintmax_t *); - if (!wc_lines_p) - wc_lines_p = avx2_supported () ? wc_lines_avx2 : wc_lines; -#else - bool (*wc_lines_p) (char const *, int, uintmax_t *, uintmax_t *) - = wc_lines; -#endif - /* Use a separate loop when counting only lines or lines and bytes -- but not chars or words. */ - ok = wc_lines_p (file, fd, &lines, &bytes); + struct wc_lines w = wc_lines (fd); + err = w.err; + lines = w.lines; + bytes = w.bytes; } else if (MB_CUR_MAX > 1) { @@ -449,8 +423,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) char const *p; if (bytes_read == SAFE_READ_ERROR) { - error (0, errno, "%s", quotef (file)); - ok = false; + err = errno; break; } @@ -583,8 +556,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) char const *p = buf; if (bytes_read == SAFE_READ_ERROR) { - error (0, errno, "%s", quotef (file)); - ok = false; + err = errno; break; } @@ -646,7 +618,9 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) if (linelength > max_line_length) max_line_length = linelength; - return ok; + if (err) + error (0, err, "%s", quotef (file)); + return !err; } static bool diff --git a/src/wc.h b/src/wc.h new file mode 100644 index 0000000000..a578e14f16 --- /dev/null +++ b/src/wc.h @@ -0,0 +1,3 @@ +#include +struct wc_lines { int err; uintmax_t lines; uintmax_t bytes; }; +struct wc_lines wc_lines_avx2 (int); diff --git a/src/wc_avx2.c b/src/wc_avx2.c index eff7972b4e..8ea4e99def 100644 --- a/src/wc_avx2.c +++ b/src/wc_avx2.c @@ -16,6 +16,8 @@ #include +#include "wc.h" + #include "system.h" #include "safe-read.h" @@ -26,62 +28,43 @@ so there is no single bytes in the optimal case. */ #define BUFSIZE (16320) -extern bool -wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out, - uintmax_t *bytes_out); - -extern bool -wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out, - uintmax_t *bytes_out) +/* Read FD and return a summary. */ +struct wc_lines +wc_lines_avx2 (int fd) { - __m256i accumulator; - __m256i accumulator2; - __m256i zeroes; - __m256i endlines; - __m256i avx_buf[BUFSIZE / sizeof (__m256i)]; - __m256i *datap; uintmax_t lines = 0; uintmax_t bytes = 0; - size_t bytes_read = 0; - - if (!lines_out || !bytes_out) - return false; + __m256i + zeroes = _mm256_setzero_si256 (), + endlines = _mm256_set1_epi8 ('\n'); - /* Using two parallel accumulators gave a good performance increase. - Adding a third gave no additional benefit, at least on an - Intel Xeon E3-1231v3. Maybe on a newer CPU with additional vector - execution engines it would be a win. */ - accumulator = _mm256_setzero_si256 (); - accumulator2 = _mm256_setzero_si256 (); - zeroes = _mm256_setzero_si256 (); - endlines = _mm256_set1_epi8 ('\n'); - - while ((bytes_read = safe_read (fd, avx_buf, sizeof (avx_buf))) > 0) + while (true) { - __m256i to_match; - __m256i to_match2; - __m256i matches; - __m256i matches2; - - if (bytes_read == SAFE_READ_ERROR) - { - error (0, errno, "%s", quotef (file)); - return false; - } + /* Using two parallel accumulators gave a good performance increase. + Adding a third gave no additional benefit, at least on an + Intel Xeon E3-1231v3. Maybe on a newer CPU with additional vector + execution engines it would be a win. */ + __m256i + accumulator = _mm256_setzero_si256 (), + accumulator2 = _mm256_setzero_si256 (), + avx_buf[BUFSIZE / sizeof (__m256i)]; + + size_t bytes_read = safe_read (fd, avx_buf, sizeof avx_buf); + if (! (0 < bytes_read && bytes_read <= sizeof avx_buf)) + return (struct wc_lines) { bytes_read == 0 ? 0 : errno, lines, bytes }; bytes += bytes_read; - - datap = avx_buf; - char *end = ((char *)avx_buf) + bytes_read; + __m256i *datap = avx_buf; while (bytes_read >= 64) { - to_match = _mm256_load_si256 (datap); - to_match2 = _mm256_load_si256 (datap + 1); + __m256i + to_match = _mm256_load_si256 (datap), + to_match2 = _mm256_load_si256 (datap + 1), + matches = _mm256_cmpeq_epi8 (to_match, endlines), + matches2 = _mm256_cmpeq_epi8 (to_match2, endlines); - matches = _mm256_cmpeq_epi8 (to_match, endlines); - matches2 = _mm256_cmpeq_epi8 (to_match2, endlines); /* Compare will set each 8 bit integer in the register to 0xFF on match. When we subtract it the 8 bit accumulators will underflow, so this is equal to adding 1. */ @@ -92,30 +75,22 @@ wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out, bytes_read -= 64; } - /* Horizontally add all 8 bit integers in the register, - and then reset it */ + /* Horizontally add all 8 bit integers in the register. */ accumulator = _mm256_sad_epu8 (accumulator, zeroes); lines += _mm256_extract_epi16 (accumulator, 0) + _mm256_extract_epi16 (accumulator, 4) + _mm256_extract_epi16 (accumulator, 8) + _mm256_extract_epi16 (accumulator, 12); - accumulator = _mm256_setzero_si256 (); accumulator2 = _mm256_sad_epu8 (accumulator2, zeroes); lines += _mm256_extract_epi16 (accumulator2, 0) + _mm256_extract_epi16 (accumulator2, 4) + _mm256_extract_epi16 (accumulator2, 8) + _mm256_extract_epi16 (accumulator2, 12); - accumulator2 = _mm256_setzero_si256 (); /* Finish up any left over bytes */ - char *p = (char *)datap; - while (p != end) - lines += *p++ == '\n'; + char *end = (char *) datap + bytes_read; + for (char *p = (char *) datap; p < end; p++) + lines += *p == '\n'; } - - *lines_out = lines; - *bytes_out = bytes; - - return true; } -- 2.47.2