Use cpuid to detect CPU support for avx2 instructions.
Performance was seen to improve by 5x for a file with only newlines,
while the performance for a file with no such characters is unchanged.
* configure.ac [USE_AVX2_WC_LINECOUNT]: A new conditional,
set when __get_cpuid_count() and avx2 compiler intrinsics are supported.
* src/wc.c (avx2_supported): A new function using __get_cpuid_count()
to determine if avx2 instructions are supported.
(wc_lines): A new function refactored from wc(),
which implements the standard line counting logic,
and provides the fallback implementation for when avx2 is not supported.
* src/wc_avx2.c: A new module to implement using avx2 intrinsics.
* src/local.mk: Reference the new module. Note we build as a separate
lib so that it can be portably built with separate -mavx2 etc. flags.
test "x$pclmul_intrinsic_exists" = "xyes"])
CFLAGS=$ac_save_CFLAGS
+AC_MSG_CHECKING([if __get_cpuid_count exists])
+AC_COMPILE_IFELSE(
+ [AC_LANG_SOURCE([[
+ #include <cpuid.h>
+
+ int main(void)
+ {
+ unsigned int eax = 0, ebx = 0, ecx = 0, edx = 0;
+ __get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
+ return 1;
+ }
+ ]])
+ ],[
+ AC_MSG_RESULT([yes])
+ get_cpuid_count_exists=yes
+ ],[
+ AC_MSG_RESULT([no])
+ ])
+
+CFLAGS="-mavx2 $CFLAGS"
+AC_MSG_CHECKING([if avx2 intrinstics exists])
+AC_COMPILE_IFELSE(
+ [AC_LANG_SOURCE([[
+ #include <x86intrin.h>
+
+ int main(void)
+ {
+ __m256i a, b;
+ a = _mm256_sad_epu8(a, b);
+ return 1;
+ }
+ ]])
+ ],[
+ AC_MSG_RESULT([yes])
+ AC_DEFINE([HAVE_AVX2_INTRINSIC], [1], [avx2 intrinsics exists])
+ avx2_intrinsic_exists=yes
+ ],[
+ AC_MSG_RESULT([no])
+ ])
+if test "x$get_cpuid_count_exists" = "xyes" &&
+ test "x$avx2_intrinsic_exists" = "xyes"; then
+ AC_DEFINE([USE_AVX2_WC_LINECOUNT], [1], [Counting lines with AVX2 enabled])
+fi
+AM_CONDITIONAL([USE_AVX2_WC_LINECOUNT],
+ [test "x$get_cpuid_count_exists" = "xyes" &&
+ test "x$avx2_intrinsic_exists" = "xyes"])
+
+CFLAGS=$ac_save_CFLAGS
+
############################################################################
dnl Autogenerated by the 'gen-lists-of-programs.sh' auxiliary script.
src_expand_SOURCES = src/expand.c src/expand-common.c
src_unexpand_SOURCES = src/unexpand.c src/expand-common.c
+src_wc_SOURCES = src/wc.c
+if USE_AVX2_WC_LINECOUNT
+noinst_LIBRARIES += src/libwc_avx2.a
+src_libwc_avx2_a_SOURCES = src/wc_avx2.c
+wc_avx2_ldadd = src/libwc_avx2.a
+src_wc_LDADD += $(wc_avx2_ldadd)
+src_libwc_avx2_a_CFLAGS = -mavx2 $(AM_CFLAGS)
+endif
+
# Ensure we don't link against libcoreutils.a as that lib is
# not compiled with -fPIC which causes issues on 64 bit at least
src_libstdbuf_so_LDADD = $(LIBINTL)
#include "safe-read.h"
#include "stat-size.h"
#include "xbinary-io.h"
+#ifdef USE_AVX2_WC_LINECOUNT
+# include <cpuid.h>
+#endif
#if !defined iswspace && !HAVE_ISWSPACE
# define iswspace(wc) \
/* Size of atomic reads. */
#define BUFFER_SIZE (16 * 1024)
+static bool
+wc_lines (char const *file, int fd, uintmax_t *lines_out,
+ uintmax_t *bytes_out);
+#ifdef USE_AVX2_WC_LINECOUNT
+/* From wc_avx2.c */
+extern bool
+wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out,
+ uintmax_t *bytes_out);
+#endif
+static bool
+(*wc_lines_p) (char const *file, int fd, uintmax_t *lines_out,
+ uintmax_t *bytes_out) = wc_lines;
+
+
/* Cumulative number of lines, words, chars and bytes in all files so far.
max_line_length is the maximum over all files processed so far. */
static uintmax_t total_lines;
{NULL, 0, NULL, 0}
};
+#ifdef USE_AVX2_WC_LINECOUNT
+static bool
+avx2_supported (void)
+{
+ unsigned int eax = 0;
+ unsigned int ebx = 0;
+ unsigned int ecx = 0;
+ unsigned int edx = 0;
+
+ if (! __get_cpuid (1, &eax, &ebx, &ecx, &edx))
+ return false;
+
+ if (! (ecx & bit_OSXSAVE))
+ return false;
+
+ eax = ebx = ecx = edx = 0;
+
+ if (! __get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx))
+ return false;
+
+ if (! (ebx & bit_AVX2))
+ return false;
+
+ return true;
+}
+#endif
+
void
usage (int status)
{
putchar ('\n');
}
+static bool
+wc_lines (char const *file, int fd, uintmax_t *lines_out, uintmax_t *bytes_out)
+{
+ size_t bytes_read;
+ uintmax_t lines, bytes;
+ char buf[BUFFER_SIZE + 1];
+ bool long_lines = false;
+
+ if (!lines_out || !bytes_out)
+ {
+ return false;
+ }
+
+ lines = bytes = 0;
+
+ while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0)
+ {
+
+ if (bytes_read == SAFE_READ_ERROR)
+ {
+ error (0, errno, "%s", quotef (file));
+ return false;
+ }
+
+ bytes += bytes_read;
+
+ char *p = buf;
+ char *end = buf + bytes_read;
+ uintmax_t plines = lines;
+
+ if (! long_lines)
+ {
+ /* Avoid function call overhead for shorter lines. */
+ while (p != end)
+ lines += *p++ == '\n';
+ }
+ else
+ {
+ /* memchr is more efficient with longer lines. */
+ while ((p = memchr (p, '\n', end - p)))
+ {
+ ++p;
+ ++lines;
+ }
+ }
+
+ /* If the average line length in the block is >= 15, then use
+ memchr for the next block, where system specific optimizations
+ may outweigh function call overhead.
+ FIXME: This line length was determined in 2015, on both
+ x86_64 and ppc64, but it's worth re-evaluating in future with
+ newer compilers, CPUs, or memchr() implementations etc. */
+ if (lines - plines <= bytes_read / 15)
+ long_lines = true;
+ else
+ long_lines = false;
+ }
+
+ *bytes_out = bytes;
+ *lines_out = lines;
+
+ return true;
+}
+
/* Count words. FILE_X is the name of the file (or NULL for standard
input) that is open on descriptor FD. *FSTATUS is its status.
CURRENT_POS is the current file offset if known, negative if unknown.
{
/* Use a separate loop when counting only lines or lines and bytes --
but not chars or words. */
- bool long_lines = false;
- while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0)
- {
- if (bytes_read == SAFE_READ_ERROR)
- {
- error (0, errno, "%s", quotef (file));
- ok = false;
- break;
- }
-
- bytes += bytes_read;
-
- char *p = buf;
- char *end = p + bytes_read;
- uintmax_t plines = lines;
-
- if (! long_lines)
- {
- /* Avoid function call overhead for shorter lines. */
- while (p != end)
- lines += *p++ == '\n';
- }
- else
- {
- /* memchr is more efficient with longer lines. */
- while ((p = memchr (p, '\n', end - p)))
- {
- ++p;
- ++lines;
- }
- }
-
- /* If the average line length in the block is >= 15, then use
- memchr for the next block, where system specific optimizations
- may outweigh function call overhead.
- FIXME: This line length was determined in 2015, on both
- x86_64 and ppc64, but it's worth re-evaluating in future with
- newer compilers, CPUs, or memchr() implementations etc. */
- if (lines - plines <= bytes_read / 15)
- long_lines = true;
- else
- long_lines = false;
- }
+ ok = wc_lines_p (file, fd, &lines, &bytes);
}
#if MB_LEN_MAX > 1
# define SUPPORT_OLD_MBRTOWC 1
print_linelength = false;
total_lines = total_words = total_chars = total_bytes = max_line_length = 0;
+#ifdef USE_AVX2_WC_LINECOUNT
+ if (avx2_supported ())
+ wc_lines_p = wc_lines_avx2;
+#endif
+
while ((optc = getopt_long (argc, argv, "clLmw", longopts, NULL)) != -1)
switch (optc)
{
--- /dev/null
+/* wc_avx - Count the number of newlines with avx2 instructions.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include "system.h"
+#include "error.h"
+#include "safe-read.h"
+
+#include <x86intrin.h>
+
+/* This must be below 16 KB (16384) or else the accumulators can
+ theoretically overflow, producing wrong result. This is 2*32 bytes below,
+ so there is no single bytes in the optimal case. */
+#define BUFSIZE (16320)
+
+extern bool
+wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out,
+ uintmax_t *bytes_out);
+
+extern bool
+wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out,
+ uintmax_t *bytes_out)
+{
+ __m256i accumulator;
+ __m256i accumulator2;
+ __m256i zeroes;
+ __m256i endlines;
+ __m256i avx_buf[BUFSIZE / sizeof (__m256i)];
+ __m256i *datap;
+ uintmax_t lines = 0;
+ uintmax_t bytes = 0;
+ size_t bytes_read = 0;
+
+
+ if (!lines_out || !bytes_out)
+ return false;
+
+ /* Using two parallel accumulators gave a good performance increase.
+ Adding a third gave no additional benefit, at least on an
+ Intel Xeon E3-1231v3. Maybe on a newer CPU with additional vector
+ execution engines it would be a win. */
+ accumulator = _mm256_setzero_si256 ();
+ accumulator2 = _mm256_setzero_si256 ();
+ zeroes = _mm256_setzero_si256 ();
+ endlines = _mm256_set1_epi8 ('\n');
+
+ while ((bytes_read = safe_read (fd, avx_buf, sizeof (avx_buf))) > 0)
+ {
+ __m256i to_match;
+ __m256i to_match2;
+ __m256i matches;
+ __m256i matches2;
+
+ if (bytes_read == SAFE_READ_ERROR)
+ {
+ error (0, errno, "%s", quotef (file));
+ return false;
+ }
+
+ bytes += bytes_read;
+
+ datap = avx_buf;
+ char *end = ((char *)avx_buf) + bytes_read;
+
+ while (bytes_read >= 64)
+ {
+ to_match = _mm256_load_si256 (datap);
+ to_match2 = _mm256_load_si256 (datap + 1);
+
+ matches = _mm256_cmpeq_epi8 (to_match, endlines);
+ matches2 = _mm256_cmpeq_epi8 (to_match2, endlines);
+ /* Compare will set each 8 bit integer in the register to 0xFF
+ on match. When we subtract it the 8 bit accumulators
+ will underflow, so this is equal to adding 1. */
+ accumulator = _mm256_sub_epi8 (accumulator, matches);
+ accumulator2 = _mm256_sub_epi8 (accumulator2, matches2);
+
+ datap += 2;
+ bytes_read -= 64;
+ }
+
+ /* Horizontally add all 8 bit integers in the register,
+ and then reset it */
+ accumulator = _mm256_sad_epu8 (accumulator, zeroes);
+ lines += _mm256_extract_epi16 (accumulator, 0)
+ + _mm256_extract_epi16 (accumulator, 4)
+ + _mm256_extract_epi16 (accumulator, 8)
+ + _mm256_extract_epi16 (accumulator, 12);
+ accumulator = _mm256_setzero_si256 ();
+
+ accumulator2 = _mm256_sad_epu8 (accumulator2, zeroes);
+ lines += _mm256_extract_epi16 (accumulator2, 0)
+ + _mm256_extract_epi16 (accumulator2, 4)
+ + _mm256_extract_epi16 (accumulator2, 8)
+ + _mm256_extract_epi16 (accumulator2, 12);
+ accumulator2 = _mm256_setzero_si256 ();
+
+ /* Finish up any left over bytes */
+ char *p = (char *)datap;
+ while (p != end)
+ lines += *p++ == '\n';
+ }
+
+ *lines_out = lines;
+ *bytes_out = bytes;
+
+ return true;
+}