* configure.ac: Add detection of AVX512 intrinsics for wc.
* src/local.mk: Build AVX512 wc libraries.
* src/wc.c: Add runtime detection of AVX512 intrinsics and call
appropriate function when detected.
* src/wc.h (wc_lines_avx512): Declare function.
* tests/wc/wc-cpu.sh: Add a test that disables AVX512 intrinsics.
* src/wc_avx512.c: New file containing the wc -l implementation using
AVX512. The logic and code is reused from the AVX2 implementation with
slight adaptations. Replaced __builtin_popcount by __builtin_popcountll
and the combination of _mm256_cmpeq_epi8 and _mm256_movemask_epi8 by a
single call to _mm512_cmpeq_epi8_mask.
* NEWS: Mention the improvement.
Previously it may have output too few lines.
[bug introduced in coreutils-9.8]
+** Improvements
+
+ wc -l now operates 10% faster on hosts that support AVX512 instructions.
+
* Noteworthy changes in release 9.8 (2025-09-22) [stable]
CFLAGS=$ac_save_CFLAGS
+CFLAGS="-mavx512bw -mavx512f $CFLAGS"
+AC_MSG_CHECKING([for avx512 intrinsics])
+AC_CACHE_VAL([utils_cv_avx512_intrinsic_exists],[
+AC_LINK_IFELSE(
+ [AC_LANG_SOURCE([[
+ #include <x86intrin.h>
+
+ int
+ main (void)
+ {
+ __m512i matches = _mm512_setzero_si512 ();
+ long long mask = _mm512_movepi8_mask (matches);
+ int lines = __builtin_popcountll (mask);
+ return (__builtin_cpu_supports ("avx512bw")
+ && __builtin_cpu_supports ("avx512f"));
+ }
+ ]])
+ ],[
+ utils_cv_avx512_intrinsic_exists=yes
+ ],[
+ utils_cv_avx512_intrinsic_exists=no
+ ])])
+AC_MSG_RESULT([$utils_cv_avx512_intrinsic_exists])
+if test $utils_cv_avx512_intrinsic_exists = yes; then
+ AC_DEFINE([USE_AVX512_WC_LINECOUNT], [1],
+ [Counting lines with AVX512 enabled])
+fi
+AM_CONDITIONAL([USE_AVX512_WC_LINECOUNT],
+ [test $utils_cv_avx512_intrinsic_exists = yes])
+
+CFLAGS=$ac_save_CFLAGS
############################################################################
dnl Autogenerated by the 'gen-lists-of-programs.sh' auxiliary script.
src_unexpand_SOURCES = src/unexpand.c src/expand-common.c
src_wc_SOURCES = src/wc.c
+if USE_AVX512_WC_LINECOUNT
+noinst_LIBRARIES += src/libwc_avx512.a
+src_libwc_avx512_a_SOURCES = src/wc_avx512.c
+wc_avx512_ldadd = src/libwc_avx512.a
+src_wc_LDADD += $(wc_avx512_ldadd)
+src_libwc_avx512_a_CFLAGS = -mavx512bw -mavx512f $(AM_CFLAGS)
+endif
if USE_AVX2_WC_LINECOUNT
noinst_LIBRARIES += src/libwc_avx2.a
src_libwc_avx2_a_SOURCES = src/wc_avx2.c
static bool
avx2_supported (void)
{
- bool avx_enabled = cpu_supports ("avx2");
-
+ bool avx2_enabled = cpu_supports ("avx2");
if (debug)
- error (0, 0, (avx_enabled
+ error (0, 0, (avx2_enabled
? _("using avx2 hardware support")
: _("avx2 support not detected")));
- return avx_enabled;
+ return avx2_enabled;
+}
+#endif
+
+#ifdef USE_AVX512_WC_LINECOUNT
+static bool
+avx512_supported (void)
+{
+ bool avx512_enabled = (cpu_supports ("avx512f")
+ && cpu_supports ("avx512bw"));
+
+ if (debug)
+ error (0, 0, (avx512_enabled
+ ? _("using avx512 hardware support")
+ : _("avx512 support not detected")));
+
+ return avx512_enabled;
}
#endif
static struct wc_lines
wc_lines (int fd)
{
+#ifdef USE_AVX512_WC_LINECOUNT
+ static signed char use_avx512;
+ if (!use_avx512)
+ use_avx512 = avx512_supported () ? 1 : -1;
+ if (0 < use_avx512)
+ return wc_lines_avx512 (fd);
+#endif
#ifdef USE_AVX2_WC_LINECOUNT
static signed char use_avx2;
if (!use_avx2)
#include <stdint.h>
struct wc_lines { int err; intmax_t lines; intmax_t bytes; };
struct wc_lines wc_lines_avx2 (int);
+struct wc_lines wc_lines_avx512 (int);
--- /dev/null
+/* wc_avx512 - Count the number of newlines with avx512 instructions.
+ Copyright (C) 2021-2025 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include "wc.h"
+#include "system.h"
+#include "ioblksize.h"
+
+#include <x86intrin.h>
+
+/* Read FD and return a summary. */
+extern struct wc_lines
+wc_lines_avx512 (int fd)
+{
+ intmax_t lines = 0;
+ intmax_t bytes = 0;
+
+ __m512i endlines = _mm512_set1_epi8 ('\n');
+
+ while (true)
+ {
+ __m512i avx_buf[IO_BUFSIZE / sizeof (__m512i)];
+ ssize_t bytes_read = read (fd, avx_buf, sizeof avx_buf);
+ if (bytes_read <= 0)
+ return (struct wc_lines) { bytes_read == 0 ? 0 : errno, lines, bytes };
+
+ bytes += bytes_read;
+ __m512i *datap = avx_buf;
+
+ while (bytes_read >= 64)
+ {
+ __m512i to_match = _mm512_load_si512 (datap);
+ long long matches = _mm512_cmpeq_epi8_mask (to_match, endlines);
+ lines += __builtin_popcountll (matches);
+ datap += 1;
+ bytes_read -= 64;
+ }
+
+ /* Finish up any left over bytes */
+ char *end = (char *) datap + bytes_read;
+ for (char *p = (char *) datap; p < end; p++)
+ lines += *p == '\n';
+ }
+}
. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
print_ver_ wc
-GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX2' \
+GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX2,-AVX512F' \
wc -l --debug /dev/null 2>debug || fail=1
grep 'using.*hardware support' debug && fail=1
seq 1000 | head -n "$lines" > lines || framework_failure_
wc_accelerated=$(wc -l < lines) || fail=1
-wc_base=$(GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX2' wc -l < lines) || fail=1
+wc_accelerated_no_avx512=$(
+ GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX512F' \
+ wc -l < lines
+ ) || fail=1
+wc_base=$(
+ GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX2,-AVX512F' \
+ wc -l < lines
+ ) || fail=1
test "$wc_accelerated" = "$wc_base" || fail=1
+test "$wc_accelerated_no_avx512" = "$wc_base" || fail=1
Exit $fail