From: Mathieu Bordere Date: Wed, 24 Sep 2025 10:41:06 +0000 (+0200) Subject: wc: add AVX512 function for line counting X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=67e9068c5f5fdae5666279717a4c19bdfe5c21de;p=thirdparty%2Fcoreutils.git wc: add AVX512 function for line counting * configure.ac: Add detection of AVX512 intrinsics for wc. * src/local.mk: Build AVX512 wc libraries. * src/wc.c: Add runtime detection of AVX512 intrinsics and call appropriate function when detected. * src/wc.h (wc_lines_avx512): Declare function. * tests/wc/wc-cpu.sh: Add a test that disables AVX512 intrinsics. * src/wc_avx512.c: New file containing the wc -l implementation using AVX512. The logic and code is reused from the AVX2 implementation with slight adaptations. Replaced __builtin_popcount by __builtin_popcountll and the combination of _mm256_cmpeq_epi8 and _mm256_movemask_epi8 by a single call to _mm512_cmpeq_epi8_mask. * NEWS: Mention the improvement. --- diff --git a/NEWS b/NEWS index dc1d268793..a19e3aed60 100644 --- a/NEWS +++ b/NEWS @@ -11,6 +11,10 @@ GNU coreutils NEWS -*- outline -*- Previously it may have output too few lines. [bug introduced in coreutils-9.8] +** Improvements + + wc -l now operates 10% faster on hosts that support AVX512 instructions. + * Noteworthy changes in release 9.8 (2025-09-22) [stable] diff --git a/configure.ac b/configure.ac index 274eff42fc..a7432e0a7e 100644 --- a/configure.ac +++ b/configure.ac @@ -742,6 +742,37 @@ AM_CONDITIONAL([USE_AVX2_WC_LINECOUNT], CFLAGS=$ac_save_CFLAGS +CFLAGS="-mavx512bw -mavx512f $CFLAGS" +AC_MSG_CHECKING([for avx512 intrinsics]) +AC_CACHE_VAL([utils_cv_avx512_intrinsic_exists],[ +AC_LINK_IFELSE( + [AC_LANG_SOURCE([[ + #include + + int + main (void) + { + __m512i matches = _mm512_setzero_si512 (); + long long mask = _mm512_movepi8_mask (matches); + int lines = __builtin_popcountll (mask); + return (__builtin_cpu_supports ("avx512bw") + && __builtin_cpu_supports ("avx512f")); + } + ]]) + ],[ + utils_cv_avx512_intrinsic_exists=yes + ],[ + utils_cv_avx512_intrinsic_exists=no + ])]) +AC_MSG_RESULT([$utils_cv_avx512_intrinsic_exists]) +if test $utils_cv_avx512_intrinsic_exists = yes; then + AC_DEFINE([USE_AVX512_WC_LINECOUNT], [1], + [Counting lines with AVX512 enabled]) +fi +AM_CONDITIONAL([USE_AVX512_WC_LINECOUNT], + [test $utils_cv_avx512_intrinsic_exists = yes]) + +CFLAGS=$ac_save_CFLAGS ############################################################################ dnl Autogenerated by the 'gen-lists-of-programs.sh' auxiliary script. diff --git a/src/local.mk b/src/local.mk index f8a4bcffba..a55c9f990a 100644 --- a/src/local.mk +++ b/src/local.mk @@ -484,6 +484,13 @@ src_expand_SOURCES = src/expand.c src/expand-common.c src_unexpand_SOURCES = src/unexpand.c src/expand-common.c src_wc_SOURCES = src/wc.c +if USE_AVX512_WC_LINECOUNT +noinst_LIBRARIES += src/libwc_avx512.a +src_libwc_avx512_a_SOURCES = src/wc_avx512.c +wc_avx512_ldadd = src/libwc_avx512.a +src_wc_LDADD += $(wc_avx512_ldadd) +src_libwc_avx512_a_CFLAGS = -mavx512bw -mavx512f $(AM_CFLAGS) +endif if USE_AVX2_WC_LINECOUNT noinst_LIBRARIES += src/libwc_avx2.a src_libwc_avx2_a_SOURCES = src/wc_avx2.c diff --git a/src/wc.c b/src/wc.c index 777277f23d..2433993937 100644 --- a/src/wc.c +++ b/src/wc.c @@ -134,14 +134,29 @@ static enum total_type total_mode = total_auto; static bool avx2_supported (void) { - bool avx_enabled = cpu_supports ("avx2"); - + bool avx2_enabled = cpu_supports ("avx2"); if (debug) - error (0, 0, (avx_enabled + error (0, 0, (avx2_enabled ? _("using avx2 hardware support") : _("avx2 support not detected"))); - return avx_enabled; + return avx2_enabled; +} +#endif + +#ifdef USE_AVX512_WC_LINECOUNT +static bool +avx512_supported (void) +{ + bool avx512_enabled = (cpu_supports ("avx512f") + && cpu_supports ("avx512bw")); + + if (debug) + error (0, 0, (avx512_enabled + ? _("using avx512 hardware support") + : _("avx512 support not detected"))); + + return avx512_enabled; } #endif @@ -246,6 +261,13 @@ write_counts (uintmax_t lines, static struct wc_lines wc_lines (int fd) { +#ifdef USE_AVX512_WC_LINECOUNT + static signed char use_avx512; + if (!use_avx512) + use_avx512 = avx512_supported () ? 1 : -1; + if (0 < use_avx512) + return wc_lines_avx512 (fd); +#endif #ifdef USE_AVX2_WC_LINECOUNT static signed char use_avx2; if (!use_avx2) diff --git a/src/wc.h b/src/wc.h index a6b4c9e840..f151e92f2c 100644 --- a/src/wc.h +++ b/src/wc.h @@ -1,3 +1,4 @@ #include struct wc_lines { int err; intmax_t lines; intmax_t bytes; }; struct wc_lines wc_lines_avx2 (int); +struct wc_lines wc_lines_avx512 (int); diff --git a/src/wc_avx512.c b/src/wc_avx512.c new file mode 100644 index 0000000000..41faea6461 --- /dev/null +++ b/src/wc_avx512.c @@ -0,0 +1,58 @@ +/* wc_avx512 - Count the number of newlines with avx512 instructions. + Copyright (C) 2021-2025 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include "wc.h" +#include "system.h" +#include "ioblksize.h" + +#include + +/* Read FD and return a summary. */ +extern struct wc_lines +wc_lines_avx512 (int fd) +{ + intmax_t lines = 0; + intmax_t bytes = 0; + + __m512i endlines = _mm512_set1_epi8 ('\n'); + + while (true) + { + __m512i avx_buf[IO_BUFSIZE / sizeof (__m512i)]; + ssize_t bytes_read = read (fd, avx_buf, sizeof avx_buf); + if (bytes_read <= 0) + return (struct wc_lines) { bytes_read == 0 ? 0 : errno, lines, bytes }; + + bytes += bytes_read; + __m512i *datap = avx_buf; + + while (bytes_read >= 64) + { + __m512i to_match = _mm512_load_si512 (datap); + long long matches = _mm512_cmpeq_epi8_mask (to_match, endlines); + lines += __builtin_popcountll (matches); + datap += 1; + bytes_read -= 64; + } + + /* Finish up any left over bytes */ + char *end = (char *) datap + bytes_read; + for (char *p = (char *) datap; p < end; p++) + lines += *p == '\n'; + } +} diff --git a/tests/wc/wc-cpu.sh b/tests/wc/wc-cpu.sh index 1118fe14ed..6ad4f5f9c2 100755 --- a/tests/wc/wc-cpu.sh +++ b/tests/wc/wc-cpu.sh @@ -19,7 +19,7 @@ . "${srcdir=.}/tests/init.sh"; path_prepend_ ./src print_ver_ wc -GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX2' \ +GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX2,-AVX512F' \ wc -l --debug /dev/null 2>debug || fail=1 grep 'using.*hardware support' debug && fail=1 @@ -27,8 +27,16 @@ lines=$(shuf -i 0-1000 | head -n1) || framework_failure_ seq 1000 | head -n "$lines" > lines || framework_failure_ wc_accelerated=$(wc -l < lines) || fail=1 -wc_base=$(GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX2' wc -l < lines) || fail=1 +wc_accelerated_no_avx512=$( + GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX512F' \ + wc -l < lines + ) || fail=1 +wc_base=$( + GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX2,-AVX512F' \ + wc -l < lines + ) || fail=1 test "$wc_accelerated" = "$wc_base" || fail=1 +test "$wc_accelerated_no_avx512" = "$wc_base" || fail=1 Exit $fail