From d155be4a22cdc5d271a74c2ae8226c4239ab76ed Mon Sep 17 00:00:00 2001 From: Sam Russell Date: Thu, 28 Nov 2024 20:28:21 +0100 Subject: [PATCH] cksum: use ARMv8 SIMD extensions * configure.ac: Add check for ARMv8 VMULL support. * src/cksum.c: Add ARMv8 VMULL detection function. * src/cksum.h: Add ARMv8 VMULL implementation declaration. * src/cksum_vmull.c: ARMv8 VMULL implementation. * src/local.mk: Add build flags for ARMv8 VMULL. * NEWS: Mention the ARMv8 SIMD improvement. --- NEWS | 4 +- configure.ac | 34 +++++++ src/cksum.c | 26 +++++ src/cksum.h | 3 + src/cksum_vmull.c | 235 ++++++++++++++++++++++++++++++++++++++++++++++ src/local.mk | 7 ++ 6 files changed, 307 insertions(+), 2 deletions(-) create mode 100644 src/cksum_vmull.c diff --git a/NEWS b/NEWS index c1e604ffab..ed87546937 100644 --- a/NEWS +++ b/NEWS @@ -70,8 +70,8 @@ GNU coreutils NEWS -*- outline -*- ** Improvements - cksum -a crc, makes use of AVX2 and AVX512 extensions for time reductions - of 40% and 60% respectively. + cksum -a crc, makes use of AVX2, AVX512, and ARMv8 SIMD extensions + for time reductions of up to 40%, 60%, and 80% respectively. 'head -c NUM', 'head -n NUM', 'nl -l NUM', 'nproc --ignore NUM', 'tail -c NUM', 'tail -n NUM', and 'tail --max-unchanged-stats NUM’ diff --git a/configure.ac b/configure.ac index 17fa23b455..f167f226f7 100644 --- a/configure.ac +++ b/configure.ac @@ -618,6 +618,40 @@ if test $utils_cv_brain_16_bit_supported = yes; then AC_DEFINE([BF16_SUPPORTED], [1], [Brain 16 bit float supported]) fi +ac_save_CFLAGS=$CFLAGS +CFLAGS="-march=armv8-a+crypto $CFLAGS" +AC_MSG_CHECKING([if vmull intrinsic exists]) +AC_CACHE_VAL([utils_cv_vmull_intrinsic_exists],[ +AC_LINK_IFELSE( + [AC_LANG_SOURCE([[ + #include + #include + #include + #include + + int + main (void) + { + uint64x2_t a; + poly64_t shift64 = vget_lane_p64(vcreate_p64(0xB8BC6765), 0); + a = vreinterpretq_u64_p128(vmull_p64(shift64, vreinterpretq_p128_u64(a))); + return (getauxval(AT_HWCAP) & HWCAP_PMULL) > 0; + } + ]]) + ],[ + utils_cv_vmull_intrinsic_exists=yes + ],[ + utils_cv_vmull_intrinsic_exists=no + ])]) +AC_MSG_RESULT([$utils_cv_vmull_intrinsic_exists]) +if test $utils_cv_vmull_intrinsic_exists = yes; then + AC_DEFINE([USE_VMULL_CRC32], [1], + [CRC32 calculation by vmull hardware instruction enabled]) +fi +AM_CONDITIONAL([USE_VMULL_CRC32], + [test $utils_cv_vmull_intrinsic_exists = yes]) +CFLAGS=$ac_save_CFLAGS + ac_save_CFLAGS=$CFLAGS CFLAGS="-mavx -mpclmul $CFLAGS" AC_MSG_CHECKING([if pclmul intrinsic exists]) diff --git a/src/cksum.c b/src/cksum.c index 5900d141ec..489af7e52d 100644 --- a/src/cksum.c +++ b/src/cksum.c @@ -40,6 +40,11 @@ #include #include "system.h" +#ifdef USE_VMULL_CRC32 +# include +# include +#endif + #ifdef CRCTAB # define BIT(x) ((uint_fast32_t) 1 << (x)) @@ -201,6 +206,25 @@ avx512_supported (void) return avx512_enabled; } +static bool +vmull_supported (void) +{ + /* vmull for multiplication */ + bool vmull_enabled = false; +# if USE_VMULL_CRC32 + + vmull_enabled = (getauxval (AT_HWCAP) & HWCAP_PMULL) > 0; + + if (cksum_debug) + error (0, 0, "%s", + (vmull_enabled + ? _("using vmull hardware support") + : _("vmull support not detected"))); +# endif + + return vmull_enabled; +} + static bool cksum_slice8 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out) { @@ -273,6 +297,8 @@ crc_sum_stream (FILE *stream, void *resstream, uintmax_t *length) cksum_fp = cksum_avx2; else if (pclmul_supported ()) cksum_fp = cksum_pclmul; + else if (vmull_supported ()) + cksum_fp = cksum_vmull; else cksum_fp = cksum_slice8; } diff --git a/src/cksum.h b/src/cksum.h index 6e8a5d0080..c42491a95b 100644 --- a/src/cksum.h +++ b/src/cksum.h @@ -14,6 +14,9 @@ output_crc (char const *file, int binary_file, void const *digest, bool raw, bool tagged, unsigned char delim, bool args, uintmax_t length) _GL_ATTRIBUTE_NONNULL ((3)); +extern bool +cksum_vmull (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out); + extern bool cksum_pclmul (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out); diff --git a/src/cksum_vmull.c b/src/cksum_vmull.c new file mode 100644 index 0000000000..c6f0675842 --- /dev/null +++ b/src/cksum_vmull.c @@ -0,0 +1,235 @@ +/* cksum -- calculate and print POSIX checksums and sizes of files + Copyright (C) 1992-2024 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include +#include +#include +#include +#include "system.h" + +/* Number of bytes to read at once. */ +#define BUFLEN (1 << 16) + +extern uint_fast32_t const crctab[8][256]; + +extern bool +cksum_vmull (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out); + +static uint64x2_t +bswap_neon (uint64x2_t in) +{ + uint64x2_t a = + vreinterpretq_u64_u8 (vrev64q_u8 (vreinterpretq_u8_u64 (in))); + a = vcombine_u64 (vget_high_u64 (a), vget_low_u64 (a)); + return a; +} + +/* Calculate CRC32 using VMULL CPU instruction found in ARMv8 CPUs */ + +bool +cksum_vmull (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out) +{ + uint64x2_t buf[BUFLEN / sizeof (uint64x2_t)]; + uint_fast32_t crc = 0; + uintmax_t length = 0; + size_t bytes_read; + poly64x2_t single_mult_constant; + poly64x2_t four_mult_constant; + + if (!fp || !crc_out || !length_out) + return false; + + /* These constants and general algorithms are taken from the Intel whitepaper + "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" + */ + single_mult_constant = + vcombine_p64 (vcreate_p64 (0xE8A45605), vcreate_p64 (0xC5B9CD4C)); + four_mult_constant = + vcombine_p64 (vcreate_p64 (0xE6228B11), vcreate_p64 (0x8833794C)); + + while ((bytes_read = fread (buf, 1, BUFLEN, fp)) > 0) + { + uint64x2_t *datap; + uint64x2_t data; + uint64x2_t data2; + uint64x2_t data3; + uint64x2_t data4; + uint64x2_t data5; + uint64x2_t data6; + uint64x2_t data7; + uint64x2_t data8; + uint64x2_t fold_data; + uint64x2_t xor_crc; + + if (length + bytes_read < length) + { + errno = EOVERFLOW; + return false; + } + length += bytes_read; + + datap = (uint64x2_t *) buf; + + /* Fold in parallel eight 16-byte blocks into four 16-byte blocks */ + if (bytes_read >= 16 * 8) + { + data = vld1q_u64 ((uint64_t *) (datap)); + data = bswap_neon (data); + /* XOR in initial CRC value (for us 0 so no effect), or CRC value + calculated for previous BUFLEN buffer from fread */ + xor_crc = vcombine_u64 (vcreate_u64 (0), vcreate_u64 (crc << 32)); + crc = 0; + data = veorq_u64 (data, xor_crc); + data3 = vld1q_u64 ((uint64_t *) (datap + 1)); + data3 = bswap_neon (data3); + data5 = vld1q_u64 ((uint64_t *) (datap + 2)); + data5 = bswap_neon (data5); + data7 = vld1q_u64 ((uint64_t *) (datap + 3)); + data7 = bswap_neon (data7); + + + while (bytes_read >= 16 * 8) + { + datap += 4; + + /* Do multiplication here for four consecutive 16 byte blocks */ + data2 = + vreinterpretq_u64_p128 (vmull_p64 + (vgetq_lane_p64 + (vreinterpretq_p64_u64 (data), 0), + vgetq_lane_p64 (four_mult_constant, + 0))); + data = + vreinterpretq_u64_p128 (vmull_high_p64 + (vreinterpretq_p64_u64 (data), + four_mult_constant)); + data4 = + vreinterpretq_u64_p128 (vmull_p64 + (vgetq_lane_p64 + (vreinterpretq_p64_u64 (data3), 0), + vgetq_lane_p64 (four_mult_constant, + 0))); + data3 = + vreinterpretq_u64_p128 (vmull_high_p64 + (vreinterpretq_p64_u64 (data3), + four_mult_constant)); + data6 = + vreinterpretq_u64_p128 (vmull_p64 + (vgetq_lane_p64 + (vreinterpretq_p64_u64 (data5), 0), + vgetq_lane_p64 (four_mult_constant, + 0))); + data5 = + vreinterpretq_u64_p128 (vmull_high_p64 + (vreinterpretq_p64_u64 (data5), + four_mult_constant)); + data8 = + vreinterpretq_u64_p128 (vmull_p64 + (vgetq_lane_p64 + (vreinterpretq_p64_u64 (data7), 0), + vgetq_lane_p64 (four_mult_constant, + 0))); + data7 = + vreinterpretq_u64_p128 (vmull_high_p64 + (vreinterpretq_p64_u64 (data7), + four_mult_constant)); + + /* Now multiplication results for the four blocks is xor:ed with + next four 16 byte blocks from the buffer. This effectively + "consumes" the first four blocks from the buffer. + Keep xor result in variables for multiplication in next + round of loop. */ + data = veorq_u64 (data, data2); + data2 = vld1q_u64 ((uint64_t *) (datap)); + data2 = bswap_neon (data2); + data = veorq_u64 (data, data2); + + data3 = veorq_u64 (data3, data4); + data4 = vld1q_u64 ((uint64_t *) (datap + 1)); + data4 = bswap_neon (data4); + data3 = veorq_u64 (data3, data4); + + data5 = veorq_u64 (data5, data6); + data6 = vld1q_u64 ((uint64_t *) (datap + 2)); + data6 = bswap_neon (data6); + data5 = veorq_u64 (data5, data6); + + data7 = veorq_u64 (data7, data8); + data8 = vld1q_u64 ((uint64_t *) (datap + 3)); + data8 = bswap_neon (data8); + data7 = veorq_u64 (data7, data8); + + bytes_read -= (16 * 4); + } + /* At end of loop we write out results from variables back into + the buffer, for use in single fold loop */ + data = bswap_neon (data); + vst1q_u64 ((uint64_t *) (datap), data); + data3 = bswap_neon (data3); + vst1q_u64 ((uint64_t *) (datap + 1), data3); + data5 = bswap_neon (data5); + vst1q_u64 ((uint64_t *) (datap + 2), data5); + data7 = bswap_neon (data7); + vst1q_u64 ((uint64_t *) (datap + 3), data7); + } + + /* Fold two 16-byte blocks into one 16-byte block */ + if (bytes_read >= 32) + { + data = vld1q_u64 ((uint64_t *) (datap)); + data = bswap_neon (data); + xor_crc = vcombine_u64 (vcreate_u64 (0), vcreate_u64 (crc << 32)); + crc = 0; + data = veorq_u64 (data, xor_crc); + while (bytes_read >= 32) + { + datap++; + + data2 = + vreinterpretq_u64_p128 (vmull_p64 + (vgetq_lane_p64 + (vreinterpretq_p64_u64 (data), 0), + vgetq_lane_p64 (single_mult_constant, + 0))); + data = + vreinterpretq_u64_p128 (vmull_high_p64 + (vreinterpretq_p64_u64 (data), + single_mult_constant)); + fold_data = vld1q_u64 ((uint64_t *) (datap)); + fold_data = bswap_neon (fold_data); + data = veorq_u64 (data, data2); + data = veorq_u64 (data, fold_data); + bytes_read -= 16; + } + data = bswap_neon (data); + vst1q_u64 ((uint64_t *) (datap), data); + } + + /* And finish up last 0-31 bytes in a byte by byte fashion */ + unsigned char *cp = (unsigned char *) datap; + while (bytes_read--) + crc = (crc << 8) ^ crctab[0][((crc >> 24) ^ *cp++) & 0xFF]; + if (feof (fp)) + break; + } + + *crc_out = crc; + *length_out = length; + + return !ferror (fp); +} diff --git a/src/local.mk b/src/local.mk index 90c98cc507..0d505052cb 100644 --- a/src/local.mk +++ b/src/local.mk @@ -461,6 +461,13 @@ cksum_pclmul_ldadd = src/libcksum_pclmul.a src_cksum_LDADD += $(cksum_pclmul_ldadd) src_libcksum_pclmul_a_CFLAGS = -mavx -mpclmul $(AM_CFLAGS) endif +if USE_VMULL_CRC32 +noinst_LIBRARIES += src/libcksum_vmull.a +src_libcksum_vmull_a_SOURCES = src/cksum_vmull.c src/cksum.h +cksum_vmull_ldadd = src/libcksum_vmull.a +src_cksum_LDADD += $(cksum_vmull_ldadd) +src_libcksum_vmull_a_CFLAGS = -march=armv8-a+crypto $(AM_CFLAGS) +endif src_base64_SOURCES = src/basenc.c src_base64_CPPFLAGS = -DBASE_TYPE=64 $(AM_CPPFLAGS) -- 2.47.3