* configure.ac: Add checks for avx2 and avx512 support.
* src/cksum_avx2.c: AVX2 implementation.
* src/cksum_avx512.c: AVX512 implementation.
* src/local.mk: Add build flags for avx2 and avx512.
* src/cksum.c: Add avx2 and avx512 detection functions.
* src/cksum.h: Add avx2 and avx512 implementation declarations.
* NEWS: Mention the AVX2 and AVX512 improvement.
** Improvements
+ cksum -a crc, makes use of AVX2 and AVX512 extensions for time reductions
+ of 40% and 60% respectively.
+
'head -c NUM', 'head -n NUM', 'nl -l NUM', 'nproc --ignore NUM',
'tail -c NUM', 'tail -n NUM', and 'tail --max-unchanged-stats NUM’
no longer fail merely because NUM stands for 2**64 or more.
[test $utils_cv_pclmul_intrinsic_exists = yes])
CFLAGS=$ac_save_CFLAGS
+ac_save_CFLAGS=$CFLAGS
+CFLAGS=" -mavx2 -mvpclmulqdq $CFLAGS"
+AC_MSG_CHECKING([if avx2 pclmul intrinsic exists])
+AC_CACHE_VAL([utils_cv_avx2_pclmul_intrinsic_exists],[
+AC_LINK_IFELSE(
+ [AC_LANG_SOURCE([[
+ #include <x86intrin.h>
+
+ int
+ main (void)
+ {
+ __m256i a, b;
+ a = _mm256_clmulepi64_epi128 (a, b, 0x00);
+ a = _mm256_shuffle_epi8 (a, b);
+ return __builtin_cpu_supports ("avx2") &&
+ __builtin_cpu_supports ("vpclmulqdq");
+ }
+ ]])
+ ],[
+ utils_cv_avx2_pclmul_intrinsic_exists=yes
+ ],[
+ utils_cv_avx2_pclmul_intrinsic_exists=no
+ ])])
+AC_MSG_RESULT([$utils_cv_avx2_pclmul_intrinsic_exists])
+if test $utils_cv_avx2_pclmul_intrinsic_exists = yes; then
+ AC_DEFINE([USE_AVX2_CRC32], [1],
+ [CRC32 calculation by avx2 hardware instructions enabled])
+fi
+AM_CONDITIONAL([USE_AVX2_CRC32],
+ [test $utils_cv_avx2_pclmul_intrinsic_exists = yes])
+CFLAGS=$ac_save_CFLAGS
+
+ac_save_CFLAGS=$CFLAGS
+CFLAGS=" -mavx512bw -mavx512f -mvpclmulqdq $CFLAGS"
+AC_MSG_CHECKING([if avx512 pclmul intrinsic exists])
+AC_CACHE_VAL([utils_cv_avx512_pclmul_intrinsic_exists],[
+AC_LINK_IFELSE(
+ [AC_LANG_SOURCE([[
+ #include <x86intrin.h>
+
+ int
+ main (void)
+ {
+ __m512i a, b;
+ a = _mm512_clmulepi64_epi128 (a, b, 0x00);
+ a = _mm512_shuffle_epi8 (a, b);
+ return __builtin_cpu_supports ("avx512bw") &&
+ __builtin_cpu_supports ("avx512f");
+ }
+ ]])
+ ],[
+ utils_cv_avx512_pclmul_intrinsic_exists=yes
+ ],[
+ utils_cv_avx512_pclmul_intrinsic_exists=no
+ ])])
+AC_MSG_RESULT([$utils_cv_avx512_pclmul_intrinsic_exists])
+if test $utils_cv_avx512_pclmul_intrinsic_exists = yes; then
+ AC_DEFINE([USE_AVX512_CRC32], [1],
+ [CRC32 calculation by avx512 hardware instructions enabled])
+fi
+AM_CONDITIONAL([USE_AVX512_CRC32],
+ [test $utils_cv_avx512_pclmul_intrinsic_exists = yes])
+CFLAGS=$ac_save_CFLAGS
+
CFLAGS="-mavx2 $CFLAGS"
AC_MSG_CHECKING([for avx2 intrinsics])
AC_CACHE_VAL([utils_cv_avx2_intrinsic_exists],[
/* Number of bytes to read at once. */
# define BUFLEN (1 << 16)
-# if USE_PCLMUL_CRC32
static bool
pclmul_supported (void)
{
- bool pclmul_enabled = (0 < __builtin_cpu_supports ("pclmul")
- && 0 < __builtin_cpu_supports ("avx"));
+ bool pclmul_enabled = false;
+# if USE_PCLMUL_CRC32
+ pclmul_enabled = (0 < __builtin_cpu_supports ("pclmul")
+ && 0 < __builtin_cpu_supports ("avx"));
if (cksum_debug)
error (0, 0, "%s",
(pclmul_enabled
? _("using pclmul hardware support")
: _("pclmul support not detected")));
+# endif
return pclmul_enabled;
}
-# endif /* USE_PCLMUL_CRC32 */
+
+static bool
+avx2_supported (void)
+{
+ /* AVX512 processors will not set vpclmulqdq unless they support
+ the avx512 version, but it implies that the avx2 version
+ is supported */
+ bool avx2_enabled = false;
+# if USE_AVX2_CRC32
+ avx2_enabled = (0 < __builtin_cpu_supports ("vpclmulqdq")
+ && 0 < __builtin_cpu_supports ("avx2"));
+
+ if (cksum_debug)
+ error (0, 0, "%s",
+ (avx2_enabled
+ ? _("using avx2 hardware support")
+ : _("avx2 support not detected")));
+# endif
+
+ return avx2_enabled;
+}
+
+static bool
+avx512_supported (void)
+{
+ /* vpclmulqdq for multiplication
+ mavx512f for most of the avx512 functions we're using
+ mavx512bw for byte swapping */
+ bool avx512_enabled = false;
+# if USE_AVX512_CRC32
+ avx512_enabled = (0 < __builtin_cpu_supports ("vpclmulqdq")
+ && 0 < __builtin_cpu_supports ("avx512bw")
+ && 0 < __builtin_cpu_supports ("avx512f"));
+
+ if (cksum_debug)
+ error (0, 0, "%s",
+ (avx512_enabled
+ ? _("using avx512 hardware support")
+ : _("avx512 support not detected")));
+# endif
+
+ return avx512_enabled;
+}
static bool
cksum_slice8 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
uintmax_t total_bytes = 0;
uint_fast32_t crc = 0;
-# if USE_PCLMUL_CRC32
static bool (*cksum_fp) (FILE *, uint_fast32_t *, uintmax_t *);
if (! cksum_fp)
- cksum_fp = pclmul_supported () ? cksum_pclmul : cksum_slice8;
-# else
- bool (*cksum_fp) (FILE *, uint_fast32_t *, uintmax_t *) = cksum_slice8;
-# endif
+ {
+ if (avx512_supported ())
+ cksum_fp = cksum_avx512;
+ else if (avx2_supported ())
+ cksum_fp = cksum_avx2;
+ else if (pclmul_supported ())
+ cksum_fp = cksum_pclmul;
+ else
+ cksum_fp = cksum_slice8;
+ }
if (! cksum_fp (stream, &crc, &total_bytes))
return -1;
extern bool
cksum_pclmul (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out);
+extern bool
+cksum_avx2 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out);
+
+extern bool
+cksum_avx512 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out);
+
extern uint_fast32_t const crctab[8][256];
#endif
--- /dev/null
+/* cksum -- calculate and print POSIX checksums and sizes of files
+ Copyright (C) 1992-2024 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <x86intrin.h>
+#include "system.h"
+
+/* Number of bytes to read at once. */
+#define BUFLEN (1 << 16)
+
+extern uint_fast32_t const crctab[8][256];
+
+extern bool
+cksum_avx2 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out);
+
+bool
+cksum_avx2 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
+{
+ __m256i buf[BUFLEN / sizeof (__m256i)];
+ uint_fast32_t crc = 0;
+ uintmax_t length = 0;
+ size_t bytes_read;
+ __m256i single_mult_constant;
+ __m256i four_mult_constant;
+ __m256i shuffle_constant;
+
+ if (!fp || !crc_out || !length_out)
+ return false;
+
+ /* These constants and general algorithms are taken from the Intel whitepaper
+ "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+ */
+ single_mult_constant = _mm256_set_epi64x (0x569700E5, 0x75BE46B7,
+ 0x569700E5, 0x75BE46B7);
+ four_mult_constant = _mm256_set_epi64x (0x10BD4D7C, 0x567FDDEB,
+ 0x10BD4D7C, 0x567FDDEB);
+
+ /* Constant to byteswap a full AVX2 register */
+ shuffle_constant = _mm256_set_epi8 (0, 1, 2, 3, 4, 5, 6, 7, 8,
+ 9, 10, 11, 12, 13, 14, 15,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8,
+ 9, 10, 11, 12, 13, 14, 15);
+ while ((bytes_read = fread (buf, 1, BUFLEN, fp)) > 0)
+ {
+ __m256i data;
+ __m256i data2;
+ __m256i data3;
+ __m256i data4;
+ __m256i data5;
+ __m256i data6;
+ __m256i data7;
+ __m256i data8;
+ __m256i fold_data;
+ __m256i xor_crc;
+
+ __m256i *datap;
+
+ if (length + bytes_read < length)
+ {
+ errno = EOVERFLOW;
+ return false;
+ }
+ length += bytes_read;
+
+ datap = (__m256i *)buf;
+
+ /* Fold in parallel 16x 16-byte blocks into 8x 16-byte blocks */
+ if (bytes_read >= 16 * 8 * 2)
+ {
+ data = _mm256_loadu_si256 (datap);
+ data = _mm256_shuffle_epi8 (data, shuffle_constant);
+ /* XOR in initial CRC value (for us 0 so no effect), or CRC value
+ calculated for previous BUFLEN buffer from fread */
+ xor_crc = _mm256_set_epi32 (0, 0, 0, 0, crc, 0, 0, 0);
+ crc = 0;
+ data = _mm256_xor_si256 (data, xor_crc);
+ data3 = _mm256_loadu_si256 (datap + 1);
+ data3 = _mm256_shuffle_epi8 (data3, shuffle_constant);
+ data5 = _mm256_loadu_si256 (datap + 2);
+ data5 = _mm256_shuffle_epi8 (data5, shuffle_constant);
+ data7 = _mm256_loadu_si256 (datap + 3);
+ data7 = _mm256_shuffle_epi8 (data7, shuffle_constant);
+
+ while (bytes_read >= 16 * 8 * 2)
+ {
+ datap += 4;
+
+ /* Do multiplication here for 8x consecutive 16 byte blocks */
+ data2 = _mm256_clmulepi64_epi128 (data, four_mult_constant,
+ 0x00);
+ data = _mm256_clmulepi64_epi128 (data, four_mult_constant,
+ 0x11);
+ data4 = _mm256_clmulepi64_epi128 (data3, four_mult_constant,
+ 0x00);
+ data3 = _mm256_clmulepi64_epi128 (data3, four_mult_constant,
+ 0x11);
+ data6 = _mm256_clmulepi64_epi128 (data5, four_mult_constant,
+ 0x00);
+ data5 = _mm256_clmulepi64_epi128 (data5, four_mult_constant,
+ 0x11);
+ data8 = _mm256_clmulepi64_epi128 (data7, four_mult_constant,
+ 0x00);
+ data7 = _mm256_clmulepi64_epi128 (data7, four_mult_constant,
+ 0x11);
+
+ /* Now multiplication results for the 8x blocks is xor:ed with
+ next 8x 16 byte blocks from the buffer. This effectively
+ "consumes" the first 8x blocks from the buffer.
+ Keep xor result in variables for multiplication in next
+ round of loop. */
+ data = _mm256_xor_si256 (data, data2);
+ data2 = _mm256_loadu_si256 (datap);
+ data2 = _mm256_shuffle_epi8 (data2, shuffle_constant);
+ data = _mm256_xor_si256 (data, data2);
+
+ data3 = _mm256_xor_si256 (data3, data4);
+ data4 = _mm256_loadu_si256 (datap + 1);
+ data4 = _mm256_shuffle_epi8 (data4, shuffle_constant);
+ data3 = _mm256_xor_si256 (data3, data4);
+
+ data5 = _mm256_xor_si256 (data5, data6);
+ data6 = _mm256_loadu_si256 (datap + 2);
+ data6 = _mm256_shuffle_epi8 (data6, shuffle_constant);
+ data5 = _mm256_xor_si256 (data5, data6);
+
+ data7 = _mm256_xor_si256 (data7, data8);
+ data8 = _mm256_loadu_si256 (datap + 3);
+ data8 = _mm256_shuffle_epi8 (data8, shuffle_constant);
+ data7 = _mm256_xor_si256 (data7, data8);
+
+ bytes_read -= (16 * 4 * 2);
+ }
+ /* At end of loop we write out results from variables back into
+ the buffer, for use in single fold loop */
+ data = _mm256_shuffle_epi8 (data, shuffle_constant);
+ _mm256_storeu_si256 (datap, data);
+ data3 = _mm256_shuffle_epi8 (data3, shuffle_constant);
+ _mm256_storeu_si256 (datap + 1, data3);
+ data5 = _mm256_shuffle_epi8 (data5, shuffle_constant);
+ _mm256_storeu_si256 (datap + 2, data5);
+ data7 = _mm256_shuffle_epi8 (data7, shuffle_constant);
+ _mm256_storeu_si256 (datap + 3, data7);
+ }
+
+ /* Fold two 32-byte blocks into one 32-byte block */
+ if (bytes_read >= 64)
+ {
+ data = _mm256_loadu_si256 (datap);
+ data = _mm256_shuffle_epi8 (data, shuffle_constant);
+ xor_crc = _mm256_set_epi32 (0, 0, 0, 0, crc, 0, 0, 0);
+ crc = 0;
+ data = _mm256_xor_si256 (data, xor_crc);
+ while (bytes_read >= 64)
+ {
+ datap++;
+
+ data2 = _mm256_clmulepi64_epi128 (data, single_mult_constant,
+ 0x00);
+ data = _mm256_clmulepi64_epi128 (data, single_mult_constant,
+ 0x11);
+ fold_data = _mm256_loadu_si256 (datap);
+ fold_data = _mm256_shuffle_epi8 (fold_data, shuffle_constant);
+ data = _mm256_xor_si256 (data, data2);
+ data = _mm256_xor_si256 (data, fold_data);
+ bytes_read -= 32;
+ }
+ data = _mm256_shuffle_epi8 (data, shuffle_constant);
+ _mm256_storeu_si256 (datap, data);
+ }
+
+ /* And finish up last 0-63 bytes in a byte by byte fashion */
+ unsigned char *cp = (unsigned char *)datap;
+ while (bytes_read--)
+ crc = (crc << 8) ^ crctab[0][((crc >> 24) ^ *cp++) & 0xFF];
+ if (feof (fp))
+ break;
+ }
+
+ *crc_out = crc;
+ *length_out = length;
+
+ return !ferror (fp);
+}
--- /dev/null
+/* cksum -- calculate and print POSIX checksums and sizes of files
+ Copyright (C) 1992-2024 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <x86intrin.h>
+#include "system.h"
+
+/* Number of bytes to read at once. */
+#define BUFLEN (1 << 16)
+
+extern uint_fast32_t const crctab[8][256];
+
+extern bool
+cksum_avx512 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out);
+
+bool
+cksum_avx512 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
+{
+ __m512i buf[BUFLEN / sizeof (__m512i)];
+ uint_fast32_t crc = 0;
+ uintmax_t length = 0;
+ size_t bytes_read;
+ __m512i single_mult_constant;
+ __m512i four_mult_constant;
+ __m512i shuffle_constant;
+
+ if (!fp || !crc_out || !length_out)
+ return false;
+
+ /* These constants and general algorithms are taken from the Intel whitepaper
+ "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+ */
+ single_mult_constant = _mm512_set_epi64 (0x8833794C, 0xE6228B11,
+ 0x8833794C, 0xE6228B11,
+ 0x8833794C, 0xE6228B11,
+ 0x8833794C, 0xE6228B11);
+ four_mult_constant = _mm512_set_epi64 (0xCBCF3BCB, 0x88FE2237,
+ 0xCBCF3BCB, 0x88FE2237,
+ 0xCBCF3BCB, 0x88FE2237,
+ 0xCBCF3BCB, 0x88FE2237);
+
+ /* Constant to byteswap a full AVX512 register */
+ shuffle_constant = _mm512_set_epi8 (0, 1, 2, 3, 4, 5, 6, 7, 8,
+ 9, 10, 11, 12, 13, 14, 15,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8,
+ 9, 10, 11, 12, 13, 14, 15,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8,
+ 9, 10, 11, 12, 13, 14, 15,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8,
+ 9, 10, 11, 12, 13, 14, 15);
+ while ((bytes_read = fread (buf, 1, BUFLEN, fp)) > 0)
+ {
+ __m512i data;
+ __m512i data2;
+ __m512i data3;
+ __m512i data4;
+ __m512i data5;
+ __m512i data6;
+ __m512i data7;
+ __m512i data8;
+ __m512i fold_data;
+ __m512i xor_crc;
+
+ __m512i *datap;
+
+ if (length + bytes_read < length)
+ {
+ errno = EOVERFLOW;
+ return false;
+ }
+ length += bytes_read;
+
+ datap = (__m512i *)buf;
+
+ /* Fold in parallel 32x 16-byte blocks into 16x 16-byte blocks */
+ if (bytes_read >= 16 * 8 * 4)
+ {
+ data = _mm512_loadu_si512 (datap);
+ data = _mm512_shuffle_epi8 (data, shuffle_constant);
+ /* XOR in initial CRC value (for us 0 so no effect), or CRC value
+ calculated for previous BUFLEN buffer from fread */
+ xor_crc = _mm512_set_epi32 (0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, crc, 0, 0, 0);
+ crc = 0;
+ data = _mm512_xor_si512 (data, xor_crc);
+ data3 = _mm512_loadu_si512 (datap + 1);
+ data3 = _mm512_shuffle_epi8 (data3, shuffle_constant);
+ data5 = _mm512_loadu_si512 (datap + 2);
+ data5 = _mm512_shuffle_epi8 (data5, shuffle_constant);
+ data7 = _mm512_loadu_si512 (datap + 3);
+ data7 = _mm512_shuffle_epi8 (data7, shuffle_constant);
+
+ while (bytes_read >= 16 * 8 * 4)
+ {
+ datap += 4;
+
+ /* Do multiplication here for 16x consecutive 16 byte blocks */
+ data2 = _mm512_clmulepi64_epi128 (data, four_mult_constant,
+ 0x00);
+ data = _mm512_clmulepi64_epi128 (data, four_mult_constant,
+ 0x11);
+ data4 = _mm512_clmulepi64_epi128 (data3, four_mult_constant,
+ 0x00);
+ data3 = _mm512_clmulepi64_epi128 (data3, four_mult_constant,
+ 0x11);
+ data6 = _mm512_clmulepi64_epi128 (data5, four_mult_constant,
+ 0x00);
+ data5 = _mm512_clmulepi64_epi128 (data5, four_mult_constant,
+ 0x11);
+ data8 = _mm512_clmulepi64_epi128 (data7, four_mult_constant,
+ 0x00);
+ data7 = _mm512_clmulepi64_epi128 (data7, four_mult_constant,
+ 0x11);
+
+ /* Now multiplication results for the 16x blocks is xor:ed with
+ next 16x 16 byte blocks from the buffer. This effectively
+ "consumes" the first 16x blocks from the buffer.
+ Keep xor result in variables for multiplication in next
+ round of loop. */
+ data = _mm512_xor_si512 (data, data2);
+ data2 = _mm512_loadu_si512 (datap);
+ data2 = _mm512_shuffle_epi8 (data2, shuffle_constant);
+ data = _mm512_xor_si512 (data, data2);
+
+ data3 = _mm512_xor_si512 (data3, data4);
+ data4 = _mm512_loadu_si512 (datap + 1);
+ data4 = _mm512_shuffle_epi8 (data4, shuffle_constant);
+ data3 = _mm512_xor_si512 (data3, data4);
+
+ data5 = _mm512_xor_si512 (data5, data6);
+ data6 = _mm512_loadu_si512 (datap + 2);
+ data6 = _mm512_shuffle_epi8 (data6, shuffle_constant);
+ data5 = _mm512_xor_si512 (data5, data6);
+
+ data7 = _mm512_xor_si512 (data7, data8);
+ data8 = _mm512_loadu_si512 (datap + 3);
+ data8 = _mm512_shuffle_epi8 (data8, shuffle_constant);
+ data7 = _mm512_xor_si512 (data7, data8);
+
+ bytes_read -= (16 * 4 * 4);
+ }
+ /* At end of loop we write out results from variables back into
+ the buffer, for use in single fold loop */
+ data = _mm512_shuffle_epi8 (data, shuffle_constant);
+ _mm512_storeu_si512 (datap, data);
+ data3 = _mm512_shuffle_epi8 (data3, shuffle_constant);
+ _mm512_storeu_si512 (datap + 1, data3);
+ data5 = _mm512_shuffle_epi8 (data5, shuffle_constant);
+ _mm512_storeu_si512 (datap + 2, data5);
+ data7 = _mm512_shuffle_epi8 (data7, shuffle_constant);
+ _mm512_storeu_si512 (datap + 3, data7);
+ }
+
+ /* Fold two 64-byte blocks into one 64-byte block */
+ if (bytes_read >= 128)
+ {
+ data = _mm512_loadu_si512 (datap);
+ data = _mm512_shuffle_epi8 (data, shuffle_constant);
+ xor_crc = _mm512_set_epi32 (0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, crc, 0, 0, 0);
+ crc = 0;
+ data = _mm512_xor_si512 (data, xor_crc);
+ while (bytes_read >= 128)
+ {
+ datap++;
+
+ data2 = _mm512_clmulepi64_epi128 (data, single_mult_constant,
+ 0x00);
+ data = _mm512_clmulepi64_epi128 (data, single_mult_constant,
+ 0x11);
+ fold_data = _mm512_loadu_si512 (datap);
+ fold_data = _mm512_shuffle_epi8 (fold_data, shuffle_constant);
+ data = _mm512_xor_si512 (data, data2);
+ data = _mm512_xor_si512 (data, fold_data);
+ bytes_read -= 64;
+ }
+ data = _mm512_shuffle_epi8 (data, shuffle_constant);
+ _mm512_storeu_si512 (datap, data);
+ }
+
+ /* And finish up last 0-127 bytes in a byte by byte fashion */
+ unsigned char *cp = (unsigned char *)datap;
+ while (bytes_read--)
+ crc = (crc << 8) ^ crctab[0][((crc >> 24) ^ *cp++) & 0xFF];
+ if (feof (fp))
+ break;
+ }
+
+ *crc_out = crc;
+ *length_out = length;
+
+ return !ferror (fp);
+}
src_cksum_SOURCES = $(src_b2sum_SOURCES) src/sum.c src/sum.h \
src/cksum.c src/cksum.h src/crctab.c
src_cksum_CPPFLAGS = -DHASH_ALGO_CKSUM=1 -DHAVE_CONFIG_H $(AM_CPPFLAGS)
+
+if USE_AVX512_CRC32
+noinst_LIBRARIES += src/libcksum_avx512.a
+src_libcksum_avx512_a_SOURCES = src/cksum_avx512.c src/cksum.h
+cksum_avx512_ldadd = src/libcksum_avx512.a
+src_cksum_LDADD += $(cksum_avx512_ldadd)
+src_libcksum_avx512_a_CFLAGS = -mavx512bw -mavx512f -mvpclmulqdq $(AM_CFLAGS)
+endif
+if USE_AVX2_CRC32
+noinst_LIBRARIES += src/libcksum_avx2.a
+src_libcksum_avx2_a_SOURCES = src/cksum_avx2.c src/cksum.h
+cksum_avx2_ldadd = src/libcksum_avx2.a
+src_cksum_LDADD += $(cksum_avx2_ldadd)
+src_libcksum_avx2_a_CFLAGS = -mpclmul -mavx -mavx2 -mvpclmulqdq $(AM_CFLAGS)
+endif
if USE_PCLMUL_CRC32
noinst_LIBRARIES += src/libcksum_pclmul.a
src_libcksum_pclmul_a_SOURCES = src/cksum_pclmul.c src/cksum.h