From: Sam Russell Date: Mon, 25 Nov 2024 23:25:00 +0000 (+0100) Subject: cksum: use AVX2 and AVX512 for speedup X-Git-Tag: v9.6~53 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=df71ac83436e6876f098495ea58f0df9ee7c31cd;p=thirdparty%2Fcoreutils.git cksum: use AVX2 and AVX512 for speedup * configure.ac: Add checks for avx2 and avx512 support. * src/cksum_avx2.c: AVX2 implementation. * src/cksum_avx512.c: AVX512 implementation. * src/local.mk: Add build flags for avx2 and avx512. * src/cksum.c: Add avx2 and avx512 detection functions. * src/cksum.h: Add avx2 and avx512 implementation declarations. * NEWS: Mention the AVX2 and AVX512 improvement. --- diff --git a/NEWS b/NEWS index 1852f311f9..5bd9199e43 100644 --- a/NEWS +++ b/NEWS @@ -66,6 +66,9 @@ GNU coreutils NEWS -*- outline -*- ** Improvements + cksum -a crc, makes use of AVX2 and AVX512 extensions for time reductions + of 40% and 60% respectively. + 'head -c NUM', 'head -n NUM', 'nl -l NUM', 'nproc --ignore NUM', 'tail -c NUM', 'tail -n NUM', and 'tail --max-unchanged-stats NUM’ no longer fail merely because NUM stands for 2**64 or more. diff --git a/configure.ac b/configure.ac index f2557ce0d1..17fa23b455 100644 --- a/configure.ac +++ b/configure.ac @@ -649,6 +649,70 @@ AM_CONDITIONAL([USE_PCLMUL_CRC32], [test $utils_cv_pclmul_intrinsic_exists = yes]) CFLAGS=$ac_save_CFLAGS +ac_save_CFLAGS=$CFLAGS +CFLAGS=" -mavx2 -mvpclmulqdq $CFLAGS" +AC_MSG_CHECKING([if avx2 pclmul intrinsic exists]) +AC_CACHE_VAL([utils_cv_avx2_pclmul_intrinsic_exists],[ +AC_LINK_IFELSE( + [AC_LANG_SOURCE([[ + #include + + int + main (void) + { + __m256i a, b; + a = _mm256_clmulepi64_epi128 (a, b, 0x00); + a = _mm256_shuffle_epi8 (a, b); + return __builtin_cpu_supports ("avx2") && + __builtin_cpu_supports ("vpclmulqdq"); + } + ]]) + ],[ + utils_cv_avx2_pclmul_intrinsic_exists=yes + ],[ + utils_cv_avx2_pclmul_intrinsic_exists=no + ])]) +AC_MSG_RESULT([$utils_cv_avx2_pclmul_intrinsic_exists]) +if test $utils_cv_avx2_pclmul_intrinsic_exists = yes; then + AC_DEFINE([USE_AVX2_CRC32], [1], + [CRC32 calculation by avx2 hardware instructions enabled]) +fi +AM_CONDITIONAL([USE_AVX2_CRC32], + [test $utils_cv_avx2_pclmul_intrinsic_exists = yes]) +CFLAGS=$ac_save_CFLAGS + +ac_save_CFLAGS=$CFLAGS +CFLAGS=" -mavx512bw -mavx512f -mvpclmulqdq $CFLAGS" +AC_MSG_CHECKING([if avx512 pclmul intrinsic exists]) +AC_CACHE_VAL([utils_cv_avx512_pclmul_intrinsic_exists],[ +AC_LINK_IFELSE( + [AC_LANG_SOURCE([[ + #include + + int + main (void) + { + __m512i a, b; + a = _mm512_clmulepi64_epi128 (a, b, 0x00); + a = _mm512_shuffle_epi8 (a, b); + return __builtin_cpu_supports ("avx512bw") && + __builtin_cpu_supports ("avx512f"); + } + ]]) + ],[ + utils_cv_avx512_pclmul_intrinsic_exists=yes + ],[ + utils_cv_avx512_pclmul_intrinsic_exists=no + ])]) +AC_MSG_RESULT([$utils_cv_avx512_pclmul_intrinsic_exists]) +if test $utils_cv_avx512_pclmul_intrinsic_exists = yes; then + AC_DEFINE([USE_AVX512_CRC32], [1], + [CRC32 calculation by avx512 hardware instructions enabled]) +fi +AM_CONDITIONAL([USE_AVX512_CRC32], + [test $utils_cv_avx512_pclmul_intrinsic_exists = yes]) +CFLAGS=$ac_save_CFLAGS + CFLAGS="-mavx2 $CFLAGS" AC_MSG_CHECKING([for avx2 intrinsics]) AC_CACHE_VAL([utils_cv_avx2_intrinsic_exists],[ diff --git a/src/cksum.c b/src/cksum.c index a977bf2960..5900d141ec 100644 --- a/src/cksum.c +++ b/src/cksum.c @@ -140,22 +140,66 @@ main (void) /* Number of bytes to read at once. */ # define BUFLEN (1 << 16) -# if USE_PCLMUL_CRC32 static bool pclmul_supported (void) { - bool pclmul_enabled = (0 < __builtin_cpu_supports ("pclmul") - && 0 < __builtin_cpu_supports ("avx")); + bool pclmul_enabled = false; +# if USE_PCLMUL_CRC32 + pclmul_enabled = (0 < __builtin_cpu_supports ("pclmul") + && 0 < __builtin_cpu_supports ("avx")); if (cksum_debug) error (0, 0, "%s", (pclmul_enabled ? _("using pclmul hardware support") : _("pclmul support not detected"))); +# endif return pclmul_enabled; } -# endif /* USE_PCLMUL_CRC32 */ + +static bool +avx2_supported (void) +{ + /* AVX512 processors will not set vpclmulqdq unless they support + the avx512 version, but it implies that the avx2 version + is supported */ + bool avx2_enabled = false; +# if USE_AVX2_CRC32 + avx2_enabled = (0 < __builtin_cpu_supports ("vpclmulqdq") + && 0 < __builtin_cpu_supports ("avx2")); + + if (cksum_debug) + error (0, 0, "%s", + (avx2_enabled + ? _("using avx2 hardware support") + : _("avx2 support not detected"))); +# endif + + return avx2_enabled; +} + +static bool +avx512_supported (void) +{ + /* vpclmulqdq for multiplication + mavx512f for most of the avx512 functions we're using + mavx512bw for byte swapping */ + bool avx512_enabled = false; +# if USE_AVX512_CRC32 + avx512_enabled = (0 < __builtin_cpu_supports ("vpclmulqdq") + && 0 < __builtin_cpu_supports ("avx512bw") + && 0 < __builtin_cpu_supports ("avx512f")); + + if (cksum_debug) + error (0, 0, "%s", + (avx512_enabled + ? _("using avx512 hardware support") + : _("avx512 support not detected"))); +# endif + + return avx512_enabled; +} static bool cksum_slice8 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out) @@ -220,13 +264,18 @@ crc_sum_stream (FILE *stream, void *resstream, uintmax_t *length) uintmax_t total_bytes = 0; uint_fast32_t crc = 0; -# if USE_PCLMUL_CRC32 static bool (*cksum_fp) (FILE *, uint_fast32_t *, uintmax_t *); if (! cksum_fp) - cksum_fp = pclmul_supported () ? cksum_pclmul : cksum_slice8; -# else - bool (*cksum_fp) (FILE *, uint_fast32_t *, uintmax_t *) = cksum_slice8; -# endif + { + if (avx512_supported ()) + cksum_fp = cksum_avx512; + else if (avx2_supported ()) + cksum_fp = cksum_avx2; + else if (pclmul_supported ()) + cksum_fp = cksum_pclmul; + else + cksum_fp = cksum_slice8; + } if (! cksum_fp (stream, &crc, &total_bytes)) return -1; diff --git a/src/cksum.h b/src/cksum.h index f8b2799736..6e8a5d0080 100644 --- a/src/cksum.h +++ b/src/cksum.h @@ -17,6 +17,12 @@ output_crc (char const *file, int binary_file, void const *digest, bool raw, extern bool cksum_pclmul (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out); +extern bool +cksum_avx2 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out); + +extern bool +cksum_avx512 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out); + extern uint_fast32_t const crctab[8][256]; #endif diff --git a/src/cksum_avx2.c b/src/cksum_avx2.c new file mode 100644 index 0000000000..252e01d6c8 --- /dev/null +++ b/src/cksum_avx2.c @@ -0,0 +1,200 @@ +/* cksum -- calculate and print POSIX checksums and sizes of files + Copyright (C) 1992-2024 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include +#include +#include +#include +#include "system.h" + +/* Number of bytes to read at once. */ +#define BUFLEN (1 << 16) + +extern uint_fast32_t const crctab[8][256]; + +extern bool +cksum_avx2 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out); + +bool +cksum_avx2 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out) +{ + __m256i buf[BUFLEN / sizeof (__m256i)]; + uint_fast32_t crc = 0; + uintmax_t length = 0; + size_t bytes_read; + __m256i single_mult_constant; + __m256i four_mult_constant; + __m256i shuffle_constant; + + if (!fp || !crc_out || !length_out) + return false; + + /* These constants and general algorithms are taken from the Intel whitepaper + "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" + */ + single_mult_constant = _mm256_set_epi64x (0x569700E5, 0x75BE46B7, + 0x569700E5, 0x75BE46B7); + four_mult_constant = _mm256_set_epi64x (0x10BD4D7C, 0x567FDDEB, + 0x10BD4D7C, 0x567FDDEB); + + /* Constant to byteswap a full AVX2 register */ + shuffle_constant = _mm256_set_epi8 (0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15); + while ((bytes_read = fread (buf, 1, BUFLEN, fp)) > 0) + { + __m256i data; + __m256i data2; + __m256i data3; + __m256i data4; + __m256i data5; + __m256i data6; + __m256i data7; + __m256i data8; + __m256i fold_data; + __m256i xor_crc; + + __m256i *datap; + + if (length + bytes_read < length) + { + errno = EOVERFLOW; + return false; + } + length += bytes_read; + + datap = (__m256i *)buf; + + /* Fold in parallel 16x 16-byte blocks into 8x 16-byte blocks */ + if (bytes_read >= 16 * 8 * 2) + { + data = _mm256_loadu_si256 (datap); + data = _mm256_shuffle_epi8 (data, shuffle_constant); + /* XOR in initial CRC value (for us 0 so no effect), or CRC value + calculated for previous BUFLEN buffer from fread */ + xor_crc = _mm256_set_epi32 (0, 0, 0, 0, crc, 0, 0, 0); + crc = 0; + data = _mm256_xor_si256 (data, xor_crc); + data3 = _mm256_loadu_si256 (datap + 1); + data3 = _mm256_shuffle_epi8 (data3, shuffle_constant); + data5 = _mm256_loadu_si256 (datap + 2); + data5 = _mm256_shuffle_epi8 (data5, shuffle_constant); + data7 = _mm256_loadu_si256 (datap + 3); + data7 = _mm256_shuffle_epi8 (data7, shuffle_constant); + + while (bytes_read >= 16 * 8 * 2) + { + datap += 4; + + /* Do multiplication here for 8x consecutive 16 byte blocks */ + data2 = _mm256_clmulepi64_epi128 (data, four_mult_constant, + 0x00); + data = _mm256_clmulepi64_epi128 (data, four_mult_constant, + 0x11); + data4 = _mm256_clmulepi64_epi128 (data3, four_mult_constant, + 0x00); + data3 = _mm256_clmulepi64_epi128 (data3, four_mult_constant, + 0x11); + data6 = _mm256_clmulepi64_epi128 (data5, four_mult_constant, + 0x00); + data5 = _mm256_clmulepi64_epi128 (data5, four_mult_constant, + 0x11); + data8 = _mm256_clmulepi64_epi128 (data7, four_mult_constant, + 0x00); + data7 = _mm256_clmulepi64_epi128 (data7, four_mult_constant, + 0x11); + + /* Now multiplication results for the 8x blocks is xor:ed with + next 8x 16 byte blocks from the buffer. This effectively + "consumes" the first 8x blocks from the buffer. + Keep xor result in variables for multiplication in next + round of loop. */ + data = _mm256_xor_si256 (data, data2); + data2 = _mm256_loadu_si256 (datap); + data2 = _mm256_shuffle_epi8 (data2, shuffle_constant); + data = _mm256_xor_si256 (data, data2); + + data3 = _mm256_xor_si256 (data3, data4); + data4 = _mm256_loadu_si256 (datap + 1); + data4 = _mm256_shuffle_epi8 (data4, shuffle_constant); + data3 = _mm256_xor_si256 (data3, data4); + + data5 = _mm256_xor_si256 (data5, data6); + data6 = _mm256_loadu_si256 (datap + 2); + data6 = _mm256_shuffle_epi8 (data6, shuffle_constant); + data5 = _mm256_xor_si256 (data5, data6); + + data7 = _mm256_xor_si256 (data7, data8); + data8 = _mm256_loadu_si256 (datap + 3); + data8 = _mm256_shuffle_epi8 (data8, shuffle_constant); + data7 = _mm256_xor_si256 (data7, data8); + + bytes_read -= (16 * 4 * 2); + } + /* At end of loop we write out results from variables back into + the buffer, for use in single fold loop */ + data = _mm256_shuffle_epi8 (data, shuffle_constant); + _mm256_storeu_si256 (datap, data); + data3 = _mm256_shuffle_epi8 (data3, shuffle_constant); + _mm256_storeu_si256 (datap + 1, data3); + data5 = _mm256_shuffle_epi8 (data5, shuffle_constant); + _mm256_storeu_si256 (datap + 2, data5); + data7 = _mm256_shuffle_epi8 (data7, shuffle_constant); + _mm256_storeu_si256 (datap + 3, data7); + } + + /* Fold two 32-byte blocks into one 32-byte block */ + if (bytes_read >= 64) + { + data = _mm256_loadu_si256 (datap); + data = _mm256_shuffle_epi8 (data, shuffle_constant); + xor_crc = _mm256_set_epi32 (0, 0, 0, 0, crc, 0, 0, 0); + crc = 0; + data = _mm256_xor_si256 (data, xor_crc); + while (bytes_read >= 64) + { + datap++; + + data2 = _mm256_clmulepi64_epi128 (data, single_mult_constant, + 0x00); + data = _mm256_clmulepi64_epi128 (data, single_mult_constant, + 0x11); + fold_data = _mm256_loadu_si256 (datap); + fold_data = _mm256_shuffle_epi8 (fold_data, shuffle_constant); + data = _mm256_xor_si256 (data, data2); + data = _mm256_xor_si256 (data, fold_data); + bytes_read -= 32; + } + data = _mm256_shuffle_epi8 (data, shuffle_constant); + _mm256_storeu_si256 (datap, data); + } + + /* And finish up last 0-63 bytes in a byte by byte fashion */ + unsigned char *cp = (unsigned char *)datap; + while (bytes_read--) + crc = (crc << 8) ^ crctab[0][((crc >> 24) ^ *cp++) & 0xFF]; + if (feof (fp)) + break; + } + + *crc_out = crc; + *length_out = length; + + return !ferror (fp); +} diff --git a/src/cksum_avx512.c b/src/cksum_avx512.c new file mode 100644 index 0000000000..5f8ff2375b --- /dev/null +++ b/src/cksum_avx512.c @@ -0,0 +1,210 @@ +/* cksum -- calculate and print POSIX checksums and sizes of files + Copyright (C) 1992-2024 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include +#include +#include +#include +#include "system.h" + +/* Number of bytes to read at once. */ +#define BUFLEN (1 << 16) + +extern uint_fast32_t const crctab[8][256]; + +extern bool +cksum_avx512 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out); + +bool +cksum_avx512 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out) +{ + __m512i buf[BUFLEN / sizeof (__m512i)]; + uint_fast32_t crc = 0; + uintmax_t length = 0; + size_t bytes_read; + __m512i single_mult_constant; + __m512i four_mult_constant; + __m512i shuffle_constant; + + if (!fp || !crc_out || !length_out) + return false; + + /* These constants and general algorithms are taken from the Intel whitepaper + "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" + */ + single_mult_constant = _mm512_set_epi64 (0x8833794C, 0xE6228B11, + 0x8833794C, 0xE6228B11, + 0x8833794C, 0xE6228B11, + 0x8833794C, 0xE6228B11); + four_mult_constant = _mm512_set_epi64 (0xCBCF3BCB, 0x88FE2237, + 0xCBCF3BCB, 0x88FE2237, + 0xCBCF3BCB, 0x88FE2237, + 0xCBCF3BCB, 0x88FE2237); + + /* Constant to byteswap a full AVX512 register */ + shuffle_constant = _mm512_set_epi8 (0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15); + while ((bytes_read = fread (buf, 1, BUFLEN, fp)) > 0) + { + __m512i data; + __m512i data2; + __m512i data3; + __m512i data4; + __m512i data5; + __m512i data6; + __m512i data7; + __m512i data8; + __m512i fold_data; + __m512i xor_crc; + + __m512i *datap; + + if (length + bytes_read < length) + { + errno = EOVERFLOW; + return false; + } + length += bytes_read; + + datap = (__m512i *)buf; + + /* Fold in parallel 32x 16-byte blocks into 16x 16-byte blocks */ + if (bytes_read >= 16 * 8 * 4) + { + data = _mm512_loadu_si512 (datap); + data = _mm512_shuffle_epi8 (data, shuffle_constant); + /* XOR in initial CRC value (for us 0 so no effect), or CRC value + calculated for previous BUFLEN buffer from fread */ + xor_crc = _mm512_set_epi32 (0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, crc, 0, 0, 0); + crc = 0; + data = _mm512_xor_si512 (data, xor_crc); + data3 = _mm512_loadu_si512 (datap + 1); + data3 = _mm512_shuffle_epi8 (data3, shuffle_constant); + data5 = _mm512_loadu_si512 (datap + 2); + data5 = _mm512_shuffle_epi8 (data5, shuffle_constant); + data7 = _mm512_loadu_si512 (datap + 3); + data7 = _mm512_shuffle_epi8 (data7, shuffle_constant); + + while (bytes_read >= 16 * 8 * 4) + { + datap += 4; + + /* Do multiplication here for 16x consecutive 16 byte blocks */ + data2 = _mm512_clmulepi64_epi128 (data, four_mult_constant, + 0x00); + data = _mm512_clmulepi64_epi128 (data, four_mult_constant, + 0x11); + data4 = _mm512_clmulepi64_epi128 (data3, four_mult_constant, + 0x00); + data3 = _mm512_clmulepi64_epi128 (data3, four_mult_constant, + 0x11); + data6 = _mm512_clmulepi64_epi128 (data5, four_mult_constant, + 0x00); + data5 = _mm512_clmulepi64_epi128 (data5, four_mult_constant, + 0x11); + data8 = _mm512_clmulepi64_epi128 (data7, four_mult_constant, + 0x00); + data7 = _mm512_clmulepi64_epi128 (data7, four_mult_constant, + 0x11); + + /* Now multiplication results for the 16x blocks is xor:ed with + next 16x 16 byte blocks from the buffer. This effectively + "consumes" the first 16x blocks from the buffer. + Keep xor result in variables for multiplication in next + round of loop. */ + data = _mm512_xor_si512 (data, data2); + data2 = _mm512_loadu_si512 (datap); + data2 = _mm512_shuffle_epi8 (data2, shuffle_constant); + data = _mm512_xor_si512 (data, data2); + + data3 = _mm512_xor_si512 (data3, data4); + data4 = _mm512_loadu_si512 (datap + 1); + data4 = _mm512_shuffle_epi8 (data4, shuffle_constant); + data3 = _mm512_xor_si512 (data3, data4); + + data5 = _mm512_xor_si512 (data5, data6); + data6 = _mm512_loadu_si512 (datap + 2); + data6 = _mm512_shuffle_epi8 (data6, shuffle_constant); + data5 = _mm512_xor_si512 (data5, data6); + + data7 = _mm512_xor_si512 (data7, data8); + data8 = _mm512_loadu_si512 (datap + 3); + data8 = _mm512_shuffle_epi8 (data8, shuffle_constant); + data7 = _mm512_xor_si512 (data7, data8); + + bytes_read -= (16 * 4 * 4); + } + /* At end of loop we write out results from variables back into + the buffer, for use in single fold loop */ + data = _mm512_shuffle_epi8 (data, shuffle_constant); + _mm512_storeu_si512 (datap, data); + data3 = _mm512_shuffle_epi8 (data3, shuffle_constant); + _mm512_storeu_si512 (datap + 1, data3); + data5 = _mm512_shuffle_epi8 (data5, shuffle_constant); + _mm512_storeu_si512 (datap + 2, data5); + data7 = _mm512_shuffle_epi8 (data7, shuffle_constant); + _mm512_storeu_si512 (datap + 3, data7); + } + + /* Fold two 64-byte blocks into one 64-byte block */ + if (bytes_read >= 128) + { + data = _mm512_loadu_si512 (datap); + data = _mm512_shuffle_epi8 (data, shuffle_constant); + xor_crc = _mm512_set_epi32 (0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, crc, 0, 0, 0); + crc = 0; + data = _mm512_xor_si512 (data, xor_crc); + while (bytes_read >= 128) + { + datap++; + + data2 = _mm512_clmulepi64_epi128 (data, single_mult_constant, + 0x00); + data = _mm512_clmulepi64_epi128 (data, single_mult_constant, + 0x11); + fold_data = _mm512_loadu_si512 (datap); + fold_data = _mm512_shuffle_epi8 (fold_data, shuffle_constant); + data = _mm512_xor_si512 (data, data2); + data = _mm512_xor_si512 (data, fold_data); + bytes_read -= 64; + } + data = _mm512_shuffle_epi8 (data, shuffle_constant); + _mm512_storeu_si512 (datap, data); + } + + /* And finish up last 0-127 bytes in a byte by byte fashion */ + unsigned char *cp = (unsigned char *)datap; + while (bytes_read--) + crc = (crc << 8) ^ crctab[0][((crc >> 24) ^ *cp++) & 0xFF]; + if (feof (fp)) + break; + } + + *crc_out = crc; + *length_out = length; + + return !ferror (fp); +} diff --git a/src/local.mk b/src/local.mk index feea9a6c97..90c98cc507 100644 --- a/src/local.mk +++ b/src/local.mk @@ -439,6 +439,21 @@ src_b2sum_SOURCES = src/digest.c \ src_cksum_SOURCES = $(src_b2sum_SOURCES) src/sum.c src/sum.h \ src/cksum.c src/cksum.h src/crctab.c src_cksum_CPPFLAGS = -DHASH_ALGO_CKSUM=1 -DHAVE_CONFIG_H $(AM_CPPFLAGS) + +if USE_AVX512_CRC32 +noinst_LIBRARIES += src/libcksum_avx512.a +src_libcksum_avx512_a_SOURCES = src/cksum_avx512.c src/cksum.h +cksum_avx512_ldadd = src/libcksum_avx512.a +src_cksum_LDADD += $(cksum_avx512_ldadd) +src_libcksum_avx512_a_CFLAGS = -mavx512bw -mavx512f -mvpclmulqdq $(AM_CFLAGS) +endif +if USE_AVX2_CRC32 +noinst_LIBRARIES += src/libcksum_avx2.a +src_libcksum_avx2_a_SOURCES = src/cksum_avx2.c src/cksum.h +cksum_avx2_ldadd = src/libcksum_avx2.a +src_cksum_LDADD += $(cksum_avx2_ldadd) +src_libcksum_avx2_a_CFLAGS = -mpclmul -mavx -mavx2 -mvpclmulqdq $(AM_CFLAGS) +endif if USE_PCLMUL_CRC32 noinst_LIBRARIES += src/libcksum_pclmul.a src_libcksum_pclmul_a_SOURCES = src/cksum_pclmul.c src/cksum.h