From: Sam Russell <sam.h.russell@gmail.com>
Date: Mon, 25 Nov 2024 23:25:00 +0000 (+0100)
Subject: cksum: use AVX2 and AVX512 for speedup
X-Git-Tag: v9.6~53
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=df71ac83436e6876f098495ea58f0df9ee7c31cd;p=thirdparty%2Fcoreutils.git

cksum: use AVX2 and AVX512 for speedup

* configure.ac: Add checks for avx2 and avx512 support.
* src/cksum_avx2.c: AVX2 implementation.
* src/cksum_avx512.c: AVX512 implementation.
* src/local.mk: Add build flags for avx2 and avx512.
* src/cksum.c: Add avx2 and avx512 detection functions.
* src/cksum.h: Add avx2 and avx512 implementation declarations.
* NEWS: Mention the AVX2 and AVX512 improvement.
---

diff --git a/NEWS b/NEWS
index 1852f311f9..5bd9199e43 100644
--- a/NEWS
+++ b/NEWS
@@ -66,6 +66,9 @@ GNU coreutils NEWS                                    -*- outline -*-
 
 ** Improvements
 
+  cksum -a crc, makes use of AVX2 and AVX512 extensions for time reductions
+  of 40% and 60% respectively.
+
   'head -c NUM', 'head -n NUM', 'nl -l NUM', 'nproc --ignore NUM',
   'tail -c NUM', 'tail -n NUM', and 'tail --max-unchanged-stats NUMâ
   no longer fail merely because NUM stands for 2**64 or more.
diff --git a/configure.ac b/configure.ac
index f2557ce0d1..17fa23b455 100644
--- a/configure.ac
+++ b/configure.ac
@@ -649,6 +649,70 @@ AM_CONDITIONAL([USE_PCLMUL_CRC32],
                [test $utils_cv_pclmul_intrinsic_exists = yes])
 CFLAGS=$ac_save_CFLAGS
 
+ac_save_CFLAGS=$CFLAGS
+CFLAGS=" -mavx2 -mvpclmulqdq $CFLAGS"
+AC_MSG_CHECKING([if avx2 pclmul intrinsic exists])
+AC_CACHE_VAL([utils_cv_avx2_pclmul_intrinsic_exists],[
+AC_LINK_IFELSE(
+  [AC_LANG_SOURCE([[
+    #include <x86intrin.h>
+
+    int
+    main (void)
+    {
+      __m256i a, b;
+      a = _mm256_clmulepi64_epi128 (a, b, 0x00);
+      a = _mm256_shuffle_epi8 (a, b);
+      return __builtin_cpu_supports ("avx2") &&
+        __builtin_cpu_supports ("vpclmulqdq");
+    }
+  ]])
+  ],[
+    utils_cv_avx2_pclmul_intrinsic_exists=yes
+  ],[
+    utils_cv_avx2_pclmul_intrinsic_exists=no
+  ])])
+AC_MSG_RESULT([$utils_cv_avx2_pclmul_intrinsic_exists])
+if test $utils_cv_avx2_pclmul_intrinsic_exists = yes; then
+  AC_DEFINE([USE_AVX2_CRC32], [1],
+            [CRC32 calculation by avx2 hardware instructions enabled])
+fi
+AM_CONDITIONAL([USE_AVX2_CRC32],
+               [test $utils_cv_avx2_pclmul_intrinsic_exists = yes])
+CFLAGS=$ac_save_CFLAGS
+
+ac_save_CFLAGS=$CFLAGS
+CFLAGS=" -mavx512bw -mavx512f -mvpclmulqdq $CFLAGS"
+AC_MSG_CHECKING([if avx512 pclmul intrinsic exists])
+AC_CACHE_VAL([utils_cv_avx512_pclmul_intrinsic_exists],[
+AC_LINK_IFELSE(
+  [AC_LANG_SOURCE([[
+    #include <x86intrin.h>
+
+    int
+    main (void)
+    {
+      __m512i a, b;
+      a = _mm512_clmulepi64_epi128 (a, b, 0x00);
+      a = _mm512_shuffle_epi8 (a, b);
+      return __builtin_cpu_supports ("avx512bw") &&
+        __builtin_cpu_supports ("avx512f");
+    }
+  ]])
+  ],[
+    utils_cv_avx512_pclmul_intrinsic_exists=yes
+  ],[
+    utils_cv_avx512_pclmul_intrinsic_exists=no
+  ])])
+AC_MSG_RESULT([$utils_cv_avx512_pclmul_intrinsic_exists])
+if test $utils_cv_avx512_pclmul_intrinsic_exists = yes; then
+  AC_DEFINE([USE_AVX512_CRC32], [1],
+            [CRC32 calculation by avx512 hardware instructions enabled])
+fi
+AM_CONDITIONAL([USE_AVX512_CRC32],
+               [test $utils_cv_avx512_pclmul_intrinsic_exists = yes])
+CFLAGS=$ac_save_CFLAGS
+
 CFLAGS="-mavx2 $CFLAGS"
 AC_MSG_CHECKING([for avx2 intrinsics])
 AC_CACHE_VAL([utils_cv_avx2_intrinsic_exists],[
diff --git a/src/cksum.c b/src/cksum.c
index a977bf2960..5900d141ec 100644
--- a/src/cksum.c
+++ b/src/cksum.c
@@ -140,22 +140,66 @@ main (void)
 /* Number of bytes to read at once.  */
 # define BUFLEN (1 << 16)
 
-# if USE_PCLMUL_CRC32
 static bool
 pclmul_supported (void)
 {
-  bool pclmul_enabled = (0 < __builtin_cpu_supports ("pclmul")
-                         && 0 < __builtin_cpu_supports ("avx"));
+  bool pclmul_enabled = false;
+# if USE_PCLMUL_CRC32
+  pclmul_enabled = (0 < __builtin_cpu_supports ("pclmul")
+                    && 0 < __builtin_cpu_supports ("avx"));
 
   if (cksum_debug)
     error (0, 0, "%s",
            (pclmul_enabled
             ? _("using pclmul hardware support")
             : _("pclmul support not detected")));
+# endif
 
   return pclmul_enabled;
 }
-# endif /* USE_PCLMUL_CRC32 */
+
+static bool
+avx2_supported (void)
+{
+  /* AVX512 processors will not set vpclmulqdq unless they support
+     the avx512 version, but it implies that the avx2 version
+     is supported  */
+  bool avx2_enabled = false;
+# if USE_AVX2_CRC32
+  avx2_enabled = (0 < __builtin_cpu_supports ("vpclmulqdq")
+                  && 0 < __builtin_cpu_supports ("avx2"));
+
+  if (cksum_debug)
+    error (0, 0, "%s",
+           (avx2_enabled
+            ? _("using avx2 hardware support")
+            : _("avx2 support not detected")));
+# endif
+
+  return avx2_enabled;
+}
+
+static bool
+avx512_supported (void)
+{
+  /* vpclmulqdq for multiplication
+     mavx512f for most of the avx512 functions we're using
+     mavx512bw for byte swapping  */
+  bool avx512_enabled = false;
+# if USE_AVX512_CRC32
+  avx512_enabled = (0 < __builtin_cpu_supports ("vpclmulqdq")
+                    && 0 < __builtin_cpu_supports ("avx512bw")
+                    && 0 < __builtin_cpu_supports ("avx512f"));
+
+  if (cksum_debug)
+    error (0, 0, "%s",
+           (avx512_enabled
+            ? _("using avx512 hardware support")
+            : _("avx512 support not detected")));
+# endif
+
+  return avx512_enabled;
+}
 
 static bool
 cksum_slice8 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
@@ -220,13 +264,18 @@ crc_sum_stream (FILE *stream, void *resstream, uintmax_t *length)
   uintmax_t total_bytes = 0;
   uint_fast32_t crc = 0;
 
-# if USE_PCLMUL_CRC32
   static bool (*cksum_fp) (FILE *, uint_fast32_t *, uintmax_t *);
   if (! cksum_fp)
-    cksum_fp = pclmul_supported () ? cksum_pclmul : cksum_slice8;
-# else
-  bool (*cksum_fp) (FILE *, uint_fast32_t *, uintmax_t *) = cksum_slice8;
-# endif
+    {
+      if (avx512_supported ())
+        cksum_fp = cksum_avx512;
+      else if (avx2_supported ())
+        cksum_fp = cksum_avx2;
+      else if (pclmul_supported ())
+        cksum_fp = cksum_pclmul;
+      else
+        cksum_fp = cksum_slice8;
+    }
 
   if (! cksum_fp (stream, &crc, &total_bytes))
     return -1;
diff --git a/src/cksum.h b/src/cksum.h
index f8b2799736..6e8a5d0080 100644
--- a/src/cksum.h
+++ b/src/cksum.h
@@ -17,6 +17,12 @@ output_crc (char const *file, int binary_file, void const *digest, bool raw,
 extern bool
 cksum_pclmul (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out);
 
+extern bool
+cksum_avx2 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out);
+
+extern bool
+cksum_avx512 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out);
+
 extern uint_fast32_t const crctab[8][256];
 
 #endif
diff --git a/src/cksum_avx2.c b/src/cksum_avx2.c
new file mode 100644
index 0000000000..252e01d6c8
--- /dev/null
+++ b/src/cksum_avx2.c
@@ -0,0 +1,200 @@
+/* cksum -- calculate and print POSIX checksums and sizes of files
+   Copyright (C) 1992-2024 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <x86intrin.h>
+#include "system.h"
+
+/* Number of bytes to read at once.  */
+#define BUFLEN (1 << 16)
+
+extern uint_fast32_t const crctab[8][256];
+
+extern bool
+cksum_avx2 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out);
+
+bool
+cksum_avx2 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
+{
+  __m256i buf[BUFLEN / sizeof (__m256i)];
+  uint_fast32_t crc = 0;
+  uintmax_t length = 0;
+  size_t bytes_read;
+  __m256i single_mult_constant;
+  __m256i four_mult_constant;
+  __m256i shuffle_constant;
+
+  if (!fp || !crc_out || !length_out)
+    return false;
+
+  /* These constants and general algorithms are taken from the Intel whitepaper
+     "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+  */
+  single_mult_constant = _mm256_set_epi64x (0x569700E5, 0x75BE46B7,
+                                            0x569700E5, 0x75BE46B7);
+  four_mult_constant = _mm256_set_epi64x (0x10BD4D7C, 0x567FDDEB,
+                                            0x10BD4D7C, 0x567FDDEB);
+
+  /* Constant to byteswap a full AVX2 register */
+  shuffle_constant = _mm256_set_epi8 (0, 1, 2, 3, 4, 5, 6, 7, 8,
+                                      9, 10, 11, 12, 13, 14, 15,
+                                      0, 1, 2, 3, 4, 5, 6, 7, 8,
+                                      9, 10, 11, 12, 13, 14, 15);
+  while ((bytes_read = fread (buf, 1, BUFLEN, fp)) > 0)
+    {
+      __m256i data;
+      __m256i data2;
+      __m256i data3;
+      __m256i data4;
+      __m256i data5;
+      __m256i data6;
+      __m256i data7;
+      __m256i data8;
+      __m256i fold_data;
+      __m256i xor_crc;
+
+      __m256i *datap;
+
+      if (length + bytes_read < length)
+        {
+          errno = EOVERFLOW;
+          return false;
+        }
+      length += bytes_read;
+
+      datap = (__m256i *)buf;
+
+      /* Fold in parallel 16x 16-byte blocks into 8x 16-byte blocks */
+      if (bytes_read >= 16 * 8 * 2)
+        {
+          data = _mm256_loadu_si256 (datap);
+          data = _mm256_shuffle_epi8 (data, shuffle_constant);
+          /* XOR in initial CRC value (for us 0 so no effect), or CRC value
+             calculated for previous BUFLEN buffer from fread */
+          xor_crc = _mm256_set_epi32 (0, 0, 0, 0, crc, 0, 0, 0);
+          crc = 0;
+          data = _mm256_xor_si256 (data, xor_crc);
+          data3 = _mm256_loadu_si256 (datap + 1);
+          data3 = _mm256_shuffle_epi8 (data3, shuffle_constant);
+          data5 = _mm256_loadu_si256 (datap + 2);
+          data5 = _mm256_shuffle_epi8 (data5, shuffle_constant);
+          data7 = _mm256_loadu_si256 (datap + 3);
+          data7 = _mm256_shuffle_epi8 (data7, shuffle_constant);
+
+          while (bytes_read >= 16 * 8 * 2)
+            {
+              datap += 4;
+
+              /* Do multiplication here for 8x consecutive 16 byte blocks */
+              data2 = _mm256_clmulepi64_epi128 (data, four_mult_constant,
+                                                0x00);
+              data = _mm256_clmulepi64_epi128 (data, four_mult_constant,
+                                               0x11);
+              data4 = _mm256_clmulepi64_epi128 (data3, four_mult_constant,
+                                                0x00);
+              data3 = _mm256_clmulepi64_epi128 (data3, four_mult_constant,
+                                                0x11);
+              data6 = _mm256_clmulepi64_epi128 (data5, four_mult_constant,
+                                                0x00);
+              data5 = _mm256_clmulepi64_epi128 (data5, four_mult_constant,
+                                                0x11);
+              data8 = _mm256_clmulepi64_epi128 (data7, four_mult_constant,
+                                                0x00);
+              data7 = _mm256_clmulepi64_epi128 (data7, four_mult_constant,
+                                                0x11);
+
+              /* Now multiplication results for the 8x blocks is xor:ed with
+                 next 8x 16 byte blocks from the buffer. This effectively
+                 "consumes" the first 8x blocks from the buffer.
+                 Keep xor result in variables for multiplication in next
+                 round of loop. */
+              data = _mm256_xor_si256 (data, data2);
+              data2 = _mm256_loadu_si256 (datap);
+              data2 = _mm256_shuffle_epi8 (data2, shuffle_constant);
+              data = _mm256_xor_si256 (data, data2);
+
+              data3 = _mm256_xor_si256 (data3, data4);
+              data4 = _mm256_loadu_si256 (datap + 1);
+              data4 = _mm256_shuffle_epi8 (data4, shuffle_constant);
+              data3 = _mm256_xor_si256 (data3, data4);
+
+              data5 = _mm256_xor_si256 (data5, data6);
+              data6 = _mm256_loadu_si256 (datap + 2);
+              data6 = _mm256_shuffle_epi8 (data6, shuffle_constant);
+              data5 = _mm256_xor_si256 (data5, data6);
+
+              data7 = _mm256_xor_si256 (data7, data8);
+              data8 = _mm256_loadu_si256 (datap + 3);
+              data8 = _mm256_shuffle_epi8 (data8, shuffle_constant);
+              data7 = _mm256_xor_si256 (data7, data8);
+
+              bytes_read -= (16 * 4 * 2);
+            }
+          /* At end of loop we write out results from variables back into
+             the buffer, for use in single fold loop */
+          data = _mm256_shuffle_epi8 (data, shuffle_constant);
+          _mm256_storeu_si256 (datap, data);
+          data3 = _mm256_shuffle_epi8 (data3, shuffle_constant);
+          _mm256_storeu_si256 (datap + 1, data3);
+          data5 = _mm256_shuffle_epi8 (data5, shuffle_constant);
+          _mm256_storeu_si256 (datap + 2, data5);
+          data7 = _mm256_shuffle_epi8 (data7, shuffle_constant);
+          _mm256_storeu_si256 (datap + 3, data7);
+        }
+
+      /* Fold two 32-byte blocks into one 32-byte block */
+      if (bytes_read >= 64)
+        {
+          data = _mm256_loadu_si256 (datap);
+          data = _mm256_shuffle_epi8 (data, shuffle_constant);
+          xor_crc = _mm256_set_epi32 (0, 0, 0, 0, crc, 0, 0, 0);
+          crc = 0;
+          data = _mm256_xor_si256 (data, xor_crc);
+          while (bytes_read >= 64)
+            {
+              datap++;
+
+              data2 = _mm256_clmulepi64_epi128 (data, single_mult_constant,
+                                                0x00);
+              data = _mm256_clmulepi64_epi128 (data, single_mult_constant,
+                                               0x11);
+              fold_data = _mm256_loadu_si256 (datap);
+              fold_data = _mm256_shuffle_epi8 (fold_data, shuffle_constant);
+              data = _mm256_xor_si256 (data, data2);
+              data = _mm256_xor_si256 (data, fold_data);
+              bytes_read -= 32;
+            }
+          data = _mm256_shuffle_epi8 (data, shuffle_constant);
+          _mm256_storeu_si256 (datap, data);
+        }
+
+      /* And finish up last 0-63 bytes in a byte by byte fashion */
+      unsigned char *cp = (unsigned char *)datap;
+      while (bytes_read--)
+        crc = (crc << 8) ^ crctab[0][((crc >> 24) ^ *cp++) & 0xFF];
+      if (feof (fp))
+        break;
+    }
+
+  *crc_out = crc;
+  *length_out = length;
+
+  return !ferror (fp);
+}
diff --git a/src/cksum_avx512.c b/src/cksum_avx512.c
new file mode 100644
index 0000000000..5f8ff2375b
--- /dev/null
+++ b/src/cksum_avx512.c
@@ -0,0 +1,210 @@
+/* cksum -- calculate and print POSIX checksums and sizes of files
+   Copyright (C) 1992-2024 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <x86intrin.h>
+#include "system.h"
+
+/* Number of bytes to read at once.  */
+#define BUFLEN (1 << 16)
+
+extern uint_fast32_t const crctab[8][256];
+
+extern bool
+cksum_avx512 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out);
+
+bool
+cksum_avx512 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
+{
+  __m512i buf[BUFLEN / sizeof (__m512i)];
+  uint_fast32_t crc = 0;
+  uintmax_t length = 0;
+  size_t bytes_read;
+  __m512i single_mult_constant;
+  __m512i four_mult_constant;
+  __m512i shuffle_constant;
+
+  if (!fp || !crc_out || !length_out)
+    return false;
+
+  /* These constants and general algorithms are taken from the Intel whitepaper
+     "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+  */
+  single_mult_constant = _mm512_set_epi64 (0x8833794C, 0xE6228B11,
+                                           0x8833794C, 0xE6228B11,
+                                           0x8833794C, 0xE6228B11,
+                                           0x8833794C, 0xE6228B11);
+  four_mult_constant = _mm512_set_epi64 (0xCBCF3BCB, 0x88FE2237,
+                                         0xCBCF3BCB, 0x88FE2237,
+                                         0xCBCF3BCB, 0x88FE2237,
+                                         0xCBCF3BCB, 0x88FE2237);
+
+  /* Constant to byteswap a full AVX512 register */
+  shuffle_constant = _mm512_set_epi8 (0, 1, 2, 3, 4, 5, 6, 7, 8,
+                                      9, 10, 11, 12, 13, 14, 15,
+                                      0, 1, 2, 3, 4, 5, 6, 7, 8,
+                                      9, 10, 11, 12, 13, 14, 15,
+                                      0, 1, 2, 3, 4, 5, 6, 7, 8,
+                                      9, 10, 11, 12, 13, 14, 15,
+                                      0, 1, 2, 3, 4, 5, 6, 7, 8,
+                                      9, 10, 11, 12, 13, 14, 15);
+  while ((bytes_read = fread (buf, 1, BUFLEN, fp)) > 0)
+    {
+      __m512i data;
+      __m512i data2;
+      __m512i data3;
+      __m512i data4;
+      __m512i data5;
+      __m512i data6;
+      __m512i data7;
+      __m512i data8;
+      __m512i fold_data;
+      __m512i xor_crc;
+
+      __m512i *datap;
+
+      if (length + bytes_read < length)
+        {
+          errno = EOVERFLOW;
+          return false;
+        }
+      length += bytes_read;
+
+      datap = (__m512i *)buf;
+
+      /* Fold in parallel 32x 16-byte blocks into 16x 16-byte blocks */
+      if (bytes_read >= 16 * 8 * 4)
+        {
+          data = _mm512_loadu_si512 (datap);
+          data = _mm512_shuffle_epi8 (data, shuffle_constant);
+          /* XOR in initial CRC value (for us 0 so no effect), or CRC value
+             calculated for previous BUFLEN buffer from fread */
+          xor_crc = _mm512_set_epi32 (0, 0, 0, 0, 0, 0, 0, 0,
+                                      0, 0, 0, 0, crc, 0, 0, 0);
+          crc = 0;
+          data = _mm512_xor_si512 (data, xor_crc);
+          data3 = _mm512_loadu_si512 (datap + 1);
+          data3 = _mm512_shuffle_epi8 (data3, shuffle_constant);
+          data5 = _mm512_loadu_si512 (datap + 2);
+          data5 = _mm512_shuffle_epi8 (data5, shuffle_constant);
+          data7 = _mm512_loadu_si512 (datap + 3);
+          data7 = _mm512_shuffle_epi8 (data7, shuffle_constant);
+
+          while (bytes_read >= 16 * 8 * 4)
+            {
+              datap += 4;
+
+              /* Do multiplication here for 16x consecutive 16 byte blocks */
+              data2 = _mm512_clmulepi64_epi128 (data, four_mult_constant,
+                                                0x00);
+              data = _mm512_clmulepi64_epi128 (data, four_mult_constant,
+                                               0x11);
+              data4 = _mm512_clmulepi64_epi128 (data3, four_mult_constant,
+                                                0x00);
+              data3 = _mm512_clmulepi64_epi128 (data3, four_mult_constant,
+                                                0x11);
+              data6 = _mm512_clmulepi64_epi128 (data5, four_mult_constant,
+                                                0x00);
+              data5 = _mm512_clmulepi64_epi128 (data5, four_mult_constant,
+                                                0x11);
+              data8 = _mm512_clmulepi64_epi128 (data7, four_mult_constant,
+                                                0x00);
+              data7 = _mm512_clmulepi64_epi128 (data7, four_mult_constant,
+                                                0x11);
+
+              /* Now multiplication results for the 16x blocks is xor:ed with
+                 next 16x 16 byte blocks from the buffer. This effectively
+                 "consumes" the first 16x blocks from the buffer.
+                 Keep xor result in variables for multiplication in next
+                 round of loop. */
+              data = _mm512_xor_si512 (data, data2);
+              data2 = _mm512_loadu_si512 (datap);
+              data2 = _mm512_shuffle_epi8 (data2, shuffle_constant);
+              data = _mm512_xor_si512 (data, data2);
+
+              data3 = _mm512_xor_si512 (data3, data4);
+              data4 = _mm512_loadu_si512 (datap + 1);
+              data4 = _mm512_shuffle_epi8 (data4, shuffle_constant);
+              data3 = _mm512_xor_si512 (data3, data4);
+
+              data5 = _mm512_xor_si512 (data5, data6);
+              data6 = _mm512_loadu_si512 (datap + 2);
+              data6 = _mm512_shuffle_epi8 (data6, shuffle_constant);
+              data5 = _mm512_xor_si512 (data5, data6);
+
+              data7 = _mm512_xor_si512 (data7, data8);
+              data8 = _mm512_loadu_si512 (datap + 3);
+              data8 = _mm512_shuffle_epi8 (data8, shuffle_constant);
+              data7 = _mm512_xor_si512 (data7, data8);
+
+              bytes_read -= (16 * 4 * 4);
+            }
+          /* At end of loop we write out results from variables back into
+             the buffer, for use in single fold loop */
+          data = _mm512_shuffle_epi8 (data, shuffle_constant);
+          _mm512_storeu_si512 (datap, data);
+          data3 = _mm512_shuffle_epi8 (data3, shuffle_constant);
+          _mm512_storeu_si512 (datap + 1, data3);
+          data5 = _mm512_shuffle_epi8 (data5, shuffle_constant);
+          _mm512_storeu_si512 (datap + 2, data5);
+          data7 = _mm512_shuffle_epi8 (data7, shuffle_constant);
+          _mm512_storeu_si512 (datap + 3, data7);
+        }
+
+      /* Fold two 64-byte blocks into one 64-byte block */
+      if (bytes_read >= 128)
+        {
+          data = _mm512_loadu_si512 (datap);
+          data = _mm512_shuffle_epi8 (data, shuffle_constant);
+          xor_crc = _mm512_set_epi32 (0, 0, 0, 0, 0, 0, 0, 0,
+                                      0, 0, 0, 0, crc, 0, 0, 0);
+          crc = 0;
+          data = _mm512_xor_si512 (data, xor_crc);
+          while (bytes_read >= 128)
+            {
+              datap++;
+
+              data2 = _mm512_clmulepi64_epi128 (data, single_mult_constant,
+                                                0x00);
+              data = _mm512_clmulepi64_epi128 (data, single_mult_constant,
+                                               0x11);
+              fold_data = _mm512_loadu_si512 (datap);
+              fold_data = _mm512_shuffle_epi8 (fold_data, shuffle_constant);
+              data = _mm512_xor_si512 (data, data2);
+              data = _mm512_xor_si512 (data, fold_data);
+              bytes_read -= 64;
+            }
+          data = _mm512_shuffle_epi8 (data, shuffle_constant);
+          _mm512_storeu_si512 (datap, data);
+        }
+
+      /* And finish up last 0-127 bytes in a byte by byte fashion */
+      unsigned char *cp = (unsigned char *)datap;
+      while (bytes_read--)
+        crc = (crc << 8) ^ crctab[0][((crc >> 24) ^ *cp++) & 0xFF];
+      if (feof (fp))
+        break;
+    }
+
+  *crc_out = crc;
+  *length_out = length;
+
+  return !ferror (fp);
+}
diff --git a/src/local.mk b/src/local.mk
index feea9a6c97..90c98cc507 100644
--- a/src/local.mk
+++ b/src/local.mk
@@ -439,6 +439,21 @@ src_b2sum_SOURCES = src/digest.c \
 src_cksum_SOURCES = $(src_b2sum_SOURCES) src/sum.c src/sum.h \
 		    src/cksum.c src/cksum.h src/crctab.c
 src_cksum_CPPFLAGS = -DHASH_ALGO_CKSUM=1 -DHAVE_CONFIG_H $(AM_CPPFLAGS)
+
+if USE_AVX512_CRC32
+noinst_LIBRARIES += src/libcksum_avx512.a
+src_libcksum_avx512_a_SOURCES = src/cksum_avx512.c src/cksum.h
+cksum_avx512_ldadd = src/libcksum_avx512.a
+src_cksum_LDADD += $(cksum_avx512_ldadd)
+src_libcksum_avx512_a_CFLAGS = -mavx512bw -mavx512f -mvpclmulqdq $(AM_CFLAGS)
+endif
+if USE_AVX2_CRC32
+noinst_LIBRARIES += src/libcksum_avx2.a
+src_libcksum_avx2_a_SOURCES = src/cksum_avx2.c src/cksum.h
+cksum_avx2_ldadd = src/libcksum_avx2.a
+src_cksum_LDADD += $(cksum_avx2_ldadd)
+src_libcksum_avx2_a_CFLAGS = -mpclmul -mavx -mavx2 -mvpclmulqdq $(AM_CFLAGS)
+endif
 if USE_PCLMUL_CRC32
 noinst_LIBRARIES += src/libcksum_pclmul.a
 src_libcksum_pclmul_a_SOURCES = src/cksum_pclmul.c src/cksum.h