]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Split crc32 pclmulqdq and vpclmulqdq implementations
authorVladislav Shchapov <vladislav@shchapov.ru>
Fri, 17 Feb 2023 16:41:46 +0000 (21:41 +0500)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Fri, 24 Feb 2023 12:25:54 +0000 (13:25 +0100)
Signed-off-by: Vladislav Shchapov <vladislav@shchapov.ru>
12 files changed:
CMakeLists.txt
arch/x86/Makefile.in
arch/x86/crc32_fold_pclmulqdq_tpl.h
arch/x86/crc32_fold_vpclmulqdq.c [deleted file]
arch/x86/crc32_fold_vpclmulqdq_tpl.h
arch/x86/crc32_pclmulqdq.c [new file with mode: 0644]
arch/x86/crc32_pclmulqdq_tpl.h [moved from arch/x86/crc32_fold_pclmulqdq.c with 92% similarity]
arch/x86/crc32_vpclmulqdq.c [new file with mode: 0644]
configure
cpu_features.h
functable.c
win32/Makefile.msc

index d85ccf010edd0a839545043ae0260c61fe4e7a8d..265772dd261008f459fd86b2ea167049a2970c2d 100644 (file)
@@ -847,7 +847,7 @@ if(WITH_OPTIM)
             check_pclmulqdq_intrinsics()
             if(HAVE_PCLMULQDQ_INTRIN AND HAVE_SSSE3_INTRIN)
                 add_definitions(-DX86_PCLMULQDQ_CRC)
-                set(PCLMULQDQ_SRCS ${ARCHDIR}/crc32_fold_pclmulqdq.c)
+                set(PCLMULQDQ_SRCS ${ARCHDIR}/crc32_pclmulqdq.c)
                 add_feature_info(PCLMUL_CRC 1 "Support CRC hash generation using PCLMULQDQ, using \"${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG}\"")
                 list(APPEND ZLIB_ARCH_SRCS ${PCLMULQDQ_SRCS})
                 set_property(SOURCE ${PCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG} ${NOLTOFLAG}")
@@ -856,10 +856,10 @@ if(WITH_OPTIM)
                     check_vpclmulqdq_intrinsics()
                     if(HAVE_VPCLMULQDQ_INTRIN AND HAVE_AVX512_INTRIN)
                         add_definitions(-DX86_VPCLMULQDQ_CRC)
-                        set(VPCLMULQDQ_SRCS ${ARCHDIR}/crc32_fold_vpclmulqdq.c)
+                        set(VPCLMULQDQ_SRCS ${ARCHDIR}/crc32_vpclmulqdq.c)
                         add_feature_info(VPCLMUL_CRC 1 "Support CRC hash generation using VPCLMULQDQ, using \"${VPCLMULFLAG} ${AVX512FLAG}\"")
                         list(APPEND ZLIB_ARCH_SRCS ${VPCLMULQDQ_SRCS})
-                        set_property(SOURCE ${VPCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${VPCLMULFLAG} ${AVX512FLAG} ${NOLTOFLAG}")
+                        set_property(SOURCE ${VPCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG} ${VPCLMULFLAG} ${AVX512FLAG} ${NOLTOFLAG}")
                     else()
                         set(WITH_VPCLMULQDQ OFF)
                     endif()
index a3d5283b91b1b603d6d863420fe9695ea2dccaa8..4cebe55531aa0d2d81f4d4807b565b588921d333 100644 (file)
@@ -37,8 +37,8 @@ all: \
        compare256_avx2.o compare256_avx2.lo \
        compare256_sse2.o compare256_sse2.lo \
        insert_string_sse42.o insert_string_sse42.lo \
-       crc32_fold_pclmulqdq.o crc32_fold_pclmulqdq.lo \
-       crc32_fold_vpclmulqdq.o crc32_fold_vpclmulqdq.lo \
+       crc32_pclmulqdq.o crc32_pclmulqdq.lo \
+       crc32_vpclmulqdq.o crc32_vpclmulqdq.lo \
        slide_hash_avx2.o slide_hash_avx2.lo \
        slide_hash_sse2.o slide_hash_sse2.lo
 
@@ -84,17 +84,17 @@ insert_string_sse42.o:
 insert_string_sse42.lo:
        $(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
 
-crc32_fold_pclmulqdq.o:
-       $(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_pclmulqdq.c
+crc32_pclmulqdq.o:
+       $(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
 
-crc32_fold_pclmulqdq.lo:
-       $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_pclmulqdq.c
+crc32_pclmulqdq.lo:
+       $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
 
-crc32_fold_vpclmulqdq.o:
-       $(CC) $(CFLAGS) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_vpclmulqdq.c
+crc32_vpclmulqdq.o:
+       $(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
 
-crc32_fold_vpclmulqdq.lo:
-       $(CC) $(SFLAGS) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_vpclmulqdq.c
+crc32_vpclmulqdq.lo:
+       $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
 
 slide_hash_avx2.o:
        $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
index 47bbc0111be6f102b36614025c4e11626d2e5a7b..da1810bf7bc19d8d2babfe4a94da1f1378fa1046 100644 (file)
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-#ifdef X86_PCLMULQDQ_CRC
-
 #ifdef COPY
-Z_INTERNAL void crc32_fold_pclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
+Z_INTERNAL void CRC32_FOLD_COPY_NAME (crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
 #else
-Z_INTERNAL void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) {
+Z_INTERNAL void CRC32_FOLD_NAME (crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) {
 #endif
     unsigned long algn_diff;
     __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
@@ -61,7 +59,7 @@ Z_INTERNAL void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t
         _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
         dst += algn_diff;
 #else
-        XOR_INITIAL(xmm_crc_part);
+        XOR_INITIAL128(xmm_crc_part);
 
         if (algn_diff < 4 && init_crc != 0) {
             xmm_t0 = xmm_crc_part;
@@ -79,8 +77,8 @@ Z_INTERNAL void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t
         len -= algn_diff;
     }
 
-#ifdef X86_VPCLMULQDQ_CRC
-    if (x86_cpu_has_vpclmulqdq && x86_cpu_has_avx512 && (len >= 256)) {
+#ifdef X86_VPCLMULQDQ
+    if (len >= 256) {
 #ifdef COPY
         size_t n = fold_16_vpclmulqdq_copy(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len);
         dst += n;
@@ -110,7 +108,7 @@ Z_INTERNAL void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t
         _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
         dst += 64;
 #else
-        XOR_INITIAL(xmm_t0);
+        XOR_INITIAL128(xmm_t0);
 #endif
 
         xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
@@ -135,7 +133,7 @@ Z_INTERNAL void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t
         _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
         dst += 48;
 #else
-        XOR_INITIAL(xmm_t0);
+        XOR_INITIAL128(xmm_t0);
 #endif
         fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
 
@@ -153,7 +151,7 @@ Z_INTERNAL void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t
         _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
         dst += 32;
 #else
-        XOR_INITIAL(xmm_t0);
+        XOR_INITIAL128(xmm_t0);
 #endif
         fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
 
@@ -167,7 +165,7 @@ Z_INTERNAL void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t
         _mm_storeu_si128((__m128i *)dst, xmm_t0);
         dst += 16;
 #else
-        XOR_INITIAL(xmm_t0);
+        XOR_INITIAL128(xmm_t0);
 #endif
         fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
 
@@ -186,4 +184,3 @@ partial:
 
     crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
 }
-#endif
diff --git a/arch/x86/crc32_fold_vpclmulqdq.c b/arch/x86/crc32_fold_vpclmulqdq.c
deleted file mode 100644 (file)
index d9c43be..0000000
+++ /dev/null
@@ -1,19 +0,0 @@
-/* crc32_fold_vpclmulqdq.c -- VPCMULQDQ-based CRC32 folding implementation.
- * Copyright Wangyang Guo (wangyang.guo@intel.com)
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#ifdef X86_VPCLMULQDQ_CRC
-#include "../../zbuild.h"
-#include "../../fallback_builtins.h"
-
-#include <immintrin.h>
-
-#define ONCE(op)            if (first) { first = 0; op; }
-#define XOR_INITIAL(where)  ONCE(where = _mm512_xor_si512(where, zmm_initial))
-
-#include "crc32_fold_vpclmulqdq_tpl.h"
-#define COPY
-#include "crc32_fold_vpclmulqdq_tpl.h"
-
-#endif
index 3d27cb3dfb651668df81ef6f4e1df2e98a67d7be..67f08e12818093785a178c3dc496de0e83c11389 100644 (file)
@@ -4,10 +4,10 @@
  */
 
 #ifdef COPY
-Z_INTERNAL size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1,
+static size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1,
     __m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len) {
 #else
-Z_INTERNAL size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
+static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
     __m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len,
     __m128i init_crc, int32_t first) {
     __m512i zmm_initial = _mm512_zextsi128_si512(init_crc);
@@ -25,7 +25,7 @@ Z_INTERNAL size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
     zmm_crc0 = _mm512_setzero_si512();
     zmm_t0 = _mm512_loadu_si512((__m512i *)src);
 #ifndef COPY
-    XOR_INITIAL(zmm_t0);
+    XOR_INITIAL512(zmm_t0);
 #endif
     zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1);
     zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2);
diff --git a/arch/x86/crc32_pclmulqdq.c b/arch/x86/crc32_pclmulqdq.c
new file mode 100644 (file)
index 0000000..b4cdeb3
--- /dev/null
@@ -0,0 +1,30 @@
+/*
+ * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
+ * instruction.
+ *
+ * A white paper describing this algorithm can be found at:
+ *     doc/crc-pclmulqdq.pdf
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Copyright (C) 2016 Marian Beermann (support for initial value)
+ * Authors:
+ *     Wajdi Feghali   <wajdi.k.feghali@intel.com>
+ *     Jim Guilford    <james.guilford@intel.com>
+ *     Vinodh Gopal    <vinodh.gopal@intel.com>
+ *     Erdinc Ozturk   <erdinc.ozturk@intel.com>
+ *     Jim Kukunas     <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_PCLMULQDQ_CRC
+
+#define CRC32_FOLD_COPY_NAME  crc32_fold_pclmulqdq_copy
+#define CRC32_FOLD_NAME       crc32_fold_pclmulqdq
+#define CRC32_FOLD_RESET_NAME crc32_fold_pclmulqdq_reset
+#define CRC32_FOLD_FINAL_NAME crc32_fold_pclmulqdq_final
+#define CRC32_NAME            crc32_pclmulqdq
+
+#include "crc32_pclmulqdq_tpl.h"
+
+#endif
similarity index 92%
rename from arch/x86/crc32_fold_pclmulqdq.c
rename to arch/x86/crc32_pclmulqdq_tpl.h
index ecee0c578cceb909a9c42ab5f1d8b79810d6e112..6726e34919e5a55f6f5ce56200a68b5b816fd107 100644 (file)
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-#ifdef X86_PCLMULQDQ_CRC
 #include "../../zbuild.h"
 
 #include <immintrin.h>
 #include <wmmintrin.h>
 #include <smmintrin.h> // _mm_extract_epi32
-
-#include "x86_features.h"
-#include "cpu_features.h"
+#ifdef X86_VPCLMULQDQ
+#include <immintrin.h>
+#endif
 
 #include "../../crc32_fold.h"
 #include "../../crc32_braid_p.h"
+#include "../../fallback_builtins.h"
 #include <assert.h>
 
-#ifdef X86_VPCLMULQDQ_CRC
-extern size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
+#ifdef X86_VPCLMULQDQ
+static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
     __m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len, __m128i init_crc,
     int32_t first);
-extern size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1,
+static size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1,
     __m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len);
 #endif
 
@@ -246,18 +246,27 @@ static inline void crc32_fold_save(__m128i *fold, const __m128i *fold0, const __
     _mm_storeu_si128(fold + 3, *fold3);
 }
 
-Z_INTERNAL uint32_t crc32_fold_pclmulqdq_reset(crc32_fold *crc) {
+Z_INTERNAL uint32_t CRC32_FOLD_RESET_NAME (crc32_fold *crc) {
     __m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
     __m128i xmm_zero = _mm_setzero_si128();
     crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_zero, &xmm_zero, &xmm_zero);
     return 0;
 }
 
-#define ONCE(op)            if (first) { first = 0; op; }
-#define XOR_INITIAL(where)  ONCE(where = _mm_xor_si128(where, xmm_initial))
+#define ONCE(op)               if (first) { first = 0; op; }
+#define XOR_INITIAL128(where)  ONCE(where = _mm_xor_si128(where, xmm_initial))
+#ifdef X86_VPCLMULQDQ
+#define XOR_INITIAL512(where)  ONCE(where = _mm512_xor_si512(where, zmm_initial))
+#endif
 
+#ifdef X86_VPCLMULQDQ
+#include "crc32_fold_vpclmulqdq_tpl.h"
+#endif
 #include "crc32_fold_pclmulqdq_tpl.h"
 #define COPY
+#ifdef X86_VPCLMULQDQ
+#include "crc32_fold_vpclmulqdq_tpl.h"
+#endif
 #include "crc32_fold_pclmulqdq_tpl.h"
 
 static const unsigned ALIGNED_(16) crc_k[] = {
@@ -277,7 +286,7 @@ static const unsigned ALIGNED_(16) crc_mask2[4] = {
     0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
 };
 
-Z_INTERNAL uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc) {
+Z_INTERNAL uint32_t CRC32_FOLD_FINAL_NAME (crc32_fold *crc) {
     const __m128i xmm_mask  = _mm_load_si128((__m128i *)crc_mask);
     const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
     __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
@@ -342,15 +351,14 @@ Z_INTERNAL uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc) {
     return crc->value;
 }
 
-Z_INTERNAL uint32_t crc32_pclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len) {
+Z_INTERNAL uint32_t CRC32_NAME (uint32_t crc32, const uint8_t *buf, size_t len) {
     /* For lens < 64, crc32_braid method is faster. The CRC32 instruction for
      * these short lengths might also prove to be effective */
     if (len < 64)
         return PREFIX(crc32_braid)(crc32, buf, len);
 
     crc32_fold ALIGNED_(16) crc_state;
-    crc32_fold_pclmulqdq_reset(&crc_state);
-    crc32_fold_pclmulqdq(&crc_state, buf, len, crc32);
-    return crc32_fold_pclmulqdq_final(&crc_state);
+    CRC32_FOLD_RESET_NAME (&crc_state);
+    CRC32_FOLD_NAME (&crc_state, buf, len, crc32);
+    return CRC32_FOLD_FINAL_NAME (&crc_state);
 }
-#endif
diff --git a/arch/x86/crc32_vpclmulqdq.c b/arch/x86/crc32_vpclmulqdq.c
new file mode 100644 (file)
index 0000000..b05ddb9
--- /dev/null
@@ -0,0 +1,17 @@
+/* crc32_vpclmulqdq.c -- VPCMULQDQ-based CRC32 folding implementation.
+ * Copyright Wangyang Guo (wangyang.guo@intel.com)
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
+
+#define X86_VPCLMULQDQ
+#define CRC32_FOLD_COPY_NAME  crc32_fold_vpclmulqdq_copy
+#define CRC32_FOLD_NAME       crc32_fold_vpclmulqdq
+#define CRC32_FOLD_RESET_NAME crc32_fold_vpclmulqdq_reset
+#define CRC32_FOLD_FINAL_NAME crc32_fold_vpclmulqdq_final
+#define CRC32_NAME            crc32_vpclmulqdq
+
+#include "crc32_pclmulqdq_tpl.h"
+
+#endif
index fdb5b69d7efb6647a012f2a913e93cf4d5783017..5dd146386c35938818eeecb67ee893001931913b 100755 (executable)
--- a/configure
+++ b/configure
@@ -1629,8 +1629,8 @@ case "${ARCH}" in
             if test ${HAVE_PCLMULQDQ_INTRIN} -eq 1; then
                 CFLAGS="${CFLAGS} -DX86_PCLMULQDQ_CRC"
                 SFLAGS="${SFLAGS} -DX86_PCLMULQDQ_CRC"
-                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_fold_pclmulqdq.o"
-                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_fold_pclmulqdq.lo"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_pclmulqdq.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_pclmulqdq.lo"
 
                 if test $buildvpclmulqdq -eq 1; then
                     check_vpclmulqdq_intrinsics
@@ -1638,8 +1638,8 @@ case "${ARCH}" in
                     if test ${HAVE_VPCLMULQDQ_INTRIN} -eq 1 && test ${HAVE_AVX512_INTRIN} -eq 1; then
                         CFLAGS="${CFLAGS} -DX86_VPCLMULQDQ_CRC"
                         SFLAGS="${SFLAGS} -DX86_VPCLMULQDQ_CRC"
-                        ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_fold_vpclmulqdq.o"
-                        ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_fold_vpclmulqdq.lo"
+                        ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_vpclmulqdq.o"
+                        ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_vpclmulqdq.lo"
                     fi
                 fi
             fi
index c098ee2d342762677be8816ff78f9f8d3026d59a..22d70da3d926db79364813df8e52d27e21425311 100644 (file)
@@ -70,6 +70,13 @@ extern void     crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t
 extern uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc);
 extern uint32_t crc32_pclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
 #endif
+#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
+extern uint32_t crc32_fold_vpclmulqdq_reset(crc32_fold *crc);
+extern void     crc32_fold_vpclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
+extern void     crc32_fold_vpclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
+extern uint32_t crc32_fold_vpclmulqdq_final(crc32_fold *crc);
+extern uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
+#endif
 
 /* memory chunking */
 extern uint32_t chunksize_c(void);
index 23106b33d2d6adeb36710fcebbf54b993a23a4d0..da9d10ec5bd9aa4beb49106f858fa3789fd4d59f 100644 (file)
@@ -130,6 +130,16 @@ static void init_functable(void) {
         ft.adler32_fold_copy = &adler32_fold_copy_avx512_vnni;
     }
 #endif
+    // X86 - VPCLMULQDQ
+#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
+    if (x86_cpu_has_pclmulqdq && x86_cpu_has_avx512 && x86_cpu_has_vpclmulqdq) {
+        ft.crc32 = &crc32_vpclmulqdq;
+        ft.crc32_fold = &crc32_fold_vpclmulqdq;
+        ft.crc32_fold_copy = &crc32_fold_vpclmulqdq_copy;
+        ft.crc32_fold_final = &crc32_fold_vpclmulqdq_final;
+        ft.crc32_fold_reset = &crc32_fold_vpclmulqdq_reset;
+    }
+#endif
 
 
     // ARM - NEON
index f2f0631a15fa94a15729ec8d819e82328780dc9c..d2a98d6f0a1b94386cd30eb54d4d1d32c4d494c3 100644 (file)
@@ -64,7 +64,7 @@ OBJS = \
        crc32_braid.obj \
        crc32_braid_comb.obj \
        crc32_fold.obj \
-       crc32_fold_pclmulqdq.obj \
+       crc32_pclmulqdq.obj \
        deflate.obj \
        deflate_fast.obj \
        deflate_huff.obj \
@@ -206,7 +206,8 @@ cpu_features.obj: $(SRCDIR)/cpu_features.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
 crc32_braid.obj: $(SRCDIR)/crc32_braid.c $(SRCDIR)/zbuild.h $(SRCDIR)/zendian.h $(SRCDIR)/deflate.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h
 crc32_braid_comb.obj: $(SRCDIR)/crc32_braid_comb.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h $(SRCDIR)/crc32_braid_comb_p.h
 crc32_fold.obj: $(SRCDIR)/crc32_fold.c $(SRCDIR)/zbuild.h
-crc32_fold_pclmulqdq.obj: $(SRCDIR)/arch/x86/crc32_fold_pclmulqdq.c $(SRCDIR)/crc32_fold.h $(SRCDIR)/zbuild.h
+crc32_pclmulqdq.obj: $(SRCDIR)/arch/x86/crc32_pclmulqdq.c $(SRCDIR)/arch/x86/crc32_pclmulqdq_tpl.h $(SRCDIR)/arch/x86/crc32_fold_pclmulqdq_tpl.h \
+                                $(SRCDIR)/crc32_fold.h $(SRCDIR)/zbuild.h
 deflate.obj: $(SRCDIR)/deflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h
 deflate_fast.obj: $(SRCDIR)/deflate_fast.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h
 deflate_huff.obj: $(SRCDIR)/deflate_huff.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h