From fdb87d63a5963e2d7112e10151d9ca413bbefb62 Mon Sep 17 00:00:00 2001 From: Vladislav Shchapov Date: Fri, 17 Feb 2023 21:41:46 +0500 Subject: [PATCH] Split crc32 pclmulqdq and vpclmulqdq implementations Signed-off-by: Vladislav Shchapov --- CMakeLists.txt | 6 +-- arch/x86/Makefile.in | 20 +++++----- arch/x86/crc32_fold_pclmulqdq_tpl.h | 21 +++++----- arch/x86/crc32_fold_vpclmulqdq.c | 19 --------- arch/x86/crc32_fold_vpclmulqdq_tpl.h | 6 +-- arch/x86/crc32_pclmulqdq.c | 30 ++++++++++++++ ...fold_pclmulqdq.c => crc32_pclmulqdq_tpl.h} | 40 +++++++++++-------- arch/x86/crc32_vpclmulqdq.c | 17 ++++++++ configure | 8 ++-- cpu_features.h | 7 ++++ functable.c | 10 +++++ win32/Makefile.msc | 5 ++- 12 files changed, 120 insertions(+), 69 deletions(-) delete mode 100644 arch/x86/crc32_fold_vpclmulqdq.c create mode 100644 arch/x86/crc32_pclmulqdq.c rename arch/x86/{crc32_fold_pclmulqdq.c => crc32_pclmulqdq_tpl.h} (92%) create mode 100644 arch/x86/crc32_vpclmulqdq.c diff --git a/CMakeLists.txt b/CMakeLists.txt index d85ccf010..265772dd2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -847,7 +847,7 @@ if(WITH_OPTIM) check_pclmulqdq_intrinsics() if(HAVE_PCLMULQDQ_INTRIN AND HAVE_SSSE3_INTRIN) add_definitions(-DX86_PCLMULQDQ_CRC) - set(PCLMULQDQ_SRCS ${ARCHDIR}/crc32_fold_pclmulqdq.c) + set(PCLMULQDQ_SRCS ${ARCHDIR}/crc32_pclmulqdq.c) add_feature_info(PCLMUL_CRC 1 "Support CRC hash generation using PCLMULQDQ, using \"${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG}\"") list(APPEND ZLIB_ARCH_SRCS ${PCLMULQDQ_SRCS}) set_property(SOURCE ${PCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG} ${NOLTOFLAG}") @@ -856,10 +856,10 @@ if(WITH_OPTIM) check_vpclmulqdq_intrinsics() if(HAVE_VPCLMULQDQ_INTRIN AND HAVE_AVX512_INTRIN) add_definitions(-DX86_VPCLMULQDQ_CRC) - set(VPCLMULQDQ_SRCS ${ARCHDIR}/crc32_fold_vpclmulqdq.c) + set(VPCLMULQDQ_SRCS ${ARCHDIR}/crc32_vpclmulqdq.c) add_feature_info(VPCLMUL_CRC 1 "Support CRC hash generation using VPCLMULQDQ, using \"${VPCLMULFLAG} ${AVX512FLAG}\"") list(APPEND ZLIB_ARCH_SRCS ${VPCLMULQDQ_SRCS}) - set_property(SOURCE ${VPCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${VPCLMULFLAG} ${AVX512FLAG} ${NOLTOFLAG}") + set_property(SOURCE ${VPCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG} ${VPCLMULFLAG} ${AVX512FLAG} ${NOLTOFLAG}") else() set(WITH_VPCLMULQDQ OFF) endif() diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in index a3d5283b9..4cebe5553 100644 --- a/arch/x86/Makefile.in +++ b/arch/x86/Makefile.in @@ -37,8 +37,8 @@ all: \ compare256_avx2.o compare256_avx2.lo \ compare256_sse2.o compare256_sse2.lo \ insert_string_sse42.o insert_string_sse42.lo \ - crc32_fold_pclmulqdq.o crc32_fold_pclmulqdq.lo \ - crc32_fold_vpclmulqdq.o crc32_fold_vpclmulqdq.lo \ + crc32_pclmulqdq.o crc32_pclmulqdq.lo \ + crc32_vpclmulqdq.o crc32_vpclmulqdq.lo \ slide_hash_avx2.o slide_hash_avx2.lo \ slide_hash_sse2.o slide_hash_sse2.lo @@ -84,17 +84,17 @@ insert_string_sse42.o: insert_string_sse42.lo: $(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c -crc32_fold_pclmulqdq.o: - $(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_pclmulqdq.c +crc32_pclmulqdq.o: + $(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c -crc32_fold_pclmulqdq.lo: - $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_pclmulqdq.c +crc32_pclmulqdq.lo: + $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c -crc32_fold_vpclmulqdq.o: - $(CC) $(CFLAGS) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_vpclmulqdq.c +crc32_vpclmulqdq.o: + $(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c -crc32_fold_vpclmulqdq.lo: - $(CC) $(SFLAGS) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_vpclmulqdq.c +crc32_vpclmulqdq.lo: + $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c slide_hash_avx2.o: $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c diff --git a/arch/x86/crc32_fold_pclmulqdq_tpl.h b/arch/x86/crc32_fold_pclmulqdq_tpl.h index 47bbc0111..da1810bf7 100644 --- a/arch/x86/crc32_fold_pclmulqdq_tpl.h +++ b/arch/x86/crc32_fold_pclmulqdq_tpl.h @@ -17,12 +17,10 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#ifdef X86_PCLMULQDQ_CRC - #ifdef COPY -Z_INTERNAL void crc32_fold_pclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) { +Z_INTERNAL void CRC32_FOLD_COPY_NAME (crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) { #else -Z_INTERNAL void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) { +Z_INTERNAL void CRC32_FOLD_NAME (crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) { #endif unsigned long algn_diff; __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3; @@ -61,7 +59,7 @@ Z_INTERNAL void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t _mm_storeu_si128((__m128i *)dst, xmm_crc_part); dst += algn_diff; #else - XOR_INITIAL(xmm_crc_part); + XOR_INITIAL128(xmm_crc_part); if (algn_diff < 4 && init_crc != 0) { xmm_t0 = xmm_crc_part; @@ -79,8 +77,8 @@ Z_INTERNAL void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len -= algn_diff; } -#ifdef X86_VPCLMULQDQ_CRC - if (x86_cpu_has_vpclmulqdq && x86_cpu_has_avx512 && (len >= 256)) { +#ifdef X86_VPCLMULQDQ + if (len >= 256) { #ifdef COPY size_t n = fold_16_vpclmulqdq_copy(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len); dst += n; @@ -110,7 +108,7 @@ Z_INTERNAL void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); dst += 64; #else - XOR_INITIAL(xmm_t0); + XOR_INITIAL128(xmm_t0); #endif xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0); @@ -135,7 +133,7 @@ Z_INTERNAL void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); dst += 48; #else - XOR_INITIAL(xmm_t0); + XOR_INITIAL128(xmm_t0); #endif fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); @@ -153,7 +151,7 @@ Z_INTERNAL void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); dst += 32; #else - XOR_INITIAL(xmm_t0); + XOR_INITIAL128(xmm_t0); #endif fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); @@ -167,7 +165,7 @@ Z_INTERNAL void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t _mm_storeu_si128((__m128i *)dst, xmm_t0); dst += 16; #else - XOR_INITIAL(xmm_t0); + XOR_INITIAL128(xmm_t0); #endif fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); @@ -186,4 +184,3 @@ partial: crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); } -#endif diff --git a/arch/x86/crc32_fold_vpclmulqdq.c b/arch/x86/crc32_fold_vpclmulqdq.c deleted file mode 100644 index d9c43be74..000000000 --- a/arch/x86/crc32_fold_vpclmulqdq.c +++ /dev/null @@ -1,19 +0,0 @@ -/* crc32_fold_vpclmulqdq.c -- VPCMULQDQ-based CRC32 folding implementation. - * Copyright Wangyang Guo (wangyang.guo@intel.com) - * For conditions of distribution and use, see copyright notice in zlib.h - */ - -#ifdef X86_VPCLMULQDQ_CRC -#include "../../zbuild.h" -#include "../../fallback_builtins.h" - -#include - -#define ONCE(op) if (first) { first = 0; op; } -#define XOR_INITIAL(where) ONCE(where = _mm512_xor_si512(where, zmm_initial)) - -#include "crc32_fold_vpclmulqdq_tpl.h" -#define COPY -#include "crc32_fold_vpclmulqdq_tpl.h" - -#endif diff --git a/arch/x86/crc32_fold_vpclmulqdq_tpl.h b/arch/x86/crc32_fold_vpclmulqdq_tpl.h index 3d27cb3df..67f08e128 100644 --- a/arch/x86/crc32_fold_vpclmulqdq_tpl.h +++ b/arch/x86/crc32_fold_vpclmulqdq_tpl.h @@ -4,10 +4,10 @@ */ #ifdef COPY -Z_INTERNAL size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1, +static size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len) { #else -Z_INTERNAL size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1, +static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len, __m128i init_crc, int32_t first) { __m512i zmm_initial = _mm512_zextsi128_si512(init_crc); @@ -25,7 +25,7 @@ Z_INTERNAL size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1, zmm_crc0 = _mm512_setzero_si512(); zmm_t0 = _mm512_loadu_si512((__m512i *)src); #ifndef COPY - XOR_INITIAL(zmm_t0); + XOR_INITIAL512(zmm_t0); #endif zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1); zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2); diff --git a/arch/x86/crc32_pclmulqdq.c b/arch/x86/crc32_pclmulqdq.c new file mode 100644 index 000000000..b4cdeb360 --- /dev/null +++ b/arch/x86/crc32_pclmulqdq.c @@ -0,0 +1,30 @@ +/* + * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ + * instruction. + * + * A white paper describing this algorithm can be found at: + * doc/crc-pclmulqdq.pdf + * + * Copyright (C) 2013 Intel Corporation. All rights reserved. + * Copyright (C) 2016 Marian Beermann (support for initial value) + * Authors: + * Wajdi Feghali + * Jim Guilford + * Vinodh Gopal + * Erdinc Ozturk + * Jim Kukunas + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_PCLMULQDQ_CRC + +#define CRC32_FOLD_COPY_NAME crc32_fold_pclmulqdq_copy +#define CRC32_FOLD_NAME crc32_fold_pclmulqdq +#define CRC32_FOLD_RESET_NAME crc32_fold_pclmulqdq_reset +#define CRC32_FOLD_FINAL_NAME crc32_fold_pclmulqdq_final +#define CRC32_NAME crc32_pclmulqdq + +#include "crc32_pclmulqdq_tpl.h" + +#endif diff --git a/arch/x86/crc32_fold_pclmulqdq.c b/arch/x86/crc32_pclmulqdq_tpl.h similarity index 92% rename from arch/x86/crc32_fold_pclmulqdq.c rename to arch/x86/crc32_pclmulqdq_tpl.h index ecee0c578..6726e3491 100644 --- a/arch/x86/crc32_fold_pclmulqdq.c +++ b/arch/x86/crc32_pclmulqdq_tpl.h @@ -17,25 +17,25 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#ifdef X86_PCLMULQDQ_CRC #include "../../zbuild.h" #include #include #include // _mm_extract_epi32 - -#include "x86_features.h" -#include "cpu_features.h" +#ifdef X86_VPCLMULQDQ +#include +#endif #include "../../crc32_fold.h" #include "../../crc32_braid_p.h" +#include "../../fallback_builtins.h" #include -#ifdef X86_VPCLMULQDQ_CRC -extern size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1, +#ifdef X86_VPCLMULQDQ +static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len, __m128i init_crc, int32_t first); -extern size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1, +static size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len); #endif @@ -246,18 +246,27 @@ static inline void crc32_fold_save(__m128i *fold, const __m128i *fold0, const __ _mm_storeu_si128(fold + 3, *fold3); } -Z_INTERNAL uint32_t crc32_fold_pclmulqdq_reset(crc32_fold *crc) { +Z_INTERNAL uint32_t CRC32_FOLD_RESET_NAME (crc32_fold *crc) { __m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487); __m128i xmm_zero = _mm_setzero_si128(); crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_zero, &xmm_zero, &xmm_zero); return 0; } -#define ONCE(op) if (first) { first = 0; op; } -#define XOR_INITIAL(where) ONCE(where = _mm_xor_si128(where, xmm_initial)) +#define ONCE(op) if (first) { first = 0; op; } +#define XOR_INITIAL128(where) ONCE(where = _mm_xor_si128(where, xmm_initial)) +#ifdef X86_VPCLMULQDQ +#define XOR_INITIAL512(where) ONCE(where = _mm512_xor_si512(where, zmm_initial)) +#endif +#ifdef X86_VPCLMULQDQ +#include "crc32_fold_vpclmulqdq_tpl.h" +#endif #include "crc32_fold_pclmulqdq_tpl.h" #define COPY +#ifdef X86_VPCLMULQDQ +#include "crc32_fold_vpclmulqdq_tpl.h" +#endif #include "crc32_fold_pclmulqdq_tpl.h" static const unsigned ALIGNED_(16) crc_k[] = { @@ -277,7 +286,7 @@ static const unsigned ALIGNED_(16) crc_mask2[4] = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; -Z_INTERNAL uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc) { +Z_INTERNAL uint32_t CRC32_FOLD_FINAL_NAME (crc32_fold *crc) { const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask); const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2); __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3; @@ -342,15 +351,14 @@ Z_INTERNAL uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc) { return crc->value; } -Z_INTERNAL uint32_t crc32_pclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len) { +Z_INTERNAL uint32_t CRC32_NAME (uint32_t crc32, const uint8_t *buf, size_t len) { /* For lens < 64, crc32_braid method is faster. The CRC32 instruction for * these short lengths might also prove to be effective */ if (len < 64) return PREFIX(crc32_braid)(crc32, buf, len); crc32_fold ALIGNED_(16) crc_state; - crc32_fold_pclmulqdq_reset(&crc_state); - crc32_fold_pclmulqdq(&crc_state, buf, len, crc32); - return crc32_fold_pclmulqdq_final(&crc_state); + CRC32_FOLD_RESET_NAME (&crc_state); + CRC32_FOLD_NAME (&crc_state, buf, len, crc32); + return CRC32_FOLD_FINAL_NAME (&crc_state); } -#endif diff --git a/arch/x86/crc32_vpclmulqdq.c b/arch/x86/crc32_vpclmulqdq.c new file mode 100644 index 000000000..b05ddb9a2 --- /dev/null +++ b/arch/x86/crc32_vpclmulqdq.c @@ -0,0 +1,17 @@ +/* crc32_vpclmulqdq.c -- VPCMULQDQ-based CRC32 folding implementation. + * Copyright Wangyang Guo (wangyang.guo@intel.com) + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC) + +#define X86_VPCLMULQDQ +#define CRC32_FOLD_COPY_NAME crc32_fold_vpclmulqdq_copy +#define CRC32_FOLD_NAME crc32_fold_vpclmulqdq +#define CRC32_FOLD_RESET_NAME crc32_fold_vpclmulqdq_reset +#define CRC32_FOLD_FINAL_NAME crc32_fold_vpclmulqdq_final +#define CRC32_NAME crc32_vpclmulqdq + +#include "crc32_pclmulqdq_tpl.h" + +#endif diff --git a/configure b/configure index fdb5b69d7..5dd146386 100755 --- a/configure +++ b/configure @@ -1629,8 +1629,8 @@ case "${ARCH}" in if test ${HAVE_PCLMULQDQ_INTRIN} -eq 1; then CFLAGS="${CFLAGS} -DX86_PCLMULQDQ_CRC" SFLAGS="${SFLAGS} -DX86_PCLMULQDQ_CRC" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_fold_pclmulqdq.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_fold_pclmulqdq.lo" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_pclmulqdq.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_pclmulqdq.lo" if test $buildvpclmulqdq -eq 1; then check_vpclmulqdq_intrinsics @@ -1638,8 +1638,8 @@ case "${ARCH}" in if test ${HAVE_VPCLMULQDQ_INTRIN} -eq 1 && test ${HAVE_AVX512_INTRIN} -eq 1; then CFLAGS="${CFLAGS} -DX86_VPCLMULQDQ_CRC" SFLAGS="${SFLAGS} -DX86_VPCLMULQDQ_CRC" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_fold_vpclmulqdq.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_fold_vpclmulqdq.lo" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_vpclmulqdq.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_vpclmulqdq.lo" fi fi fi diff --git a/cpu_features.h b/cpu_features.h index c098ee2d3..22d70da3d 100644 --- a/cpu_features.h +++ b/cpu_features.h @@ -70,6 +70,13 @@ extern void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t extern uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc); extern uint32_t crc32_pclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len); #endif +#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC) +extern uint32_t crc32_fold_vpclmulqdq_reset(crc32_fold *crc); +extern void crc32_fold_vpclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len); +extern void crc32_fold_vpclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc); +extern uint32_t crc32_fold_vpclmulqdq_final(crc32_fold *crc); +extern uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len); +#endif /* memory chunking */ extern uint32_t chunksize_c(void); diff --git a/functable.c b/functable.c index 23106b33d..da9d10ec5 100644 --- a/functable.c +++ b/functable.c @@ -130,6 +130,16 @@ static void init_functable(void) { ft.adler32_fold_copy = &adler32_fold_copy_avx512_vnni; } #endif + // X86 - VPCLMULQDQ +#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC) + if (x86_cpu_has_pclmulqdq && x86_cpu_has_avx512 && x86_cpu_has_vpclmulqdq) { + ft.crc32 = &crc32_vpclmulqdq; + ft.crc32_fold = &crc32_fold_vpclmulqdq; + ft.crc32_fold_copy = &crc32_fold_vpclmulqdq_copy; + ft.crc32_fold_final = &crc32_fold_vpclmulqdq_final; + ft.crc32_fold_reset = &crc32_fold_vpclmulqdq_reset; + } +#endif // ARM - NEON diff --git a/win32/Makefile.msc b/win32/Makefile.msc index f2f0631a1..d2a98d6f0 100644 --- a/win32/Makefile.msc +++ b/win32/Makefile.msc @@ -64,7 +64,7 @@ OBJS = \ crc32_braid.obj \ crc32_braid_comb.obj \ crc32_fold.obj \ - crc32_fold_pclmulqdq.obj \ + crc32_pclmulqdq.obj \ deflate.obj \ deflate_fast.obj \ deflate_huff.obj \ @@ -206,7 +206,8 @@ cpu_features.obj: $(SRCDIR)/cpu_features.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h crc32_braid.obj: $(SRCDIR)/crc32_braid.c $(SRCDIR)/zbuild.h $(SRCDIR)/zendian.h $(SRCDIR)/deflate.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h crc32_braid_comb.obj: $(SRCDIR)/crc32_braid_comb.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h $(SRCDIR)/crc32_braid_comb_p.h crc32_fold.obj: $(SRCDIR)/crc32_fold.c $(SRCDIR)/zbuild.h -crc32_fold_pclmulqdq.obj: $(SRCDIR)/arch/x86/crc32_fold_pclmulqdq.c $(SRCDIR)/crc32_fold.h $(SRCDIR)/zbuild.h +crc32_pclmulqdq.obj: $(SRCDIR)/arch/x86/crc32_pclmulqdq.c $(SRCDIR)/arch/x86/crc32_pclmulqdq_tpl.h $(SRCDIR)/arch/x86/crc32_fold_pclmulqdq_tpl.h \ + $(SRCDIR)/crc32_fold.h $(SRCDIR)/zbuild.h deflate.obj: $(SRCDIR)/deflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h deflate_fast.obj: $(SRCDIR)/deflate_fast.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h deflate_huff.obj: $(SRCDIR)/deflate_huff.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h -- 2.47.3