From: Nathan Moinvaziri Date: Wed, 15 Dec 2021 22:21:58 +0000 (-0800) Subject: VPCLMULQDQ implementation for Intel's CRC32 folding. X-Git-Tag: 2.1.0-beta1~451 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f20f9b610c286fe55e309c8ff24ac9ac795c78e5;p=thirdparty%2Fzlib-ng.git VPCLMULQDQ implementation for Intel's CRC32 folding. Based on PR https://github.com/jtkukunas/zlib/pull/28. Co-authored-by: Wangyang Guo --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 956f73563..ecb9ac6b2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -113,6 +113,7 @@ elseif(BASEARCH_X86_FOUND) option(WITH_SSE41 "Build with SSE41" ON) option(WITH_SSE42 "Build with SSE42" ON) option(WITH_PCLMULQDQ "Build with PCLMULQDQ" ON) + option(WITH_VPCLMULQDQ "Build with VPCLMULQDQ" ON) endif() option(INSTALL_UTILS "Copy minigzip and minideflate during install" OFF) @@ -827,11 +828,28 @@ if(WITH_OPTIM) add_feature_info(PCLMUL_CRC 1 "Support CRC hash generation using PCLMULQDQ, using \"${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG}\"") list(APPEND ZLIB_ARCH_SRCS ${PCLMULQDQ_SRCS}) set_property(SOURCE ${PCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG} ${NOLTOFLAG}") + + if(WITH_VPCLMULQDQ AND WITH_AVX512) + check_vpclmulqdq_intrinsics() + if(HAVE_VPCLMULQDQ_INTRIN AND HAVE_AVX512_INTRIN) + add_definitions(-DX86_VPCLMULQDQ_CRC) + set(VPCLMULQDQ_SRCS ${ARCHDIR}/crc32_fold_vpclmulqdq.c) + add_feature_info(VPCLMUL_CRC 1 "Support CRC hash generation using VPCLMULQDQ, using \"${VPCLMULFLAG} ${AVX512FLAG}\"") + list(APPEND ZLIB_ARCH_SRCS ${VPCLMULQDQ_SRCS}) + set_property(SOURCE ${VPCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${VPCLMULFLAG} ${AVX512FLAG} ${NOLTOFLAG}") + else() + set(WITH_VPCLMULQDQ OFF) + endif() + else() + set(WITH_VPCLMULQDQ OFF) + endif() else() set(WITH_PCLMULQDQ OFF) + set(WITH_VPCLMULQDQ OFF) endif() else() set(WITH_PCLMULQDQ OFF) + set(WITH_VPCLMULQDQ OFF) endif() endif() endif() @@ -1458,6 +1476,7 @@ elseif(BASEARCH_X86_FOUND) add_feature_info(WITH_SSE41 WITH_SSE41 "Build with SSE41") add_feature_info(WITH_SSE42 WITH_SSE42 "Build with SSE42") add_feature_info(WITH_PCLMULQDQ WITH_PCLMULQDQ "Build with PCLMULQDQ") + add_feature_info(WITH_VPCLMULQDQ WITH_VPCLMULQDQ "Build with VPCLMULQDQ") endif() add_feature_info(INSTALL_UTILS INSTALL_UTILS "Copy minigzip and minideflate during install") diff --git a/README.md b/README.md index 44e63db30..79d6648bd 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ Features * Deflate medium and quick algorithms based on Intels zlib fork * Support for CPU intrinsics when available * Adler32 implementation using SSSE3, AVX2, AVX512, AVX512-VNNI, Neon, VMX & VSX - * CRC32-B implementation using PCLMULQDQ & ACLE + * CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, & ACLE * Hash table implementation using CRC32-C intrinsics on x86 and ARM * Slide hash implementations using SSE2, AVX2, Neon, VMX & VSX * Compare256/258 implementations using SSE4.2 & AVX2 @@ -203,6 +203,7 @@ Advanced Build Options | WITH_SSE41 | | Build with SSE41 intrinsics | ON | | WITH_SSE42 | | Build with SSE42 intrinsics | ON | | WITH_PCLMULQDQ | | Build with PCLMULQDQ intrinsics | ON | +| WITH_VPCLMULQDQ | --without-vpclmulqdq | Build with VPCLMULQDQ intrinsics | ON | | WITH_ACLE | --without-acle | Build with ACLE intrinsics | ON | | WITH_NEON | --without-neon | Build with NEON intrinsics | ON | | WITH_ALTIVEC | --without-altivec | Build with AltiVec (VMX) intrinsics | ON | diff --git a/arch/x86/INDEX.md b/arch/x86/INDEX.md index e20245a5e..af987a25c 100644 --- a/arch/x86/INDEX.md +++ b/arch/x86/INDEX.md @@ -5,4 +5,5 @@ Contents |:-|:-| |deflate_quick.c|SSE4 optimized deflate strategy for use as level 1| |crc32_fold_pclmulqdq.c|SSE4 + PCLMULQDQ optimized CRC folding implementation| +|crc32_fold_vpclmulqdq.c|VPCLMULQDQ optimized CRC folding implementation| |slide_hash_sse2.c|SSE2 optimized slide_hash| diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in index f54a695c2..0a1dc0766 100644 --- a/arch/x86/Makefile.in +++ b/arch/x86/Makefile.in @@ -16,6 +16,7 @@ SSSE3FLAG=-mssse3 SSE41FLAG=-msse4.1 SSE42FLAG=-msse4.2 PCLMULFLAG=-mpclmul +VPCLMULFLAG=-mvpclmulqdq NOLTOFLAG= SRCDIR=. @@ -35,6 +36,7 @@ all: \ compare258_sse.o compare258_sse.lo \ insert_string_sse.o insert_string_sse.lo \ crc32_fold_pclmulqdq.o crc32_fold_pclmulqdq.lo \ + crc32_fold_vpclmulqdq.o crc32_fold_vpclmulqdq.lo \ slide_hash_avx.o slide_hash_avx.lo \ slide_hash_sse.o slide_hash_sse.lo @@ -80,6 +82,12 @@ crc32_fold_pclmulqdq.o: crc32_fold_pclmulqdq.lo: $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_pclmulqdq.c +crc32_fold_vpclmulqdq.o: + $(CC) $(CFLAGS) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_vpclmulqdq.c + +crc32_fold_vpclmulqdq.lo: + $(CC) $(SFLAGS) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_vpclmulqdq.c + slide_hash_avx.o: $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx.c diff --git a/arch/x86/crc32_fold_pclmulqdq.c b/arch/x86/crc32_fold_pclmulqdq.c index d07ffb408..1434357a8 100644 --- a/arch/x86/crc32_fold_pclmulqdq.c +++ b/arch/x86/crc32_fold_pclmulqdq.c @@ -26,6 +26,11 @@ #include "../../crc32_fold.h" +#ifdef X86_VPCLMULQDQ_CRC +extern size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1, + __m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len); +#endif + static void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) { const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596); @@ -275,6 +280,16 @@ Z_INTERNAL void crc32_fold_copy_pclmulqdq(crc32_fold *crc, uint8_t *dst, const u xmm_crc_part = _mm_setzero_si128(); } +#ifdef X86_VPCLMULQDQ_CRC + if (x86_cpu_has_vpclmulqdq && x86_cpu_has_avx512 && (len >= 256)) { + size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len); + + len -= n; + src += n; + dst += n; + } +#endif + while (len >= 64) { crc32_fold_load((__m128i *)src, &xmm_t0, &xmm_t1, &xmm_t2, &xmm_t3); diff --git a/arch/x86/crc32_fold_vpclmulqdq.c b/arch/x86/crc32_fold_vpclmulqdq.c new file mode 100644 index 000000000..9ed54b37e --- /dev/null +++ b/arch/x86/crc32_fold_vpclmulqdq.c @@ -0,0 +1,108 @@ +/* crc32_fold_vpclmulqdq.c -- VPCMULQDQ-based CRC32 folding implementation. + * Copyright Wangyang Guo (wangyang.guo@intel.com) + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_VPCLMULQDQ_CRC +#include "../../zutil.h" + +#include + +size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1, + __m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len) { + size_t len_tmp = len; + __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3; + __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3; + __m512i z0, z1, z2, z3; + z_const __m512i zmm_fold4 = _mm512_set4_epi32( + 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596); + z_const __m512i zmm_fold16 = _mm512_set4_epi32( + 0x00000001, 0x1542778a, 0x00000001, 0x322d1430); + + // zmm register init + zmm_crc0 = _mm512_setzero_si512(); + zmm_t0 = _mm512_loadu_si512((__m512i *)src); + zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1); + zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2); + zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3); + + /* already have intermediate CRC in xmm registers + * fold4 with 4 xmm_crc to get zmm_crc0 + */ + zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0); + zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1); + zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2); + zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3); + z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); + zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); + zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); + zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0); + + _mm512_storeu_si512((__m512i *)dst, zmm_t0); + _mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1); + _mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2); + _mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3); + len -= 256; + src += 256; + dst += 256; + + // fold-16 loops + while (len >= 256) { + zmm_t0 = _mm512_loadu_si512((__m512i *)src); + zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1); + zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2); + zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3); + + z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01); + z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01); + z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01); + z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01); + + zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10); + zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10); + zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10); + zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10); + + zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); + zmm_crc1 = _mm512_xor_si512(z1, zmm_crc1); + zmm_crc2 = _mm512_xor_si512(z2, zmm_crc2); + zmm_crc3 = _mm512_xor_si512(z3, zmm_crc3); + + zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0); + zmm_crc1 = _mm512_xor_si512(zmm_crc1, zmm_t1); + zmm_crc2 = _mm512_xor_si512(zmm_crc2, zmm_t2); + zmm_crc3 = _mm512_xor_si512(zmm_crc3, zmm_t3); + + _mm512_storeu_si512((__m512i *)dst, zmm_t0); + _mm512_storeu_si512((__m512i *)dst + 1, zmm_t1); + _mm512_storeu_si512((__m512i *)dst + 2, zmm_t2); + _mm512_storeu_si512((__m512i *)dst + 3, zmm_t3); + len -= 256; + src += 256; + dst += 256; + } + // zmm_crc[0,1,2,3] -> zmm_crc0 + z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); + zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); + zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); + zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc1); + + z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); + zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); + zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); + zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc2); + + z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); + zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); + zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); + zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc3); + + // zmm_crc0 -> xmm_crc[0, 1, 2, 3] + *xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0); + *xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1); + *xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2); + *xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3); + + return (len_tmp - len); // return n bytes processed +} +#endif diff --git a/arch/x86/x86.c b/arch/x86/x86.c index 065a71703..32baf8a74 100644 --- a/arch/x86/x86.c +++ b/arch/x86/x86.c @@ -27,6 +27,7 @@ Z_INTERNAL int x86_cpu_has_ssse3; Z_INTERNAL int x86_cpu_has_sse41; Z_INTERNAL int x86_cpu_has_sse42; Z_INTERNAL int x86_cpu_has_pclmulqdq; +Z_INTERNAL int x86_cpu_has_vpclmulqdq; Z_INTERNAL int x86_cpu_has_tzcnt; Z_INTERNAL int x86_cpu_well_suited_avx512; @@ -98,9 +99,11 @@ void Z_INTERNAL x86_check_features(void) { x86_cpu_has_avx2 = ebx & 0x20; x86_cpu_has_avx512 = ebx & 0x00010000; x86_cpu_has_avx512vnni = ecx & 0x800; + x86_cpu_has_vpclmulqdq = ecx & 0x400; } else { x86_cpu_has_tzcnt = 0; x86_cpu_has_avx2 = 0; + x86_cpu_has_vpclmulqdq = 0; } diff --git a/arch/x86/x86.h b/arch/x86/x86.h index 80da6f32a..00f8d9efc 100644 --- a/arch/x86/x86.h +++ b/arch/x86/x86.h @@ -14,6 +14,7 @@ extern int x86_cpu_has_ssse3; extern int x86_cpu_has_sse41; extern int x86_cpu_has_sse42; extern int x86_cpu_has_pclmulqdq; +extern int x86_cpu_has_vpclmulqdq; extern int x86_cpu_has_tzcnt; extern int x86_cpu_well_suited_avx512; diff --git a/cmake/detect-intrinsics.cmake b/cmake/detect-intrinsics.cmake index 2facf5051..47d93d0c1 100644 --- a/cmake/detect-intrinsics.cmake +++ b/cmake/detect-intrinsics.cmake @@ -168,6 +168,32 @@ macro(check_pclmulqdq_intrinsics) endif() endmacro() +macro(check_vpclmulqdq_intrinsics) + if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang") + if(NOT NATIVEFLAG) + set(VPCLMULFLAG "-mvpclmulqdq") + endif() + endif() + # Check whether compiler supports VPCLMULQDQ intrinsics + if(NOT (APPLE AND "${ARCH}" MATCHES "i386")) + set(CMAKE_REQUIRED_FLAGS "${VPCLMULFLAG}") + check_c_source_compile_or_run( + "#include + int main(void) { + __m512i a = _mm512_setzero_si512(); + __m512i b = _mm512_setzero_si512(); + __m512i c = _mm512_clmulepi64_epi128(a, b, 0x10); + (void)c; + return 0; + }" + HAVE_VPCLMULQDQ_INTRIN + ) + set(CMAKE_REQUIRED_FLAGS) + else() + set(HAVE_VPCLMULQDQ_INTRIN OFF) + endif() +endmacro() + macro(check_ppc_intrinsics) # Check if compiler supports AltiVec set(CMAKE_REQUIRED_FLAGS "-maltivec") diff --git a/configure b/configure index 2c17507a8..143d6e4e6 100755 --- a/configure +++ b/configure @@ -90,6 +90,7 @@ compat=0 cover=0 build32=0 build64=0 +buildvpclmulqdq=1 buildacle=1 buildaltivec=1 buildpower8=1 @@ -112,6 +113,7 @@ ssse3flag="-mssse3" sse41flag="-msse4.1" sse42flag="-msse4.2" pclmulflag="-mpclmul" +vpclmulflag="-mvpclmulqdq" acleflag= neonflag= noltoflag="-fno-lto" @@ -194,6 +196,7 @@ case "$1" in --cover) cover=1; shift ;; -3* | --32) build32=1; shift ;; -6* | --64) build64=1; shift ;; + --without-vpclmulqdq) buildvpclmulqdq=0; shift ;; --without-acle) buildacle=0; shift ;; --without-neon) buildneon=0; shift ;; --without-altivec) buildaltivec=0 ; shift ;; @@ -262,6 +265,7 @@ if test $native -eq 1; then sse4flag="" sse42flag="" pclmulflag="" + vpclmulflag="" noltoflag="" fi @@ -1161,6 +1165,28 @@ EOF fi } +check_vpclmulqdq_intrinsics() { + # Check whether compiler supports VPCLMULQDQ intrinsics + cat > $test.c << EOF +#include +#include +int main(void) { + __m512i a = _mm512_setzero_si512(); + __m512i b = _mm512_setzero_si512(); + __m512i c = _mm512_clmulepi64_epi128(a, b, 0x10); + (void)c; + return 0; +} +EOF + if try ${CC} ${CFLAGS} ${vpclmulflag} $test.c; then + echo "Checking for VPCLMULQDQ intrinsics ... Yes." | tee -a configure.log + HAVE_VPCLMULQDQ_INTRIN=1 + else + echo "Checking for VPCLMULQDQ intrinsics ... No." | tee -a configure.log + HAVE_VPCLMULQDQ_INTRIN=0 + fi +} + check_ppc_intrinsics() { cat > $test.c << EOF #include @@ -1503,6 +1529,17 @@ case "${ARCH}" in SFLAGS="${SFLAGS} -DX86_PCLMULQDQ_CRC" ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_fold_pclmulqdq.o" ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_fold_pclmulqdq.lo" + + if test $buildvpclmulqdq -eq 1; then + check_vpclmulqdq_intrinsics + + if test ${HAVE_VPCLMULQDQ_INTRIN} -eq 1 && test ${HAVE_AVX512_INTRIN} -eq 1; then + CFLAGS="${CFLAGS} -DX86_VPCLMULQDQ_CRC" + SFLAGS="${SFLAGS} -DX86_VPCLMULQDQ_CRC" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_fold_vpclmulqdq.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_fold_vpclmulqdq.lo" + fi + fi fi fi ;; @@ -1912,6 +1949,7 @@ echo ssse3flag = $ssse3flag >> configure.log echo sse41flag = $sse41flag >> configure.log echo sse42flag = $sse42flag >> configure.log echo pclmulflag = $pclmulflag >> configure.log +echo vpclmulflag = $vpclmulflag >> configure.log echo acleflag = $acleflag >> configure.log echo neonflag = $neonflag >> configure.log echo ARCHDIR = ${ARCHDIR} >> configure.log @@ -2049,6 +2087,7 @@ sed < $SRCDIR/$ARCHDIR/Makefile.in " /^SSE41FLAG *=/s#=.*#=$sse41flag# /^SSE42FLAG *=/s#=.*#=$sse42flag# /^PCLMULFLAG *=/s#=.*#=$pclmulflag# +/^VPCLMULFLAG *=/s#=.*#=$vpclmulflag# /^ACLEFLAG *=/s#=.*#=$acleflag# /^NEONFLAG *=/s#=.*#=$neonflag# /^NOLTOFLAG *=/s#=.*#=$noltoflag#