Based on PR https://github.com/jtkukunas/zlib/pull/28.
Co-authored-by: Wangyang Guo <wangyang.guo@intel.com>
option(WITH_SSE41 "Build with SSE41" ON)
option(WITH_SSE42 "Build with SSE42" ON)
option(WITH_PCLMULQDQ "Build with PCLMULQDQ" ON)
+ option(WITH_VPCLMULQDQ "Build with VPCLMULQDQ" ON)
endif()
option(INSTALL_UTILS "Copy minigzip and minideflate during install" OFF)
add_feature_info(PCLMUL_CRC 1 "Support CRC hash generation using PCLMULQDQ, using \"${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG}\"")
list(APPEND ZLIB_ARCH_SRCS ${PCLMULQDQ_SRCS})
set_property(SOURCE ${PCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG} ${NOLTOFLAG}")
+
+ if(WITH_VPCLMULQDQ AND WITH_AVX512)
+ check_vpclmulqdq_intrinsics()
+ if(HAVE_VPCLMULQDQ_INTRIN AND HAVE_AVX512_INTRIN)
+ add_definitions(-DX86_VPCLMULQDQ_CRC)
+ set(VPCLMULQDQ_SRCS ${ARCHDIR}/crc32_fold_vpclmulqdq.c)
+ add_feature_info(VPCLMUL_CRC 1 "Support CRC hash generation using VPCLMULQDQ, using \"${VPCLMULFLAG} ${AVX512FLAG}\"")
+ list(APPEND ZLIB_ARCH_SRCS ${VPCLMULQDQ_SRCS})
+ set_property(SOURCE ${VPCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${VPCLMULFLAG} ${AVX512FLAG} ${NOLTOFLAG}")
+ else()
+ set(WITH_VPCLMULQDQ OFF)
+ endif()
+ else()
+ set(WITH_VPCLMULQDQ OFF)
+ endif()
else()
set(WITH_PCLMULQDQ OFF)
+ set(WITH_VPCLMULQDQ OFF)
endif()
else()
set(WITH_PCLMULQDQ OFF)
+ set(WITH_VPCLMULQDQ OFF)
endif()
endif()
endif()
add_feature_info(WITH_SSE41 WITH_SSE41 "Build with SSE41")
add_feature_info(WITH_SSE42 WITH_SSE42 "Build with SSE42")
add_feature_info(WITH_PCLMULQDQ WITH_PCLMULQDQ "Build with PCLMULQDQ")
+ add_feature_info(WITH_VPCLMULQDQ WITH_VPCLMULQDQ "Build with VPCLMULQDQ")
endif()
add_feature_info(INSTALL_UTILS INSTALL_UTILS "Copy minigzip and minideflate during install")
* Deflate medium and quick algorithms based on Intels zlib fork
* Support for CPU intrinsics when available
* Adler32 implementation using SSSE3, AVX2, AVX512, AVX512-VNNI, Neon, VMX & VSX
- * CRC32-B implementation using PCLMULQDQ & ACLE
+ * CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, & ACLE
* Hash table implementation using CRC32-C intrinsics on x86 and ARM
* Slide hash implementations using SSE2, AVX2, Neon, VMX & VSX
* Compare256/258 implementations using SSE4.2 & AVX2
| WITH_SSE41 | | Build with SSE41 intrinsics | ON |
| WITH_SSE42 | | Build with SSE42 intrinsics | ON |
| WITH_PCLMULQDQ | | Build with PCLMULQDQ intrinsics | ON |
+| WITH_VPCLMULQDQ | --without-vpclmulqdq | Build with VPCLMULQDQ intrinsics | ON |
| WITH_ACLE | --without-acle | Build with ACLE intrinsics | ON |
| WITH_NEON | --without-neon | Build with NEON intrinsics | ON |
| WITH_ALTIVEC | --without-altivec | Build with AltiVec (VMX) intrinsics | ON |
|:-|:-|
|deflate_quick.c|SSE4 optimized deflate strategy for use as level 1|
|crc32_fold_pclmulqdq.c|SSE4 + PCLMULQDQ optimized CRC folding implementation|
+|crc32_fold_vpclmulqdq.c|VPCLMULQDQ optimized CRC folding implementation|
|slide_hash_sse2.c|SSE2 optimized slide_hash|
SSE41FLAG=-msse4.1
SSE42FLAG=-msse4.2
PCLMULFLAG=-mpclmul
+VPCLMULFLAG=-mvpclmulqdq
NOLTOFLAG=
SRCDIR=.
compare258_sse.o compare258_sse.lo \
insert_string_sse.o insert_string_sse.lo \
crc32_fold_pclmulqdq.o crc32_fold_pclmulqdq.lo \
+ crc32_fold_vpclmulqdq.o crc32_fold_vpclmulqdq.lo \
slide_hash_avx.o slide_hash_avx.lo \
slide_hash_sse.o slide_hash_sse.lo
crc32_fold_pclmulqdq.lo:
$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_pclmulqdq.c
+crc32_fold_vpclmulqdq.o:
+ $(CC) $(CFLAGS) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_vpclmulqdq.c
+
+crc32_fold_vpclmulqdq.lo:
+ $(CC) $(SFLAGS) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_vpclmulqdq.c
+
slide_hash_avx.o:
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx.c
#include "../../crc32_fold.h"
+#ifdef X86_VPCLMULQDQ_CRC
+extern size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
+ __m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
static void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
0x00000001, 0xc6e41596);
xmm_crc_part = _mm_setzero_si128();
}
+#ifdef X86_VPCLMULQDQ_CRC
+ if (x86_cpu_has_vpclmulqdq && x86_cpu_has_avx512 && (len >= 256)) {
+ size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len);
+
+ len -= n;
+ src += n;
+ dst += n;
+ }
+#endif
+
while (len >= 64) {
crc32_fold_load((__m128i *)src, &xmm_t0, &xmm_t1, &xmm_t2, &xmm_t3);
--- /dev/null
+/* crc32_fold_vpclmulqdq.c -- VPCMULQDQ-based CRC32 folding implementation.
+ * Copyright Wangyang Guo (wangyang.guo@intel.com)
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_VPCLMULQDQ_CRC
+#include "../../zutil.h"
+
+#include <immintrin.h>
+
+size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
+ __m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len) {
+ size_t len_tmp = len;
+ __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3;
+ __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3;
+ __m512i z0, z1, z2, z3;
+ z_const __m512i zmm_fold4 = _mm512_set4_epi32(
+ 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
+ z_const __m512i zmm_fold16 = _mm512_set4_epi32(
+ 0x00000001, 0x1542778a, 0x00000001, 0x322d1430);
+
+ // zmm register init
+ zmm_crc0 = _mm512_setzero_si512();
+ zmm_t0 = _mm512_loadu_si512((__m512i *)src);
+ zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1);
+ zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2);
+ zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3);
+
+ /* already have intermediate CRC in xmm registers
+ * fold4 with 4 xmm_crc to get zmm_crc0
+ */
+ zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0);
+ zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1);
+ zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2);
+ zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3);
+ z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+ zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+ zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
+ zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0);
+
+ _mm512_storeu_si512((__m512i *)dst, zmm_t0);
+ _mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1);
+ _mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2);
+ _mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3);
+ len -= 256;
+ src += 256;
+ dst += 256;
+
+ // fold-16 loops
+ while (len >= 256) {
+ zmm_t0 = _mm512_loadu_si512((__m512i *)src);
+ zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1);
+ zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2);
+ zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3);
+
+ z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01);
+ z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01);
+ z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01);
+ z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01);
+
+ zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10);
+ zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10);
+ zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10);
+ zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10);
+
+ zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
+ zmm_crc1 = _mm512_xor_si512(z1, zmm_crc1);
+ zmm_crc2 = _mm512_xor_si512(z2, zmm_crc2);
+ zmm_crc3 = _mm512_xor_si512(z3, zmm_crc3);
+
+ zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0);
+ zmm_crc1 = _mm512_xor_si512(zmm_crc1, zmm_t1);
+ zmm_crc2 = _mm512_xor_si512(zmm_crc2, zmm_t2);
+ zmm_crc3 = _mm512_xor_si512(zmm_crc3, zmm_t3);
+
+ _mm512_storeu_si512((__m512i *)dst, zmm_t0);
+ _mm512_storeu_si512((__m512i *)dst + 1, zmm_t1);
+ _mm512_storeu_si512((__m512i *)dst + 2, zmm_t2);
+ _mm512_storeu_si512((__m512i *)dst + 3, zmm_t3);
+ len -= 256;
+ src += 256;
+ dst += 256;
+ }
+ // zmm_crc[0,1,2,3] -> zmm_crc0
+ z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+ zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+ zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
+ zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc1);
+
+ z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+ zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+ zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
+ zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc2);
+
+ z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+ zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+ zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
+ zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc3);
+
+ // zmm_crc0 -> xmm_crc[0, 1, 2, 3]
+ *xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0);
+ *xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1);
+ *xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2);
+ *xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3);
+
+ return (len_tmp - len); // return n bytes processed
+}
+#endif
Z_INTERNAL int x86_cpu_has_sse41;
Z_INTERNAL int x86_cpu_has_sse42;
Z_INTERNAL int x86_cpu_has_pclmulqdq;
+Z_INTERNAL int x86_cpu_has_vpclmulqdq;
Z_INTERNAL int x86_cpu_has_tzcnt;
Z_INTERNAL int x86_cpu_well_suited_avx512;
x86_cpu_has_avx2 = ebx & 0x20;
x86_cpu_has_avx512 = ebx & 0x00010000;
x86_cpu_has_avx512vnni = ecx & 0x800;
+ x86_cpu_has_vpclmulqdq = ecx & 0x400;
} else {
x86_cpu_has_tzcnt = 0;
x86_cpu_has_avx2 = 0;
+ x86_cpu_has_vpclmulqdq = 0;
}
extern int x86_cpu_has_sse41;
extern int x86_cpu_has_sse42;
extern int x86_cpu_has_pclmulqdq;
+extern int x86_cpu_has_vpclmulqdq;
extern int x86_cpu_has_tzcnt;
extern int x86_cpu_well_suited_avx512;
endif()
endmacro()
+macro(check_vpclmulqdq_intrinsics)
+ if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+ if(NOT NATIVEFLAG)
+ set(VPCLMULFLAG "-mvpclmulqdq")
+ endif()
+ endif()
+ # Check whether compiler supports VPCLMULQDQ intrinsics
+ if(NOT (APPLE AND "${ARCH}" MATCHES "i386"))
+ set(CMAKE_REQUIRED_FLAGS "${VPCLMULFLAG}")
+ check_c_source_compile_or_run(
+ "#include <immintrin.h>
+ int main(void) {
+ __m512i a = _mm512_setzero_si512();
+ __m512i b = _mm512_setzero_si512();
+ __m512i c = _mm512_clmulepi64_epi128(a, b, 0x10);
+ (void)c;
+ return 0;
+ }"
+ HAVE_VPCLMULQDQ_INTRIN
+ )
+ set(CMAKE_REQUIRED_FLAGS)
+ else()
+ set(HAVE_VPCLMULQDQ_INTRIN OFF)
+ endif()
+endmacro()
+
macro(check_ppc_intrinsics)
# Check if compiler supports AltiVec
set(CMAKE_REQUIRED_FLAGS "-maltivec")
cover=0
build32=0
build64=0
+buildvpclmulqdq=1
buildacle=1
buildaltivec=1
buildpower8=1
sse41flag="-msse4.1"
sse42flag="-msse4.2"
pclmulflag="-mpclmul"
+vpclmulflag="-mvpclmulqdq"
acleflag=
neonflag=
noltoflag="-fno-lto"
--cover) cover=1; shift ;;
-3* | --32) build32=1; shift ;;
-6* | --64) build64=1; shift ;;
+ --without-vpclmulqdq) buildvpclmulqdq=0; shift ;;
--without-acle) buildacle=0; shift ;;
--without-neon) buildneon=0; shift ;;
--without-altivec) buildaltivec=0 ; shift ;;
sse4flag=""
sse42flag=""
pclmulflag=""
+ vpclmulflag=""
noltoflag=""
fi
fi
}
+check_vpclmulqdq_intrinsics() {
+ # Check whether compiler supports VPCLMULQDQ intrinsics
+ cat > $test.c << EOF
+#include <immintrin.h>
+#include <wmmintrin.h>
+int main(void) {
+ __m512i a = _mm512_setzero_si512();
+ __m512i b = _mm512_setzero_si512();
+ __m512i c = _mm512_clmulepi64_epi128(a, b, 0x10);
+ (void)c;
+ return 0;
+}
+EOF
+ if try ${CC} ${CFLAGS} ${vpclmulflag} $test.c; then
+ echo "Checking for VPCLMULQDQ intrinsics ... Yes." | tee -a configure.log
+ HAVE_VPCLMULQDQ_INTRIN=1
+ else
+ echo "Checking for VPCLMULQDQ intrinsics ... No." | tee -a configure.log
+ HAVE_VPCLMULQDQ_INTRIN=0
+ fi
+}
+
check_ppc_intrinsics() {
cat > $test.c << EOF
#include <altivec.h>
SFLAGS="${SFLAGS} -DX86_PCLMULQDQ_CRC"
ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_fold_pclmulqdq.o"
ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_fold_pclmulqdq.lo"
+
+ if test $buildvpclmulqdq -eq 1; then
+ check_vpclmulqdq_intrinsics
+
+ if test ${HAVE_VPCLMULQDQ_INTRIN} -eq 1 && test ${HAVE_AVX512_INTRIN} -eq 1; then
+ CFLAGS="${CFLAGS} -DX86_VPCLMULQDQ_CRC"
+ SFLAGS="${SFLAGS} -DX86_VPCLMULQDQ_CRC"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_fold_vpclmulqdq.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_fold_vpclmulqdq.lo"
+ fi
+ fi
fi
fi
;;
echo sse41flag = $sse41flag >> configure.log
echo sse42flag = $sse42flag >> configure.log
echo pclmulflag = $pclmulflag >> configure.log
+echo vpclmulflag = $vpclmulflag >> configure.log
echo acleflag = $acleflag >> configure.log
echo neonflag = $neonflag >> configure.log
echo ARCHDIR = ${ARCHDIR} >> configure.log
/^SSE41FLAG *=/s#=.*#=$sse41flag#
/^SSE42FLAG *=/s#=.*#=$sse42flag#
/^PCLMULFLAG *=/s#=.*#=$pclmulflag#
+/^VPCLMULFLAG *=/s#=.*#=$vpclmulflag#
/^ACLEFLAG *=/s#=.*#=$acleflag#
/^NEONFLAG *=/s#=.*#=$neonflag#
/^NOLTOFLAG *=/s#=.*#=$noltoflag#