check_pclmulqdq_intrinsics()
if(HAVE_PCLMULQDQ_INTRIN AND HAVE_SSSE3_INTRIN)
add_definitions(-DX86_PCLMULQDQ_CRC)
- set(PCLMULQDQ_SRCS ${ARCHDIR}/crc32_fold_pclmulqdq.c)
+ set(PCLMULQDQ_SRCS ${ARCHDIR}/crc32_pclmulqdq.c)
add_feature_info(PCLMUL_CRC 1 "Support CRC hash generation using PCLMULQDQ, using \"${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG}\"")
list(APPEND ZLIB_ARCH_SRCS ${PCLMULQDQ_SRCS})
set_property(SOURCE ${PCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG} ${NOLTOFLAG}")
check_vpclmulqdq_intrinsics()
if(HAVE_VPCLMULQDQ_INTRIN AND HAVE_AVX512_INTRIN)
add_definitions(-DX86_VPCLMULQDQ_CRC)
- set(VPCLMULQDQ_SRCS ${ARCHDIR}/crc32_fold_vpclmulqdq.c)
+ set(VPCLMULQDQ_SRCS ${ARCHDIR}/crc32_vpclmulqdq.c)
add_feature_info(VPCLMUL_CRC 1 "Support CRC hash generation using VPCLMULQDQ, using \"${VPCLMULFLAG} ${AVX512FLAG}\"")
list(APPEND ZLIB_ARCH_SRCS ${VPCLMULQDQ_SRCS})
- set_property(SOURCE ${VPCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${VPCLMULFLAG} ${AVX512FLAG} ${NOLTOFLAG}")
+ set_property(SOURCE ${VPCLMULQDQ_SRCS} PROPERTY COMPILE_FLAGS "${SSSE3FLAG} ${SSE42FLAG} ${PCLMULFLAG} ${VPCLMULFLAG} ${AVX512FLAG} ${NOLTOFLAG}")
else()
set(WITH_VPCLMULQDQ OFF)
endif()
compare256_avx2.o compare256_avx2.lo \
compare256_sse2.o compare256_sse2.lo \
insert_string_sse42.o insert_string_sse42.lo \
- crc32_fold_pclmulqdq.o crc32_fold_pclmulqdq.lo \
- crc32_fold_vpclmulqdq.o crc32_fold_vpclmulqdq.lo \
+ crc32_pclmulqdq.o crc32_pclmulqdq.lo \
+ crc32_vpclmulqdq.o crc32_vpclmulqdq.lo \
slide_hash_avx2.o slide_hash_avx2.lo \
slide_hash_sse2.o slide_hash_sse2.lo
insert_string_sse42.lo:
$(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
-crc32_fold_pclmulqdq.o:
- $(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_pclmulqdq.c
+crc32_pclmulqdq.o:
+ $(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
-crc32_fold_pclmulqdq.lo:
- $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_pclmulqdq.c
+crc32_pclmulqdq.lo:
+ $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_pclmulqdq.c
-crc32_fold_vpclmulqdq.o:
- $(CC) $(CFLAGS) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_vpclmulqdq.c
+crc32_vpclmulqdq.o:
+ $(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
-crc32_fold_vpclmulqdq.lo:
- $(CC) $(SFLAGS) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_vpclmulqdq.c
+crc32_vpclmulqdq.lo:
+ $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_vpclmulqdq.c
slide_hash_avx2.o:
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
* For conditions of distribution and use, see copyright notice in zlib.h
*/
-#ifdef X86_PCLMULQDQ_CRC
-
#ifdef COPY
-Z_INTERNAL void crc32_fold_pclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
+Z_INTERNAL void CRC32_FOLD_COPY_NAME (crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
#else
-Z_INTERNAL void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) {
+Z_INTERNAL void CRC32_FOLD_NAME (crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) {
#endif
unsigned long algn_diff;
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
_mm_storeu_si128((__m128i *)dst, xmm_crc_part);
dst += algn_diff;
#else
- XOR_INITIAL(xmm_crc_part);
+ XOR_INITIAL128(xmm_crc_part);
if (algn_diff < 4 && init_crc != 0) {
xmm_t0 = xmm_crc_part;
len -= algn_diff;
}
-#ifdef X86_VPCLMULQDQ_CRC
- if (x86_cpu_has_vpclmulqdq && x86_cpu_has_avx512 && (len >= 256)) {
+#ifdef X86_VPCLMULQDQ
+ if (len >= 256) {
#ifdef COPY
size_t n = fold_16_vpclmulqdq_copy(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len);
dst += n;
_mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
dst += 64;
#else
- XOR_INITIAL(xmm_t0);
+ XOR_INITIAL128(xmm_t0);
#endif
xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
_mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
dst += 48;
#else
- XOR_INITIAL(xmm_t0);
+ XOR_INITIAL128(xmm_t0);
#endif
fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
dst += 32;
#else
- XOR_INITIAL(xmm_t0);
+ XOR_INITIAL128(xmm_t0);
#endif
fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
_mm_storeu_si128((__m128i *)dst, xmm_t0);
dst += 16;
#else
- XOR_INITIAL(xmm_t0);
+ XOR_INITIAL128(xmm_t0);
#endif
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
}
-#endif
+++ /dev/null
-/* crc32_fold_vpclmulqdq.c -- VPCMULQDQ-based CRC32 folding implementation.
- * Copyright Wangyang Guo (wangyang.guo@intel.com)
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#ifdef X86_VPCLMULQDQ_CRC
-#include "../../zbuild.h"
-#include "../../fallback_builtins.h"
-
-#include <immintrin.h>
-
-#define ONCE(op) if (first) { first = 0; op; }
-#define XOR_INITIAL(where) ONCE(where = _mm512_xor_si512(where, zmm_initial))
-
-#include "crc32_fold_vpclmulqdq_tpl.h"
-#define COPY
-#include "crc32_fold_vpclmulqdq_tpl.h"
-
-#endif
*/
#ifdef COPY
-Z_INTERNAL size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1,
+static size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1,
__m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len) {
#else
-Z_INTERNAL size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
+static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
__m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len,
__m128i init_crc, int32_t first) {
__m512i zmm_initial = _mm512_zextsi128_si512(init_crc);
zmm_crc0 = _mm512_setzero_si512();
zmm_t0 = _mm512_loadu_si512((__m512i *)src);
#ifndef COPY
- XOR_INITIAL(zmm_t0);
+ XOR_INITIAL512(zmm_t0);
#endif
zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1);
zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2);
--- /dev/null
+/*
+ * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
+ * instruction.
+ *
+ * A white paper describing this algorithm can be found at:
+ * doc/crc-pclmulqdq.pdf
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Copyright (C) 2016 Marian Beermann (support for initial value)
+ * Authors:
+ * Wajdi Feghali <wajdi.k.feghali@intel.com>
+ * Jim Guilford <james.guilford@intel.com>
+ * Vinodh Gopal <vinodh.gopal@intel.com>
+ * Erdinc Ozturk <erdinc.ozturk@intel.com>
+ * Jim Kukunas <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_PCLMULQDQ_CRC
+
+#define CRC32_FOLD_COPY_NAME crc32_fold_pclmulqdq_copy
+#define CRC32_FOLD_NAME crc32_fold_pclmulqdq
+#define CRC32_FOLD_RESET_NAME crc32_fold_pclmulqdq_reset
+#define CRC32_FOLD_FINAL_NAME crc32_fold_pclmulqdq_final
+#define CRC32_NAME crc32_pclmulqdq
+
+#include "crc32_pclmulqdq_tpl.h"
+
+#endif
* For conditions of distribution and use, see copyright notice in zlib.h
*/
-#ifdef X86_PCLMULQDQ_CRC
#include "../../zbuild.h"
#include <immintrin.h>
#include <wmmintrin.h>
#include <smmintrin.h> // _mm_extract_epi32
-
-#include "x86_features.h"
-#include "cpu_features.h"
+#ifdef X86_VPCLMULQDQ
+#include <immintrin.h>
+#endif
#include "../../crc32_fold.h"
#include "../../crc32_braid_p.h"
+#include "../../fallback_builtins.h"
#include <assert.h>
-#ifdef X86_VPCLMULQDQ_CRC
-extern size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
+#ifdef X86_VPCLMULQDQ
+static size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
__m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len, __m128i init_crc,
int32_t first);
-extern size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1,
+static size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1,
__m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len);
#endif
_mm_storeu_si128(fold + 3, *fold3);
}
-Z_INTERNAL uint32_t crc32_fold_pclmulqdq_reset(crc32_fold *crc) {
+Z_INTERNAL uint32_t CRC32_FOLD_RESET_NAME (crc32_fold *crc) {
__m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
__m128i xmm_zero = _mm_setzero_si128();
crc32_fold_save((__m128i *)crc->fold, &xmm_crc0, &xmm_zero, &xmm_zero, &xmm_zero);
return 0;
}
-#define ONCE(op) if (first) { first = 0; op; }
-#define XOR_INITIAL(where) ONCE(where = _mm_xor_si128(where, xmm_initial))
+#define ONCE(op) if (first) { first = 0; op; }
+#define XOR_INITIAL128(where) ONCE(where = _mm_xor_si128(where, xmm_initial))
+#ifdef X86_VPCLMULQDQ
+#define XOR_INITIAL512(where) ONCE(where = _mm512_xor_si512(where, zmm_initial))
+#endif
+#ifdef X86_VPCLMULQDQ
+#include "crc32_fold_vpclmulqdq_tpl.h"
+#endif
#include "crc32_fold_pclmulqdq_tpl.h"
#define COPY
+#ifdef X86_VPCLMULQDQ
+#include "crc32_fold_vpclmulqdq_tpl.h"
+#endif
#include "crc32_fold_pclmulqdq_tpl.h"
static const unsigned ALIGNED_(16) crc_k[] = {
0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
};
-Z_INTERNAL uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc) {
+Z_INTERNAL uint32_t CRC32_FOLD_FINAL_NAME (crc32_fold *crc) {
const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
__m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
return crc->value;
}
-Z_INTERNAL uint32_t crc32_pclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len) {
+Z_INTERNAL uint32_t CRC32_NAME (uint32_t crc32, const uint8_t *buf, size_t len) {
/* For lens < 64, crc32_braid method is faster. The CRC32 instruction for
* these short lengths might also prove to be effective */
if (len < 64)
return PREFIX(crc32_braid)(crc32, buf, len);
crc32_fold ALIGNED_(16) crc_state;
- crc32_fold_pclmulqdq_reset(&crc_state);
- crc32_fold_pclmulqdq(&crc_state, buf, len, crc32);
- return crc32_fold_pclmulqdq_final(&crc_state);
+ CRC32_FOLD_RESET_NAME (&crc_state);
+ CRC32_FOLD_NAME (&crc_state, buf, len, crc32);
+ return CRC32_FOLD_FINAL_NAME (&crc_state);
}
-#endif
--- /dev/null
+/* crc32_vpclmulqdq.c -- VPCMULQDQ-based CRC32 folding implementation.
+ * Copyright Wangyang Guo (wangyang.guo@intel.com)
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
+
+#define X86_VPCLMULQDQ
+#define CRC32_FOLD_COPY_NAME crc32_fold_vpclmulqdq_copy
+#define CRC32_FOLD_NAME crc32_fold_vpclmulqdq
+#define CRC32_FOLD_RESET_NAME crc32_fold_vpclmulqdq_reset
+#define CRC32_FOLD_FINAL_NAME crc32_fold_vpclmulqdq_final
+#define CRC32_NAME crc32_vpclmulqdq
+
+#include "crc32_pclmulqdq_tpl.h"
+
+#endif
if test ${HAVE_PCLMULQDQ_INTRIN} -eq 1; then
CFLAGS="${CFLAGS} -DX86_PCLMULQDQ_CRC"
SFLAGS="${SFLAGS} -DX86_PCLMULQDQ_CRC"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_fold_pclmulqdq.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_fold_pclmulqdq.lo"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_pclmulqdq.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_pclmulqdq.lo"
if test $buildvpclmulqdq -eq 1; then
check_vpclmulqdq_intrinsics
if test ${HAVE_VPCLMULQDQ_INTRIN} -eq 1 && test ${HAVE_AVX512_INTRIN} -eq 1; then
CFLAGS="${CFLAGS} -DX86_VPCLMULQDQ_CRC"
SFLAGS="${SFLAGS} -DX86_VPCLMULQDQ_CRC"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_fold_vpclmulqdq.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_fold_vpclmulqdq.lo"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_vpclmulqdq.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_vpclmulqdq.lo"
fi
fi
fi
extern uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc);
extern uint32_t crc32_pclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
#endif
+#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
+extern uint32_t crc32_fold_vpclmulqdq_reset(crc32_fold *crc);
+extern void crc32_fold_vpclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
+extern void crc32_fold_vpclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
+extern uint32_t crc32_fold_vpclmulqdq_final(crc32_fold *crc);
+extern uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len);
+#endif
/* memory chunking */
extern uint32_t chunksize_c(void);
ft.adler32_fold_copy = &adler32_fold_copy_avx512_vnni;
}
#endif
+ // X86 - VPCLMULQDQ
+#if defined(X86_PCLMULQDQ_CRC) && defined(X86_VPCLMULQDQ_CRC)
+ if (x86_cpu_has_pclmulqdq && x86_cpu_has_avx512 && x86_cpu_has_vpclmulqdq) {
+ ft.crc32 = &crc32_vpclmulqdq;
+ ft.crc32_fold = &crc32_fold_vpclmulqdq;
+ ft.crc32_fold_copy = &crc32_fold_vpclmulqdq_copy;
+ ft.crc32_fold_final = &crc32_fold_vpclmulqdq_final;
+ ft.crc32_fold_reset = &crc32_fold_vpclmulqdq_reset;
+ }
+#endif
// ARM - NEON
crc32_braid.obj \
crc32_braid_comb.obj \
crc32_fold.obj \
- crc32_fold_pclmulqdq.obj \
+ crc32_pclmulqdq.obj \
deflate.obj \
deflate_fast.obj \
deflate_huff.obj \
crc32_braid.obj: $(SRCDIR)/crc32_braid.c $(SRCDIR)/zbuild.h $(SRCDIR)/zendian.h $(SRCDIR)/deflate.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h
crc32_braid_comb.obj: $(SRCDIR)/crc32_braid_comb.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h $(SRCDIR)/crc32_braid_p.h $(SRCDIR)/crc32_braid_tbl.h $(SRCDIR)/crc32_braid_comb_p.h
crc32_fold.obj: $(SRCDIR)/crc32_fold.c $(SRCDIR)/zbuild.h
-crc32_fold_pclmulqdq.obj: $(SRCDIR)/arch/x86/crc32_fold_pclmulqdq.c $(SRCDIR)/crc32_fold.h $(SRCDIR)/zbuild.h
+crc32_pclmulqdq.obj: $(SRCDIR)/arch/x86/crc32_pclmulqdq.c $(SRCDIR)/arch/x86/crc32_pclmulqdq_tpl.h $(SRCDIR)/arch/x86/crc32_fold_pclmulqdq_tpl.h \
+ $(SRCDIR)/crc32_fold.h $(SRCDIR)/zbuild.h
deflate.obj: $(SRCDIR)/deflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h
deflate_fast.obj: $(SRCDIR)/deflate_fast.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h
deflate_huff.obj: $(SRCDIR)/deflate_huff.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h