* https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
*
* Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Copyright (C) 2016 Marian Beermann (support for initial value)
* Authors:
* Wajdi Feghali <wajdi.k.feghali@intel.com>
* Jim Guilford <james.guilford@intel.com>
#include <smmintrin.h> // _mm_extract_epi32
#include "x86_features.h"
+#include "cpu_features.h"
#include "../../crc32_fold.h"
+#include "../../crc32_p.h"
+#include <assert.h>
#ifdef X86_VPCLMULQDQ_CRC
extern size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
__m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len);
+extern size_t fold_16_vpclmulqdq_nocp(__m128i *xmm_crc0, __m128i *xmm_crc1,
+ __m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len, __m128i init_crc,
+ int32_t first);
#endif
static void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
crc32_fold_save_partial((__m128i *)crc->fold, xmm_crc_part);
}
+#define ONCE(op) if (first) { \
+ first = 0; \
+ (op); \
+}
+#define XOR_INITIAL(where) ONCE(where = _mm_xor_si128(where, xmm_initial))
+
+Z_INTERNAL void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) {
+ unsigned long algn_diff;
+ __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
+ __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3, xmm_crc_part;
+ __m128i xmm_initial = _mm_cvtsi32_si128(init_crc);
+ int32_t first = 1;
+
+ /* Technically the CRC functions don't even call this for input < 64, but a bare minimum of 31
+ * bytes of input is needed for the aligning load that occurs. If there's an initial CRC, to
+ * carry it forward through the folded CRC there must be 16 - src % 16 + 16 bytes available, which
+ * by definition can be up to 15 bytes + one full vector load. */
+ assert(len >= 31);
+ crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+ algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF;
+ if (algn_diff) {
+ if (algn_diff >= 4) {
+ xmm_crc_part = _mm_loadu_si128((__m128i *)src);
+
+ src += algn_diff;
+ len -= algn_diff;
+
+ XOR_INITIAL(xmm_crc_part);
+ partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
+ } else {
+ xmm_t0 = _mm_loadu_si128((__m128i*)src);
+ xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
+ XOR_INITIAL(xmm_t0);
+ fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
+ partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
+
+ src += (algn_diff + 16);
+ len -= (algn_diff + 16);
+ }
+ }
+
+ xmm_crc_part = _mm_setzero_si128();
+
+#ifdef X86_VPCLMULQDQ_CRC
+ if (x86_cpu_has_vpclmulqdq && x86_cpu_has_avx512 && (len >= 256)) {
+ size_t n = fold_16_vpclmulqdq_nocp(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, src, len,
+ xmm_initial, first);
+ first = 0;
+
+ len -= n;
+ src += n;
+ }
+#endif
+
+ while (len >= 64) {
+ crc32_fold_load((__m128i *)src, &xmm_t0, &xmm_t1, &xmm_t2, &xmm_t3);
+ XOR_INITIAL(xmm_t0);
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+ xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
+ xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
+ xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
+
+ src += 64;
+ len -= 64;
+ }
+
+ /*
+ * len = num bytes left - 64
+ */
+ if (len >= 48) {
+ xmm_t0 = _mm_load_si128((__m128i *)src);
+ xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+ xmm_t2 = _mm_load_si128((__m128i *)src + 2);
+ XOR_INITIAL(xmm_t0);
+
+ fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+ xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
+ xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
+ len -= 48;
+ src += 48;
+ } else if (len >= 32) {
+ xmm_t0 = _mm_load_si128((__m128i *)src);
+ xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+ XOR_INITIAL(xmm_t0);
+
+ fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+ xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
+
+ len -= 32;
+ src += 32;
+ } else if (len >= 16) {
+ xmm_t0 = _mm_load_si128((__m128i *)src);
+ XOR_INITIAL(xmm_t0);
+
+ fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
+
+ len -= 16;
+ src += 16;
+ }
+
+ if (len) {
+ memcpy(&xmm_crc_part, src, len);
+ partial_fold((size_t)len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
+ }
+
+ crc32_fold_save((__m128i *)crc->fold, xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3);
+}
+
static const unsigned ALIGNED_(16) crc_k[] = {
0xccaa009e, 0x00000000, /* rk1 */
0x751997d0, 0x00000001, /* rk2 */
return crc->value;
}
+uint32_t crc32_pclmulqdq(uint32_t crc32, const unsigned char* buf, uint64_t len) {
+ /* For lens < 64, crc32_byfour method is faster. The CRC32 instruction for
+ * these short lengths might also prove to be effective */
+ if (len < 64)
+ return crc32_byfour(crc32, buf, len);
+
+ crc32_fold ALIGNED_(16) crc_state;
+ crc32_fold_reset_pclmulqdq(&crc_state);
+ crc32_fold_pclmulqdq(&crc_state, buf, len, crc32);
+ return crc32_fold_final_pclmulqdq(&crc_state);
+}
+
#endif
#ifdef X86_VPCLMULQDQ_CRC
#include "../../zbuild.h"
+#include "../../fallback_builtins.h"
#include <immintrin.h>
+#define ONCE(op) if (first) { \
+ first = 0; \
+ (op); \
+}
+#define XOR_INITIAL(where) ONCE(where = _mm512_xor_si512(where, zmm_initial))
+
size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
__m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len) {
size_t len_tmp = len;
return (len_tmp - len); // return n bytes processed
}
+
+size_t fold_16_vpclmulqdq_nocp(__m128i *xmm_crc0, __m128i *xmm_crc1,
+ __m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len,
+ __m128i init_crc, int32_t first) {
+ size_t len_tmp = len;
+ __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3;
+ __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3;
+ __m512i z0, z1, z2, z3;
+ __m512i zmm_initial = _mm512_zextsi128_si512(init_crc);
+ const __m512i zmm_fold4 = _mm512_set4_epi32(
+ 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
+ const __m512i zmm_fold16 = _mm512_set4_epi32(
+ 0x00000001, 0x1542778a, 0x00000001, 0x322d1430);
+
+ // zmm register init
+ zmm_crc0 = _mm512_setzero_si512();
+ zmm_t0 = _mm512_loadu_si512((__m512i *)src);
+ XOR_INITIAL(zmm_t0);
+ zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1);
+ zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2);
+ zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3);
+
+ /* already have intermediate CRC in xmm registers
+ * fold4 with 4 xmm_crc to get zmm_crc0
+ */
+ zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0);
+ zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1);
+ zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2);
+ zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3);
+ z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+ zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+ zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
+ zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0);
+
+ len -= 256;
+ src += 256;
+
+ // fold-16 loops
+ while (len >= 256) {
+ zmm_t0 = _mm512_loadu_si512((__m512i *)src);
+ zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1);
+ zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2);
+ zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3);
+
+ z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01);
+ z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01);
+ z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01);
+ z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01);
+
+ zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10);
+ zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10);
+ zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10);
+ zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10);
+
+ zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
+ zmm_crc1 = _mm512_xor_si512(z1, zmm_crc1);
+ zmm_crc2 = _mm512_xor_si512(z2, zmm_crc2);
+ zmm_crc3 = _mm512_xor_si512(z3, zmm_crc3);
+
+ zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0);
+ zmm_crc1 = _mm512_xor_si512(zmm_crc1, zmm_t1);
+ zmm_crc2 = _mm512_xor_si512(zmm_crc2, zmm_t2);
+ zmm_crc3 = _mm512_xor_si512(zmm_crc3, zmm_t3);
+
+ len -= 256;
+ src += 256;
+ }
+ // zmm_crc[0,1,2,3] -> zmm_crc0
+ z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+ zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+ zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
+ zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc1);
+
+ z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+ zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+ zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
+ zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc2);
+
+ z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+ zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+ zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
+ zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc3);
+
+ // zmm_crc0 -> xmm_crc[0, 1, 2, 3]
+ *xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0);
+ *xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1);
+ *xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2);
+ *xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3);
+
+ return (len_tmp - len); // return n bytes processed
+}
#endif
#ifdef X86_PCLMULQDQ_CRC
extern uint32_t crc32_fold_reset_pclmulqdq(crc32_fold *crc);
extern void crc32_fold_copy_pclmulqdq(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
+extern void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc);
extern uint32_t crc32_fold_final_pclmulqdq(crc32_fold *crc);
+extern uint32_t crc32_pclmulqdq(uint32_t crc32, const unsigned char* buf, uint64_t len);
#endif
/* memory chunking */
#elif defined(S390_CRC32_VX)
if (s390_cpu_has_vx)
functable.crc32 = &s390_crc32_vx;
+#elif defined(X86_PCLMULQDQ_CRC)
+ if (x86_cpu_has_pclmulqdq)
+ functable.crc32 = &crc32_pclmulqdq;
#endif
return functable.crc32(crc, buf, len);
BENCHMARK_CRC32(vx, s390_crc32_vx, s390_cpu_has_vx);
#elif defined(X86_PCLMULQDQ_CRC)
/* CRC32 fold does a memory copy while hashing */
-uint32_t crc32_pclmulqdq(uint32_t crc32, const unsigned char* buf, uint64_t len) {
- crc32_fold ALIGNED_(16) crc_state;
- crc32_fold_reset_pclmulqdq(&crc_state);
- crc32_fold_copy_pclmulqdq(&crc_state, (uint8_t *)buf, buf, len);
- return crc32_fold_final_pclmulqdq(&crc_state);
-}
BENCHMARK_CRC32(pclmulqdq, crc32_pclmulqdq, x86_cpu_has_pclmulqdq);
-#endif
\ No newline at end of file
+#endif