From: Nathan Moinvaziri Date: Fri, 15 Apr 2022 02:49:32 +0000 (-0700) Subject: Move crc32 fold functions into templates. Don't store xmm_crc_part between runs becau... X-Git-Tag: 2.1.0-beta1~229 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=843c16c87afd53a20bf915661012b02acdd30281;p=thirdparty%2Fzlib-ng.git Move crc32 fold functions into templates. Don't store xmm_crc_part between runs because it is automatically folded into the checksum in partial_fold. Co-authored-by: Adam Stylinski --- diff --git a/arch/x86/crc32_fold_pclmulqdq.c b/arch/x86/crc32_fold_pclmulqdq.c index 6bb2c985c..01c753b48 100644 --- a/arch/x86/crc32_fold_pclmulqdq.c +++ b/arch/x86/crc32_fold_pclmulqdq.c @@ -33,10 +33,10 @@ #ifdef X86_VPCLMULQDQ_CRC extern size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1, - __m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len); -extern size_t fold_16_vpclmulqdq_nocp(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len, __m128i init_crc, int32_t first); +extern size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1, + __m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len); #endif static void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) { @@ -245,271 +245,19 @@ static inline void crc32_fold_save(__m128i *fold, __m128i fold0, __m128i fold1, _mm_storeu_si128(fold + 3, fold3); } -static inline void crc32_fold_save_partial(__m128i *fold, __m128i foldp) { - _mm_store_si128(fold + 4, foldp); -} - -Z_INTERNAL uint32_t crc32_fold_reset_pclmulqdq(crc32_fold *crc) { +Z_INTERNAL uint32_t crc32_fold_pclmulqdq_reset(crc32_fold *crc) { __m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487); __m128i xmm_zero = _mm_setzero_si128(); crc32_fold_save((__m128i *)crc->fold, xmm_crc0, xmm_zero, xmm_zero, xmm_zero); return 0; } -Z_INTERNAL void crc32_fold_copy_pclmulqdq(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) { - unsigned long algn_diff; - __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3; - __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3, xmm_crc_part; - char ALIGNED_(16) partial_buf[16] = { 0 }; - - crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); - - if (len < 16) { - if (len == 0) - return; - - memcpy(partial_buf, src, len); - xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf); - memcpy(dst, partial_buf, len); - goto partial; - } - - algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF; - if (algn_diff) { - xmm_crc_part = _mm_loadu_si128((__m128i *)src); - _mm_storeu_si128((__m128i *)dst, xmm_crc_part); - - dst += algn_diff; - src += algn_diff; - len -= algn_diff; - - partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part); - } else { - xmm_crc_part = _mm_setzero_si128(); - } - -#ifdef X86_VPCLMULQDQ_CRC - if (x86_cpu_has_vpclmulqdq && x86_cpu_has_avx512 && (len >= 256)) { - size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len); - - len -= n; - src += n; - dst += n; - } -#endif - - while (len >= 64) { - crc32_fold_load((__m128i *)src, &xmm_t0, &xmm_t1, &xmm_t2, &xmm_t3); - - fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); - - crc32_fold_save((__m128i *)dst, xmm_t0, xmm_t1, xmm_t2, xmm_t3); - - xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0); - xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1); - xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2); - xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3); - - src += 64; - dst += 64; - len -= 64; - } - - /* - * len = num bytes left - 64 - */ - if (len >= 48) { - xmm_t0 = _mm_load_si128((__m128i *)src); - xmm_t1 = _mm_load_si128((__m128i *)src + 1); - xmm_t2 = _mm_load_si128((__m128i *)src + 2); - - fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); - - _mm_storeu_si128((__m128i *)dst, xmm_t0); - _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); - _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); - - xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0); - xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1); - xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2); - len -= 48; - if (len == 0) - goto done; - - dst += 48; - memcpy(&xmm_crc_part, (__m128i *)src + 3, len); - } else if (len >= 32) { - xmm_t0 = _mm_load_si128((__m128i *)src); - xmm_t1 = _mm_load_si128((__m128i *)src + 1); - - fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); - - _mm_storeu_si128((__m128i *)dst, xmm_t0); - _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); - - xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0); - xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1); +#define ONCE(op) if (first) { first = 0; op; } +#define XOR_INITIAL(where) ONCE(where = _mm_xor_si128(where, xmm_initial)) - len -= 32; - if (len == 0) - goto done; - - dst += 32; - memcpy(&xmm_crc_part, (__m128i *)src + 2, len); - } else if (len >= 16) { - xmm_t0 = _mm_load_si128((__m128i *)src); - - fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); - - _mm_storeu_si128((__m128i *)dst, xmm_t0); - - xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); - - len -= 16; - if (len == 0) - goto done; - - dst += 16; - memcpy(&xmm_crc_part, (__m128i *)src + 1, len); - } else { - if (len == 0) - goto done; - memcpy(&xmm_crc_part, src, len); - } - - _mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part); - memcpy(dst, partial_buf, len); - -partial: - partial_fold((size_t)len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part); -done: - crc32_fold_save((__m128i *)crc->fold, xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3); - crc32_fold_save_partial((__m128i *)crc->fold, xmm_crc_part); -} - -#define ONCE(op) if (first) { \ - first = 0; \ - (op); \ -} -#define XOR_INITIAL(where) ONCE(where = _mm_xor_si128(where, xmm_initial)) - -Z_INTERNAL void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) { - unsigned long algn_diff; - __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3; - __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3, xmm_crc_part; - __m128i xmm_initial = _mm_cvtsi32_si128(init_crc); - xmm_crc_part = _mm_setzero_si128(); - int32_t first = init_crc != 0; - - /* Technically the CRC functions don't even call this for input < 64, but a bare minimum of 31 - * bytes of input is needed for the aligning load that occurs. If there's an initial CRC, to - * carry it forward through the folded CRC there must be 16 - src % 16 + 16 bytes available, which - * by definition can be up to 15 bytes + one full vector load. */ - assert(len >= 31 || first == 0); - crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); - - if (len < 16) { - goto partial_nocpy; - } - - algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF; - if (algn_diff) { - if (algn_diff >= 4 || init_crc == 0) { - xmm_crc_part = _mm_loadu_si128((__m128i *)src); - - src += algn_diff; - len -= algn_diff; - - XOR_INITIAL(xmm_crc_part); - partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part); - } else { - xmm_t0 = _mm_loadu_si128((__m128i*)src); - xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1); - XOR_INITIAL(xmm_t0); - fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); - xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); - partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part); - - src += (algn_diff + 16); - len -= (algn_diff + 16); - } - - xmm_crc_part = _mm_setzero_si128(); - } - -#ifdef X86_VPCLMULQDQ_CRC - if (x86_cpu_has_vpclmulqdq && x86_cpu_has_avx512 && (len >= 256)) { - size_t n = fold_16_vpclmulqdq_nocp(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, src, len, - xmm_initial, first); - first = 0; - - len -= n; - src += n; - } -#endif - - while (len >= 64) { - crc32_fold_load((__m128i *)src, &xmm_t0, &xmm_t1, &xmm_t2, &xmm_t3); - XOR_INITIAL(xmm_t0); - fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); - - xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0); - xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1); - xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2); - xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3); - - src += 64; - len -= 64; - } - - /* - * len = num bytes left - 64 - */ - if (len >= 48) { - xmm_t0 = _mm_load_si128((__m128i *)src); - xmm_t1 = _mm_load_si128((__m128i *)src + 1); - xmm_t2 = _mm_load_si128((__m128i *)src + 2); - XOR_INITIAL(xmm_t0); - - fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); - - xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0); - xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1); - xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2); - len -= 48; - src += 48; - } else if (len >= 32) { - xmm_t0 = _mm_load_si128((__m128i *)src); - xmm_t1 = _mm_load_si128((__m128i *)src + 1); - XOR_INITIAL(xmm_t0); - - fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); - - xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0); - xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1); - - len -= 32; - src += 32; - } else if (len >= 16) { - xmm_t0 = _mm_load_si128((__m128i *)src); - XOR_INITIAL(xmm_t0); - - fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); - - xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); - - len -= 16; - src += 16; - } - -partial_nocpy: - if (len) { - memcpy(&xmm_crc_part, src, len); - partial_fold((size_t)len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part); - } - - crc32_fold_save((__m128i *)crc->fold, xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3); -} +#include "crc32_fold_pclmulqdq_tpl.h" +#define COPY +#include "crc32_fold_pclmulqdq_tpl.h" static const unsigned ALIGNED_(16) crc_k[] = { 0xccaa009e, 0x00000000, /* rk1 */ @@ -528,7 +276,7 @@ static const unsigned ALIGNED_(16) crc_mask2[4] = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; -Z_INTERNAL uint32_t crc32_fold_final_pclmulqdq(crc32_fold *crc) { +Z_INTERNAL uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc) { const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask); const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2); __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3; @@ -600,9 +348,8 @@ uint32_t crc32_pclmulqdq(uint32_t crc32, const unsigned char* buf, uint64_t len) return crc32_braid(crc32, buf, len); crc32_fold ALIGNED_(16) crc_state; - crc32_fold_reset_pclmulqdq(&crc_state); + crc32_fold_pclmulqdq_reset(&crc_state); crc32_fold_pclmulqdq(&crc_state, buf, len, crc32); - return crc32_fold_final_pclmulqdq(&crc_state); + return crc32_fold_pclmulqdq_final(&crc_state); } - #endif diff --git a/arch/x86/crc32_fold_pclmulqdq_tpl.h b/arch/x86/crc32_fold_pclmulqdq_tpl.h new file mode 100644 index 000000000..12681b802 --- /dev/null +++ b/arch/x86/crc32_fold_pclmulqdq_tpl.h @@ -0,0 +1,189 @@ +/* + * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ + * instruction. + * + * A white paper describing this algorithm can be found at: + * https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf + * + * Copyright (C) 2013 Intel Corporation. All rights reserved. + * Copyright (C) 2016 Marian Beermann (support for initial value) + * Authors: + * Wajdi Feghali + * Jim Guilford + * Vinodh Gopal + * Erdinc Ozturk + * Jim Kukunas + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_PCLMULQDQ_CRC + +#ifdef COPY +Z_INTERNAL void crc32_fold_pclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) { +#else +Z_INTERNAL void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) { +#endif + unsigned long algn_diff; + __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3; + __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3; + __m128i xmm_crc_part = _mm_setzero_si128(); +#ifdef COPY + char ALIGNED_(16) partial_buf[16] = { 0 }; +#else + __m128i xmm_initial = _mm_cvtsi32_si128(init_crc); + int32_t first = init_crc != 0; + + /* Technically the CRC functions don't even call this for input < 64, but a bare minimum of 31 + * bytes of input is needed for the aligning load that occurs. If there's an initial CRC, to + * carry it forward through the folded CRC there must be 16 - src % 16 + 16 bytes available, which + * by definition can be up to 15 bytes + one full vector load. */ + assert(len >= 31 || first == 0); +#endif + crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); + + if (len < 16) { +#ifdef COPY + if (len == 0) + return; + + memcpy(partial_buf, src, len); + xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf); + memcpy(dst, partial_buf, len); +#endif + goto partial; + } + + algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF; + if (algn_diff) { + xmm_crc_part = _mm_loadu_si128((__m128i *)src); +#ifdef COPY + _mm_storeu_si128((__m128i *)dst, xmm_crc_part); + dst += algn_diff; +#else + XOR_INITIAL(xmm_crc_part); + + if (algn_diff < 4 && init_crc != 0) { + xmm_t0 = xmm_crc_part; + xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1); + fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); + src += 16; + len -= 16; + } +#endif + + partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part); + + src += algn_diff; + len -= algn_diff; + } + +#ifdef X86_VPCLMULQDQ_CRC + if (x86_cpu_has_vpclmulqdq && x86_cpu_has_avx512 && (len >= 256)) { +#ifdef COPY + size_t n = fold_16_vpclmulqdq_copy(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len); + dst += n; +#else + size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, src, len, + xmm_initial, first); + first = 0; +#endif + len -= n; + src += n; + } +#endif + + while (len >= 64) { + len -= 64; + xmm_t0 = _mm_load_si128((__m128i *)src); + xmm_t1 = _mm_load_si128((__m128i *)src + 1); + xmm_t2 = _mm_load_si128((__m128i *)src + 2); + xmm_t3 = _mm_load_si128((__m128i *)src + 3); + src += 64; + + fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); +#ifdef COPY + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); + _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); + dst += 64; +#else + XOR_INITIAL(xmm_t0); +#endif + + xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0); + xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1); + xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2); + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3); + } + + /* + * len = num bytes left - 64 + */ + if (len >= 48) { + len -= 48; + + xmm_t0 = _mm_load_si128((__m128i *)src); + xmm_t1 = _mm_load_si128((__m128i *)src + 1); + xmm_t2 = _mm_load_si128((__m128i *)src + 2); + src += 48; +#ifdef COPY + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); + dst += 48; +#else + XOR_INITIAL(xmm_t0); +#endif + fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); + + xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0); + xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1); + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2); + } else if (len >= 32) { + len -= 32; + + xmm_t0 = _mm_load_si128((__m128i *)src); + xmm_t1 = _mm_load_si128((__m128i *)src + 1); + src += 32; +#ifdef COPY + _mm_storeu_si128((__m128i *)dst, xmm_t0); + _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); + dst += 32; +#else + XOR_INITIAL(xmm_t0); +#endif + fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); + + xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0); + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1); + } else if (len >= 16) { + len -= 16; + xmm_t0 = _mm_load_si128((__m128i *)src); + src += 16; +#ifdef COPY + _mm_storeu_si128((__m128i *)dst, xmm_t0); + dst += 16; +#else + XOR_INITIAL(xmm_t0); +#endif + fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); + + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); + } + +partial: + if (len) { + memcpy(&xmm_crc_part, src, len); +#ifdef COPY + _mm_storeu_si128((__m128i *)partial_buf, xmm_crc_part); + memcpy(dst, partial_buf, len); +#endif + partial_fold((size_t)len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part); + } + + crc32_fold_save((__m128i *)crc->fold, xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3); +} +#endif diff --git a/arch/x86/crc32_fold_vpclmulqdq.c b/arch/x86/crc32_fold_vpclmulqdq.c index dfcdc8a8c..d9c43be74 100644 --- a/arch/x86/crc32_fold_vpclmulqdq.c +++ b/arch/x86/crc32_fold_vpclmulqdq.c @@ -9,198 +9,11 @@ #include -#define ONCE(op) if (first) { \ - first = 0; \ - (op); \ -} -#define XOR_INITIAL(where) ONCE(where = _mm512_xor_si512(where, zmm_initial)) +#define ONCE(op) if (first) { first = 0; op; } +#define XOR_INITIAL(where) ONCE(where = _mm512_xor_si512(where, zmm_initial)) -size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1, - __m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len) { - size_t len_tmp = len; - __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3; - __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3; - __m512i z0, z1, z2, z3; - const __m512i zmm_fold4 = _mm512_set4_epi32( - 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596); - const __m512i zmm_fold16 = _mm512_set4_epi32( - 0x00000001, 0x1542778a, 0x00000001, 0x322d1430); +#include "crc32_fold_vpclmulqdq_tpl.h" +#define COPY +#include "crc32_fold_vpclmulqdq_tpl.h" - // zmm register init - zmm_crc0 = _mm512_setzero_si512(); - zmm_t0 = _mm512_loadu_si512((__m512i *)src); - zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1); - zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2); - zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3); - - /* already have intermediate CRC in xmm registers - * fold4 with 4 xmm_crc to get zmm_crc0 - */ - zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0); - zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1); - zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2); - zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3); - z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); - zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); - zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); - zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0); - - _mm512_storeu_si512((__m512i *)dst, zmm_t0); - _mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1); - _mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2); - _mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3); - len -= 256; - src += 256; - dst += 256; - - // fold-16 loops - while (len >= 256) { - zmm_t0 = _mm512_loadu_si512((__m512i *)src); - zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1); - zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2); - zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3); - - z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01); - z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01); - z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01); - z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01); - - zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10); - zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10); - zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10); - zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10); - - zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); - zmm_crc1 = _mm512_xor_si512(z1, zmm_crc1); - zmm_crc2 = _mm512_xor_si512(z2, zmm_crc2); - zmm_crc3 = _mm512_xor_si512(z3, zmm_crc3); - - zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0); - zmm_crc1 = _mm512_xor_si512(zmm_crc1, zmm_t1); - zmm_crc2 = _mm512_xor_si512(zmm_crc2, zmm_t2); - zmm_crc3 = _mm512_xor_si512(zmm_crc3, zmm_t3); - - _mm512_storeu_si512((__m512i *)dst, zmm_t0); - _mm512_storeu_si512((__m512i *)dst + 1, zmm_t1); - _mm512_storeu_si512((__m512i *)dst + 2, zmm_t2); - _mm512_storeu_si512((__m512i *)dst + 3, zmm_t3); - len -= 256; - src += 256; - dst += 256; - } - // zmm_crc[0,1,2,3] -> zmm_crc0 - z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); - zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); - zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); - zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc1); - - z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); - zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); - zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); - zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc2); - - z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); - zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); - zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); - zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc3); - - // zmm_crc0 -> xmm_crc[0, 1, 2, 3] - *xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0); - *xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1); - *xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2); - *xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3); - - return (len_tmp - len); // return n bytes processed -} - -size_t fold_16_vpclmulqdq_nocp(__m128i *xmm_crc0, __m128i *xmm_crc1, - __m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len, - __m128i init_crc, int32_t first) { - size_t len_tmp = len; - __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3; - __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3; - __m512i z0, z1, z2, z3; - __m512i zmm_initial = _mm512_zextsi128_si512(init_crc); - const __m512i zmm_fold4 = _mm512_set4_epi32( - 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596); - const __m512i zmm_fold16 = _mm512_set4_epi32( - 0x00000001, 0x1542778a, 0x00000001, 0x322d1430); - - // zmm register init - zmm_crc0 = _mm512_setzero_si512(); - zmm_t0 = _mm512_loadu_si512((__m512i *)src); - XOR_INITIAL(zmm_t0); - zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1); - zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2); - zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3); - - /* already have intermediate CRC in xmm registers - * fold4 with 4 xmm_crc to get zmm_crc0 - */ - zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0); - zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1); - zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2); - zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3); - z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); - zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); - zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); - zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0); - - len -= 256; - src += 256; - - // fold-16 loops - while (len >= 256) { - zmm_t0 = _mm512_loadu_si512((__m512i *)src); - zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1); - zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2); - zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3); - - z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01); - z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01); - z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01); - z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01); - - zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10); - zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10); - zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10); - zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10); - - zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); - zmm_crc1 = _mm512_xor_si512(z1, zmm_crc1); - zmm_crc2 = _mm512_xor_si512(z2, zmm_crc2); - zmm_crc3 = _mm512_xor_si512(z3, zmm_crc3); - - zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0); - zmm_crc1 = _mm512_xor_si512(zmm_crc1, zmm_t1); - zmm_crc2 = _mm512_xor_si512(zmm_crc2, zmm_t2); - zmm_crc3 = _mm512_xor_si512(zmm_crc3, zmm_t3); - - len -= 256; - src += 256; - } - // zmm_crc[0,1,2,3] -> zmm_crc0 - z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); - zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); - zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); - zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc1); - - z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); - zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); - zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); - zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc2); - - z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); - zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); - zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); - zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc3); - - // zmm_crc0 -> xmm_crc[0, 1, 2, 3] - *xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0); - *xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1); - *xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2); - *xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3); - - return (len_tmp - len); // return n bytes processed -} #endif diff --git a/arch/x86/crc32_fold_vpclmulqdq_tpl.h b/arch/x86/crc32_fold_vpclmulqdq_tpl.h new file mode 100644 index 000000000..89378aef9 --- /dev/null +++ b/arch/x86/crc32_fold_vpclmulqdq_tpl.h @@ -0,0 +1,116 @@ +/* crc32_fold_vpclmulqdq_tpl.h -- VPCMULQDQ-based CRC32 folding template. + * Copyright Wangyang Guo (wangyang.guo@intel.com) + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef COPY +size_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1, + __m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len) { +#else +size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1, + __m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len, + __m128i init_crc, int32_t first) { + __m512i zmm_initial = _mm512_zextsi128_si512(init_crc); +#endif + __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3; + __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3; + __m512i z0, z1, z2, z3; + size_t len_tmp = len; + const __m512i zmm_fold4 = _mm512_set4_epi32( + 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596); + const __m512i zmm_fold16 = _mm512_set4_epi32( + 0x00000001, 0x1542778a, 0x00000001, 0x322d1430); + + // zmm register init + zmm_crc0 = _mm512_setzero_si512(); + zmm_t0 = _mm512_loadu_si512((__m512i *)src); +#ifndef COPY + XOR_INITIAL(zmm_t0); +#endif + zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1); + zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2); + zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3); + + /* already have intermediate CRC in xmm registers + * fold4 with 4 xmm_crc to get zmm_crc0 + */ + zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0); + zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1); + zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2); + zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3); + z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); + zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); + zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); + zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0); + +#ifdef COPY + _mm512_storeu_si512((__m512i *)dst, zmm_t0); + _mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1); + _mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2); + _mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3); + dst += 256; +#endif + len -= 256; + src += 256; + + // fold-16 loops + while (len >= 256) { + zmm_t0 = _mm512_loadu_si512((__m512i *)src); + zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1); + zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2); + zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3); + + z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01); + z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01); + z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01); + z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01); + + zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10); + zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10); + zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10); + zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10); + + zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); + zmm_crc1 = _mm512_xor_si512(z1, zmm_crc1); + zmm_crc2 = _mm512_xor_si512(z2, zmm_crc2); + zmm_crc3 = _mm512_xor_si512(z3, zmm_crc3); + + zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0); + zmm_crc1 = _mm512_xor_si512(zmm_crc1, zmm_t1); + zmm_crc2 = _mm512_xor_si512(zmm_crc2, zmm_t2); + zmm_crc3 = _mm512_xor_si512(zmm_crc3, zmm_t3); + +#ifdef COPY + _mm512_storeu_si512((__m512i *)dst, zmm_t0); + _mm512_storeu_si512((__m512i *)dst + 1, zmm_t1); + _mm512_storeu_si512((__m512i *)dst + 2, zmm_t2); + _mm512_storeu_si512((__m512i *)dst + 3, zmm_t3); + dst += 256; +#endif + len -= 256; + src += 256; + } + // zmm_crc[0,1,2,3] -> zmm_crc0 + z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); + zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); + zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); + zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc1); + + z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); + zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); + zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); + zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc2); + + z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); + zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); + zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); + zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc3); + + // zmm_crc0 -> xmm_crc[0, 1, 2, 3] + *xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0); + *xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1); + *xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2); + *xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3); + + return (len_tmp - len); // return n bytes processed +} diff --git a/cpu_features.h b/cpu_features.h index ca1465d5d..fb9ffe7a3 100644 --- a/cpu_features.h +++ b/cpu_features.h @@ -64,10 +64,10 @@ extern uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, cons /* CRC32 folding */ #ifdef X86_PCLMULQDQ_CRC -extern uint32_t crc32_fold_reset_pclmulqdq(crc32_fold *crc); -extern void crc32_fold_copy_pclmulqdq(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len); +extern uint32_t crc32_fold_pclmulqdq_reset(crc32_fold *crc); +extern void crc32_fold_pclmulqdq_copy(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len); extern void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc); -extern uint32_t crc32_fold_final_pclmulqdq(crc32_fold *crc); +extern uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc); extern uint32_t crc32_pclmulqdq(uint32_t crc32, const unsigned char* buf, uint64_t len); #endif diff --git a/crc32_fold.h b/crc32_fold.h index ecfad454e..0d2ff6696 100644 --- a/crc32_fold.h +++ b/crc32_fold.h @@ -5,8 +5,8 @@ #ifndef CRC32_FOLD_H_ #define CRC32_FOLD_H_ -#define CRC32_FOLD_BUFFER_SIZE (16 * 5) -/* sizeof(__m128i) * (4 folds & 1 partial fold) */ +#define CRC32_FOLD_BUFFER_SIZE (16 * 4) +/* sizeof(__m128i) * (4 folds) */ typedef struct crc32_fold_s { uint8_t fold[CRC32_FOLD_BUFFER_SIZE]; diff --git a/functable.c b/functable.c index dbb8256b7..ca95df55d 100644 --- a/functable.c +++ b/functable.c @@ -228,7 +228,7 @@ Z_INTERNAL uint32_t crc32_fold_reset_stub(crc32_fold *crc) { cpu_check_features(); #ifdef X86_PCLMULQDQ_CRC if (x86_cpu_has_pclmulqdq) - functable.crc32_fold_reset = &crc32_fold_reset_pclmulqdq; + functable.crc32_fold_reset = &crc32_fold_pclmulqdq_reset; #endif return functable.crc32_fold_reset(crc); } @@ -238,7 +238,7 @@ Z_INTERNAL void crc32_fold_copy_stub(crc32_fold *crc, uint8_t *dst, const uint8_ cpu_check_features(); #ifdef X86_PCLMULQDQ_CRC if (x86_cpu_has_pclmulqdq) - functable.crc32_fold_copy = &crc32_fold_copy_pclmulqdq; + functable.crc32_fold_copy = &crc32_fold_pclmulqdq_copy; #endif functable.crc32_fold_copy(crc, dst, src, len); } @@ -258,7 +258,7 @@ Z_INTERNAL uint32_t crc32_fold_final_stub(crc32_fold *crc) { cpu_check_features(); #ifdef X86_PCLMULQDQ_CRC if (x86_cpu_has_pclmulqdq) - functable.crc32_fold_final = &crc32_fold_final_pclmulqdq; + functable.crc32_fold_final = &crc32_fold_pclmulqdq_final; #endif return functable.crc32_fold_final(crc); }