From: Nathan Moinvaziri Date: Sat, 3 Jul 2021 19:40:55 +0000 (-0700) Subject: Use static inline functions for crc32 folding load/save. X-Git-Tag: 2.1.0-beta1~518 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e128537d4db2a4bec431d75b92b7b8e01fbf8bd7;p=thirdparty%2Fzlib-ng.git Use static inline functions for crc32 folding load/save. --- diff --git a/arch/x86/crc32_fold_pclmulqdq.c b/arch/x86/crc32_fold_pclmulqdq.c index 30a24ea86..9065298f5 100644 --- a/arch/x86/crc32_fold_pclmulqdq.c +++ b/arch/x86/crc32_fold_pclmulqdq.c @@ -25,16 +25,6 @@ #include "../../crc32_fold.h" -Z_INTERNAL uint32_t crc32_fold_reset_pclmulqdq(crc32_fold *crc) { - /* CRC_SAVE */ - _mm_storeu_si128((__m128i *)crc->fold + 0, _mm_cvtsi32_si128(0x9db42487)); - _mm_storeu_si128((__m128i *)crc->fold + 1, _mm_setzero_si128()); - _mm_storeu_si128((__m128i *)crc->fold + 2, _mm_setzero_si128()); - _mm_storeu_si128((__m128i *)crc->fold + 3, _mm_setzero_si128()); - - return 0; -} - static void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) { const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596); @@ -227,24 +217,45 @@ static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1, *xmm_crc3 = _mm_castps_si128(ps_res); } +static inline void crc32_fold_load(__m128i *fold, __m128i *fold0, __m128i *fold1, __m128i *fold2, __m128i *fold3) { + *fold0 = _mm_load_si128(fold + 0); + *fold1 = _mm_load_si128(fold + 1); + *fold2 = _mm_load_si128(fold + 2); + *fold3 = _mm_load_si128(fold + 3); +} + +static inline void crc32_fold_save(__m128i *fold, __m128i fold0, __m128i fold1, __m128i fold2, __m128i fold3) { + _mm_storeu_si128(fold + 0, fold0); + _mm_storeu_si128(fold + 1, fold1); + _mm_storeu_si128(fold + 2, fold2); + _mm_storeu_si128(fold + 3, fold3); +} + +static inline void crc32_fold_save_partial(__m128i *fold, __m128i foldp) { + _mm_store_si128(fold + 4, foldp); +} + +Z_INTERNAL uint32_t crc32_fold_reset_pclmulqdq(crc32_fold *crc) { + __m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487); + __m128i xmm_zero = _mm_setzero_si128(); + crc32_fold_save((__m128i *)&crc->fold, xmm_crc0, xmm_zero, xmm_zero, xmm_zero); + return 0; +} + Z_INTERNAL void crc32_fold_copy_pclmulqdq(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) { unsigned long algn_diff; __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3; + __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3, xmm_crc_part; char ALIGNED_(16) partial_buf[16] = { 0 }; - /* CRC_LOAD */ - __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)crc->fold + 0); - __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)crc->fold + 1); - __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)crc->fold + 2); - __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)crc->fold + 3); - __m128i xmm_crc_part; + crc32_fold_load((__m128i *)&crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); if (len < 16) { if (len == 0) return; memcpy(partial_buf, src, len); - xmm_crc_part = _mm_loadu_si128((const __m128i *)partial_buf); + xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf); memcpy(dst, partial_buf, len); goto partial; } @@ -264,19 +275,11 @@ Z_INTERNAL void crc32_fold_copy_pclmulqdq(crc32_fold *crc, uint8_t *dst, const u } while (len >= 64) { - /* CRC_LOAD */ - xmm_t0 = _mm_load_si128((__m128i *)src); - xmm_t1 = _mm_load_si128((__m128i *)src + 1); - xmm_t2 = _mm_load_si128((__m128i *)src + 2); - xmm_t3 = _mm_load_si128((__m128i *)src + 3); + crc32_fold_load((__m128i *)src, &xmm_t0, &xmm_t1, &xmm_t2, &xmm_t3); fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); - /* CRC_SAVE */ - _mm_storeu_si128((__m128i *)dst, xmm_t0); - _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); - _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); - _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); + crc32_fold_save((__m128i *)dst, xmm_t0, xmm_t1, xmm_t2, xmm_t3); xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0); xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1); @@ -356,12 +359,8 @@ Z_INTERNAL void crc32_fold_copy_pclmulqdq(crc32_fold *crc, uint8_t *dst, const u partial: partial_fold((size_t)len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part); done: - /* CRC_SAVE */ - _mm_storeu_si128((__m128i *)crc->fold + 0, xmm_crc0); - _mm_storeu_si128((__m128i *)crc->fold + 1, xmm_crc1); - _mm_storeu_si128((__m128i *)crc->fold + 2, xmm_crc2); - _mm_storeu_si128((__m128i *)crc->fold + 3, xmm_crc3); - _mm_storeu_si128((__m128i *)crc->fold + 4, xmm_crc_part); + crc32_fold_save((__m128i *)&crc->fold, xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3); + crc32_fold_save_partial((__m128i *)&crc->fold, xmm_crc_part); } static const unsigned ALIGNED_(16) crc_k[] = { @@ -384,14 +383,10 @@ static const unsigned ALIGNED_(16) crc_mask2[4] = { Z_INTERNAL uint32_t crc32_fold_final_pclmulqdq(crc32_fold *crc) { const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask); const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2); - + __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3; __m128i x_tmp0, x_tmp1, x_tmp2, crc_fold; - /* CRC_LOAD */ - __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)crc->fold + 0); - __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)crc->fold + 1); - __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)crc->fold + 2); - __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)crc->fold + 3); + crc32_fold_load((__m128i *)&crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); /* * k1