From: Adam Stylinski Date: Sat, 27 Jan 2024 23:29:41 +0000 (+0100) Subject: Add support for handling alignment correction for input buffers down to 16 bytes... X-Git-Tag: 2.2.0~103 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=907604722a34c758ec8aa4fa9dcea41e778e8484;p=thirdparty%2Fzlib-ng.git Add support for handling alignment correction for input buffers down to 16 bytes in crc32_fold_[v]pclmulqdq --- diff --git a/arch/x86/crc32_fold_pclmulqdq_tpl.h b/arch/x86/crc32_fold_pclmulqdq_tpl.h index 3e799283..1ffe201d 100644 --- a/arch/x86/crc32_fold_pclmulqdq_tpl.h +++ b/arch/x86/crc32_fold_pclmulqdq_tpl.h @@ -26,27 +26,26 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3; __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3; __m128i xmm_crc_part = _mm_setzero_si128(); -#ifdef COPY char ALIGNED_(16) partial_buf[16] = { 0 }; -#else +#ifndef COPY __m128i xmm_initial = _mm_cvtsi32_si128(init_crc); int32_t first = init_crc != 0; - /* Technically the CRC functions don't even call this for input < 64, but a bare minimum of 31 - * bytes of input is needed for the aligning load that occurs. If there's an initial CRC, to - * carry it forward through the folded CRC there must be 16 - src % 16 + 16 bytes available, which - * by definition can be up to 15 bytes + one full vector load. */ - assert(len >= 31 || first == 0); + /* The CRC functions don't call this for input < 16, as a minimum of 16 bytes of input is needed + * for the aligning load that occurs. If there's an initial CRC, to carry it forward through + * the folded CRC there must be 16 - src % 16 + 16 bytes available, which by definition can be + * up to 15 bytes + one full vector load. */ + assert(len >= 16 || first == 0); #endif crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); if (len < 16) { -#ifdef COPY if (len == 0) return; memcpy(partial_buf, src, len); xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf); +#ifdef COPY memcpy(dst, partial_buf, len); #endif goto partial; @@ -63,9 +62,23 @@ Z_INTERNAL void CRC32_FOLD(crc32_fold *crc, const uint8_t *src, size_t len, uint if (algn_diff < 4 && init_crc != 0) { xmm_t0 = xmm_crc_part; - xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1); - fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); - xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); + if (len >= 32) { + xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1); + fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); + } else { + memcpy(partial_buf, src + 16, len - 16); + xmm_crc_part = _mm_load_si128((__m128i*)partial_buf); + fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); + src += 16; + len -= 16; +#ifdef COPY + dst -= algn_diff; +#endif + goto partial; + } + src += 16; len -= 16; } diff --git a/arch/x86/crc32_pclmulqdq_tpl.h b/arch/x86/crc32_pclmulqdq_tpl.h index 80a35b03..3a4f6af5 100644 --- a/arch/x86/crc32_pclmulqdq_tpl.h +++ b/arch/x86/crc32_pclmulqdq_tpl.h @@ -365,7 +365,7 @@ static inline uint32_t crc32_small(uint32_t crc, const uint8_t *buf, size_t len) Z_INTERNAL uint32_t CRC32(uint32_t crc32, const uint8_t *buf, size_t len) { /* For lens smaller than ~12, crc32_small method is faster. * But there are also minimum requirements for the pclmul functions due to alignment */ - if (len < 32) + if (len < 16) return crc32_small(crc32, buf, len); crc32_fold ALIGNED_(16) crc_state;