From: Adam Stylinski Date: Sat, 12 Mar 2022 21:09:02 +0000 (-0500) Subject: Leverage inline CRC + copy X-Git-Tag: 2.1.0-beta1~295 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8550a90de4dcb8589a7d48fe308c4c45bba5a466;p=thirdparty%2Fzlib-ng.git Leverage inline CRC + copy This brings back a bit of the performance that may have been sacrificed by reverting the reorganized inflate window. Doing a copy at the same time as a CRC is basically free. --- diff --git a/arch/x86/crc32_fold_pclmulqdq.c b/arch/x86/crc32_fold_pclmulqdq.c index 1a7e77f0e..9072a47e7 100644 --- a/arch/x86/crc32_fold_pclmulqdq.c +++ b/arch/x86/crc32_fold_pclmulqdq.c @@ -398,18 +398,23 @@ Z_INTERNAL void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3; __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3, xmm_crc_part; __m128i xmm_initial = _mm_cvtsi32_si128(init_crc); - int32_t first = 1; + xmm_crc_part = _mm_setzero_si128(); + int32_t first = init_crc != 0; /* Technically the CRC functions don't even call this for input < 64, but a bare minimum of 31 * bytes of input is needed for the aligning load that occurs. If there's an initial CRC, to * carry it forward through the folded CRC there must be 16 - src % 16 + 16 bytes available, which * by definition can be up to 15 bytes + one full vector load. */ - assert(len >= 31); + assert(len >= 31 || first == 0); crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); + if (len < 16) { + goto partial_nocpy; + } + algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF; if (algn_diff) { - if (algn_diff >= 4) { + if (algn_diff >= 4 || init_crc == 0) { xmm_crc_part = _mm_loadu_si128((__m128i *)src); src += algn_diff; @@ -428,9 +433,9 @@ Z_INTERNAL void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t src += (algn_diff + 16); len -= (algn_diff + 16); } - } - xmm_crc_part = _mm_setzero_si128(); + xmm_crc_part = _mm_setzero_si128(); + } #ifdef X86_VPCLMULQDQ_CRC if (x86_cpu_has_vpclmulqdq && x86_cpu_has_avx512 && (len >= 256)) { @@ -497,6 +502,7 @@ Z_INTERNAL void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t src += 16; } +partial_nocpy: if (len) { memcpy(&xmm_crc_part, src, len); partial_fold((size_t)len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part); diff --git a/crc32_fold.c b/crc32_fold.c index 1800ed0a0..b3072e374 100644 --- a/crc32_fold.c +++ b/crc32_fold.c @@ -17,6 +17,15 @@ Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t * memcpy(dst, src, len); } +Z_INTERNAL void crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) { + /* Note: while this is basically the same thing as the vanilla CRC function, we still need + * a functable entry for it so that we can generically dispatch to this function with the + * same arguments for the versions that _do_ do a folding CRC but we don't want a copy. The + * init_crc is an unused argument in this context */ + Z_UNUSED(init_crc); + crc->value = functable.crc32(crc->value, src, len); +} + Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc) { return crc->value; } diff --git a/crc32_fold.h b/crc32_fold.h index 8bd0b6bc2..ecfad454e 100644 --- a/crc32_fold.h +++ b/crc32_fold.h @@ -15,6 +15,7 @@ typedef struct crc32_fold_s { Z_INTERNAL uint32_t crc32_fold_reset_c(crc32_fold *crc); Z_INTERNAL void crc32_fold_copy_c(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len); +Z_INTERNAL void crc32_fold_c(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc); Z_INTERNAL uint32_t crc32_fold_final_c(crc32_fold *crc); #endif diff --git a/functable.c b/functable.c index bd1e4cea7..3e23c54e5 100644 --- a/functable.c +++ b/functable.c @@ -206,6 +206,16 @@ Z_INTERNAL void crc32_fold_copy_stub(crc32_fold *crc, uint8_t *dst, const uint8_ functable.crc32_fold_copy(crc, dst, src, len); } +Z_INTERNAL void crc32_fold_stub(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) { + functable.crc32_fold = &crc32_fold_c; + cpu_check_features(); +#ifdef X86_PCLMULQDQ_CRC + if (x86_cpu_has_pclmulqdq) + functable.crc32_fold = &crc32_fold_pclmulqdq; +#endif + functable.crc32_fold(crc, src, len, init_crc); +} + Z_INTERNAL uint32_t crc32_fold_final_stub(crc32_fold *crc) { functable.crc32_fold_final = &crc32_fold_final_c; cpu_check_features(); @@ -402,6 +412,7 @@ Z_INTERNAL Z_TLS struct functable_s functable = { crc32_stub, crc32_fold_reset_stub, crc32_fold_copy_stub, + crc32_fold_stub, crc32_fold_final_stub, compare256_stub, chunksize_stub, diff --git a/functable.h b/functable.h index a106c93aa..61dde2105 100644 --- a/functable.h +++ b/functable.h @@ -14,6 +14,7 @@ struct functable_s { uint32_t (* crc32) (uint32_t crc, const unsigned char *buf, uint64_t len); uint32_t (* crc32_fold_reset) (crc32_fold *crc); void (* crc32_fold_copy) (crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len); + void (* crc32_fold) (crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc); uint32_t (* crc32_fold_final) (crc32_fold *crc); uint32_t (* compare256) (const uint8_t *src0, const uint8_t *src1); uint32_t (* chunksize) (void); diff --git a/inflate.c b/inflate.c index 11f7b5b08..1a0914859 100644 --- a/inflate.c +++ b/inflate.c @@ -15,9 +15,35 @@ /* function prototypes */ static int inflateStateCheck(PREFIX3(stream) *strm); -static int updatewindow(PREFIX3(stream) *strm, const unsigned char *end, uint32_t copy); +static int updatewindow(PREFIX3(stream) *strm, const uint8_t *end, uint32_t copy); static uint32_t syncsearch(uint32_t *have, const unsigned char *buf, uint32_t len); +static inline void inf_chksum_cpy(PREFIX3(stream) *strm, uint8_t *dst, + const uint8_t *src, uint32_t copy) { + struct inflate_state *state = (struct inflate_state*)strm->state; +#ifdef GUNZIP + if (state->flags) { + functable.crc32_fold_copy(&state->crc_fold, dst, src, copy); + } else +#endif + { + strm->adler = state->check = functable.adler32(state->check, src, copy); + memcpy(dst, src, copy); + } +} + +static inline void inf_chksum(PREFIX3(stream) *strm, const uint8_t *src, uint32_t len) { + struct inflate_state *state = (struct inflate_state*)strm->state; +#ifdef GUNZIP + if (state->flags) { + functable.crc32_fold(&state->crc_fold, src, len, 0); + } else +#endif + { + strm->adler = state->check = functable.adler32(state->check, src, len); + } +} + static int inflateStateCheck(PREFIX3(stream) *strm) { struct inflate_state *state; if (strm == NULL || strm->zalloc == NULL || strm->zfree == NULL) @@ -216,17 +242,39 @@ static int32_t updatewindow(PREFIX3(stream) *strm, const uint8_t *end, uint32_t /* copy state->wsize or less output bytes into the circular window */ if (copy >= state->wsize) { - memcpy(state->window, end - state->wsize, state->wsize); + /* Only do this if the caller specifies to checksum bytes AND the platform requires + * it (s/390 being the primary exception to this. Also, for now, do the adler checksums + * if not a gzip based header. The inline adler checksums will come in the near future, + * possibly the next commit */ + if (INFLATE_NEED_CHECKSUM(strm) && (state->wrap & 4)) { + /* We have to split the checksum over non-copied and copied bytes */ + if (copy > state->wsize) + inf_chksum(strm, end - copy, copy - state->wsize); + inf_chksum_cpy(strm, state->window, end - state->wsize, state->wsize); + } else { + memcpy(state->window, end - state->wsize, state->wsize); + } + state->wnext = 0; state->whave = state->wsize; } else { dist = state->wsize - state->wnext; - if (dist > copy) - dist = copy; - memcpy(state->window + state->wnext, end - copy, dist); + /* Only do this if the caller specifies to checksum bytes AND the platform requires + * We need to maintain the correct order here for the checksum */ + dist = MIN(dist, copy); + if (INFLATE_NEED_CHECKSUM(strm) && (state->wrap & 4)) { + inf_chksum_cpy(strm, state->window + state->wnext, end - copy, dist); + } else { + memcpy(state->window + state->wnext, end - copy, dist); + } copy -= dist; if (copy) { - memcpy(state->window, end - copy, copy); + if (INFLATE_NEED_CHECKSUM(strm) && (state->wrap & 4)) { + inf_chksum_cpy(strm, state->window, end - copy, copy); + } else { + memcpy(state->window, end - copy, copy); + } + state->wnext = copy; state->whave = state->wsize; } else { @@ -480,8 +528,9 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) { len + copy > state->head->extra_max ? state->head->extra_max - len : copy); } - if ((state->flags & 0x0200) && (state->wrap & 4)) + if ((state->flags & 0x0200) && (state->wrap & 4)) { state->check = PREFIX(crc32)(state->check, next, copy); + } have -= copy; next += copy; state->length -= copy; @@ -547,7 +596,9 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) { state->head->hcrc = (int)((state->flags >> 9) & 1); state->head->done = 1; } - strm->adler = state->check = CRC32_INITIAL_VALUE; + /* compute crc32 checksum if not in raw mode */ + if ((state->wrap & 4) && state->flags) + strm->adler = state->check = functable.crc32_fold_reset(&state->crc_fold); state->mode = TYPE; break; #endif @@ -946,8 +997,17 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) { out -= left; strm->total_out += out; state->total += out; - if (INFLATE_NEED_CHECKSUM(strm) && (state->wrap & 4) && out) - strm->adler = state->check = UPDATE(state->check, put - out, out); + + /* compute crc32 checksum if not in raw mode */ + if (INFLATE_NEED_CHECKSUM(strm) && state->wrap & 4) { + if (out) { + inf_chksum(strm, put - out, out); + } +#ifdef GUNZIP + if (state->flags) + strm->adler = state->check = functable.crc32_fold_final(&state->crc_fold); +#endif + } out = left; if ((state->wrap & 4) && ( #ifdef GUNZIP @@ -1015,8 +1075,7 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) { strm->total_in += in; strm->total_out += out; state->total += out; - if (INFLATE_NEED_CHECKSUM(strm) && (state->wrap & 4) && out) - strm->adler = state->check = UPDATE(state->check, strm->next_out - out, out); + strm->data_type = (int)state->bits + (state->last ? 64 : 0) + (state->mode == TYPE ? 128 : 0) + (state->mode == LEN_ || state->mode == COPY_ ? 256 : 0); if (((in == 0 && out == 0) || flush == Z_FINISH) && ret == Z_OK) diff --git a/inflate.h b/inflate.h index 67041348f..3f57e7840 100644 --- a/inflate.h +++ b/inflate.h @@ -11,6 +11,8 @@ #ifndef INFLATE_H_ #define INFLATE_H_ +#include "crc32_fold.h" + /* define NO_GZIP when compiling if you want to disable gzip header and trailer decoding by inflate(). NO_GZIP would be used to avoid linking in the crc code when it is not needed. For shared libraries, gzip decoding should be left enabled. */ @@ -101,6 +103,8 @@ struct inflate_state { uint32_t wnext; /* window write index */ unsigned char *window; /* allocated sliding window, if needed */ + struct crc32_fold_s ALIGNED_(16) crc_fold; + /* bit accumulator */ uint32_t hold; /* input bit accumulator */ unsigned bits; /* number of bits in "in" */