From: Nathan Moinvaziri Date: Mon, 23 Aug 2021 19:21:40 +0000 (-0700) Subject: Add back original version of inflate_fast for use with inflateBack. X-Git-Tag: 2.1.0-beta1~491 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=2d2dde43b11c40cb58a339ff4a8425bca0091c31;p=thirdparty%2Fzlib-ng.git Add back original version of inflate_fast for use with inflateBack. --- diff --git a/infback.c b/infback.c index 8c43a7cef..cf0549ae1 100644 --- a/infback.c +++ b/infback.c @@ -338,6 +338,17 @@ int32_t Z_EXPORT PREFIX(inflateBack)(PREFIX3(stream) *strm, in_func in, void *in state->mode = LEN; case LEN: + /* use inflate_fast() if we have enough input and output */ + if (have >= INFLATE_FAST_MIN_HAVE && + left >= INFLATE_FAST_MIN_LEFT) { + RESTORE_BACK(); + if (state->whave < state->wsize) + state->whave = state->wsize - left; + zng_inflate_fast_back(strm, state->wsize); + LOAD_BACK(); + break; + } + /* get a literal, length, or end-of-block code */ for (;;) { here = state->lencode[BITS(state->lenbits)]; diff --git a/inffast.c b/inffast.c index 57031aeca..899534f0d 100644 --- a/inffast.c +++ b/inffast.c @@ -250,6 +250,258 @@ void Z_INTERNAL zng_inflate_fast(PREFIX3(stream) *strm) { state->bits = bits; return; } +void Z_INTERNAL zng_inflate_fast_back(PREFIX3(stream) *strm, unsigned long start) { + /* start: inflate()'s starting value for strm->avail_out */ + struct inflate_state *state; + z_const unsigned char *in; /* local strm->next_in */ + const unsigned char *last; /* have enough input while in < last */ + unsigned char *out; /* local strm->next_out */ + unsigned char *beg; /* inflate()'s initial strm->next_out */ + unsigned char *end; /* while out < end, enough space available */ + unsigned char *safe; /* can use chunkcopy provided out < safe */ +#ifdef INFLATE_STRICT + unsigned dmax; /* maximum distance from zlib header */ +#endif + unsigned wsize; /* window size or zero if not using window */ + unsigned whave; /* valid bytes in the window */ + unsigned wnext; /* window write index */ + unsigned char *window; /* allocated sliding window, if wsize != 0 */ + + /* hold is a local copy of strm->hold. By default, hold satisfies the same + invariants that strm->hold does, namely that (hold >> bits) == 0. This + invariant is kept by loading bits into hold one byte at a time, like: + hold |= next_byte_of_input << bits; in++; bits += 8; + If we need to ensure that bits >= 15 then this code snippet is simply + repeated. Over one iteration of the outermost do/while loop, this + happens up to six times (48 bits of input), as described in the NOTES + above. + However, on some little endian architectures, it can be significantly + faster to load 64 bits once instead of 8 bits six times: + if (bits <= 16) { + hold |= next_8_bytes_of_input << bits; in += 6; bits += 48; + } + Unlike the simpler one byte load, shifting the next_8_bytes_of_input + by bits will overflow and lose those high bits, up to 2 bytes' worth. + The conservative estimate is therefore that we have read only 6 bytes + (48 bits). Again, as per the NOTES above, 48 bits is sufficient for the + rest of the iteration, and we will not need to load another 8 bytes. + Inside this function, we no longer satisfy (hold >> bits) == 0, but + this is not problematic, even if that overflow does not land on an 8 bit + byte boundary. Those excess bits will eventually shift down lower as the + Huffman decoder consumes input, and when new input bits need to be loaded + into the bits variable, the same input bits will be or'ed over those + existing bits. A bitwise or is idempotent: (a | b | b) equals (a | b). + Note that we therefore write that load operation as "hold |= etc" and not + "hold += etc". + Outside that loop, at the end of the function, hold is bitwise and'ed + with (1<hold >> state->bits) == 0. + */ + uint64_t hold; /* local strm->hold */ + unsigned bits; /* local strm->bits */ + code const *lcode; /* local strm->lencode */ + code const *dcode; /* local strm->distcode */ + unsigned lmask; /* mask for first level of length codes */ + unsigned dmask; /* mask for first level of distance codes */ + const code *here; /* retrieved table entry */ + unsigned op; /* code bits, operation, extra bits, or */ + /* window position, window bytes to copy */ + unsigned len; /* match length, unused bytes */ + unsigned dist; /* match distance */ + unsigned char *from; /* where to copy match from */ + unsigned extra_safe; /* copy chunks safely in all cases */ + + /* copy state to local variables */ + state = (struct inflate_state *)strm->state; + in = strm->next_in; + last = in + (strm->avail_in - (INFLATE_FAST_MIN_HAVE - 1)); + out = strm->next_out; + beg = out - (start - strm->avail_out); + end = out + (strm->avail_out - (INFLATE_FAST_MIN_LEFT - 1)); + safe = out + strm->avail_out; +#ifdef INFLATE_STRICT + dmax = state->dmax; +#endif + wsize = state->wsize; + whave = state->whave; + wnext = state->wnext; + window = state->window; + hold = state->hold; + bits = state->bits; + lcode = state->lencode; + dcode = state->distcode; + lmask = (1U << state->lenbits) - 1; + dmask = (1U << state->distbits) - 1; + + /* Detect if out and window point to the same memory allocation. In this instance it is + necessary to use safe chunk copy functions to prevent overwriting the window. If the + window is overwritten then future matches with far distances will fail to copy correctly. */ + extra_safe = (wsize != 0 && out >= window && out + INFLATE_FAST_MIN_LEFT <= window + wsize); + + /* decode literals and length/distances until end-of-block or not enough + input data or output space */ + do { + if (bits < 15) { + hold |= load_64_bits(in, bits); + in += 6; + bits += 48; + } + here = lcode + (hold & lmask); + dolen: + DROPBITS(here->bits); + op = here->op; + if (op == 0) { /* literal */ + Tracevv((stderr, here->val >= 0x20 && here->val < 0x7f ? + "inflate: literal '%c'\n" : + "inflate: literal 0x%02x\n", here->val)); + *out++ = (unsigned char)(here->val); + } else if (op & 16) { /* length base */ + len = here->val; + op &= 15; /* number of extra bits */ + if (bits < op) { + hold |= load_64_bits(in, bits); + in += 6; + bits += 48; + } + len += BITS(op); + DROPBITS(op); + Tracevv((stderr, "inflate: length %u\n", len)); + if (bits < 15) { + hold |= load_64_bits(in, bits); + in += 6; + bits += 48; + } + here = dcode + (hold & dmask); + dodist: + DROPBITS(here->bits); + op = here->op; + if (op & 16) { /* distance base */ + dist = here->val; + op &= 15; /* number of extra bits */ + if (bits < op) { + hold |= load_64_bits(in, bits); + in += 6; + bits += 48; + } + dist += BITS(op); +#ifdef INFLATE_STRICT + if (dist > dmax) { + SET_BAD("invalid distance too far back"); + break; + } +#endif + DROPBITS(op); + Tracevv((stderr, "inflate: distance %u\n", dist)); + op = (unsigned)(out - beg); /* max distance in output */ + if (dist > op) { /* see if copy from window */ + op = dist - op; /* distance back in window */ + if (op > whave) { + if (state->sane) { + SET_BAD("invalid distance too far back"); + break; + } +#ifdef INFLATE_ALLOW_INVALID_DISTANCE_TOOFAR_ARRR + if (len <= op - whave) { + do { + *out++ = 0; + } while (--len); + continue; + } + len -= op - whave; + do { + *out++ = 0; + } while (--op > whave); + if (op == 0) { + from = out - dist; + do { + *out++ = *from++; + } while (--len); + continue; + } +#endif + } + from = window; + if (wnext == 0) { /* very common case */ + from += wsize - op; + } else if (wnext >= op) { /* contiguous in window */ + from += wnext - op; + } else { /* wrap around window */ + op -= wnext; + from += wsize - op; + if (op < len) { /* some from end of window */ + len -= op; + out = functable.chunkcopy_safe(out, from, op, safe); + from = window; /* more from start of window */ + op = wnext; + /* This (rare) case can create a situation where + the first chunkcopy below must be checked. + */ + } + } + if (op < len) { /* still need some from output */ + len -= op; + out = functable.chunkcopy_safe(out, from, op, safe); + out = functable.chunkunroll(out, &dist, &len); + out = functable.chunkcopy_safe(out, out - dist, len, safe); + } else { + out = functable.chunkcopy_safe(out, from, len, safe); + } + } else if (extra_safe) { + /* Whole reference is in range of current output. */ + if (dist >= len || dist >= state->chunksize) + out = functable.chunkcopy_safe(out, out - dist, len, safe); + else + out = functable.chunkmemset_safe(out, dist, len, (unsigned)((safe - out) + 1)); + } else { + /* Whole reference is in range of current output. No range checks are + necessary because we start with room for at least 258 bytes of output, + so unroll and roundoff operations can write beyond `out+len` so long + as they stay within 258 bytes of `out`. + */ + if (dist >= len || dist >= state->chunksize) + out = functable.chunkcopy(out, out - dist, len); + else + out = functable.chunkmemset(out, dist, len); + } + } else if ((op & 64) == 0) { /* 2nd level distance code */ + here = dcode + here->val + BITS(op); + goto dodist; + } else { + SET_BAD("invalid distance code"); + break; + } + } else if ((op & 64) == 0) { /* 2nd level length code */ + here = lcode + here->val + BITS(op); + goto dolen; + } else if (op & 32) { /* end-of-block */ + Tracevv((stderr, "inflate: end of block\n")); + state->mode = TYPE; + break; + } else { + SET_BAD("invalid literal/length code"); + break; + } + } while (in < last && out < end); + + /* return unused bytes (on entry, bits < 8, so in won't go too far back) */ + len = bits >> 3; + in -= len; + bits -= len << 3; + hold &= (UINT64_C(1) << bits) - 1; + + /* update state and return */ + strm->next_in = in; + strm->next_out = out; + strm->avail_in = (unsigned)(in < last ? (INFLATE_FAST_MIN_HAVE - 1) + (last - in) + : (INFLATE_FAST_MIN_HAVE - 1) - (in - last)); + strm->avail_out = (unsigned)(out < end ? (INFLATE_FAST_MIN_LEFT - 1) + (end - out) + : (INFLATE_FAST_MIN_LEFT - 1) - (out - end)); + + Assert(bits <= 32, "Remaining bits greater than 32"); + state->hold = (uint32_t)hold; + state->bits = bits; + return; +} /* inflate_fast() speedups that turned out slower (on a PowerPC G3 750CXe): diff --git a/inffast.h b/inffast.h index cf9a3e2d2..e8d7e6e01 100644 --- a/inffast.h +++ b/inffast.h @@ -11,6 +11,7 @@ */ void Z_INTERNAL zng_inflate_fast(PREFIX3(stream) *strm); +void Z_INTERNAL zng_inflate_fast_back(PREFIX3(stream) *strm, unsigned long start); #define INFLATE_FAST_MIN_HAVE 8 #define INFLATE_FAST_MIN_LEFT 258 diff --git a/inflate_p.h b/inflate_p.h index 168f270fa..b2e8d8de2 100644 --- a/inflate_p.h +++ b/inflate_p.h @@ -70,6 +70,17 @@ bits = state->bits; \ } while (0) +/* Load registers with state in inflateBack() for speed */ +#define LOAD_BACK() \ + do { \ + put = strm->next_out; \ + left = strm->avail_out; \ + next = strm->next_in; \ + have = strm->avail_in; \ + hold = state->hold; \ + bits = state->bits; \ + } while (0) + /* Restore state from registers in inflate() */ #define RESTORE() \ do { \ @@ -81,6 +92,17 @@ state->bits = bits; \ } while (0) +/* Restore state from registers in inflateBack() */ +#define RESTORE_BACK() \ + do { \ + strm->next_out = put; \ + strm->avail_out = left; \ + strm->next_in = (z_const unsigned char *)next; \ + strm->avail_in = have; \ + state->hold = hold; \ + state->bits = bits; \ + } while (0) + /* Clear the input bit accumulator */ #define INITBITS() \ do { \