*/
#include "zbuild.h"
+#include <stdlib.h>
/* Returns the chunk size */
Z_INTERNAL uint32_t CHUNKSIZE(void) {
return out;
}
-/* Behave like chunkcopy, but avoid writing beyond of legal output. */
-Z_INTERNAL uint8_t* CHUNKCOPY_SAFE(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe) {
- unsigned safelen = (unsigned)((safe - out) + 1);
- len = MIN(len, safelen);
-#if CHUNK_SIZE >= 32
- while (len >= 32) {
- memcpy(out, from, 32);
- out += 32;
- from += 32;
- len -= 32;
- }
-#endif
-#if CHUNK_SIZE >= 16
- while (len >= 16) {
- memcpy(out, from, 16);
- out += 16;
- from += 16;
- len -= 16;
- }
-#endif
-#if CHUNK_SIZE >= 8
- while (len >= 8) {
- zmemcpy_8(out, from);
- out += 8;
- from += 8;
- len -= 8;
- }
-#endif
- if (len >= 4) {
- zmemcpy_4(out, from);
- out += 4;
- from += 4;
- len -= 4;
- }
- if (len >= 2) {
- zmemcpy_2(out, from);
- out += 2;
- from += 2;
- len -= 2;
- }
- if (len == 1) {
- *out++ = *from++;
- }
- return out;
-}
-
/* Perform short copies until distance can be rewritten as being at least
sizeof chunk_t.
Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
Assert(dist > 0, "chunkmemset cannot have a distance 0");
- unsigned char *from = out - dist;
- chunk_t chunk;
- unsigned sz = sizeof(chunk);
- if (len < sz) {
- while (len != 0) {
- *out++ = *from++;
- --len;
- }
- return out;
- }
+ uint8_t *from = out - dist;
-#ifdef HAVE_CHUNKMEMSET_1
if (dist == 1) {
- chunkmemset_1(from, &chunk);
- } else
-#endif
+ memset(out, *from, len);
+ return out + len;
+ } else if (dist > sizeof(chunk_t)) {
+ return CHUNKCOPY(out, out - dist, len);
+ }
+
+ chunk_t chunk_load;
+ uint32_t chunk_mod = 0;
+
+ /* TODO: possibly build up a permutation table for this if not an even modulus */
#ifdef HAVE_CHUNKMEMSET_2
if (dist == 2) {
- chunkmemset_2(from, &chunk);
+ chunkmemset_2(from, &chunk_load);
} else
#endif
#ifdef HAVE_CHUNKMEMSET_4
if (dist == 4) {
- chunkmemset_4(from, &chunk);
+ chunkmemset_4(from, &chunk_load);
} else
#endif
#ifdef HAVE_CHUNKMEMSET_8
if (dist == 8) {
- chunkmemset_8(from, &chunk);
+ chunkmemset_8(from, &chunk_load);
+ } else if (dist == sizeof(chunk_t)) {
+ loadchunk(from, &chunk_load);
} else
#endif
- if (dist == sz) {
- loadchunk(from, &chunk);
- } else if (dist < sz) {
- unsigned char *end = out + len - 1;
- while (len > dist) {
- out = CHUNKCOPY_SAFE(out, from, dist, end);
- len -= dist;
+ {
+ /* This code takes string of length dist from "from" and repeats
+ * it for as many times as can fit in a chunk_t (vector register) */
+ int32_t cpy_dist;
+ int32_t bytes_remaining = sizeof(chunk_t);
+ uint8_t *cur_chunk = (uint8_t*)&chunk_load;
+ while (bytes_remaining) {
+ cpy_dist = MIN(dist, bytes_remaining);
+ memcpy(cur_chunk, from, cpy_dist);
+ bytes_remaining -= cpy_dist;
+ cur_chunk += cpy_dist;
+ /* This allows us to bypass an expensive integer division since we're effectively
+ * counting in this loop, anyway. However, we may have to derive a similarly
+ * sensible solution for if we use a permutation table that allows us to construct
+ * this vector in one load and one permute instruction */
+ chunk_mod = cpy_dist;
}
- if (len > 0) {
- out = CHUNKCOPY_SAFE(out, from, len, end);
+ }
+
+ /* If we're lucky enough and dist happens to be an even modulus of our vector length,
+ * we can do two stores per loop iteration, which for most ISAs, especially x86, is beneficial */
+ if (chunk_mod == 0) {
+ while (len >= (2 * sizeof(chunk_t))) {
+ storechunk(out, &chunk_load);
+ storechunk(out + sizeof(chunk_t), &chunk_load);
+ out += 2 * sizeof(chunk_t);
+ len -= 2 * sizeof(chunk_t);
}
- return out;
- } else {
- out = CHUNKUNROLL(out, &dist, &len);
- return CHUNKCOPY(out, out - dist, len);
}
- unsigned rem = len % sz;
- len -= rem;
- while (len) {
- storechunk(out, &chunk);
- out += sz;
- len -= sz;
+ /* If we don't have a "dist" length that divides evenly into a vector
+ * register, we can write the whole vector register but we need only
+ * advance by the amount of the whole string that fits in our chunk_t.
+ * If we do divide evenly into the vector length, adv_amount = chunk_t size*/
+ uint32_t adv_amount = sizeof(chunk_t) - chunk_mod;
+ while (len >= sizeof(chunk_t)) {
+ storechunk(out, &chunk_load);
+ len -= adv_amount;
+ out += adv_amount;
}
- /* Last, deal with the case when LEN is not a multiple of SZ. */
- if (rem) {
- memcpy(out, from, rem);
- out += rem;
+ if (len) {
+ memcpy(out, &chunk_load, len);
+ out += len;
}
return out;
return functable.chunkcopy(out, from, len);
}
-Z_INTERNAL uint8_t* chunkcopy_safe_stub(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe) {
- // Initialize default
- functable.chunkcopy_safe = &chunkcopy_safe_c;
-
-#ifdef X86_SSE2_CHUNKSET
-# if !defined(__x86_64__) && !defined(_M_X64) && !defined(X86_NOCHECK_SSE2)
- if (x86_cpu_has_sse2)
-# endif
- functable.chunkcopy_safe = &chunkcopy_safe_sse2;
-#endif
-#ifdef X86_AVX_CHUNKSET
- if (x86_cpu_has_avx2)
- functable.chunkcopy_safe = &chunkcopy_safe_avx;
-#endif
-#ifdef ARM_NEON_CHUNKSET
- if (arm_cpu_has_neon)
- functable.chunkcopy_safe = &chunkcopy_safe_neon;
-#endif
-#ifdef POWER8_VSX_CHUNKSET
- if (power_cpu_has_arch_2_07)
- functable.chunkcopy_safe = &chunkcopy_safe_power8;
-#endif
-
- return functable.chunkcopy_safe(out, from, len, safe);
-}
-
Z_INTERNAL uint8_t* chunkunroll_stub(uint8_t *out, unsigned *dist, unsigned *len) {
// Initialize default
functable.chunkunroll = &chunkunroll_c;
compare256_stub,
chunksize_stub,
chunkcopy_stub,
- chunkcopy_safe_stub,
chunkunroll_stub,
chunkmemset_stub,
chunkmemset_safe_stub,
uint32_t (* compare256) (const uint8_t *src0, const uint8_t *src1);
uint32_t (* chunksize) (void);
uint8_t* (* chunkcopy) (uint8_t *out, uint8_t const *from, unsigned len);
- uint8_t* (* chunkcopy_safe) (uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
uint8_t* (* chunkunroll) (uint8_t *out, unsigned *dist, unsigned *len);
uint8_t* (* chunkmemset) (uint8_t *out, unsigned dist, unsigned len);
uint8_t* (* chunkmemset_safe) (uint8_t *out, unsigned dist, unsigned len, unsigned left);
#include "inflate_p.h"
#include "functable.h"
-
/* Load 64 bits from IN and place the bytes at offset BITS in the result. */
static inline uint64_t load_64_bits(const unsigned char *in, unsigned bits) {
uint64_t chunk;
from += wsize - op;
if (op < len) { /* some from end of window */
len -= op;
- out = functable.chunkcopy_safe(out, from, op, safe);
+ out = chunkcopy_safe(out, from, op, safe);
from = window; /* more from start of window */
op = wnext;
/* This (rare) case can create a situation where
}
if (op < len) { /* still need some from output */
len -= op;
- out = functable.chunkcopy_safe(out, from, op, safe);
+ out = chunkcopy_safe(out, from, op, safe);
out = functable.chunkunroll(out, &dist, &len);
- out = functable.chunkcopy_safe(out, out - dist, len, safe);
+ out = chunkcopy_safe(out, out - dist, len, safe);
} else {
- out = functable.chunkcopy_safe(out, from, len, safe);
+ out = chunkcopy_safe(out, from, len, safe);
}
} else if (extra_safe) {
/* Whole reference is in range of current output. */
if (dist >= len || dist >= state->chunksize)
- out = functable.chunkcopy_safe(out, out - dist, len, safe);
+ out = chunkcopy_safe(out, out - dist, len, safe);
else
out = functable.chunkmemset_safe(out, dist, len, (unsigned)((safe - out) + 1));
} else {
}
unsigned char *next_out = state->window + state->wsize + state->wnext;
if (copy <= state->offset) {
- functable.chunkcopy_safe(next_out, next_out - state->offset, copy, put + buf_left);
+ chunkcopy_safe(next_out, next_out - state->offset, copy, put + buf_left);
} else { /* copy from output */
functable.chunkmemset_safe(next_out, state->offset, copy, (uint32_t)buf_left);
}
state->whave += out_bytes;
state->whave = MIN(state->whave, state->wsize);
}
+
+/* Behave like chunkcopy, but avoid writing beyond of legal output. */
+static inline uint8_t* chunkcopy_safe(uint8_t *out, uint8_t *from, unsigned len, uint8_t *safe) {
+ uint32_t safelen = (uint32_t)((safe - out) + 1);
+ len = MIN(len, safelen);
+ int olap_src = from >= out && from < out + len;
+ int olap_dst = out >= from && out < from + len;
+ int tocopy;
+
+ /* For all cases without overlap, memcpy is ideal */
+ if (!(olap_src || olap_dst)) {
+ memcpy(out, from, len);
+ return out + len;
+ }
+
+ /* We are emulating a self-modifying copy loop here. To do this in a way that doesn't produce undefined behavior,
+ * we have to get a bit clever. First if the overlap is such that src falls between dst and dst+len, we can do the
+ * initial bulk memcpy of the nonoverlapping region. Then, we can leverage the size of this to determine the safest
+ * atomic memcpy size we can pick such that we have non-overlapping regions. This effectively becomes a safe look
+ * behind or lookahead distance */
+ int non_olap_size = (from > out) ? from - out : out - from;
+
+ memcpy(out, from, non_olap_size);
+ out += non_olap_size;
+ from += non_olap_size;
+ len -= non_olap_size;
+
+ /* So this doesn't give use a worst case scenario of function calls in a loop,
+ * we want to instead break this down into copy blocks of fixed lengths */
+ while (len) {
+ tocopy = MIN(non_olap_size, len);
+ len -= tocopy;
+
+ while (tocopy >= 32) {
+ memcpy(out, from, 32);
+ out += 32;
+ from += 32;
+ tocopy -= 32;
+ }
+
+ if (tocopy >= 16) {
+ memcpy(out, from, 16);
+ out += 16;
+ from += 16;
+ tocopy -= 16;
+ }
+
+ if (tocopy >= 8) {
+ zmemcpy_8(out, from);
+ out += 8;
+ from += 8;
+ tocopy -= 8;
+ }
+
+ if (tocopy >= 4) {
+ zmemcpy_4(out, from);
+ out += 4;
+ from += 4;
+ tocopy -= 4;
+ }
+
+ if (tocopy >= 2) {
+ zmemcpy_2(out, from);
+ out += 2;
+ from += 2;
+ tocopy -= 2;
+ }
+
+ if (tocopy) {
+ *out++ = *from++;
+ }
+ }
+
+ return out;
+}