#ifdef ARM_NEON
uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t chunksize_neon(void);
-uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+uint8_t* chunkmemset_safe_neon(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
# ifdef HAVE_BUILTIN_CTZLL
uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1);
uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t chunksize_c(void);
-uint8_t* chunkmemset_safe_c(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+uint8_t* chunkmemset_safe_c(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
void inflate_fast_c(PREFIX3(stream) *strm, uint32_t start);
uint32_t PREFIX(crc32_braid)(uint32_t crc, const uint8_t *buf, size_t len);
#ifdef POWER8_VSX
uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t chunksize_power8(void);
-uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+uint8_t* chunkmemset_safe_power8(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
uint32_t crc32_power8(uint32_t crc, const uint8_t *buf, size_t len);
void slide_hash_power8(deflate_state *s);
void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start);
uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
uint32_t chunksize_rvv(void);
-uint8_t* chunkmemset_safe_rvv(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+uint8_t* chunkmemset_safe_rvv(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1);
uint32_t longest_match_rvv(deflate_state *const s, Pos cur_match);
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
#define HAVE_CHUNKMEMSET_16
+#define HAVE_CHUNKMEMSET_1
#define HAVE_CHUNK_MAG
#define HAVE_HALF_CHUNK
return ret_vec;
}
-static inline void halfchunkmemset_2(uint8_t *from, halfchunk_t *chunk) {
- int16_t tmp;
- memcpy(&tmp, from, sizeof(tmp));
- *chunk = _mm_set1_epi16(tmp);
-}
-
-static inline void halfchunkmemset_4(uint8_t *from, halfchunk_t *chunk) {
- int32_t tmp;
- memcpy(&tmp, from, sizeof(tmp));
- *chunk = _mm_set1_epi32(tmp);
-}
-
-static inline void halfchunkmemset_8(uint8_t *from, halfchunk_t *chunk) {
- int64_t tmp;
- memcpy(&tmp, from, sizeof(tmp));
- *chunk = _mm_set1_epi64x(tmp);
-}
-
static inline void loadhalfchunk(uint8_t const *s, halfchunk_t *chunk) {
*chunk = _mm_loadu_si128((__m128i *)s);
}
_mm_storeu_si128((__m128i *)out, *chunk);
}
-static inline chunk_t halfchunk2whole(halfchunk_t chunk) {
+static inline chunk_t halfchunk2whole(halfchunk_t *chunk) {
/* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately
* unlikely to be actually written or read from */
- return _mm256_zextsi128_si256(chunk);
+ return _mm256_zextsi128_si256(*chunk);
}
static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
#ifdef X86_SSE2
uint32_t chunksize_sse2(void);
-uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+uint8_t* chunkmemset_safe_sse2(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
# ifdef HAVE_BUILTIN_CTZ
uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
#ifdef X86_SSSE3
uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len);
-uint8_t* chunkmemset_safe_ssse3(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+uint8_t* chunkmemset_safe_ssse3(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
#endif
uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
uint32_t chunksize_avx2(void);
-uint8_t* chunkmemset_safe_avx2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+uint8_t* chunkmemset_safe_avx2(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
# ifdef HAVE_BUILTIN_CTZ
uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
#include "zbuild.h"
#include <stdlib.h>
+#include <stdio.h>
/* Returns the chunk size */
Z_INTERNAL uint32_t CHUNKSIZE(void) {
static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
/* This code takes string of length dist from "from" and repeats
* it for as many times as can fit in a chunk_t (vector register) */
- uint32_t cpy_dist;
- uint32_t bytes_remaining = sizeof(chunk_t);
+ uint64_t cpy_dist;
+ uint64_t bytes_remaining = sizeof(chunk_t);
chunk_t chunk_load;
uint8_t *cur_chunk = (uint8_t *)&chunk_load;
while (bytes_remaining) {
cpy_dist = MIN(dist, bytes_remaining);
- memcpy(cur_chunk, buf, cpy_dist);
+ memcpy(cur_chunk, buf, (size_t)cpy_dist);
bytes_remaining -= cpy_dist;
cur_chunk += cpy_dist;
/* This allows us to bypass an expensive integer division since we're effectively
* counting in this loop, anyway */
- *chunk_rem = cpy_dist;
+ *chunk_rem = (uint32_t)cpy_dist;
}
return chunk_load;
/* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST.
Return OUT + LEN. */
-static inline uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
+static inline uint8_t* CHUNKMEMSET(uint8_t *out, uint8_t *from, unsigned len) {
/* Debug performance related issues when len < sizeof(uint64_t):
Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
- Assert(dist > 0, "chunkmemset cannot have a distance 0");
+ Assert(from != out, "chunkmemset cannot have a distance 0");
- uint8_t *from = out - dist;
chunk_t chunk_load;
uint32_t chunk_mod = 0;
uint32_t adv_amount;
+ int64_t sdist = out - from;
+ uint64_t dist = llabs(sdist);
+
+ /* We are supporting the case for when we are reading bytes from ahead in the buffer.
+ * We now have to handle this, though it wasn't _quite_ clear if this rare circumstance
+ * always needed to be handled here or if we're just now seeing it because we are
+ * dispatching to this function, more */
+ if (sdist < 0 && dist < len) {
+ /* Here the memmove semantics match perfectly, as when this happens we are
+ * effectively sliding down the contents of memory by dist bytes */
+ memmove(out, from, len);
+ return out + len;
+ }
if (dist == 1) {
memset(out, *from, len);
return out + len;
- } else if (dist > sizeof(chunk_t)) {
- return CHUNKCOPY(out, out - dist, len);
+ } else if (dist >= sizeof(chunk_t)) {
+ return CHUNKCOPY(out, from, len);
}
/* Only AVX2 as there's 128 bit vectors and 256 bit. We allow for shorter vector
* making the code a little smaller. */
#ifdef HAVE_HALF_CHUNK
if (len <= sizeof(halfchunk_t)) {
- if (dist > sizeof(halfchunk_t)) {
- return HALFCHUNKCOPY(out, out - dist, len);
- }
+ if (dist >= sizeof(halfchunk_t))
+ return HALFCHUNKCOPY(out, from, len);
- halfchunk_t halfchunk_load;
-
- if (dist == 2) {
- halfchunkmemset_2(from, &halfchunk_load);
- } else if (dist == 4) {
- halfchunkmemset_4(from, &halfchunk_load);
- } else if (dist == 8) {
- halfchunkmemset_8(from, &halfchunk_load);
- } else if (dist == 16) {
- loadhalfchunk(from, &halfchunk_load);
- } else {
- halfchunk_load = GET_HALFCHUNK_MAG(from, &chunk_mod, dist);
- }
+ if ((dist % 2) != 0 || dist == 6) {
+ halfchunk_t halfchunk_load = GET_HALFCHUNK_MAG(from, &chunk_mod, (unsigned)dist);
- adv_amount = sizeof(halfchunk_t) - chunk_mod;
- while (len >= sizeof(halfchunk_t)) {
- storehalfchunk(out, &halfchunk_load);
- len -= adv_amount;
- out += adv_amount;
- }
+ adv_amount = sizeof(halfchunk_t) - chunk_mod;
+ if (len == sizeof(halfchunk_t)) {
+ storehalfchunk(out, &halfchunk_load);
+ len -= adv_amount;
+ out += adv_amount;
+ }
- chunk_load = halfchunk2whole(halfchunk_load);
- goto rem_bytes;
+ chunk_load = halfchunk2whole(&halfchunk_load);
+ goto rem_bytes;
+ }
}
#endif
chunkmemset_16(from, &chunk_load);
} else
#endif
- if (dist == sizeof(chunk_t)) {
- loadchunk(from, &chunk_load);
- } else {
- chunk_load = GET_CHUNK_MAG(from, &chunk_mod, dist);
- }
+ chunk_load = GET_CHUNK_MAG(from, &chunk_mod, (unsigned)dist);
adv_amount = sizeof(chunk_t) - chunk_mod;
return out;
}
-Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, unsigned dist, unsigned len, unsigned left) {
+Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, uint8_t *from, unsigned len, unsigned left) {
#if !defined(UNALIGNED64_OK)
# if !defined(UNALIGNED_OK)
static const uint32_t align_mask = 7;
#endif
len = MIN(len, left);
- uint8_t *from = out - dist;
+
#if !defined(UNALIGNED64_OK)
while (((uintptr_t)out & align_mask) && (len > 0)) {
*out++ = *from++;
--left;
}
#endif
- if (left < (unsigned)(3 * sizeof(chunk_t))) {
+ if (UNLIKELY(left < sizeof(chunk_t))) {
while (len > 0) {
*out++ = *from++;
--len;
}
+
return out;
}
+
if (len)
- return CHUNKMEMSET(out, dist, len);
+ out = CHUNKMEMSET(out, from, len);
return out;
}
+
+static inline uint8_t *CHUNKCOPY_SAFE(uint8_t *out, uint8_t *from, unsigned len, uint8_t *safe)
+{
+ if (out == from)
+ return out + len;
+
+ uint64_t safelen = (safe - out);
+ len = MIN(len, (unsigned)safelen);
+
+ uint64_t from_dist = (uint64_t)llabs(safe - from);
+ if (UNLIKELY(from_dist < sizeof(chunk_t) || safelen < sizeof(chunk_t))) {
+ while (len--) {
+ *out++ = *from++;
+ }
+
+ return out;
+ }
+
+ return CHUNKMEMSET(out, from, len);
+}
return functable.adler32_fold_copy(adler, dst, src, len);
}
-static uint8_t* chunkmemset_safe_stub(uint8_t* out, unsigned dist, unsigned len, unsigned left) {
+static uint8_t* chunkmemset_safe_stub(uint8_t* out, uint8_t *from, unsigned len, unsigned left) {
init_functable();
- return functable.chunkmemset_safe(out, dist, len, left);
+ return functable.chunkmemset_safe(out, from, len, left);
}
static uint32_t chunksize_stub(void) {
void (* force_init) (void);
uint32_t (* adler32) (uint32_t adler, const uint8_t *buf, size_t len);
uint32_t (* adler32_fold_copy) (uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
- uint8_t* (* chunkmemset_safe) (uint8_t *out, unsigned dist, unsigned len, unsigned left);
+ uint8_t* (* chunkmemset_safe) (uint8_t *out, uint8_t *from, unsigned len, unsigned left);
uint32_t (* chunksize) (void);
uint32_t (* compare256) (const uint8_t *src0, const uint8_t *src1);
uint32_t (* crc32) (uint32_t crc, const uint8_t *buf, size_t len);
from += wsize - op;
if (op < len) { /* some from end of window */
len -= op;
- out = chunkcopy_safe(out, from, op, safe);
+ out = CHUNKCOPY_SAFE(out, from, op, safe);
from = window; /* more from start of window */
op = wnext;
/* This (rare) case can create a situation where
}
if (op < len) { /* still need some from output */
len -= op;
- out = chunkcopy_safe(out, from, op, safe);
- if (!extra_safe)
+ if (!extra_safe) {
+ out = CHUNKCOPY_SAFE(out, from, op, safe);
out = CHUNKUNROLL(out, &dist, &len);
- out = chunkcopy_safe(out, out - dist, len, safe);
+ out = CHUNKCOPY_SAFE(out, out - dist, len, safe);
+ } else {
+ out = chunkcopy_safe(out, from, op, safe);
+ out = chunkcopy_safe(out, out - dist, len, safe);
+ }
} else {
- out = chunkcopy_safe(out, from, len, safe);
+ if (!extra_safe)
+ out = CHUNKCOPY_SAFE(out, from, len, safe);
+ else
+ out = chunkcopy_safe(out, from, len, safe);
}
} else if (extra_safe) {
/* Whole reference is in range of current output. */
- if (dist >= len || dist >= state->chunksize)
out = chunkcopy_safe(out, out - dist, len, safe);
- else
- out = CHUNKMEMSET_SAFE(out, dist, len, (unsigned)((safe - out)));
} else {
/* Whole reference is in range of current output. No range checks are
necessary because we start with room for at least 258 bytes of output,
if (dist >= len || dist >= state->chunksize)
out = CHUNKCOPY(out, out - dist, len);
else
- out = CHUNKMEMSET(out, dist, len);
+ out = CHUNKMEMSET(out, out - dist, len);
}
} else if ((op & 64) == 0) { /* 2nd level distance code */
here = dcode + here->val + BITS(op);
} else {
copy = MIN(state->length, left);
- put = FUNCTABLE_CALL(chunkmemset_safe)(put, state->offset, copy, left);
+ put = FUNCTABLE_CALL(chunkmemset_safe)(put, put - state->offset, copy, left);
}
left -= copy;
state->length -= copy;
/* Behave like chunkcopy, but avoid writing beyond of legal output. */
static inline uint8_t* chunkcopy_safe(uint8_t *out, uint8_t *from, uint64_t len, uint8_t *safe) {
- uint64_t safelen = (safe - out) + 1;
+ uint64_t safelen = safe - out;
len = MIN(len, safelen);
int32_t olap_src = from >= out && from < out + len;
int32_t olap_dst = out >= from && out < from + len;