#ifdef ARM_NEON
uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t adler32_copy_neon(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-uint8_t* chunkmemset_safe_neon(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+uint8_t* chunkmemset_safe_neon(uint8_t *out, uint8_t *from, size_t len, size_t left);
uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1);
void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start);
uint32_t longest_match_neon(deflate_state *const s, uint32_t cur_match);
vst1q_u8(out, *chunk);
}
-static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
*chunk_rem = lut_rem.remval;
uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t adler32_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-uint8_t* chunkmemset_safe_c(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+uint8_t* chunkmemset_safe_c(uint8_t *out, uint8_t *from, size_t len, size_t left);
#ifdef WITH_ALL_FALLBACKS
uint32_t compare256_8(const uint8_t *src0, const uint8_t *src1);
__lasx_xvst(*chunk, out, 0);
}
-static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
__m256i ret_vec;
/* While technically we only need to read 4 or 8 bytes into this vector register for a lot of cases, GCC is
return lasx_zext_128(*chunk);
}
-static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
__m128i perm_vec, ret_vec;
__msan_unpoison(buf + dist, 16 - dist);
__lsx_vst(*chunk, out, 0);
}
-static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
__m128i perm_vec, ret_vec;
/* Important to note:
#ifdef LOONGARCH_LSX
uint32_t adler32_lsx(uint32_t adler, const uint8_t *src, size_t len);
uint32_t adler32_copy_lsx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-uint8_t* chunkmemset_safe_lsx(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+uint8_t* chunkmemset_safe_lsx(uint8_t *out, uint8_t *from, size_t len, size_t left);
uint32_t compare256_lsx(const uint8_t *src0, const uint8_t *src1);
void inflate_fast_lsx(PREFIX3(stream) *strm, uint32_t start);
uint32_t longest_match_lsx(deflate_state *const s, uint32_t cur_match);
#ifdef LOONGARCH_LASX
uint32_t adler32_lasx(uint32_t adler, const uint8_t *src, size_t len);
uint32_t adler32_copy_lasx(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-uint8_t* chunkmemset_safe_lasx(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+uint8_t* chunkmemset_safe_lasx(uint8_t *out, uint8_t *from, size_t len, size_t left);
uint32_t compare256_lasx(const uint8_t *src0, const uint8_t *src1);
void inflate_fast_lasx(PREFIX3(stream) *strm, uint32_t start);
uint32_t longest_match_lasx(deflate_state *const s, uint32_t cur_match);
#ifdef POWER8_VSX
uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t adler32_copy_power8(uint32_t adler, uint8_t *dst, const uint8_t *buf, size_t len);
-uint8_t* chunkmemset_safe_power8(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+uint8_t* chunkmemset_safe_power8(uint8_t *out, uint8_t *from, size_t len, size_t left);
uint32_t crc32_power8(uint32_t crc, const uint8_t *buf, size_t len);
uint32_t crc32_copy_power8(uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
void slide_hash_power8(deflate_state *s);
* After using a single memcpy to copy N chunks, we have to use series of
* loadchunk and storechunk to ensure the result is correct.
*/
-static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
+static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, size_t len) {
Assert(len > 0, "chunkcopy should never have a length 0");
- ptrdiff_t dist = out - from;
- if (dist < 0 || dist >= len) {
+ size_t dist = out - from;
+ if (out < from || dist >= len) {
memcpy(out, from, len);
out += len;
from += len;
return out;
}
- int32_t align = ((len - 1) % sizeof(chunk_t)) + 1;
+ size_t align = ((len - 1) % sizeof(chunk_t)) + 1;
memcpy(out, from, sizeof(chunk_t));
out += align;
from += align;
#ifdef RISCV_RVV
uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t adler32_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-uint8_t* chunkmemset_safe_rvv(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+uint8_t* chunkmemset_safe_rvv(uint8_t *out, uint8_t *from, size_t len, size_t left);
uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1);
uint32_t longest_match_rvv(deflate_state *const s, uint32_t cur_match);
_mm256_storeu_si256((__m256i *)out, *chunk);
}
-static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
__m256i ret_vec;
/* While technically we only need to read 4 or 8 bytes into this vector register for a lot of cases, GCC is
return _mm256_zextsi128_si256(*chunk);
}
-static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
__m128i perm_vec, ret_vec;
__msan_unpoison(buf + dist, 16 - dist);
#define HAVE_CHUNKCOPY
#define HAVE_HALFCHUNKCOPY
-static inline halfmask_t gen_half_mask(unsigned len) {
- return (halfmask_t)_bzhi_u32(0xFFFF, len);
+static inline halfmask_t gen_half_mask(size_t len) {
+ return (halfmask_t)_bzhi_u32(0xFFFF, (unsigned)len);
}
-static inline mask_t gen_mask(unsigned len) {
- return (mask_t)_bzhi_u32(0xFFFFFFFF, len);
+static inline mask_t gen_mask(size_t len) {
+ return (mask_t)_bzhi_u32(0xFFFFFFFF, (unsigned)len);
}
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
_mm256_storeu_si256((__m256i *)out, *chunk);
}
-static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
+static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, size_t len) {
Assert(len > 0, "chunkcopy should never have a length 0");
chunk_t chunk;
- uint32_t rem = len % sizeof(chunk_t);
+ size_t rem = len % sizeof(chunk_t);
if (len < sizeof(chunk_t)) {
mask_t rem_mask = gen_mask(rem);
#if defined(_MSC_VER) && _MSC_VER < 1943
# pragma optimize("", off)
#endif
-static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
__m256i ret_vec;
*chunk_rem = lut_rem.remval;
return _mm256_zextsi128_si256(*chunk);
}
-static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
__m128i perm_vec, ret_vec;
halfmask_t load_mask = gen_half_mask(dist);
return ret_vec;
}
-static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
+static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, size_t len) {
Assert(len > 0, "chunkcopy should never have a length 0");
halfchunk_t chunk;
- uint32_t rem = len % sizeof(halfchunk_t);
+ size_t rem = len % sizeof(halfchunk_t);
if (rem == 0) {
rem = sizeof(halfchunk_t);
}
_mm_storeu_si128((__m128i *)out, *chunk);
}
-static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
__m128i perm_vec, ret_vec;
/* Important to note:
#endif
#ifdef X86_SSE2
-uint8_t* chunkmemset_safe_sse2(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+uint8_t* chunkmemset_safe_sse2(uint8_t *out, uint8_t *from, size_t len, size_t left);
uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
void inflate_fast_sse2(PREFIX3(stream)* strm, uint32_t start);
uint32_t longest_match_sse2(deflate_state *const s, uint32_t cur_match);
#ifdef X86_SSSE3
uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t adler32_copy_ssse3(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-uint8_t* chunkmemset_safe_ssse3(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+uint8_t* chunkmemset_safe_ssse3(uint8_t *out, uint8_t *from, size_t len, size_t left);
void inflate_fast_ssse3(PREFIX3(stream) *strm, uint32_t start);
#endif
#ifdef X86_AVX2
uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t adler32_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-uint8_t* chunkmemset_safe_avx2(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+uint8_t* chunkmemset_safe_avx2(uint8_t *out, uint8_t *from, size_t len, size_t left);
uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
void inflate_fast_avx2(PREFIX3(stream)* strm, uint32_t start);
uint32_t longest_match_avx2(deflate_state *const s, uint32_t cur_match);
#ifdef X86_AVX512
uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len);
uint32_t adler32_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-uint8_t* chunkmemset_safe_avx512(uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+uint8_t* chunkmemset_safe_avx512(uint8_t *out, uint8_t *from, size_t len, size_t left);
uint32_t compare256_avx512(const uint8_t *src0, const uint8_t *src1);
void inflate_fast_avx512(PREFIX3(stream)* strm, uint32_t start);
uint32_t longest_match_avx512(deflate_state *const s, uint32_t cur_match);
without iteration, which will hopefully make the branch prediction more
reliable. */
#ifndef HAVE_CHUNKCOPY
-static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
+static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, size_t len) {
Assert(len > 0, "chunkcopy should never have a length 0");
chunk_t chunk;
- int32_t align = ((len - 1) % sizeof(chunk_t)) + 1;
+ size_t align = ((len - 1) % sizeof(chunk_t)) + 1;
loadchunk(from, &chunk);
storechunk(out, &chunk);
out += align;
#ifndef HAVE_CHUNK_MAG
/* Loads a magazine to feed into memory of the pattern */
-static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, size_t *chunk_rem, size_t dist) {
/* This code takes string of length dist from "from" and repeats
* it for as many times as can fit in a chunk_t (vector register) */
- uint64_t cpy_dist;
- uint64_t bytes_remaining = sizeof(chunk_t);
+ size_t cpy_dist;
+ size_t bytes_remaining = sizeof(chunk_t);
chunk_t chunk_load;
uint8_t *cur_chunk = (uint8_t *)&chunk_load;
while (bytes_remaining) {
cpy_dist = MIN(dist, bytes_remaining);
- memcpy(cur_chunk, buf, (size_t)cpy_dist);
+ memcpy(cur_chunk, buf, cpy_dist);
bytes_remaining -= cpy_dist;
cur_chunk += cpy_dist;
/* This allows us to bypass an expensive integer division since we're effectively
* counting in this loop, anyway */
- *chunk_rem = (uint32_t)cpy_dist;
+ *chunk_rem = cpy_dist;
}
return chunk_load;
#endif
#if defined(HAVE_HALF_CHUNK) && !defined(HAVE_HALFCHUNKCOPY)
-static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
+static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, size_t len) {
+ Assert(len > 0, "halfchunkcopy should never have a length 0");
halfchunk_t chunk;
- int32_t align = ((len - 1) % sizeof(halfchunk_t)) + 1;
+ size_t align = ((len - 1) % sizeof(halfchunk_t)) + 1;
loadhalfchunk(from, &chunk);
storehalfchunk(out, &chunk);
out += align;
/* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST.
Return OUT + LEN. */
-static inline uint8_t* CHUNKMEMSET(uint8_t *out, uint8_t *from, unsigned len) {
+static inline uint8_t* CHUNKMEMSET(uint8_t *out, uint8_t *from, size_t len) {
/* Debug performance related issues when len < sizeof(uint64_t):
Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
Assert(from != out, "chunkmemset cannot have a distance 0");
chunk_t chunk_load;
- uint32_t chunk_mod = 0;
- uint32_t adv_amount;
- int64_t sdist = out - from;
- uint64_t dist = llabs(sdist);
+ size_t chunk_mod = 0;
+ size_t adv_amount;
+ size_t dist = (size_t)ABS(out - from);
/* We are supporting the case for when we are reading bytes from ahead in the buffer.
* We now have to handle this, though it wasn't _quite_ clear if this rare circumstance
* always needed to be handled here or if we're just now seeing it because we are
* dispatching to this function, more */
- if (sdist < 0 && dist < len) {
+ if (out < from && dist < len) {
#ifdef HAVE_MASKED_READWRITE
/* We can still handle this case if we can mitigate over writing _and_ we
* fit the entirety of the copy length with one load */
return HALFCHUNKCOPY(out, from, len);
if ((dist % 2) != 0 || dist == 6) {
- halfchunk_t halfchunk_load = GET_HALFCHUNK_MAG(from, &chunk_mod, (unsigned)dist);
+ halfchunk_t halfchunk_load = GET_HALFCHUNK_MAG(from, &chunk_mod, dist);
if (len == sizeof(halfchunk_t)) {
storehalfchunk(out, &halfchunk_load);
chunkmemset_16(from, &chunk_load);
} else
#endif
- chunk_load = GET_CHUNK_MAG(from, &chunk_mod, (unsigned)dist);
+ chunk_load = GET_CHUNK_MAG(from, &chunk_mod, dist);
adv_amount = sizeof(chunk_t) - chunk_mod;
return out;
}
-Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, uint8_t *from, unsigned len, unsigned left) {
+Z_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, uint8_t *from, size_t len, size_t left) {
#if OPTIMAL_CMP < 32
- static const uint32_t align_mask = 7;
+ static const uintptr_t align_mask = 7;
#elif OPTIMAL_CMP == 32
- static const uint32_t align_mask = 3;
+ static const uintptr_t align_mask = 3;
#endif
len = MIN(len, left);
return out;
}
-static inline uint8_t *CHUNKCOPY_SAFE(uint8_t *out, uint8_t *from, uint64_t len, uint8_t *safe)
+static inline uint8_t *CHUNKCOPY_SAFE(uint8_t *out, uint8_t *from, size_t len, uint8_t *safe)
{
if (out == from)
return out + len;
- uint64_t safelen = (safe - out);
+ size_t safelen = (safe - out);
len = MIN(len, safelen);
#ifndef HAVE_MASKED_READWRITE
- uint64_t from_dist = (uint64_t)llabs(safe - from);
+ size_t from_dist = (size_t)ABS(safe - from);
if (UNLIKELY(from_dist < sizeof(chunk_t) || safelen < sizeof(chunk_t))) {
while (len--) {
*out++ = *from++;
}
#endif
- return CHUNKMEMSET(out, from, (unsigned)len);
+ return CHUNKMEMSET(out, from, len);
}
return functable.adler32_copy(adler, dst, src, len);
}
-static uint8_t* chunkmemset_safe_stub(uint8_t* out, uint8_t *from, unsigned len, unsigned left) {
+static uint8_t* chunkmemset_safe_stub(uint8_t* out, uint8_t *from, size_t len, size_t left) {
FUNCTABLE_INIT_ABORT;
return functable.chunkmemset_safe(out, from, len, left);
}
int (* force_init) (void);
uint32_t (* adler32) (uint32_t adler, const uint8_t *buf, size_t len);
uint32_t (* adler32_copy) (uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
- uint8_t* (* chunkmemset_safe) (uint8_t *out, uint8_t *from, unsigned len, unsigned left);
+ uint8_t* (* chunkmemset_safe) (uint8_t *out, uint8_t *from, size_t len, size_t left);
uint32_t (* compare256) (const uint8_t *src0, const uint8_t *src1);
uint32_t (* crc32) (uint32_t crc, const uint8_t *buf, size_t len);
uint32_t (* crc32_copy) (uint32_t crc, uint8_t *dst, const uint8_t *src, size_t len);
}
/* Behave like chunkcopy, but avoid writing beyond of legal output. */
-static inline uint8_t* chunkcopy_safe(uint8_t *out, uint8_t *from, uint64_t len, uint8_t *safe) {
- uint64_t safelen = safe - out;
+static inline uint8_t* chunkcopy_safe(uint8_t *out, uint8_t *from, size_t len, uint8_t *safe) {
+ size_t safelen = safe - out;
len = MIN(len, safelen);
int32_t olap_src = from >= out && from < out + len;
int32_t olap_dst = out >= from && out < from + len;
- uint64_t tocopy;
+ size_t tocopy;
/* For all cases without overlap, memcpy is ideal */
if (!(olap_src || olap_dst)) {
- memcpy(out, from, (size_t)len);
+ memcpy(out, from, len);
return out + len;
}
* initial bulk memcpy of the nonoverlapping region. Then, we can leverage the size of this to determine the safest
* atomic memcpy size we can pick such that we have non-overlapping regions. This effectively becomes a safe look
* behind or lookahead distance. */
- uint64_t non_olap_size = llabs(from - out); // llabs vs labs for compatibility with windows
+ size_t non_olap_size = (size_t)ABS(from - out);
/* So this doesn't give use a worst case scenario of function calls in a loop,
* we want to instead break this down into copy blocks of fixed lengths
#define MIN(a, b) ((a) > (b) ? (b) : (a))
/* Maximum of a and b. */
#define MAX(a, b) ((a) < (b) ? (b) : (a))
+/* Absolute value of a. */
+#define ABS(a) ((a) < 0 ? -(a) : (a))
/* Ignore unused variable warning */
#define Z_UNUSED(var) (void)(var)