From: Sebastian Pop Date: Wed, 6 Mar 2019 20:16:20 +0000 (-0600) Subject: define and use chunkmemset instead of byte_memset for INFFAST_CHUNKSIZE X-Git-Tag: 1.9.9-b1~506 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=20ca64fa5d2d8a7421ed86b68709ef971dcfbddf;p=thirdparty%2Fzlib-ng.git define and use chunkmemset instead of byte_memset for INFFAST_CHUNKSIZE --- diff --git a/inffast.c b/inffast.c index 045242af..f2811ef8 100644 --- a/inffast.c +++ b/inffast.c @@ -262,18 +262,10 @@ void ZLIB_INTERNAL inflate_fast(PREFIX3(stream) *strm, unsigned long start) { if (op < len) { /* still need some from output */ len -= op; out = chunkcopysafe(out, from, op, safe); - if (dist == 1) { - out = byte_memset(out, len); - } else { - out = chunkunroll(out, &dist, &len); - out = chunkcopysafe(out, out - dist, len, safe); - } + out = chunkunroll(out, &dist, &len); + out = chunkcopysafe(out, out - dist, len, safe); } else { - if (from - out == 1) { - out = byte_memset(out, len); - } else { - out = chunkcopysafe(out, from, len, safe); - } + out = chunkcopysafe(out, from, len, safe); } #else from = window; @@ -319,18 +311,16 @@ void ZLIB_INTERNAL inflate_fast(PREFIX3(stream) *strm, unsigned long start) { #endif } else { #ifdef INFFAST_CHUNKSIZE - if (dist == 1 && len >= sizeof(uint64_t)) { - out = byte_memset(out, len); - } else { - /* Whole reference is in range of current output. No - range checks are necessary because we start with room - for at least 258 bytes of output, so unroll and roundoff - operations can write beyond `out+len` so long as they - stay within 258 bytes of `out`. - */ - out = chunkunroll(out, &dist, &len); + /* Whole reference is in range of current output. No + range checks are necessary because we start with room + for at least 258 bytes of output, so unroll and roundoff + operations can write beyond `out+len` so long as they + stay within 258 bytes of `out`. + */ + if (dist >= len || dist >= INFFAST_CHUNKSIZE) out = chunkcopy(out, out - dist, len); - } + else + out = chunkmemset(out, dist, len); #else if (len < sizeof(uint64_t)) out = set_bytes(out, out - dist, dist, len); diff --git a/memcopy.h b/memcopy.h index b294711e..817bb5bc 100644 --- a/memcopy.h +++ b/memcopy.h @@ -78,9 +78,24 @@ static inline unsigned char* chunkcopy(unsigned char *out, unsigned char const * */ static inline unsigned char* chunkcopysafe(unsigned char *out, unsigned char const *from, unsigned len, unsigned char *safe) { - if (out > safe) { - while (len-- > 0) { - *out++ = *from++; + if ((safe - out) < (ptrdiff_t)INFFAST_CHUNKSIZE) { + if (len & 8) { + memcpy(out, from, 8); + out += 8; + from += 8; + } + if (len & 4) { + memcpy(out, from, 4); + out += 4; + from += 4; + } + if (len & 2) { + memcpy(out, from, 2); + out += 2; + from += 2; + } + if (len & 1) { + *out++ = *from++; } return out; } @@ -107,6 +122,191 @@ static inline unsigned char* chunkunroll(unsigned char *out, unsigned *dist, uns } return out; } + +static inline inffast_chunk_t chunkmemset_1(unsigned char *from) { + #if defined(X86_SSE2) + int8_t c; + memcpy(&c, from, sizeof(c)); + return _mm_set1_epi8(c); + #elif defined(__ARM_NEON__) || defined(__ARM_NEON) + return vld1q_dup_u8(from); + #endif +} + +static inline inffast_chunk_t chunkmemset_2(unsigned char *from) { + int16_t c; + memcpy(&c, from, sizeof(c)); + #if defined(X86_SSE2) + return _mm_set1_epi16(c); + #elif defined(__ARM_NEON__) || defined(__ARM_NEON) + return vreinterpretq_u8_s16(vdupq_n_s16(c)); + #endif +} + +static inline inffast_chunk_t chunkmemset_4(unsigned char *from) { + int32_t c; + memcpy(&c, from, sizeof(c)); + #if defined(X86_SSE2) + return _mm_set1_epi32(c); + #elif defined(__ARM_NEON__) || defined(__ARM_NEON) + return vreinterpretq_u8_s32(vdupq_n_s32(c)); + #endif +} + +static inline inffast_chunk_t chunkmemset_8(unsigned char *from) { + #if defined(X86_SSE2) + int64_t c; + memcpy(&c, from, sizeof(c)); + return _mm_set1_epi64x(c); + #elif defined(__ARM_NEON__) || defined(__ARM_NEON) + return vcombine_u8(vld1_u8(from), vld1_u8(from)); + #endif +} + + #if defined(__ARM_NEON__) || defined(__ARM_NEON) +static inline unsigned char *chunkmemset_3(unsigned char *out, unsigned char *from, unsigned dist, unsigned len) { + uint8x8x3_t chunks; + unsigned sz = sizeof(chunks); + if (len < sz) { + out = chunkunroll(out, &dist, &len); + return chunkcopy(out, out - dist, len); + } + + /* Load 3 bytes 'a,b,c' from FROM and duplicate across all lanes: + chunks[0] = {a,a,a,a,a,a,a,a} + chunks[1] = {b,b,b,b,b,b,b,b} + chunks[2] = {c,c,c,c,c,c,c,c}. */ + chunks = vld3_dup_u8(from); + + unsigned rem = len % sz; + len -= rem; + while (len) { + /* Store "a,b,c, ..., a,b,c". */ + vst3_u8(out, chunks); + out += sz; + len -= sz; + } + + if (!rem) + return out; + + /* Last, deal with the case when LEN is not a multiple of SZ. */ + out = chunkunroll(out, &dist, &rem); + return chunkcopy(out, out - dist, rem); +} + #endif + + #if defined(__aarch64__) +static inline unsigned char *chunkmemset_6(unsigned char *out, unsigned char *from, unsigned dist, unsigned len) { + uint16x8x3_t chunks; + unsigned sz = sizeof(chunks); + if (len < sz) { + out = chunkunroll(out, &dist, &len); + return chunkcopy(out, out - dist, len); + } + + /* Load 6 bytes 'ab,cd,ef' from FROM and duplicate across all lanes: + chunks[0] = {ab,ab,ab,ab,ab,ab,ab,ab} + chunks[1] = {cd,cd,cd,cd,cd,cd,cd,cd} + chunks[2] = {ef,ef,ef,ef,ef,ef,ef,ef}. */ + chunks = vld3q_dup_u16((unsigned short *)from); + + unsigned rem = len % sz; + len -= rem; + while (len) { + /* Store "ab,cd,ef, ..., ab,cd,ef". */ + vst3q_u16((unsigned short *)out, chunks); + out += sz; + len -= sz; + } + + if (rem) + return out; + + /* Last, deal with the case when LEN is not a multiple of SZ. */ + out = chunkunroll(out, &dist, &rem); + return chunkcopy(out, out - dist, rem); +} + #endif + +/* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST. Return OUT + LEN. */ +static inline unsigned char *chunkmemset(unsigned char *out, unsigned dist, unsigned len) { + Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); + Assert(dist > 0, "cannot have a distance 0"); + + unsigned char *from = out - dist; + inffast_chunk_t chunk; + unsigned sz = sizeof(chunk); + if (len < sz) { + do { + *out++ = *from++; + --len; + } while (len != 0); + return out; + } + + switch (dist) { + case 1: { + chunk = chunkmemset_1(from); + break; + } + case 2: { + chunk = chunkmemset_2(from); + break; + } + #if defined(__ARM_NEON__) || defined(__ARM_NEON) + case 3: + return chunkmemset_3(out, from, dist, len); + #endif + case 4: { + chunk = chunkmemset_4(from); + break; + } + #if defined(__aarch64__) + case 6: + return chunkmemset_6(out, from, dist, len); + #endif + case 8: { + chunk = chunkmemset_8(from); + break; + } + case 16: + memcpy(&chunk, from, sz); + break; + + default: + out = chunkunroll(out, &dist, &len); + return chunkcopy(out, out - dist, len); + } + + unsigned rem = len % sz; + len -= rem; + while (len) { + memcpy(out, &chunk, sz); + out += sz; + len -= sz; + } + + /* Last, deal with the case when LEN is not a multiple of SZ. */ + if (rem) + memcpy(out, &chunk, rem); + out += rem; + return out; +} + +static inline unsigned char* chunkmemsetsafe(unsigned char *out, unsigned dist, unsigned len, unsigned left) { + if (left < (unsigned)(3 * INFFAST_CHUNKSIZE)) { + while (len > 0) { + *out = *(out - dist); + out++; + --len; + } + return out; + } + + return chunkmemset(out, dist, len); +} + #endif /* INFFAST_CHUNKSIZE */ static inline unsigned char *copy_1_bytes(unsigned char *out, unsigned char *from) { @@ -466,5 +666,4 @@ static inline unsigned char *chunk_copy(unsigned char *out, unsigned char *from, return chunk_memcpy(out, from, len); } - #endif /* MEMCOPY_H_ */