From: Adam Stylinski Date: Sun, 10 Apr 2022 17:01:22 +0000 (-0400) Subject: Improved chunkset substantially where it's heavily used X-Git-Tag: 2.1.0-beta1~245 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=ef0cf5ca17057062a895b187ff8bef9ad7762c2b;p=thirdparty%2Fzlib-ng.git Improved chunkset substantially where it's heavily used For most realistic use cases, this doesn't make a ton of difference. However, for things which are highly compressible and enjoy very large run length encodes in the window, this is a huge win. We leverage a permutation table to swizzle the contents of the memory chunk into a vector register and then splat that over memory with a fast copy loop. In essence, where this helps, it helps a lot. Where it doesn't, it does no measurable damage to the runtime. This commit also simplifies a chunkcopy_safe call for determining a distance. Using labs is enough to give the same behavior as before, with the added benefit that no predication is required _and_, most importantly, static analysis by GCC's string fortification can't throw a fit because it conveys better to the compiler that the input into builtin_memcpy will always be in range. --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 6c45fd844..24fe98b3e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -769,6 +769,9 @@ if(WITH_OPTIM) check_sse41_intrinsics() if(HAVE_SSE41_INTRIN) add_definitions(-DX86_SSE41) + list(APPEND SSE41_SRCS ${ARCHDIR}/chunkset_sse41.c) + list(APPEND ZLIB_ARCH_SRCS ${SSE41_SRCS}) + set_property(SOURCE ${SSE41_SRCS} PROPERTY COMPILE_FLAGS "${SSE41FLAG} ${NOLTOFLAG}") else() set(WITH_SSE41 OFF) endif() diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in index 05cf144b3..689e3a0c2 100644 --- a/arch/x86/Makefile.in +++ b/arch/x86/Makefile.in @@ -31,6 +31,7 @@ all: \ adler32_ssse3.o adler32_ssse3.lo \ chunkset_avx.o chunkset_avx.lo \ chunkset_sse2.o chunkset_sse2.lo \ + chunkset_sse41.o chunkset_sse41.lo \ compare256_avx2.o compare256_avx2.lo \ compare256_sse2.o compare256_sse2.lo \ insert_string_sse42.o insert_string_sse42.lo \ @@ -57,6 +58,12 @@ chunkset_sse2.o: chunkset_sse2.lo: $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c +chunkset_sse41.o: + $(CC) $(CFLAGS) $(SSE41FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse41.c + +chunkset_sse41.lo: + $(CC) $(SFLAGS) $(SSE41FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse41.c + compare256_avx2.o: $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c diff --git a/arch/x86/chunk_permute_table.h b/arch/x86/chunk_permute_table.h new file mode 100644 index 000000000..c7b2d2de7 --- /dev/null +++ b/arch/x86/chunk_permute_table.h @@ -0,0 +1,53 @@ +/* chunk_permute_table.h - shared AVX/SSE4 permutation table for use with chunkmemset family of functions. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef CHUNK_PERMUTE_TABLE_H_ +#define CHUNK_PERMUTE_TABLE_H_ + +#include "zbuild.h" + +/* Need entries for all numbers not an even modulus for 1, 2, 4, 8, 16 & 32 */ +static const ALIGNED_(32) uint8_t permute_table[26*32] = { + 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, /* dist 3 */ + 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, /* dist 5 */ + 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, /* dist 6 */ + 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, /* dist 7 */ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, /* dist 9 */ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, /* dist 10 */ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 11 */ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 12 */ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, /* dist 13 */ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, /* dist 14 */ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, /* dist 15 */ + + /* Beyond dists of 15 means we have to permute from a vector > len(m128i). Because AVX couldn't permute + * beyond 128 bit lanes until AVX512 for sub 4-byte sequences, we have to do some math here for an eventual + * blend with a comparison. That means we need to wrap the indices with yet another derived table. For simplicity, + * we'll use absolute indexing here to derive a blend vector. This is actually a lot simpler with ARM's TBL, but, + * this is what we're dealt. + */ + + 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, /* dist 17 */ + 16, 17, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, /* dist 18 */ + 16, 17, 18, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, /* dist 19 */ + 16, 17, 18, 19, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, /* dist 20 */ + 16, 17, 18, 19, 20, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, /* dist 21 */ + 16, 17, 18, 19, 20, 21, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 22 */ + 16, 17, 18, 19, 20, 21, 22, 0, 1, 2, 3, 4, 5, 6, 7, 8, /* dist 23 */ + 16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 24 */ + 16, 17, 18, 19, 20, 21, 22, 23, 24, 0, 1, 2, 3, 4, 5, 6, /* dist 25 */ + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 1, 2, 3, 4, 5, /* dist 26 */ + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 0, 1, 2, 3, 4, /* dist 27 */ + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 1, 2, 3, /* dist 28 */ + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 0, 1, 2, /* dist 29 */ + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 0, 1, /* dist 30 */ + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, /* dist 31 */ +}; + +typedef struct lut_rem_pair_s { + uint16_t idx; + uint16_t remval; +} lut_rem_pair; + +#endif diff --git a/arch/x86/chunkset_avx.c b/arch/x86/chunkset_avx.c index 6643b0468..91aaa458e 100644 --- a/arch/x86/chunkset_avx.c +++ b/arch/x86/chunkset_avx.c @@ -5,6 +5,7 @@ #ifdef X86_AVX_CHUNKSET #include +#include "chunk_permute_table.h" typedef __m256i chunk_t; @@ -13,6 +14,41 @@ typedef __m256i chunk_t; #define HAVE_CHUNKMEMSET_2 #define HAVE_CHUNKMEMSET_4 #define HAVE_CHUNKMEMSET_8 +#define HAVE_CHUNK_MAG + +/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can + * never be 0 - 2, we'll start with an offset, subtracting 3 from the input */ +static const lut_rem_pair perm_idx_lut[29] = { + { 0, 2}, /* 3 */ + { 0, 0}, /* don't care */ + { 1 * 32, 2}, /* 5 */ + { 2 * 32, 2}, /* 6 */ + { 3 * 32, 4}, /* 7 */ + { 0 * 32, 0}, /* don't care */ + { 4 * 32, 5}, /* 9 */ + { 5 * 32, 22}, /* 10 */ + { 6 * 32, 21}, /* 11 */ + { 7 * 32, 20}, /* 12 */ + { 8 * 32, 6}, /* 13 */ + { 9 * 32, 4}, /* 14 */ + {10 * 32, 2}, /* 15 */ + { 0 * 32, 0}, /* don't care */ + {11 * 32, 15}, /* 17 */ + {11 * 32 + 16, 14}, /* 18 */ + {11 * 32 + 16 * 2, 13}, /* 19 */ + {11 * 32 + 16 * 3, 12}, /* 20 */ + {11 * 32 + 16 * 4, 11}, /* 21 */ + {11 * 32 + 16 * 5, 10}, /* 22 */ + {11 * 32 + 16 * 6, 9}, /* 23 */ + {11 * 32 + 16 * 7, 8}, /* 24 */ + {11 * 32 + 16 * 8, 7}, /* 25 */ + {11 * 32 + 16 * 9, 6}, /* 26 */ + {11 * 32 + 16 * 10, 5}, /* 27 */ + {11 * 32 + 16 * 11, 4}, /* 28 */ + {11 * 32 + 16 * 12, 3}, /* 29 */ + {11 * 32 + 16 * 13, 2}, /* 30 */ + {11 * 32 + 16 * 14, 1} /* 31 */ +}; static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { int16_t tmp; @@ -40,6 +76,50 @@ static inline void storechunk(uint8_t *out, chunk_t *chunk) { _mm256_storeu_si256((__m256i *)out, *chunk); } +static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) { + lut_rem_pair lut_rem = perm_idx_lut[dist - 3]; + __m256i ret_vec; + /* While technically we only need to read 4 or 8 bytes into this vector register for a lot of cases, GCC is + * compiling this to a shared load for all branches, preferring the simpler code. Given that the buf value isn't in + * GPRs to begin with the 256 bit load is _probably_ just as inexpensive */ + *chunk_rem = lut_rem.remval; + +#ifdef Z_MEMORY_SANITIZER + /* See note in chunkset_sse4.c for why this is ok */ + __msan_unpoison(buf + dist, 32 - dist); +#endif + + if (dist < 16) { + /* This simpler case still requires us to shuffle in 128 bit lanes, so we must apply a static offset after + * broadcasting the first vector register to both halves. This is _marginally_ faster than doing two separate + * shuffles and combining the halves later */ + const __m256i permute_xform = + _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16); + __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx)); + __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf); + perm_vec = _mm256_add_epi8(perm_vec, permute_xform); + ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1); + ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec); + } else if (dist == 16) { + __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf); + return _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1); + } else { + __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf); + __m128i ret_vec1 = _mm_loadu_si128((__m128i*)(buf + 16)); + /* Take advantage of the fact that only the latter half of the 256 bit vector will actually differ */ + __m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx)); + __m128i xlane_permutes = _mm_cmpgt_epi8(_mm_set1_epi8(16), perm_vec1); + __m128i xlane_res = _mm_shuffle_epi8(ret_vec0, perm_vec1); + /* Since we can't wrap twice, we can simply keep the later half exactly how it is instead of having to _also_ + * shuffle those values */ + __m128i latter_half = _mm_blendv_epi8(ret_vec1, xlane_res, xlane_permutes); + ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1); + } + + return ret_vec; +} + #define CHUNKSIZE chunksize_avx #define CHUNKCOPY chunkcopy_avx #define CHUNKCOPY_SAFE chunkcopy_safe_avx diff --git a/arch/x86/chunkset_sse41.c b/arch/x86/chunkset_sse41.c new file mode 100644 index 000000000..c6f982183 --- /dev/null +++ b/arch/x86/chunkset_sse41.c @@ -0,0 +1,98 @@ +/* chunkset_sse41.c -- SSE4 inline functions to copy small data chunks. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zbuild.h" + +/* This requires SSE2 support. While it's implicit with SSE4, we can minimize + * code size by sharing the chunkcopy functions, which will certainly compile + * to identical machine code */ +#if defined(X86_SSE41) && defined(X86_SSE2) +#include +#include "chunk_permute_table.h" + +typedef __m128i chunk_t; + +#define CHUNK_SIZE 16 + +#define HAVE_CHUNKMEMSET_2 +#define HAVE_CHUNKMEMSET_4 +#define HAVE_CHUNKMEMSET_8 +#define HAVE_CHUNK_MAG +#define HAVE_CHUNKCOPY +#define HAVE_CHUNKUNROLL + +static const lut_rem_pair perm_idx_lut[13] = { + {0, 1}, /* 3 */ + {0, 0}, /* don't care */ + {1 * 32, 1}, /* 5 */ + {2 * 32, 4}, /* 6 */ + {3 * 32, 2}, /* 7 */ + {0 * 32, 0}, /* don't care */ + {4 * 32, 7}, /* 9 */ + {5 * 32, 6}, /* 10 */ + {6 * 32, 5}, /* 11 */ + {7 * 32, 4}, /* 12 */ + {8 * 32, 3}, /* 13 */ + {9 * 32, 2}, /* 14 */ + {10 * 32, 1},/* 15 */ +}; + + +static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { + int16_t tmp; + zmemcpy_2(&tmp, from); + *chunk = _mm_set1_epi16(tmp); +} + +static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { + int32_t tmp; + zmemcpy_4(&tmp, from); + *chunk = _mm_set1_epi32(tmp); +} + +static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { + int64_t tmp; + zmemcpy_8(&tmp, from); + *chunk = _mm_set1_epi64x(tmp); +} + +static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { + *chunk = _mm_loadu_si128((__m128i *)s); +} + +static inline void storechunk(uint8_t *out, chunk_t *chunk) { + _mm_storeu_si128((__m128i *)out, *chunk); +} + +static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) { + lut_rem_pair lut_rem = perm_idx_lut[dist - 3]; + __m128i perm_vec, ret_vec; +#ifdef Z_MEMORY_SANITIZER + /* Important to note: + * This is _not_ to subvert the memory sanitizer but to instead unpoison some + * bytes we willingly and purposefully load unitialized that we swizzle over + * in a vector register, anyway. If what we assume is wrong about what is used, + * the memory sanitizer will still usefully flag it */ + __msan_unpoison(buf + dist, 16 - dist); +#endif + ret_vec = _mm_loadu_si128((__m128i*)buf); + *chunk_rem = lut_rem.remval; + + perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx)); + ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec); + + return ret_vec; +} + +extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len); + +#define CHUNKSIZE chunksize_sse41 +#define CHUNKMEMSET chunkmemset_sse41 +#define CHUNKMEMSET_SAFE chunkmemset_safe_sse41 +#define CHUNKCOPY(a, b, c) chunkcopy_sse2(a, b, c) +#define CHUNKUNROLL(a, b, c) chunkunroll_sse2(a, b, c) + +#include "chunkset_tpl.h" + +#endif diff --git a/chunkset_tpl.h b/chunkset_tpl.h index 25ee95596..f70ef42cd 100644 --- a/chunkset_tpl.h +++ b/chunkset_tpl.h @@ -5,6 +5,10 @@ #include "zbuild.h" #include +#if CHUNK_SIZE == 32 && defined(X86_SSE41) && defined(X86_SSE2) +extern uint8_t* chunkmemset_sse41(uint8_t *out, unsigned dist, unsigned len); +#endif + /* Returns the chunk size */ Z_INTERNAL uint32_t CHUNKSIZE(void) { return sizeof(chunk_t); @@ -20,6 +24,7 @@ Z_INTERNAL uint32_t CHUNKSIZE(void) { (chunk_t bytes or fewer) will fall straight through the loop without iteration, which will hopefully make the branch prediction more reliable. */ +#ifndef HAVE_CHUNKCOPY Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) { Assert(len > 0, "chunkcopy should never have a length 0"); chunk_t chunk; @@ -38,6 +43,7 @@ Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) { } return out; } +#endif /* Perform short copies until distance can be rewritten as being at least sizeof chunk_t. @@ -47,6 +53,7 @@ Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) { This assumption holds because inflate_fast() starts every iteration with at least 258 bytes of output space available (258 being the maximum length output from a single token; see inflate_fast()'s assumptions below). */ +#ifndef HAVE_CHUNKUNROLL Z_INTERNAL uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) { unsigned char const *from = out - *dist; chunk_t chunk; @@ -59,6 +66,30 @@ Z_INTERNAL uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) { } return out; } +#endif + +#ifndef HAVE_CHUNK_MAG +/* Loads a magazine to feed into memory of the pattern */ +static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) { + /* This code takes string of length dist from "from" and repeats + * it for as many times as can fit in a chunk_t (vector register) */ + uint32_t cpy_dist; + uint32_t bytes_remaining = sizeof(chunk_t); + chunk_t chunk_load; + uint8_t *cur_chunk = (uint8_t *)&chunk_load; + while (bytes_remaining) { + cpy_dist = MIN(dist, bytes_remaining); + memcpy(cur_chunk, buf, cpy_dist); + bytes_remaining -= cpy_dist; + cur_chunk += cpy_dist; + /* This allows us to bypass an expensive integer division since we're effectively + * counting in this loop, anyway */ + *chunk_rem = cpy_dist; + } + + return chunk_load; +} +#endif /* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST. Return OUT + LEN. */ @@ -66,6 +97,12 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) { /* Debug performance related issues when len < sizeof(uint64_t): Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */ Assert(dist > 0, "chunkmemset cannot have a distance 0"); + /* Only AVX2 */ +#if CHUNK_SIZE == 32 && defined(X86_SSE41) && defined(X86_SSE2) + if (len <= 16) { + return chunkmemset_sse41(out, dist, len); + } +#endif uint8_t *from = out - dist; @@ -98,22 +135,7 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) { } else #endif { - /* This code takes string of length dist from "from" and repeats - * it for as many times as can fit in a chunk_t (vector register) */ - uint32_t cpy_dist; - uint32_t bytes_remaining = sizeof(chunk_t); - uint8_t *cur_chunk = (uint8_t *)&chunk_load; - while (bytes_remaining) { - cpy_dist = MIN(dist, bytes_remaining); - memcpy(cur_chunk, from, cpy_dist); - bytes_remaining -= cpy_dist; - cur_chunk += cpy_dist; - /* This allows us to bypass an expensive integer division since we're effectively - * counting in this loop, anyway. However, we may have to derive a similarly - * sensible solution for if we use a permutation table that allows us to construct - * this vector in one load and one permute instruction */ - chunk_mod = cpy_dist; - } + chunk_load = GET_CHUNK_MAG(from, &chunk_mod, dist); } /* If we're lucky enough and dist happens to be an even modulus of our vector length, diff --git a/configure b/configure index ff657e698..6f79825a7 100755 --- a/configure +++ b/configure @@ -1487,6 +1487,9 @@ case "${ARCH}" in if test ${HAVE_SSE41_INTRIN} -eq 1; then CFLAGS="${CFLAGS} -DX86_SSE41" SFLAGS="${SFLAGS} -DX86_SSE41" + + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} chunkset_sse41.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} chunkset_sse41.lo" fi check_sse42_intrinsics diff --git a/cpu_features.h b/cpu_features.h index 861ae0c4d..d3df33b9f 100644 --- a/cpu_features.h +++ b/cpu_features.h @@ -71,6 +71,10 @@ extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len); extern uint8_t* chunkmemset_sse2(uint8_t *out, unsigned dist, unsigned len); extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left); #endif +#ifdef X86_SSE41 +extern uint8_t* chunkmemset_sse41(uint8_t *out, unsigned dist, unsigned len); +extern uint8_t* chunkmemset_safe_sse41(uint8_t *out, unsigned dist, unsigned len, unsigned left); +#endif #ifdef X86_AVX_CHUNKSET extern uint32_t chunksize_avx(void); extern uint8_t* chunkcopy_avx(uint8_t *out, uint8_t const *from, unsigned len); diff --git a/functable.c b/functable.c index 64992bc7b..960c51f1b 100644 --- a/functable.c +++ b/functable.c @@ -331,6 +331,10 @@ Z_INTERNAL uint8_t* chunkmemset_stub(uint8_t *out, unsigned dist, unsigned len) # endif functable.chunkmemset = &chunkmemset_sse2; #endif +#if defined(X86_SSE41) && defined(X86_SSE2) + if (x86_cpu_has_sse41) + functable.chunkmemset = &chunkmemset_sse41; +#endif #ifdef X86_AVX_CHUNKSET if (x86_cpu_has_avx2) functable.chunkmemset = &chunkmemset_avx; @@ -358,6 +362,10 @@ Z_INTERNAL uint8_t* chunkmemset_safe_stub(uint8_t *out, unsigned dist, unsigned # endif functable.chunkmemset_safe = &chunkmemset_safe_sse2; #endif +#if defined(X86_SSE41) && defined(X86_SSE2) + if (x86_cpu_has_sse41) + functable.chunkmemset_safe = &chunkmemset_safe_sse41; +#endif #ifdef X86_AVX_CHUNKSET if (x86_cpu_has_avx2) functable.chunkmemset_safe = &chunkmemset_safe_avx; diff --git a/inflate_p.h b/inflate_p.h index a0b6aa8fb..65bbd4448 100644 --- a/inflate_p.h +++ b/inflate_p.h @@ -5,6 +5,8 @@ #ifndef INFLATE_P_H #define INFLATE_P_H +#include + /* Architecture-specific hooks. */ #ifdef S390_DFLTCC_INFLATE # include "arch/s390/dfltcc_inflate.h" @@ -145,8 +147,8 @@ static inline uint8_t* chunkcopy_safe(uint8_t *out, uint8_t *from, size_t len, u * we have to get a bit clever. First if the overlap is such that src falls between dst and dst+len, we can do the * initial bulk memcpy of the nonoverlapping region. Then, we can leverage the size of this to determine the safest * atomic memcpy size we can pick such that we have non-overlapping regions. This effectively becomes a safe look - * behind or lookahead distance */ - size_t non_olap_size = ((from > out) ? from - out : out - from); + * behind or lookahead distance. */ + size_t non_olap_size = llabs(from - out); // llabs vs labs for compatibility with windows memcpy(out, from, non_olap_size); out += non_olap_size;