--- /dev/null
+/* chunk_permute_table.h - shared AVX/SSE4 permutation table for use with chunkmemset family of functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef CHUNK_PERMUTE_TABLE_H_
+#define CHUNK_PERMUTE_TABLE_H_
+
+#include "zbuild.h"
+
+/* Need entries for all numbers not an even modulus for 1, 2, 4, 8, 16 & 32 */
+static const ALIGNED_(32) uint8_t permute_table[26*32] = {
+ 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, /* dist 3 */
+ 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, /* dist 5 */
+ 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, /* dist 6 */
+ 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, /* dist 7 */
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, /* dist 9 */
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, /* dist 10 */
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 11 */
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 12 */
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, /* dist 13 */
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, /* dist 14 */
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, /* dist 15 */
+
+ /* Beyond dists of 15 means we have to permute from a vector > len(m128i). Because AVX couldn't permute
+ * beyond 128 bit lanes until AVX512 for sub 4-byte sequences, we have to do some math here for an eventual
+ * blend with a comparison. That means we need to wrap the indices with yet another derived table. For simplicity,
+ * we'll use absolute indexing here to derive a blend vector. This is actually a lot simpler with ARM's TBL, but,
+ * this is what we're dealt.
+ */
+
+ 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, /* dist 17 */
+ 16, 17, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, /* dist 18 */
+ 16, 17, 18, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, /* dist 19 */
+ 16, 17, 18, 19, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, /* dist 20 */
+ 16, 17, 18, 19, 20, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, /* dist 21 */
+ 16, 17, 18, 19, 20, 21, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 22 */
+ 16, 17, 18, 19, 20, 21, 22, 0, 1, 2, 3, 4, 5, 6, 7, 8, /* dist 23 */
+ 16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 24 */
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 0, 1, 2, 3, 4, 5, 6, /* dist 25 */
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 1, 2, 3, 4, 5, /* dist 26 */
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 0, 1, 2, 3, 4, /* dist 27 */
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 1, 2, 3, /* dist 28 */
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 0, 1, 2, /* dist 29 */
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 0, 1, /* dist 30 */
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, /* dist 31 */
+};
+
+typedef struct lut_rem_pair_s {
+ uint16_t idx;
+ uint16_t remval;
+} lut_rem_pair;
+
+#endif
#ifdef X86_AVX_CHUNKSET
#include <immintrin.h>
+#include "chunk_permute_table.h"
typedef __m256i chunk_t;
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNK_MAG
+
+/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
+ * never be 0 - 2, we'll start with an offset, subtracting 3 from the input */
+static const lut_rem_pair perm_idx_lut[29] = {
+ { 0, 2}, /* 3 */
+ { 0, 0}, /* don't care */
+ { 1 * 32, 2}, /* 5 */
+ { 2 * 32, 2}, /* 6 */
+ { 3 * 32, 4}, /* 7 */
+ { 0 * 32, 0}, /* don't care */
+ { 4 * 32, 5}, /* 9 */
+ { 5 * 32, 22}, /* 10 */
+ { 6 * 32, 21}, /* 11 */
+ { 7 * 32, 20}, /* 12 */
+ { 8 * 32, 6}, /* 13 */
+ { 9 * 32, 4}, /* 14 */
+ {10 * 32, 2}, /* 15 */
+ { 0 * 32, 0}, /* don't care */
+ {11 * 32, 15}, /* 17 */
+ {11 * 32 + 16, 14}, /* 18 */
+ {11 * 32 + 16 * 2, 13}, /* 19 */
+ {11 * 32 + 16 * 3, 12}, /* 20 */
+ {11 * 32 + 16 * 4, 11}, /* 21 */
+ {11 * 32 + 16 * 5, 10}, /* 22 */
+ {11 * 32 + 16 * 6, 9}, /* 23 */
+ {11 * 32 + 16 * 7, 8}, /* 24 */
+ {11 * 32 + 16 * 8, 7}, /* 25 */
+ {11 * 32 + 16 * 9, 6}, /* 26 */
+ {11 * 32 + 16 * 10, 5}, /* 27 */
+ {11 * 32 + 16 * 11, 4}, /* 28 */
+ {11 * 32 + 16 * 12, 3}, /* 29 */
+ {11 * 32 + 16 * 13, 2}, /* 30 */
+ {11 * 32 + 16 * 14, 1} /* 31 */
+};
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
int16_t tmp;
_mm256_storeu_si256((__m256i *)out, *chunk);
}
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+ lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+ __m256i ret_vec;
+ /* While technically we only need to read 4 or 8 bytes into this vector register for a lot of cases, GCC is
+ * compiling this to a shared load for all branches, preferring the simpler code. Given that the buf value isn't in
+ * GPRs to begin with the 256 bit load is _probably_ just as inexpensive */
+ *chunk_rem = lut_rem.remval;
+
+#ifdef Z_MEMORY_SANITIZER
+ /* See note in chunkset_sse4.c for why this is ok */
+ __msan_unpoison(buf + dist, 32 - dist);
+#endif
+
+ if (dist < 16) {
+ /* This simpler case still requires us to shuffle in 128 bit lanes, so we must apply a static offset after
+ * broadcasting the first vector register to both halves. This is _marginally_ faster than doing two separate
+ * shuffles and combining the halves later */
+ const __m256i permute_xform =
+ _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16);
+ __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));
+ __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+ perm_vec = _mm256_add_epi8(perm_vec, permute_xform);
+ ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
+ ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
+ } else if (dist == 16) {
+ __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+ return _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
+ } else {
+ __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+ __m128i ret_vec1 = _mm_loadu_si128((__m128i*)(buf + 16));
+ /* Take advantage of the fact that only the latter half of the 256 bit vector will actually differ */
+ __m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+ __m128i xlane_permutes = _mm_cmpgt_epi8(_mm_set1_epi8(16), perm_vec1);
+ __m128i xlane_res = _mm_shuffle_epi8(ret_vec0, perm_vec1);
+ /* Since we can't wrap twice, we can simply keep the later half exactly how it is instead of having to _also_
+ * shuffle those values */
+ __m128i latter_half = _mm_blendv_epi8(ret_vec1, xlane_res, xlane_permutes);
+ ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1);
+ }
+
+ return ret_vec;
+}
+
#define CHUNKSIZE chunksize_avx
#define CHUNKCOPY chunkcopy_avx
#define CHUNKCOPY_SAFE chunkcopy_safe_avx
--- /dev/null
+/* chunkset_sse41.c -- SSE4 inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+
+/* This requires SSE2 support. While it's implicit with SSE4, we can minimize
+ * code size by sharing the chunkcopy functions, which will certainly compile
+ * to identical machine code */
+#if defined(X86_SSE41) && defined(X86_SSE2)
+#include <immintrin.h>
+#include "chunk_permute_table.h"
+
+typedef __m128i chunk_t;
+
+#define CHUNK_SIZE 16
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNK_MAG
+#define HAVE_CHUNKCOPY
+#define HAVE_CHUNKUNROLL
+
+static const lut_rem_pair perm_idx_lut[13] = {
+ {0, 1}, /* 3 */
+ {0, 0}, /* don't care */
+ {1 * 32, 1}, /* 5 */
+ {2 * 32, 4}, /* 6 */
+ {3 * 32, 2}, /* 7 */
+ {0 * 32, 0}, /* don't care */
+ {4 * 32, 7}, /* 9 */
+ {5 * 32, 6}, /* 10 */
+ {6 * 32, 5}, /* 11 */
+ {7 * 32, 4}, /* 12 */
+ {8 * 32, 3}, /* 13 */
+ {9 * 32, 2}, /* 14 */
+ {10 * 32, 1},/* 15 */
+};
+
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+ int16_t tmp;
+ zmemcpy_2(&tmp, from);
+ *chunk = _mm_set1_epi16(tmp);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+ int32_t tmp;
+ zmemcpy_4(&tmp, from);
+ *chunk = _mm_set1_epi32(tmp);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+ int64_t tmp;
+ zmemcpy_8(&tmp, from);
+ *chunk = _mm_set1_epi64x(tmp);
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+ *chunk = _mm_loadu_si128((__m128i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+ _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+ lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+ __m128i perm_vec, ret_vec;
+#ifdef Z_MEMORY_SANITIZER
+ /* Important to note:
+ * This is _not_ to subvert the memory sanitizer but to instead unpoison some
+ * bytes we willingly and purposefully load unitialized that we swizzle over
+ * in a vector register, anyway. If what we assume is wrong about what is used,
+ * the memory sanitizer will still usefully flag it */
+ __msan_unpoison(buf + dist, 16 - dist);
+#endif
+ ret_vec = _mm_loadu_si128((__m128i*)buf);
+ *chunk_rem = lut_rem.remval;
+
+ perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+ ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
+
+ return ret_vec;
+}
+
+extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
+
+#define CHUNKSIZE chunksize_sse41
+#define CHUNKMEMSET chunkmemset_sse41
+#define CHUNKMEMSET_SAFE chunkmemset_safe_sse41
+#define CHUNKCOPY(a, b, c) chunkcopy_sse2(a, b, c)
+#define CHUNKUNROLL(a, b, c) chunkunroll_sse2(a, b, c)
+
+#include "chunkset_tpl.h"
+
+#endif
#include "zbuild.h"
#include <stdlib.h>
+#if CHUNK_SIZE == 32 && defined(X86_SSE41) && defined(X86_SSE2)
+extern uint8_t* chunkmemset_sse41(uint8_t *out, unsigned dist, unsigned len);
+#endif
+
/* Returns the chunk size */
Z_INTERNAL uint32_t CHUNKSIZE(void) {
return sizeof(chunk_t);
(chunk_t bytes or fewer) will fall straight through the loop
without iteration, which will hopefully make the branch prediction more
reliable. */
+#ifndef HAVE_CHUNKCOPY
Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
Assert(len > 0, "chunkcopy should never have a length 0");
chunk_t chunk;
}
return out;
}
+#endif
/* Perform short copies until distance can be rewritten as being at least
sizeof chunk_t.
This assumption holds because inflate_fast() starts every iteration with at
least 258 bytes of output space available (258 being the maximum length
output from a single token; see inflate_fast()'s assumptions below). */
+#ifndef HAVE_CHUNKUNROLL
Z_INTERNAL uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) {
unsigned char const *from = out - *dist;
chunk_t chunk;
}
return out;
}
+#endif
+
+#ifndef HAVE_CHUNK_MAG
+/* Loads a magazine to feed into memory of the pattern */
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+ /* This code takes string of length dist from "from" and repeats
+ * it for as many times as can fit in a chunk_t (vector register) */
+ uint32_t cpy_dist;
+ uint32_t bytes_remaining = sizeof(chunk_t);
+ chunk_t chunk_load;
+ uint8_t *cur_chunk = (uint8_t *)&chunk_load;
+ while (bytes_remaining) {
+ cpy_dist = MIN(dist, bytes_remaining);
+ memcpy(cur_chunk, buf, cpy_dist);
+ bytes_remaining -= cpy_dist;
+ cur_chunk += cpy_dist;
+ /* This allows us to bypass an expensive integer division since we're effectively
+ * counting in this loop, anyway */
+ *chunk_rem = cpy_dist;
+ }
+
+ return chunk_load;
+}
+#endif
/* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST.
Return OUT + LEN. */
/* Debug performance related issues when len < sizeof(uint64_t):
Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
Assert(dist > 0, "chunkmemset cannot have a distance 0");
+ /* Only AVX2 */
+#if CHUNK_SIZE == 32 && defined(X86_SSE41) && defined(X86_SSE2)
+ if (len <= 16) {
+ return chunkmemset_sse41(out, dist, len);
+ }
+#endif
uint8_t *from = out - dist;
} else
#endif
{
- /* This code takes string of length dist from "from" and repeats
- * it for as many times as can fit in a chunk_t (vector register) */
- uint32_t cpy_dist;
- uint32_t bytes_remaining = sizeof(chunk_t);
- uint8_t *cur_chunk = (uint8_t *)&chunk_load;
- while (bytes_remaining) {
- cpy_dist = MIN(dist, bytes_remaining);
- memcpy(cur_chunk, from, cpy_dist);
- bytes_remaining -= cpy_dist;
- cur_chunk += cpy_dist;
- /* This allows us to bypass an expensive integer division since we're effectively
- * counting in this loop, anyway. However, we may have to derive a similarly
- * sensible solution for if we use a permutation table that allows us to construct
- * this vector in one load and one permute instruction */
- chunk_mod = cpy_dist;
- }
+ chunk_load = GET_CHUNK_MAG(from, &chunk_mod, dist);
}
/* If we're lucky enough and dist happens to be an even modulus of our vector length,