#ifdef X86_AVX2
#include <immintrin.h>
#include "../generic/chunk_permute_table.h"
+#include "x86_intrins.h"
typedef __m256i chunk_t;
-
-#define CHUNK_SIZE 32
+typedef __m128i halfchunk_t;
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
#define HAVE_CHUNKMEMSET_16
#define HAVE_CHUNK_MAG
+#define HAVE_HALF_CHUNK
/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
* never be 0 - 2, we'll start with an offset, subtracting 3 from the input */
{11 * 32 + 16 * 14, 1} /* 31 */
};
+static const uint16_t half_rem_vals[13] = {
+ 1, 0, 1, 4, 2, 0, 7, 6, 5, 4, 3, 2, 1
+};
+
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
int16_t tmp;
memcpy(&tmp, from, sizeof(tmp));
return ret_vec;
}
+static inline void halfchunkmemset_2(uint8_t *from, halfchunk_t *chunk) {
+ int16_t tmp;
+ memcpy(&tmp, from, sizeof(tmp));
+ *chunk = _mm_set1_epi16(tmp);
+}
+
+static inline void halfchunkmemset_4(uint8_t *from, halfchunk_t *chunk) {
+ int32_t tmp;
+ memcpy(&tmp, from, sizeof(tmp));
+ *chunk = _mm_set1_epi32(tmp);
+}
+
+static inline void halfchunkmemset_8(uint8_t *from, halfchunk_t *chunk) {
+ int64_t tmp;
+ memcpy(&tmp, from, sizeof(tmp));
+ *chunk = _mm_set1_epi64x(tmp);
+}
+
+static inline void loadhalfchunk(uint8_t const *s, halfchunk_t *chunk) {
+ *chunk = _mm_loadu_si128((__m128i *)s);
+}
+
+static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) {
+ _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+static inline chunk_t halfchunk2whole(halfchunk_t chunk) {
+ /* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately
+ * unlikely to be actually written or read from */
+ return _mm256_zextsi128_si256(chunk);
+}
+
+static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+ lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+ __m128i perm_vec, ret_vec;
+ __msan_unpoison(buf + dist, 16 - dist);
+ ret_vec = _mm_loadu_si128((__m128i*)buf);
+ *chunk_rem = half_rem_vals[dist - 3];
+
+ perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+ ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
+
+ return ret_vec;
+}
+
#define CHUNKSIZE chunksize_avx2
#define CHUNKCOPY chunkcopy_avx2
#define CHUNKUNROLL chunkunroll_avx2
#include "zbuild.h"
#include <stdlib.h>
-#if CHUNK_SIZE == 32 && defined(X86_SSSE3)
-extern uint8_t* chunkmemset_ssse3(uint8_t *out, unsigned dist, unsigned len);
-#endif
-
/* Returns the chunk size */
Z_INTERNAL uint32_t CHUNKSIZE(void) {
return sizeof(chunk_t);
}
#endif
+#ifdef HAVE_HALF_CHUNK
+static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
+ halfchunk_t chunk;
+ int32_t align = ((len - 1) % sizeof(halfchunk_t)) + 1;
+ loadhalfchunk(from, &chunk);
+ storehalfchunk(out, &chunk);
+ out += align;
+ from += align;
+ len -= align;
+ while (len > 0) {
+ loadhalfchunk(from, &chunk);
+ storehalfchunk(out, &chunk);
+ out += sizeof(halfchunk_t);
+ from += sizeof(halfchunk_t);
+ len -= sizeof(halfchunk_t);
+ }
+ return out;
+}
+#endif
+
/* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST.
Return OUT + LEN. */
-Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
+static inline uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
/* Debug performance related issues when len < sizeof(uint64_t):
Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
Assert(dist > 0, "chunkmemset cannot have a distance 0");
- /* Only AVX2 */
-#if CHUNK_SIZE == 32 && defined(X86_SSSE3)
- if (len <= 16) {
- return chunkmemset_ssse3(out, dist, len);
- }
-#endif
uint8_t *from = out - dist;
+ chunk_t chunk_load;
+ uint32_t chunk_mod = 0;
+ uint32_t adv_amount;
if (dist == 1) {
memset(out, *from, len);
return CHUNKCOPY(out, out - dist, len);
}
- chunk_t chunk_load;
- uint32_t chunk_mod = 0;
+ /* Only AVX2 as there's 128 bit vectors and 256 bit. We allow for shorter vector
+ * lengths because they serve to allow more cases to fall into chunkcopy, as the
+ * distance of the shorter length is still deemed a safe distance. We rewrite this
+ * here rather than calling the ssse3 variant directly now because doing so required
+ * dispatching to another function and broke inlining for this function entirely. We
+ * also can merge an assert and some remainder peeling behavior into the same code blocks,
+ * making the code a little smaller. */
+#ifdef HAVE_HALF_CHUNK
+ if (len <= sizeof(halfchunk_t)) {
+ if (dist > sizeof(halfchunk_t)) {
+ return HALFCHUNKCOPY(out, out - dist, len);
+ }
+
+ halfchunk_t halfchunk_load;
+
+ if (dist == 2) {
+ halfchunkmemset_2(from, &halfchunk_load);
+ } else if (dist == 4) {
+ halfchunkmemset_4(from, &halfchunk_load);
+ } else if (dist == 8) {
+ halfchunkmemset_8(from, &halfchunk_load);
+ } else if (dist == 16) {
+ loadhalfchunk(from, &halfchunk_load);
+ } else {
+ halfchunk_load = GET_HALFCHUNK_MAG(from, &chunk_mod, dist);
+ }
+
+ adv_amount = sizeof(halfchunk_t) - chunk_mod;
+ while (len >= sizeof(halfchunk_t)) {
+ storehalfchunk(out, &halfchunk_load);
+ len -= adv_amount;
+ out += adv_amount;
+ }
+
+ chunk_load = halfchunk2whole(halfchunk_load);
+ goto rem_bytes;
+ }
+#endif
- /* TODO: possibly build up a permutation table for this if not an even modulus */
#ifdef HAVE_CHUNKMEMSET_2
if (dist == 2) {
chunkmemset_2(from, &chunk_load);
chunk_load = GET_CHUNK_MAG(from, &chunk_mod, dist);
}
- /* If we're lucky enough and dist happens to be an even modulus of our vector length,
- * we can do two stores per loop iteration, which for most ISAs, especially x86, is beneficial */
- if (chunk_mod == 0) {
- while (len >= (2 * sizeof(chunk_t))) {
- storechunk(out, &chunk_load);
- storechunk(out + sizeof(chunk_t), &chunk_load);
- out += 2 * sizeof(chunk_t);
- len -= 2 * sizeof(chunk_t);
- }
+ adv_amount = sizeof(chunk_t) - chunk_mod;
+
+ while (len >= (2 * sizeof(chunk_t))) {
+ storechunk(out, &chunk_load);
+ storechunk(out + adv_amount, &chunk_load);
+ out += 2 * adv_amount;
+ len -= 2 * adv_amount;
}
/* If we don't have a "dist" length that divides evenly into a vector
* register, we can write the whole vector register but we need only
* advance by the amount of the whole string that fits in our chunk_t.
* If we do divide evenly into the vector length, adv_amount = chunk_t size*/
- uint32_t adv_amount = sizeof(chunk_t) - chunk_mod;
while (len >= sizeof(chunk_t)) {
storechunk(out, &chunk_load);
len -= adv_amount;
out += adv_amount;
}
+#ifdef HAVE_HALF_CHUNK
+rem_bytes:
+#endif
if (len) {
memcpy(out, &chunk_load, len);
out += len;