#include "adler32_avx2_p.h"
#include "x86_intrins.h"
-#ifdef X86_SSE42
extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);
-#define copy_sub32(a, b, c, d) adler32_fold_copy_sse42(a, b, c, d)
-#define sub32(a, b, c) adler32_ssse3(a, b, c)
-#else
-#define copy_sub32(a, b, c, d) adler32_copy_len_16(adler0, c, b, d, adler1)
-#define sub32(a, b, c) adler32_len_16(adler0, b, c, adler1)
-#endif
-
static inline uint32_t adler32_fold_copy_impl(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len, const int COPY) {
if (src == NULL) return 1L;
if (len == 0) return adler;
}
} else if (len < 32) {
if (COPY) {
- return copy_sub32(adler, dst, src, len);
+ return adler32_fold_copy_sse42(adler, dst, src, len);
} else {
- return sub32(adler, src, len);
+ return adler32_ssse3(adler, src, len);
}
}
_mm512_mask_storeu_epi8(dst, storemask, copy_vec);
}
-#ifdef X86_AVX2
return adler32_avx2(adler, src, len);
-#elif defined(X86_SSSE3)
- return adler32_ssse3(adler, src, len);
-#else
- return adler32_len_16(adler0, src, len, adler1);
-#endif
}
__m512i vbuf, vs1_0, vs3;
rem_peel:
if (len < 32)
-#if defined(X86_SSSE3)
return adler32_ssse3(adler, src, len);
-#else
- return adler32_len_16(adler0, src, len, adler1);
-#endif
if (len < 64)
-#ifdef X86_AVX2
return adler32_avx2(adler, src, len);
-#elif defined(X86_SSE3)
- return adler32_ssse3(adler, src, len);
-#else
- return adler32_len_16(adler0, src, len, adler1);
-#endif
const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
__m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src);
_mm256_mask_storeu_epi8(dst, storemask, copy_vec);
-#if defined(X86_SSSE3)
return adler32_ssse3(adler, src, len);
-#else
- return adler32_len_16(adler0, src, len, adler1);
-#endif
}
const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
/* This requires SSE2 support. While it's implicit with SSSE3, we can minimize
* code size by sharing the chunkcopy functions, which will certainly compile
* to identical machine code */
-#if defined(X86_SSSE3) && defined(X86_SSE2)
+#if defined(X86_SSSE3)
#include <immintrin.h>
#include "../generic/chunk_permute_table.h"
#include "zbuild.h"
#include <stdlib.h>
-#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
+#if CHUNK_SIZE == 32 && defined(X86_SSSE3)
extern uint8_t* chunkmemset_ssse3(uint8_t *out, unsigned dist, unsigned len);
#endif
Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
Assert(dist > 0, "chunkmemset cannot have a distance 0");
/* Only AVX2 */
-#if CHUNK_SIZE == 32 && defined(X86_SSSE3) && defined(X86_SSE2)
+#if CHUNK_SIZE == 32 && defined(X86_SSSE3)
if (len <= 16) {
return chunkmemset_ssse3(out, dist, len);
}
#ifdef X86_SSSE3
if (cf.x86.has_ssse3) {
ft.adler32 = &adler32_ssse3;
-# ifdef X86_SSE2
ft.chunkmemset_safe = &chunkmemset_safe_ssse3;
ft.inflate_fast = &inflate_fast_ssse3;
-# endif
}
#endif
// X86 - SSE4.2