From: Yann Collet Date: Tue, 21 Dec 2021 23:24:45 +0000 (-0800) Subject: attempt a sse2/avx2 branch of the lazy match detector X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f6d673a5149e8e80543feced6fd57273d59422e8;p=thirdparty%2Fzstd.git attempt a sse2/avx2 branch of the lazy match detector --- diff --git a/lib/common/compiler.h b/lib/common/compiler.h index 516930c01..1c3bee5bc 100644 --- a/lib/common/compiler.h +++ b/lib/common/compiler.h @@ -190,6 +190,9 @@ /* compile time determination of SIMD support */ #if !defined(ZSTD_NO_INTRINSICS) +# if defined(__AVX2__) +# define ZSTD_ARCH_X86_AVX2 +# endif # if defined(__SSE2__) || defined(_M_AMD64) || (defined (_M_IX86) && defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) # define ZSTD_ARCH_X86_SSE2 # endif @@ -197,9 +200,13 @@ # define ZSTD_ARCH_ARM_NEON # endif # +# if defined(ZSTD_ARCH_X86_AVX2) +# include +# endif # if defined(ZSTD_ARCH_X86_SSE2) # include -# elif defined(ZSTD_ARCH_ARM_NEON) +# endif +# if defined(ZSTD_ARCH_ARM_NEON) # include # endif #endif diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c index c40473cad..d4df5d1b2 100644 --- a/lib/compress/zstd_lazy.c +++ b/lib/compress/zstd_lazy.c @@ -1002,6 +1002,27 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U } #endif +#if defined(ZSTD_ARCH_X86_AVX2) +FORCE_INLINE_TEMPLATE ZSTD_VecMask +ZSTD_row_getAVXMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head) +{ + if (nbChunks==1) return ZSTD_row_getSSEMask(1, src, tag, head); + { const __m256i comparisonMask = _mm256_set1_epi8((char)tag); + int matches[2] = {0}; + int i; + assert(nbChunks == 2 || nbChunks == 4); + for (i=0; i<(nbChunks/2); i++) { + const __m256i chunk = _mm256_loadu_si256((const __m256i*)(const void*)(src + 32*i)); + const __m256i equalMask = _mm256_cmpeq_epi8(chunk, comparisonMask); + matches[i] = _mm256_movemask_epi8(equalMask); + } + if (nbChunks == 2) return ZSTD_rotateRight_U32((U32)matches[0], head); + assert(nbChunks == 4); + return ZSTD_rotateRight_U64((U64)matches[1] << 32 | (U64)matches[0], head); + } +} +#endif + /* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches * the hash at the nth position in a row of the tagTable. * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield @@ -1013,7 +1034,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); -#if defined(ZSTD_ARCH_X86_SSE2) +#if defined(ZSTD_ARCH_X86_AVX2) + + return ZSTD_row_getAVXMask(rowEntries / 16, src, tag, head); + +#elif defined(ZSTD_ARCH_X86_SSE2) return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head);