attempt a sse2/avx2 branch of the lazy match detector

author Yann Collet <cyan@fb.com>

Tue, 21 Dec 2021 23:24:45 +0000 (15:24 -0800)

committer Yann Collet <cyan@fb.com>

Tue, 21 Dec 2021 23:25:20 +0000 (15:25 -0800)
author Yann Collet <cyan@fb.com>
Tue, 21 Dec 2021 23:24:45 +0000 (15:24 -0800)
committer Yann Collet <cyan@fb.com>
Tue, 21 Dec 2021 23:25:20 +0000 (15:25 -0800)
diff --git a/lib/common/compiler.h b/lib/common/compiler.h

index 516930c01ec9c6bfce854085fa914faa75757282..1c3bee5bc40a26c3acde71d382c7631e3f1aa150 100644 (file)
--- a/lib/common/compiler.h
+++ b/lib/common/compiler.h
@@ -190,6 +190,9 @@
  
  /* compile time determination of SIMD support */
  #if !defined(ZSTD_NO_INTRINSICS)
+#  if defined(__AVX2__)
+#    define ZSTD_ARCH_X86_AVX2
+#  endif
  #  if defined(__SSE2__) || defined(_M_AMD64) || (defined (_M_IX86) && defined(_M_IX86_FP) && (_M_IX86_FP >= 2))
  #    define ZSTD_ARCH_X86_SSE2
  #  endif
@@ -197,9 +200,13 @@
  #    define ZSTD_ARCH_ARM_NEON
  #  endif
  #
+#  if defined(ZSTD_ARCH_X86_AVX2)
+#    include <immintrin.h>
+#  endif
  #  if defined(ZSTD_ARCH_X86_SSE2)
  #    include <emmintrin.h>
-#  elif defined(ZSTD_ARCH_ARM_NEON)
+#  endif
+#  if defined(ZSTD_ARCH_ARM_NEON)
  #    include <arm_neon.h>
  #  endif
  #endif
diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c

index c40473cad7d7cbbc5448cf1f0a34a80175d681c1..d4df5d1b29d59f4dfd44d87fd5bed2babcda7f9e 100644 (file)
--- a/lib/compress/zstd_lazy.c
+++ b/lib/compress/zstd_lazy.c
@@ -1002,6 +1002,27 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U
  }
  #endif
  
+#if defined(ZSTD_ARCH_X86_AVX2)
+FORCE_INLINE_TEMPLATE ZSTD_VecMask
+ZSTD_row_getAVXMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head)
+{
+    if (nbChunks==1) return ZSTD_row_getSSEMask(1, src, tag, head);
+    {   const __m256i comparisonMask = _mm256_set1_epi8((char)tag);
+        int matches[2] = {0};
+        int i;
+        assert(nbChunks == 2 || nbChunks == 4);
+        for (i=0; i<(nbChunks/2); i++) {
+            const __m256i chunk = _mm256_loadu_si256((const __m256i*)(const void*)(src + 32*i));
+            const __m256i equalMask = _mm256_cmpeq_epi8(chunk, comparisonMask);
+            matches[i] = _mm256_movemask_epi8(equalMask);
+        }
+        if (nbChunks == 2) return ZSTD_rotateRight_U32((U32)matches[0], head);
+        assert(nbChunks == 4);
+        return ZSTD_rotateRight_U64((U64)matches[1] << 32 | (U64)matches[0], head);
+    }
+}
+#endif
+
  /* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
   * the hash at the nth position in a row of the tagTable.
   * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
@@ -1013,7 +1034,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
      assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
      assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
  
-#if defined(ZSTD_ARCH_X86_SSE2)
+#if defined(ZSTD_ARCH_X86_AVX2)
+
+    return ZSTD_row_getAVXMask(rowEntries / 16, src, tag, head);
+
+#elif defined(ZSTD_ARCH_X86_SSE2)
  
      return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head);
author	Yann Collet <cyan@fb.com>
	Tue, 21 Dec 2021 23:24:45 +0000 (15:24 -0800)
committer	Yann Collet <cyan@fb.com>
	Tue, 21 Dec 2021 23:25:20 +0000 (15:25 -0800)
lib/common/compiler.h		patch \| blob \| blame \| history
lib/compress/zstd_lazy.c		patch \| blob \| blame \| history