perf improvements for zstd decode (#1668)

author mgrice <mgrice@users.noreply.github.com>

Thu, 11 Jul 2019 22:31:07 +0000 (15:31 -0700)

committer Nick Terrell <terrelln@fb.com>

Thu, 11 Jul 2019 22:31:07 +0000 (18:31 -0400)
author mgrice <mgrice@users.noreply.github.com>
Thu, 11 Jul 2019 22:31:07 +0000 (15:31 -0700)
committer Nick Terrell <terrelln@fb.com>
Thu, 11 Jul 2019 22:31:07 +0000 (18:31 -0400)
diff --git a/lib/Makefile b/lib/Makefile

index 404f5b69210177d678ffd78200fceb300a28c526..87a396c53ebd8cc389341e9eb932c5cc01f72753 100644 (file)
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -17,6 +17,7 @@ LIBVER_MINOR := $(shell echo $(LIBVER_MINOR_SCRIPT))
  LIBVER_PATCH := $(shell echo $(LIBVER_PATCH_SCRIPT))
  LIBVER := $(shell echo $(LIBVER_SCRIPT))
  VERSION?= $(LIBVER)
+CCVER := $(shell $(CC) --version)
  
  CPPFLAGS+= -I. -I./common -DXXH_NAMESPACE=ZSTD_
  ifeq ($(OS),Windows_NT)   # MinGW assumed
@@ -45,6 +46,10 @@ ZDICT_FILES := $(sort $(wildcard dictBuilder/*.c))
  ZDEPR_FILES := $(sort $(wildcard deprecated/*.c))
  ZSTD_FILES := $(ZSTDCOMMON_FILES)
  
+ifeq ($(findstring GCC,$(CCVER)),GCC)
+decompress/zstd_decompress_block.o :   CFLAGS+=-fno-tree-vectorize
+endif
+
  ZSTD_LEGACY_SUPPORT ?= 5
  ZSTD_LIB_COMPRESSION ?= 1
  ZSTD_LIB_DECOMPRESSION ?= 1
diff --git a/lib/common/compiler.h b/lib/common/compiler.h

index 0836e3ed27a39d8ac7ec3438802ce57609718541..87bf51ae8c896c1af9411d214a2e2c9a9e04f07a 100644 (file)
--- a/lib/common/compiler.h
+++ b/lib/common/compiler.h
@@ -127,6 +127,13 @@
      }                                     \
  }
  
+/* vectorization */
+#if !defined(__clang__) && defined(__GNUC__)
+#  define DONT_VECTORIZE __attribute__((optimize("no-tree-vectorize")))
+#else
+#  define DONT_VECTORIZE
+#endif
+
  /* disable warnings */
  #ifdef _MSC_VER    /* Visual Studio */
  #  include <intrin.h>                    /* For Visual 2005 */
diff --git a/lib/common/zstd_internal.h b/lib/common/zstd_internal.h

index 31f756ab581235c2ffadc07c2847f94ebfd6d13d..81b16eac2ea62391d64aa3113b82c4a208c1f95a 100644 (file)
--- a/lib/common/zstd_internal.h
+++ b/lib/common/zstd_internal.h
@@ -34,7 +34,6 @@
  #endif
  #include "xxhash.h"                /* XXH_reset, update, digest */
  
-
  #if defined (__cplusplus)
  extern "C" {
  #endif
@@ -193,19 +192,72 @@ static const U32 OF_defaultNormLog = OF_DEFAULTNORMLOG;
  *  Shared functions to include for inlining
  *********************************************/
  static void ZSTD_copy8(void* dst, const void* src) { memcpy(dst, src, 8); }
+
  #define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
+static void ZSTD_copy16(void* dst, const void* src) { memcpy(dst, src, 16); }
+#define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; }
+
+#define WILDCOPY_OVERLENGTH 8
+#define VECLEN 16
+
+typedef enum {
+    ZSTD_no_overlap,
+    ZSTD_overlap_src_before_dst,
+    /*  ZSTD_overlap_dst_before_src, */
+} ZSTD_overlap_e;
  
  /*! ZSTD_wildcopy() :
   *  custom version of memcpy(), can overwrite up to WILDCOPY_OVERLENGTH bytes (if length==0) */
-#define WILDCOPY_OVERLENGTH 8
-MEM_STATIC void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length)
+MEM_STATIC FORCE_INLINE_ATTR DONT_VECTORIZE
+void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e ovtype)
  {
+    ptrdiff_t diff = (BYTE*)dst - (const BYTE*)src;
      const BYTE* ip = (const BYTE*)src;
      BYTE* op = (BYTE*)dst;
      BYTE* const oend = op + length;
-    do
-        COPY8(op, ip)
-    while (op < oend);
+
+    assert(diff >= 8 || (ovtype == ZSTD_no_overlap && diff < -8));
+    if (length < VECLEN || (ovtype == ZSTD_overlap_src_before_dst && diff < VECLEN)) {
+      do
+          COPY8(op, ip)
+      while (op < oend);
+    }
+    else {
+      if ((length & 8) == 0)
+        COPY8(op, ip);
+      do {
+        COPY16(op, ip);
+      }
+      while (op < oend);
+    }
+}
+
+/*! ZSTD_wildcopy_16min() :
+ *  same semantics as ZSTD_wilcopy() except guaranteed to be able to copy 16 bytes at the start */
+MEM_STATIC FORCE_INLINE_ATTR DONT_VECTORIZE
+void ZSTD_wildcopy_16min(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e ovtype)
+{
+    ptrdiff_t diff = (BYTE*)dst - (const BYTE*)src;
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + length;
+
+    assert(length >= 8);
+    assert(diff >= 8 || (ovtype == ZSTD_no_overlap && diff < -8));
+
+    if (ovtype == ZSTD_overlap_src_before_dst && diff < VECLEN) {
+      do
+          COPY8(op, ip)
+      while (op < oend);
+    }
+    else {
+      if ((length & 8) == 0)
+        COPY8(op, ip);
+      do {
+        COPY16(op, ip);
+      }
+      while (op < oend);
+    }
  }
  
  MEM_STATIC void ZSTD_wildcopy_e(void* dst, const void* src, void* dstEnd)   /* should be faster for decoding, but strangely, not verified on all platform */
diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h

index ffcf26310f0f225872e613c248eea1a65c216fc8..5495899be3546d3ccc4ca302b441379df0b37401 100644 (file)
--- a/lib/compress/zstd_compress_internal.h
+++ b/lib/compress/zstd_compress_internal.h
@@ -326,7 +326,7 @@ MEM_STATIC void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const v
      /* copy Literals */
      assert(seqStorePtr->maxNbLit <= 128 KB);
      assert(seqStorePtr->lit + litLength <= seqStorePtr->litStart + seqStorePtr->maxNbLit);
-    ZSTD_wildcopy(seqStorePtr->lit, literals, litLength);
+    ZSTD_wildcopy(seqStorePtr->lit, literals, litLength, ZSTD_no_overlap);
      seqStorePtr->lit += litLength;
  
      /* literal Length */
diff --git a/lib/decompress/zstd_decompress_block.c b/lib/decompress/zstd_decompress_block.c

index 741784fdf9fdae7e2e3bca4414b19ba702073117..24f4859c56c94fd9d42d803bcbe464be18d85b9f 100644 (file)
--- a/lib/decompress/zstd_decompress_block.c
+++ b/lib/decompress/zstd_decompress_block.c
@@ -637,9 +637,10 @@ size_t ZSTD_execSequence(BYTE* op,
      if (oLitEnd>oend_w) return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
  
      /* copy Literals */
-    ZSTD_copy8(op, *litPtr);
      if (sequence.litLength > 8)
-        ZSTD_wildcopy(op+8, (*litPtr)+8, sequence.litLength - 8);   /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
+        ZSTD_wildcopy_16min(op, (*litPtr), sequence.litLength, ZSTD_no_overlap);   /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
+    else
+        ZSTD_copy8(op, *litPtr);
      op = oLitEnd;
      *litPtr = iLitEnd;   /* update for next sequence */
  
@@ -686,13 +687,13 @@ size_t ZSTD_execSequence(BYTE* op,
  
      if (oMatchEnd > oend-(16-MINMATCH)) {
          if (op < oend_w) {
-            ZSTD_wildcopy(op, match, oend_w - op);
+            ZSTD_wildcopy(op, match, oend_w - op, ZSTD_overlap_src_before_dst);
              match += oend_w - op;
              op = oend_w;
          }
          while (op < oMatchEnd) *op++ = *match++;
      } else {
-        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8);   /* works even if matchLength < 8 */
+        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst);   /* works even if matchLength < 8 */
      }
      return sequenceLength;
  }
@@ -717,9 +718,11 @@ size_t ZSTD_execSequenceLong(BYTE* op,
      if (oLitEnd > oend_w) return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, prefixStart, dictStart, dictEnd);
  
      /* copy Literals */
-    ZSTD_copy8(op, *litPtr);  /* note : op <= oLitEnd <= oend_w == oend - 8 */
      if (sequence.litLength > 8)
-        ZSTD_wildcopy(op+8, (*litPtr)+8, sequence.litLength - 8);   /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
+        ZSTD_wildcopy_16min(op, *litPtr, sequence.litLength, ZSTD_no_overlap);   /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */
+    else
+        ZSTD_copy8(op, *litPtr);  /* note : op <= oLitEnd <= oend_w == oend - 8 */
+
      op = oLitEnd;
      *litPtr = iLitEnd;   /* update for next sequence */
  
@@ -766,13 +769,13 @@ size_t ZSTD_execSequenceLong(BYTE* op,
  
      if (oMatchEnd > oend-(16-MINMATCH)) {
          if (op < oend_w) {
-            ZSTD_wildcopy(op, match, oend_w - op);
+            ZSTD_wildcopy(op, match, oend_w - op, ZSTD_overlap_src_before_dst);
              match += oend_w - op;
              op = oend_w;
          }
          while (op < oMatchEnd) *op++ = *match++;
      } else {
-        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8);   /* works even if matchLength < 8 */
+        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst);   /* works even if matchLength < 8 */
      }
      return sequenceLength;
  }
@@ -889,6 +892,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
  }
  
  FORCE_INLINE_TEMPLATE size_t
+DONT_VECTORIZE
  ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
                                 void* dst, size_t maxDstSize,
                           const void* seqStart, size_t seqSize, int nbSeq,
@@ -1137,6 +1141,7 @@ ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
  
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
  static TARGET_ATTRIBUTE("bmi2") size_t
+DONT_VECTORIZE
  ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
                                   void* dst, size_t maxDstSize,
                             const void* seqStart, size_t seqSize, int nbSeq,
author	mgrice <mgrice@users.noreply.github.com>
	Thu, 11 Jul 2019 22:31:07 +0000 (15:31 -0700)
committer	Nick Terrell <terrelln@fb.com>
	Thu, 11 Jul 2019 22:31:07 +0000 (18:31 -0400)
lib/Makefile		patch \| blob \| blame \| history
lib/common/compiler.h		patch \| blob \| blame \| history
lib/common/zstd_internal.h		patch \| blob \| blame \| history
lib/compress/zstd_compress_internal.h		patch \| blob \| blame \| history
lib/decompress/zstd_decompress_block.c		patch \| blob \| blame \| history