From: Yann Collet <cyan@fb.com>
Date: Thu, 8 Nov 2018 20:47:46 +0000 (-0800)
Subject: improve long-range decoder speed
X-Git-Tag: v1.3.8~52^2~2
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=9126da5b5c74dac824b9cd9467923c36d8792fb6;p=thirdparty%2Fzstd.git

improve long-range decoder speed

on enwik9 at level 22 (which is almost a worst case scenario),
speed improves by +7% on my laptop (415 -> 445 MB/s)
---

diff --git a/lib/common/compiler.h b/lib/common/compiler.h
index 07f875e4d..cc830b2b4 100644
--- a/lib/common/compiler.h
+++ b/lib/common/compiler.h
@@ -89,23 +89,21 @@
 #endif
 
 /* prefetch
- * can be disabled, by declaring NO_PREFETCH macro
- * All prefetch invocations use a single default locality 2,
- * generating instruction prefetcht1,
- * which, according to Intel, means "load data into L2 cache".
- * This is a good enough "middle ground" for the time being,
- * though in theory, it would be better to specialize locality depending on data being prefetched.
- * Tests could not determine any sensible difference based on locality value. */
+ * can be disabled, by declaring NO_PREFETCH build macro */
 #if defined(NO_PREFETCH)
+#  define PREFETCH_L1(ptr)  (void)(ptr)  /* disabled */
 #  define PREFETCH(ptr)     (void)(ptr)  /* disabled */
 #else
 #  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
 #    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
-#    define PREFETCH(ptr)   _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
+#    define PREFETCH_L1(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#    define PREFETCH(ptr)     _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
 #  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
-#    define PREFETCH(ptr)   __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
+#    define PREFETCH_L1(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+#    define PREFETCH(ptr)     __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
 #  else
-#    define PREFETCH(ptr)   (void)(ptr)  /* disabled */
+#    define PREFETCH_L1(ptr) (void)(ptr)  /* disabled */
+#    define PREFETCH(ptr)    (void)(ptr)  /* disabled */
 #  endif
 #endif  /* NO_PREFETCH */
 
diff --git a/lib/decompress/zstd_decompress_block.c b/lib/decompress/zstd_decompress_block.c
index 68298772e..ef424877b 100644
--- a/lib/decompress/zstd_decompress_block.c
+++ b/lib/decompress/zstd_decompress_block.c
@@ -1054,7 +1054,7 @@ ZSTD_decompressSequencesLong_body(
             seq_t const sequence = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
             size_t const oneSeqSize = ZSTD_execSequenceLong(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
             if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
-            PREFETCH(sequence.match);  /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
+            PREFETCH_L1(sequence.match); PREFETCH_L1(sequence.match + sequence.matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
             sequences[seqNb & STORED_SEQS_MASK] = sequence;
             op += oneSeqSize;
         }
diff --git a/programs/bench.c b/programs/bench.c
index caa803572..2bbaa9d0a 100644
--- a/programs/bench.c
+++ b/programs/bench.c
@@ -601,7 +601,11 @@ BMK_benchMemAdvancedNoAlloc(
                 cPtr += cCapacities[nbBlocks];
                 resPtr += thisBlockSize;
                 remaining -= thisBlockSize;
-                if (BMK_decodeOnly) { assert(nbBlocks==0); cSizes[nbBlocks] = thisBlockSize; }
+                if (BMK_decodeOnly) {
+                    assert(nbBlocks==0);
+                    cSizes[nbBlocks] = thisBlockSize;
+                    benchResult.cSize = thisBlockSize;
+                }
             }
         }
     }