From 483759a3de5545188eac00797fe46b5d99250aa6 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Thu, 8 Nov 2018 17:00:23 -0800
Subject: [PATCH] Improves decompression speed when using cold dictionary

by triggering the prefetching decoder path
(which used to be dedicated to long-range offsets only).

Figures on my laptop :
no content prefetch : ~300 MB/s (for reference)
full content prefetch : ~325 MB/s (before this patch)
new prefetch path : ~375 MB/s (after this patch)

The benchmark speed is already significant,
but another side-effect is that this version
prefetch less data into memory,
since it only prefetches what's needed, instead of the full dictionary.

This is supposed to help highly active environments
such as active databases,
that can't be properly measured in benchmark environment (too clean).

Also :
fixed the largeNbDict test program
which was working improperly when setting nbBlocks > nbFiles.
---
 contrib/largeNbDicts/Makefile          |  3 ++-
 contrib/largeNbDicts/largeNbDicts.c    | 10 +++++----
 lib/decompress/zstd_ddict.c            |  1 -
 lib/decompress/zstd_decompress.c       |  1 -
 lib/decompress/zstd_decompress_block.c | 31 +++++++++++---------------
 5 files changed, 21 insertions(+), 25 deletions(-)

diff --git a/contrib/largeNbDicts/Makefile b/contrib/largeNbDicts/Makefile
index 3b19d49af..730250f96 100644
--- a/contrib/largeNbDicts/Makefile
+++ b/contrib/largeNbDicts/Makefile
@@ -33,7 +33,7 @@ largeNbDicts: util.o bench.o datagen.o xxhash.o largeNbDicts.c $(LIBZSTD)
 
 .PHONY: $(LIBZSTD)
 $(LIBZSTD):
-	$(MAKE) -C $(LIBDIR) libzstd.a
+	$(MAKE) -C $(LIBDIR) libzstd.a CFLAGS="$(CFLAGS)"
 
 bench.o  : $(PROGDIR)/bench.c
 	$(CC) $(CPPFLAGS) $(CFLAGS) $^ -c
@@ -50,4 +50,5 @@ xxhash.o : $(LIBDIR)/common/xxhash.c
 
 clean:
 	$(RM) *.o
+	$(MAKE) -C $(LIBDIR) clean > /dev/null
 	$(RM) largeNbDicts
diff --git a/contrib/largeNbDicts/largeNbDicts.c b/contrib/largeNbDicts/largeNbDicts.c
index e0bc55380..d7639fc40 100644
--- a/contrib/largeNbDicts/largeNbDicts.c
+++ b/contrib/largeNbDicts/largeNbDicts.c
@@ -49,6 +49,7 @@
 
 
 /*---  Macros  ---*/
+
 #define CONTROL(c)   { if (!(c)) abort(); }
 #undef MIN
 #define MIN(a,b)     ((a) < (b) ? (a) : (b))
@@ -594,6 +595,7 @@ int bench(const char** fileNameTable, unsigned nbFiles,
     if (blockSize)
         DISPLAYLEVEL(3, "of max size %u bytes ", (unsigned)blockSize);
     DISPLAYLEVEL(3, "\n");
+    size_t const totalSrcSlicesSize = sliceCollection_totalCapacity(srcSlices);
 
 
     size_t* const dstCapacities = malloc(nbBlocks * sizeof(*dstCapacities));
@@ -625,8 +627,8 @@ int bench(const char** fileNameTable, unsigned nbFiles,
 
     /* dictionary determination */
     buffer_t const dictBuffer = createDictionaryBuffer(dictionary,
-                                srcBuffer.ptr,
-                                srcSlices.capacities, nbBlocks,
+                                srcs.buffer.ptr,
+                                srcs.slices.capacities, srcs.slices.nbSlices,
                                 DICTSIZE);
     CONTROL(dictBuffer.ptr != NULL);
 
@@ -637,7 +639,7 @@ int bench(const char** fileNameTable, unsigned nbFiles,
     CONTROL(cTotalSizeNoDict != 0);
     DISPLAYLEVEL(3, "compressing at level %u without dictionary : Ratio=%.2f  (%u bytes) \n",
                     clevel,
-                    (double)srcSize / cTotalSizeNoDict, (unsigned)cTotalSizeNoDict);
+                    (double)totalSrcSlicesSize / cTotalSizeNoDict, (unsigned)cTotalSizeNoDict);
 
     size_t* const cSizes = malloc(nbBlocks * sizeof(size_t));
     CONTROL(cSizes != NULL);
@@ -646,7 +648,7 @@ int bench(const char** fileNameTable, unsigned nbFiles,
     CONTROL(cTotalSize != 0);
     DISPLAYLEVEL(3, "compressed using a %u bytes dictionary : Ratio=%.2f  (%u bytes) \n",
                     (unsigned)dictBuffer.size,
-                    (double)srcSize / cTotalSize, (unsigned)cTotalSize);
+                    (double)totalSrcSlicesSize / cTotalSize, (unsigned)cTotalSize);
 
     /* now dstSlices contain the real compressed size of each block, instead of the maximum capacity */
     shrinkSizes(dstSlices, cSizes);
diff --git a/lib/decompress/zstd_ddict.c b/lib/decompress/zstd_ddict.c
index 6569eeed1..2ad044068 100644
--- a/lib/decompress/zstd_ddict.c
+++ b/lib/decompress/zstd_ddict.c
@@ -15,7 +15,6 @@
 *  Dependencies
 *********************************************************/
 #include <string.h>      /* memcpy, memmove, memset */
-#include "compiler.h"    /* prefetch */
 #include "cpu.h"         /* bmi2 */
 #include "mem.h"         /* low level memory routines */
 #define FSE_STATIC_LINKING_ONLY
diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c
index 986ebff9e..549cc7582 100644
--- a/lib/decompress/zstd_decompress.c
+++ b/lib/decompress/zstd_decompress.c
@@ -56,7 +56,6 @@
 *  Dependencies
 *********************************************************/
 #include <string.h>      /* memcpy, memmove, memset */
-#include "compiler.h"    /* prefetch */
 #include "cpu.h"         /* bmi2 */
 #include "mem.h"         /* low level memory routines */
 #define FSE_STATIC_LINKING_ONLY
diff --git a/lib/decompress/zstd_decompress_block.c b/lib/decompress/zstd_decompress_block.c
index 71bb8e97b..869bdd9aa 100644
--- a/lib/decompress/zstd_decompress_block.c
+++ b/lib/decompress/zstd_decompress_block.c
@@ -507,16 +507,6 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
         }
     }
 
-    /* prefetch dictionary content */
-    if (dctx->ddictIsCold) {
-        size_t const dictSize = (const char*)dctx->prefixStart - (const char*)dctx->virtualStart;
-        size_t const psmin = MIN(dictSize, (size_t)(64*nbSeq) /* heuristic */ );
-        size_t const pSize = MIN(psmin, 128 KB /* protection */ );
-        const void* const pStart = (const char*)dctx->dictEnd - pSize;
-        PREFETCH_AREA(pStart, pSize);
-        dctx->ddictIsCold = 0;
-    }
-
     return ip-istart;
 }
 
@@ -1046,6 +1036,7 @@ ZSTD_decompressSequencesLong_body(
         /* prepare in advance */
         for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
             sequences[seqNb] = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
+            PREFETCH_L1(sequences[seqNb].match); PREFETCH_L1(sequences[seqNb].match + sequences[seqNb].matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
         }
         if (seqNb<seqAdvance) return ERROR(corruption_detected);
 
@@ -1070,9 +1061,6 @@ ZSTD_decompressSequencesLong_body(
 
         /* save reps for next block */
         { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
-#undef STORED_SEQS
-#undef STORED_SEQS_MASK
-#undef ADVANCED_SEQS
     }
 
     /* last literal segment */
@@ -1213,20 +1201,27 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
     }
 
     /* Build Decoding Tables */
-    {   int nbSeq;
+    {   int usePrefetchDecoder = dctx->ddictIsCold;
+        int nbSeq;
         size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
         if (ZSTD_isError(seqHSize)) return seqHSize;
         ip += seqHSize;
         srcSize -= seqHSize;
 
-        if ( (!frame || (dctx->fParams.windowSize > (1<<24)))
-          && (nbSeq>0) ) {  /* could probably use a larger nbSeq limit */
+        if ( !usePrefetchDecoder
+          && (!frame || (dctx->fParams.windowSize > (1<<24)))
+          && (nbSeq>ADVANCED_SEQS) ) {  /* could probably use a larger nbSeq limit */
             U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
             U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
-            if (shareLongOffsets >= minShare)
-                return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
+            usePrefetchDecoder = (shareLongOffsets >= minShare);
         }
 
+        dctx->ddictIsCold = 0;
+
+        if (usePrefetchDecoder)
+            return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
+
+        /* else */
         return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
     }
 }
-- 
2.47.3