From: Danielle Rozenblit <drozenblit@fb.com>
Date: Thu, 27 Oct 2022 17:20:44 +0000 (-0700)
Subject: Speed optimizations with macro
X-Git-Tag: v1.5.4^2~66^2~6
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=db74d043d6de8268d7c23c8781c26ecef60a86b7;p=thirdparty%2Fzstd.git

Speed optimizations with macro
---

diff --git a/contrib/freestanding_lib/freestanding.py b/contrib/freestanding_lib/freestanding.py
index 4a02dea14..4e0a944f1 100755
--- a/contrib/freestanding_lib/freestanding.py
+++ b/contrib/freestanding_lib/freestanding.py
@@ -431,7 +431,7 @@ class Freestanding(object):
             external_xxhash: bool, xxh64_state: Optional[str],
             xxh64_prefix: Optional[str], rewritten_includes: [(str, str)],
             defs: [(str, Optional[str])], replaces: [(str, str)],
-            undefs: [str], excludes: [str], seds: [str],
+            undefs: [str], excludes: [str], seds: [str], spdx: bool,
     ):
         self._zstd_deps = zstd_deps
         self._mem = mem
@@ -446,6 +446,7 @@ class Freestanding(object):
         self._undefs = undefs
         self._excludes = excludes
         self._seds = seds
+        self._spdx = spdx
 
     def _dst_lib_file_paths(self):
         """
@@ -640,6 +641,27 @@ class Freestanding(object):
         for sed in self._seds:
             self._process_sed(sed)
 
+    def _process_spdx(self):
+        if not self._spdx:
+            return
+        self._log("Processing spdx")
+        SPDX_C = "// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause\n"
+        SPDX_H_S = "/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */\n"
+        for filepath in self._dst_lib_file_paths():
+            file = FileLines(filepath)
+            if file.lines[0] == SPDX_C or file.lines[0] == SPDX_H_S:
+                continue
+            for line in file.lines:
+                if "SPDX-License-Identifier" in line:
+                    raise RuntimeError(f"Unexpected SPDX license identifier: {file.filename} {repr(line)}")
+            if file.filename.endswith(".c"):
+                file.lines.insert(0, SPDX_C)
+            elif file.filename.endswith(".h") or file.filename.endswith(".S"):
+                file.lines.insert(0, SPDX_H_S)
+            else:
+                raise RuntimeError(f"Unexpected file extension: {file.filename}")
+            file.write()
+
 
 
     def go(self):
@@ -651,6 +673,7 @@ class Freestanding(object):
         self._rewrite_includes()
         self._replace_xxh64_prefix()
         self._process_seds()
+        self._process_spdx()
 
 
 def parse_optional_pair(defines: [str]) -> [(str, Optional[str])]:
@@ -689,6 +712,7 @@ def main(name, args):
     parser.add_argument("--xxh64-prefix", default=None, help="Alternate XXH64 function prefix (excluding _) e.g. --xxh64-prefix=xxh64")
     parser.add_argument("--rewrite-include", default=[], dest="rewritten_includes", action="append", help="Rewrite an include REGEX=NEW (e.g. '<stddef\\.h>=<linux/types.h>')")
     parser.add_argument("--sed", default=[], dest="seds", action="append", help="Apply a sed replacement. Format: `s/REGEX/FORMAT/[g]`. REGEX is a Python regex. FORMAT is a Python format string formatted by the regex dict.")
+    parser.add_argument("--spdx", action="store_true", help="Add SPDX License Identifiers")
     parser.add_argument("-D", "--define", default=[], dest="defs", action="append", help="Pre-define this macro (can be passed multiple times)")
     parser.add_argument("-U", "--undefine", default=[], dest="undefs", action="append", help="Pre-undefine this macro (can be passed multiple times)")
     parser.add_argument("-R", "--replace", default=[], dest="replaces", action="append", help="Pre-define this macro and replace the first ifndef block with its definition")
@@ -743,6 +767,7 @@ def main(name, args):
         args.undefs,
         args.excludes,
         args.seds,
+        args.spdx,
     ).go()
 
 if __name__ == "__main__":
diff --git a/contrib/linux-kernel/Makefile b/contrib/linux-kernel/Makefile
index 47a431740..baa1f24c6 100644
--- a/contrib/linux-kernel/Makefile
+++ b/contrib/linux-kernel/Makefile
@@ -26,6 +26,7 @@ libzstd:
 		--rewrite-include '"(\.\./)?zstd_errors.h"=<linux/zstd_errors.h>' \
 		--sed 's,/\*\*\*,/* *,g' \
 		--sed 's,/\*\*,/*,g' \
+		--spdx \
 		-DZSTD_NO_INTRINSICS \
 		-DZSTD_NO_UNUSED_FUNCTIONS \
 		-DZSTD_LEGACY_SUPPORT=0 \
@@ -55,10 +56,13 @@ libzstd:
 		-DZSTD_HAVE_WEAK_SYMBOLS=0 \
 		-DZSTD_TRACE=0 \
 		-DZSTD_NO_TRACE \
+		-DZSTD_DISABLE_ASM \
 		-DZSTD_LINUX_KERNEL
+	rm linux/lib/zstd/decompress/huf_decompress_amd64.S
 	mv linux/lib/zstd/zstd.h linux/include/linux/zstd_lib.h
 	mv linux/lib/zstd/zstd_errors.h linux/include/linux/
 	cp linux_zstd.h linux/include/linux/zstd.h
+	cp zstd_common_module.c linux/lib/zstd
 	cp zstd_compress_module.c linux/lib/zstd
 	cp zstd_decompress_module.c linux/lib/zstd
 	cp decompress_sources.h linux/lib/zstd
@@ -102,4 +106,5 @@ test: libzstd
 
 .PHONY: clean
 clean:
-	$(RM) -rf linux test/test test/static_test
+	$(RM) -rf linux
+	$(MAKE) -C test clean
diff --git a/contrib/linux-kernel/linux.mk b/contrib/linux-kernel/linux.mk
index f6f3a8983..20f08c644 100644
--- a/contrib/linux-kernel/linux.mk
+++ b/contrib/linux-kernel/linux.mk
@@ -10,16 +10,10 @@
 # ################################################################
 obj-$(CONFIG_ZSTD_COMPRESS) += zstd_compress.o
 obj-$(CONFIG_ZSTD_DECOMPRESS) += zstd_decompress.o
-
-ccflags-y += -Wno-error=deprecated-declarations
+obj-$(CONFIG_ZSTD_COMMON) += zstd_common.o
 
 zstd_compress-y := \
 		zstd_compress_module.o \
-		common/debug.o \
-		common/entropy_common.o \
-		common/error_private.o \
-		common/fse_decompress.o \
-		common/zstd_common.o \
 		compress/fse_compress.o \
 		compress/hist.o \
 		compress/huf_compress.o \
@@ -35,13 +29,15 @@ zstd_compress-y := \
 
 zstd_decompress-y := \
 		zstd_decompress_module.o \
+		decompress/huf_decompress.o \
+		decompress/zstd_ddict.o \
+		decompress/zstd_decompress.o \
+		decompress/zstd_decompress_block.o \
+
+zstd_common-y := \
+		zstd_common_module.o \
 		common/debug.o \
 		common/entropy_common.o \
 		common/error_private.o \
 		common/fse_decompress.o \
 		common/zstd_common.o \
-		decompress/huf_decompress.o \
-		decompress/huf_decompress_amd64.o \
-		decompress/zstd_ddict.o \
-		decompress/zstd_decompress.o \
-		decompress/zstd_decompress_block.o \
diff --git a/contrib/linux-kernel/test/Makefile b/contrib/linux-kernel/test/Makefile
index be82b3fba..53b0c2a65 100644
--- a/contrib/linux-kernel/test/Makefile
+++ b/contrib/linux-kernel/test/Makefile
@@ -45,4 +45,5 @@ clean:
 	$(RM) -f $(LINUX_ZSTDLIB)/*.o
 	$(RM) -f $(LINUX_ZSTDLIB)/**/*.o
 	$(RM) -f *.o *.a
+	$(RM) -f static_test
 	$(RM) -f test
diff --git a/contrib/linux-kernel/test/include/linux/module.h b/contrib/linux-kernel/test/include/linux/module.h
index be6d20dae..63a28d57b 100644
--- a/contrib/linux-kernel/test/include/linux/module.h
+++ b/contrib/linux-kernel/test/include/linux/module.h
@@ -12,6 +12,8 @@
 
 #define EXPORT_SYMBOL(symbol)                                                  \
   void* __##symbol = symbol
+#define EXPORT_SYMBOL_GPL(symbol)                                              \
+  void* __##symbol = symbol
 #define MODULE_LICENSE(license)
 #define MODULE_DESCRIPTION(description)
 
diff --git a/contrib/linux-kernel/zstd_common_module.c b/contrib/linux-kernel/zstd_common_module.c
new file mode 100644
index 000000000..22686e367
--- /dev/null
+++ b/contrib/linux-kernel/zstd_common_module.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+/*
+ * Copyright (c) Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include <linux/module.h>
+
+#include "common/huf.h"
+#include "common/fse.h"
+#include "common/zstd_internal.h"
+
+// Export symbols shared by compress and decompress into a common module
+
+#undef ZSTD_isError   /* defined within zstd_internal.h */
+EXPORT_SYMBOL_GPL(FSE_readNCount);
+EXPORT_SYMBOL_GPL(HUF_readStats);
+EXPORT_SYMBOL_GPL(HUF_readStats_wksp);
+EXPORT_SYMBOL_GPL(ZSTD_isError);
+EXPORT_SYMBOL_GPL(ZSTD_getErrorName);
+EXPORT_SYMBOL_GPL(ZSTD_getErrorCode);
+EXPORT_SYMBOL_GPL(ZSTD_customMalloc);
+EXPORT_SYMBOL_GPL(ZSTD_customCalloc);
+EXPORT_SYMBOL_GPL(ZSTD_customFree);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("Zstd Common");
diff --git a/lib/common/compiler.h b/lib/common/compiler.h
index 6c7100e83..42f289e0b 100644
--- a/lib/common/compiler.h
+++ b/lib/common/compiler.h
@@ -165,6 +165,12 @@
 #define UNLIKELY(x) (x)
 #endif
 
+#if __has_builtin(__builtin_unreachable) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)))
+#  define ZSTD_UNREACHABLE { assert(0), __builtin_unreachable(); }
+#else
+#  define ZSTD_UNREACHABLE { assert(0); }
+#endif
+
 /* disable warnings */
 #ifdef _MSC_VER    /* Visual Studio */
 #  include <intrin.h>                    /* For Visual 2005 */
diff --git a/lib/common/huf.h b/lib/common/huf.h
index dee99da77..595b2f6db 100644
--- a/lib/common/huf.h
+++ b/lib/common/huf.h
@@ -173,7 +173,7 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
 /* ****************************************
  *  HUF detailed API
  * ****************************************/
-#define HUF_OPTIMAL_DEPTH_THRESHOLD 3
+#define HUF_OPTIMAL_DEPTH_THRESHOLD ZSTD_btultra
 typedef enum {
    HUF_depth_fast, /** Use heuristic to find the table depth**/
    HUF_depth_optimal /** Test possible table depths to find the one that produces the smallest header + encoded size**/
diff --git a/lib/common/pool.c b/lib/common/pool.c
index 5c1d07d35..bf21c57ed 100644
--- a/lib/common/pool.c
+++ b/lib/common/pool.c
@@ -12,7 +12,7 @@
 /* ======   Dependencies   ======= */
 #include "zstd_deps.h" /* size_t */
 #include "debug.h"     /* assert */
-#include "zstd_internal.h"  /* ZSTD_customMalloc, ZSTD_customFree */
+#include "zstd_internal.h"  /* ZSTD_customCalloc, ZSTD_customFree */
 #include "pool.h"
 
 /* ======   Compiler specifics   ====== */
@@ -126,7 +126,7 @@ POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize,
      * empty and full queues.
      */
     ctx->queueSize = queueSize + 1;
-    ctx->queue = (POOL_job*)ZSTD_customMalloc(ctx->queueSize * sizeof(POOL_job), customMem);
+    ctx->queue = (POOL_job*)ZSTD_customCalloc(ctx->queueSize * sizeof(POOL_job), customMem);
     ctx->queueHead = 0;
     ctx->queueTail = 0;
     ctx->numThreadsBusy = 0;
@@ -140,7 +140,7 @@ POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize,
     }
     ctx->shutdown = 0;
     /* Allocate space for the thread handles */
-    ctx->threads = (ZSTD_pthread_t*)ZSTD_customMalloc(numThreads * sizeof(ZSTD_pthread_t), customMem);
+    ctx->threads = (ZSTD_pthread_t*)ZSTD_customCalloc(numThreads * sizeof(ZSTD_pthread_t), customMem);
     ctx->threadCapacity = 0;
     ctx->customMem = customMem;
     /* Check for errors */
@@ -220,7 +220,7 @@ static int POOL_resize_internal(POOL_ctx* ctx, size_t numThreads)
         return 0;
     }
     /* numThreads > threadCapacity */
-    {   ZSTD_pthread_t* const threadPool = (ZSTD_pthread_t*)ZSTD_customMalloc(numThreads * sizeof(ZSTD_pthread_t), ctx->customMem);
+    {   ZSTD_pthread_t* const threadPool = (ZSTD_pthread_t*)ZSTD_customCalloc(numThreads * sizeof(ZSTD_pthread_t), ctx->customMem);
         if (!threadPool) return 1;
         /* replace existing thread pool */
         ZSTD_memcpy(threadPool, ctx->threads, ctx->threadCapacity * sizeof(*threadPool));
diff --git a/lib/compress/huf_compress.c b/lib/compress/huf_compress.c
index 4c3af94b7..a6a5a257c 100644
--- a/lib/compress/huf_compress.c
+++ b/lib/compress/huf_compress.c
@@ -1253,63 +1253,54 @@ unsigned HUF_minTableLog(unsigned symbolCardinality)
     return minBitsSymbols;
 }
 
+#define ESTIMATE_TOTAL_SIZE(huffLog) {\
+            maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, huffLog, workSpace, wkspSize);\
+            if (ERR_isError(maxBits)) continue;\
+            hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize);\
+            if (ERR_isError(hSize)) continue;\
+            newSize = HUF_estimateCompressedSize(table, count, maxSymbolValue) + hSize;\
+            }\
+
 unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, void* workSpace, size_t wkspSize, HUF_CElt* table, const unsigned* count, HUF_depth_mode depthMode)
 {
-    unsigned optLogGuess = FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
-    unsigned optLog = optLogGuess;
+    unsigned optLog = FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
     assert(srcSize > 1); /* Not supported, RLE should be used instead */
 
     if (depthMode == HUF_depth_optimal) { /** Test valid depths and return optimal **/
         BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp);
         size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp);
-        size_t optSize = ((size_t) ~0);
-        unsigned huffLog;
         size_t maxBits, hSize, newSize;
         const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue);
         const unsigned minTableLog = HUF_minTableLog(symbolCardinality);
+        size_t optSize = ((size_t) ~0);
+        unsigned optLogGuess = optLog;
 
-        if (wkspSize < sizeof(HUF_buildCTable_wksp_tables)) return optLog;
-
-        /* Search left of guess until size increases */
-        for (huffLog = optLogGuess; huffLog >= minTableLog; huffLog--) {
-            maxBits = HUF_buildCTable_wksp(table, count,
-                                            maxSymbolValue, huffLog,
-                                            workSpace, wkspSize);
-            if (ERR_isError(maxBits)) continue;
-
-            hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits,
-                                              workSpace, wkspSize);
-            if (ERR_isError(hSize)) continue;
+        if (wkspSize < sizeof(HUF_buildCTable_wksp_tables)) return optLog; /** Assert workspace is large enough **/
 
-            newSize = HUF_estimateCompressedSize(table, count, maxSymbolValue) + hSize;
+        /* Search below estimate log until size increases */
+        for (; optLogGuess >= minTableLog; optLogGuess--) {
+            ESTIMATE_TOTAL_SIZE(optLogGuess);
 
             if (newSize > optSize) {
                 break;
-            } else {
-                optSize = newSize;
-                optLog = huffLog;
             }
+            optSize = newSize;
+            optLog = optLogGuess;
         }
 
-        /* Search right of estimate until size increases */
-        for (huffLog = optLogGuess + 1; huffLog <= maxTableLog; huffLog++) {
-            maxBits = HUF_buildCTable_wksp(table, count,
-                                            maxSymbolValue, huffLog,
-                                            workSpace, wkspSize);
-            if (ERR_isError(maxBits)) continue;
-
-            hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits,
-                                              workSpace, wkspSize);
-            if (ERR_isError(hSize)) continue;
+        if (optSize < ((size_t) ~0)) {
+            return optLog;
+        }
 
-            newSize = HUF_estimateCompressedSize(table, count, maxSymbolValue) + hSize;
+        /* Search above estimate log until size increases */
+        for (; optLogGuess <= maxTableLog; optLogGuess++) {
+            ESTIMATE_TOTAL_SIZE(optLogGuess);
 
             if (newSize > optSize) {
                 break;
-            } else {
-                optSize = newSize;
-                optLog = huffLog;
             }
+            optSize = newSize;
+            optLog = optLogGuess;
         }
     }
     assert(optLog <= HUF_TABLELOG_MAX);
diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c
index 3e2ee1dda..2ee9cf630 100644
--- a/lib/compress/zstd_lazy.c
+++ b/lib/compress/zstd_lazy.c
@@ -1317,14 +1317,10 @@ size_t ZSTD_RowFindBestMatch(
 }
 
 
-typedef size_t (*searchMax_f)(
-                    ZSTD_matchState_t* ms,
-                    const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
-
 /**
- * This struct contains the functions necessary for lazy to search.
- * Currently, that is only searchMax. However, it is still valuable to have the
- * VTable because this makes it easier to add more functions to the VTable later.
+ * Generate search functions templated on (dictMode, mls, rowLog).
+ * These functions are outlined for code size & compilation time.
+ * ZSTD_searchMax() dispatches to the correct implementation function.
  *
  * TODO: The start of the search function involves loading and calculating a
  * bunch of constants from the ZSTD_matchState_t. These computations could be
@@ -1342,25 +1338,25 @@ typedef size_t (*searchMax_f)(
  * the single segment loop. It should go in searchMax instead of its own
  * function to avoid having multiple virtual function calls per search.
  */
-typedef struct {
-    searchMax_f searchMax;
-} ZSTD_LazyVTable;
 
-#define GEN_ZSTD_BT_VTABLE(dictMode, mls)                                             \
-    static size_t ZSTD_BtFindBestMatch_##dictMode##_##mls(                            \
-            ZSTD_matchState_t* ms,                                                    \
-            const BYTE* ip, const BYTE* const iLimit,                                 \
-            size_t* offBasePtr)                                                       \
-    {                                                                                 \
-        assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls);                          \
-        return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_##dictMode);\
-    }                                                                                 \
-    static const ZSTD_LazyVTable ZSTD_BtVTable_##dictMode##_##mls = {                 \
-        ZSTD_BtFindBestMatch_##dictMode##_##mls                                       \
-    };
+#define ZSTD_BT_SEARCH_FN(dictMode, mls) ZSTD_BtFindBestMatch_##dictMode##_##mls
+#define ZSTD_HC_SEARCH_FN(dictMode, mls) ZSTD_HcFindBestMatch_##dictMode##_##mls
+#define ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog
+
+#define ZSTD_SEARCH_FN_ATTRS FORCE_NOINLINE
+
+#define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls)                                           \
+    ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)(                      \
+            ZSTD_matchState_t* ms,                                                     \
+            const BYTE* ip, const BYTE* const iLimit,                                  \
+            size_t* offBasePtr)                                                        \
+    {                                                                                  \
+        assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls);                           \
+        return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_##dictMode); \
+    }                                                                                  \
 
-#define GEN_ZSTD_HC_VTABLE(dictMode, mls)                                             \
-    static size_t ZSTD_HcFindBestMatch_##dictMode##_##mls(                            \
+#define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls)                                          \
+    ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)(                     \
             ZSTD_matchState_t* ms,                                                    \
             const BYTE* ip, const BYTE* const iLimit,                                 \
             size_t* offsetPtr)                                                        \
@@ -1368,12 +1364,9 @@ typedef struct {
         assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls);                          \
         return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
     }                                                                                 \
-    static const ZSTD_LazyVTable ZSTD_HcVTable_##dictMode##_##mls = {                 \
-        ZSTD_HcFindBestMatch_##dictMode##_##mls                                       \
-    };
 
-#define GEN_ZSTD_ROW_VTABLE(dictMode, mls, rowLog)                                             \
-    static size_t ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog(                         \
+#define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)                                          \
+    ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(                     \
             ZSTD_matchState_t* ms,                                                             \
             const BYTE* ip, const BYTE* const iLimit,                                          \
             size_t* offsetPtr)                                                                 \
@@ -1382,9 +1375,6 @@ typedef struct {
         assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog);                               \
         return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \
     }                                                                                          \
-    static const ZSTD_LazyVTable ZSTD_RowVTable_##dictMode##_##mls##_##rowLog = {              \
-        ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog                                    \
-    };
 
 #define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \
     X(dictMode, mls, 4)                        \
@@ -1407,84 +1397,103 @@ typedef struct {
     X(__VA_ARGS__, dictMatchState)      \
     X(__VA_ARGS__, dedicatedDictSearch)
 
-/* Generate Row VTables for each combination of (dictMode, mls, rowLog) */
-ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_VTABLE)
-/* Generate Binary Tree VTables for each combination of (dictMode, mls) */
-ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_VTABLE)
-/* Generate Hash Chain VTables for each combination of (dictMode, mls) */
-ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_VTABLE)
-
-#define GEN_ZSTD_BT_VTABLE_ARRAY(dictMode) \
-    {                                      \
-        &ZSTD_BtVTable_##dictMode##_4,     \
-        &ZSTD_BtVTable_##dictMode##_5,     \
-        &ZSTD_BtVTable_##dictMode##_6      \
-    }
-
-#define GEN_ZSTD_HC_VTABLE_ARRAY(dictMode) \
-    {                                      \
-        &ZSTD_HcVTable_##dictMode##_4,     \
-        &ZSTD_HcVTable_##dictMode##_5,     \
-        &ZSTD_HcVTable_##dictMode##_6      \
-    }
-
-#define GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, mls) \
-    {                                             \
-        &ZSTD_RowVTable_##dictMode##_##mls##_4,   \
-        &ZSTD_RowVTable_##dictMode##_##mls##_5,   \
-        &ZSTD_RowVTable_##dictMode##_##mls##_6    \
-    }
+/* Generate row search fns for each combination of (dictMode, mls, rowLog) */
+ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_SEARCH_FN)
+/* Generate binary Tree search fns for each combination of (dictMode, mls) */
+ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_SEARCH_FN)
+/* Generate hash chain search fns for each combination of (dictMode, mls) */
+ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_SEARCH_FN)
 
-#define GEN_ZSTD_ROW_VTABLE_ARRAY(dictMode)      \
-    {                                            \
-        GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 4), \
-        GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 5), \
-        GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 6)  \
-    }
+typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
 
-#define GEN_ZSTD_VTABLE_ARRAY(X) \
-    {                            \
-        X(noDict),               \
-        X(extDict),              \
-        X(dictMatchState),       \
-        X(dedicatedDictSearch)   \
+#define GEN_ZSTD_CALL_BT_SEARCH_FN(dictMode, mls)                         \
+    case mls:                                                             \
+        return ZSTD_BT_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
+#define GEN_ZSTD_CALL_HC_SEARCH_FN(dictMode, mls)                         \
+    case mls:                                                             \
+        return ZSTD_HC_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
+#define GEN_ZSTD_CALL_ROW_SEARCH_FN(dictMode, mls, rowLog)                         \
+    case rowLog:                                                                   \
+        return ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(ms, ip, iend, offsetPtr);
+
+#define ZSTD_SWITCH_MLS(X, dictMode)   \
+    switch (mls) {                     \
+        ZSTD_FOR_EACH_MLS(X, dictMode) \
     }
 
-/* *******************************
-*  Common parser - lazy strategy
-*********************************/
-typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
+#define ZSTD_SWITCH_ROWLOG(dictMode, mls)                                    \
+    case mls:                                                                \
+        switch (rowLog) {                                                    \
+            ZSTD_FOR_EACH_ROWLOG(GEN_ZSTD_CALL_ROW_SEARCH_FN, dictMode, mls) \
+        }                                                                    \
+        ZSTD_UNREACHABLE;                                                    \
+        break;
+
+#define ZSTD_SWITCH_SEARCH_METHOD(dictMode)                       \
+    switch (searchMethod) {                                       \
+        case search_hashChain:                                    \
+            ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_HC_SEARCH_FN, dictMode) \
+            break;                                                \
+        case search_binaryTree:                                   \
+            ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_BT_SEARCH_FN, dictMode) \
+            break;                                                \
+        case search_rowHash:                                      \
+            ZSTD_SWITCH_MLS(ZSTD_SWITCH_ROWLOG, dictMode)         \
+            break;                                                \
+    }                                                             \
+    ZSTD_UNREACHABLE;
 
 /**
- * This table is indexed first by the four ZSTD_dictMode_e values, and then
- * by the two searchMethod_e values. NULLs are placed for configurations
- * that should never occur (extDict modes go to the other implementation
- * below and there is no DDSS for binary tree search yet).
+ * Searches for the longest match at @p ip.
+ * Dispatches to the correct implementation function based on the
+ * (searchMethod, dictMode, mls, rowLog). We use switch statements
+ * here instead of using an indirect function call through a function
+ * pointer because after Spectre and Meltdown mitigations, indirect
+ * function calls can be very costly, especially in the kernel.
+ *
+ * NOTE: dictMode and searchMethod should be templated, so those switch
+ * statements should be optimized out. Only the mls & rowLog switches
+ * should be left.
+ *
+ * @param ms The match state.
+ * @param ip The position to search at.
+ * @param iend The end of the input data.
+ * @param[out] offsetPtr Stores the match offset into this pointer.
+ * @param mls The minimum search length, in the range [4, 6].
+ * @param rowLog The row log (if applicable), in the range [4, 6].
+ * @param searchMethod The search method to use (templated).
+ * @param dictMode The dictMode (templated).
+ *
+ * @returns The length of the longest match found, or < mls if no match is found.
+ * If a match is found its offset is stored in @p offsetPtr.
  */
-
-static ZSTD_LazyVTable const*
-ZSTD_selectLazyVTable(ZSTD_matchState_t const* ms, searchMethod_e searchMethod, ZSTD_dictMode_e dictMode)
+FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax(
+    ZSTD_matchState_t* ms,
+    const BYTE* ip,
+    const BYTE* iend,
+    size_t* offsetPtr,
+    U32 const mls,
+    U32 const rowLog,
+    searchMethod_e const searchMethod,
+    ZSTD_dictMode_e const dictMode)
 {
-    /* Fill the Hc/Bt VTable arrays with the right functions for the (dictMode, mls) combination. */
-    ZSTD_LazyVTable const* const hcVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_HC_VTABLE_ARRAY);
-    ZSTD_LazyVTable const* const btVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_BT_VTABLE_ARRAY);
-    /* Fill the Row VTable array with the right functions for the (dictMode, mls, rowLog) combination. */
-    ZSTD_LazyVTable const* const rowVTables[4][3][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_ROW_VTABLE_ARRAY);
-
-    U32 const mls = MAX(4, MIN(6, ms->cParams.minMatch));
-    U32 const rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
-    switch (searchMethod) {
-        case search_hashChain:
-            return hcVTables[dictMode][mls - 4];
-        case search_binaryTree:
-            return btVTables[dictMode][mls - 4];
-        case search_rowHash:
-            return rowVTables[dictMode][mls - 4][rowLog - 4];
-        default:
-            return NULL;
+    if (dictMode == ZSTD_noDict) {
+        ZSTD_SWITCH_SEARCH_METHOD(noDict)
+    } else if (dictMode == ZSTD_extDict) {
+        ZSTD_SWITCH_SEARCH_METHOD(extDict)
+    } else if (dictMode == ZSTD_dictMatchState) {
+        ZSTD_SWITCH_SEARCH_METHOD(dictMatchState)
+    } else if (dictMode == ZSTD_dedicatedDictSearch) {
+        ZSTD_SWITCH_SEARCH_METHOD(dedicatedDictSearch)
     }
+    ZSTD_UNREACHABLE;
+    return 0;
 }
 
+/* *******************************
+*  Common parser - lazy strategy
+*********************************/
+
 FORCE_INLINE_TEMPLATE size_t
 ZSTD_compressBlock_lazy_generic(
                         ZSTD_matchState_t* ms, seqStore_t* seqStore,
@@ -1501,8 +1510,9 @@ ZSTD_compressBlock_lazy_generic(
     const BYTE* const base = ms->window.base;
     const U32 prefixLowestIndex = ms->window.dictLimit;
     const BYTE* const prefixLowest = base + prefixLowestIndex;
+    const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
+    const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
 
-    searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, dictMode)->searchMax;
     U32 offset_1 = rep[0], offset_2 = rep[1];
     U32 offsetSaved1 = 0, offsetSaved2 = 0;
 
@@ -1519,8 +1529,6 @@ ZSTD_compressBlock_lazy_generic(
                                      0;
     const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));
 
-    assert(searchMax != NULL);
-
     DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);
     ip += (dictAndPrefixLength == 0);
     if (dictMode == ZSTD_noDict) {
@@ -1538,7 +1546,6 @@ ZSTD_compressBlock_lazy_generic(
     }
 
     if (searchMethod == search_rowHash) {
-        const U32 rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
         ZSTD_row_fillHashCache(ms, base, rowLog,
                             MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
                             ms->nextToUpdate, ilimit);
@@ -1579,7 +1586,7 @@ ZSTD_compressBlock_lazy_generic(
 
         /* first search (depth 0) */
         {   size_t offbaseFound = 999999999;
-            size_t const ml2 = searchMax(ms, ip, iend, &offbaseFound);
+            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode);
             if (ml2 > matchLength)
                 matchLength = ml2, start = ip, offBase = offbaseFound;
         }
@@ -1618,7 +1625,7 @@ ZSTD_compressBlock_lazy_generic(
                 }
             }
             {   size_t ofbCandidate=999999999;
-                size_t const ml2 = searchMax(ms, ip, iend, &ofbCandidate);
+                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
                 int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
                 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
                 if ((ml2 >= 4) && (gain2 > gain1)) {
@@ -1654,7 +1661,7 @@ ZSTD_compressBlock_lazy_generic(
                     }
                 }
                 {   size_t ofbCandidate=999999999;
-                    size_t const ml2 = searchMax(ms, ip, iend, &ofbCandidate);
+                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
                     int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
                     int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
                     if ((ml2 >= 4) && (gain2 > gain1)) {
@@ -1899,9 +1906,9 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
     const BYTE* const dictEnd  = dictBase + dictLimit;
     const BYTE* const dictStart  = dictBase + ms->window.lowLimit;
     const U32 windowLog = ms->cParams.windowLog;
-    const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
+    const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
+    const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
 
-    searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, ZSTD_extDict)->searchMax;
     U32 offset_1 = rep[0], offset_2 = rep[1];
 
     DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
@@ -1943,7 +1950,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
 
         /* first search (depth 0) */
         {   size_t ofbCandidate = 999999999;
-            size_t const ml2 = searchMax(ms, ip, iend, &ofbCandidate);
+            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
             if (ml2 > matchLength)
                 matchLength = ml2, start = ip, offBase = ofbCandidate;
         }
@@ -1978,7 +1985,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
 
             /* search match, depth 1 */
             {   size_t ofbCandidate = 999999999;
-                size_t const ml2 = searchMax(ms, ip, iend, &ofbCandidate);
+                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
                 int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
                 int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
                 if ((ml2 >= 4) && (gain2 > gain1)) {
@@ -2010,7 +2017,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
 
                 /* search match, depth 2 */
                 {   size_t ofbCandidate = 999999999;
-                    size_t const ml2 = searchMax(ms, ip, iend, &ofbCandidate);
+                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
                     int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
                     int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
                     if ((ml2 >= 4) && (gain2 > gain1)) {
diff --git a/lib/decompress/zstd_ddict.c b/lib/decompress/zstd_ddict.c
index 889764a5e..6ffa35f6e 100644
--- a/lib/decompress/zstd_ddict.c
+++ b/lib/decompress/zstd_ddict.c
@@ -240,5 +240,5 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict)
 unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict)
 {
     if (ddict==NULL) return 0;
-    return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize);
+    return ddict->dictID;
 }
diff --git a/programs/zstd.1 b/programs/zstd.1
index a2bf7fd2f..5cf3c6bb1 100644
--- a/programs/zstd.1
+++ b/programs/zstd.1
@@ -144,7 +144,7 @@ Note: cannot use both this and \-D together Note: \fB\-\-long\fR mode will be au
 \fB\-\-no\-dictID\fR: do not store dictionary ID within frame header (dictionary compression)\. The decoder will have to rely on implicit knowledge about which dictionary to use, it won\'t be able to check if it\'s correct\.
 .
 .IP "\(bu" 4
-\fB\-M#\fR, \fB\-\-memory=#\fR: Set a memory usage limit\. By default, Zstandard uses 128 MB for decompression as the maximum amount of memory the decompressor is allowed to use, but you can override this manually if need be in either direction (i\.e\. you can increase or decrease it)\.
+\fB\-M#\fR, \fB\-\-memory=#\fR: Set a memory usage limit\. By default, \fBzstd\fR uses 128 MB for decompression as the maximum amount of memory the decompressor is allowed to use, but you can override this manually if need be in either direction (i\.e\. you can increase or decrease it)\.
 .
 .IP
 This is also used during compression when using with \-\-patch\-from=\. In this case, this parameter overrides that maximum size allowed for a dictionary\. (128 MB)\.
diff --git a/programs/zstd.1.md b/programs/zstd.1.md
index 3ab8404ee..37c2ba187 100644
--- a/programs/zstd.1.md
+++ b/programs/zstd.1.md
@@ -183,7 +183,7 @@ the last one takes effect.
     The decoder will have to rely on implicit knowledge about which dictionary to use,
     it won't be able to check if it's correct.
 * `-M#`, `--memory=#`:
-    Set a memory usage limit. By default, Zstandard uses 128 MB for decompression
+    Set a memory usage limit. By default, `zstd` uses 128 MB for decompression
     as the maximum amount of memory the decompressor is allowed to use, but you can
     override this manually if need be in either direction (i.e. you can increase or
     decrease it).
diff --git a/tests/fuzz/zstd_helpers.c b/tests/fuzz/zstd_helpers.c
index 4f8727df9..b4a6509db 100644
--- a/tests/fuzz/zstd_helpers.c
+++ b/tests/fuzz/zstd_helpers.c
@@ -89,8 +89,13 @@ void FUZZ_setRandomParameters(ZSTD_CCtx *cctx, size_t srcSize, FUZZ_dataProducer
     setRand(cctx, ZSTD_c_ldmHashRateLog, ZSTD_LDM_HASHRATELOG_MIN,
             ZSTD_LDM_HASHRATELOG_MAX, producer);
     /* Set misc parameters */
+#ifndef ZSTD_MULTITHREAD
+    setRand(cctx, ZSTD_c_nbWorkers, 0, 0, producer);
+    setRand(cctx, ZSTD_c_rsyncable, 0, 0, producer);
+#else
     setRand(cctx, ZSTD_c_nbWorkers, 0, 2, producer);
     setRand(cctx, ZSTD_c_rsyncable, 0, 1, producer);
+#endif
     setRand(cctx, ZSTD_c_useRowMatchFinder, 0, 2, producer);
     setRand(cctx, ZSTD_c_enableDedicatedDictSearch, 0, 1, producer);
     setRand(cctx, ZSTD_c_forceMaxWindow, 0, 1, producer);