From: Nick Terrell <terrelln@fb.com>
Date: Tue, 3 Nov 2020 01:52:29 +0000 (-0800)
Subject: Add and integrate lazy row hash strategy
X-Git-Tag: v1.5.0^2~60^2~1
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=4694423c4f73e7d6101e44c949cfaaa0205f1500;p=thirdparty%2Fzstd.git

Add and integrate lazy row hash strategy
---

diff --git a/.gitignore b/.gitignore
index ae277e932..ea574d747 100644
--- a/.gitignore
+++ b/.gitignore
@@ -50,3 +50,5 @@ googletest/
 *.code-workspace
 compile_commands.json
 .clangd
+perf.data
+perf.data.old
diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c
index 2eb9cb7ac..cb58ca294 100644
--- a/lib/compress/zstd_compress.c
+++ b/lib/compress/zstd_compress.c
@@ -72,6 +72,10 @@ struct ZSTD_CDict_s {
     ZSTD_customMem customMem;
     U32 dictID;
     int compressionLevel; /* 0 indicates that advanced API was used to select CDict params */
+    ZSTD_useRowMatchFinderMode_e useRowMatchFinder; /* Indicates whether the CDict was created with params that would use
+                                                     * row-based matchfinder. Unless the cdict is reloaded, we will use
+                                                     * the same greedy/lazy matchfinder at compression time.
+                                                     */
 };  /* typedef'd to ZSTD_CDict within "zstd.h" */
 
 ZSTD_CCtx* ZSTD_createCCtx(void)
@@ -202,6 +206,49 @@ size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs)
 /* private API call, for dictBuilder only */
 const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); }
 
+/* Returns true if the strategy supports using a row based matchfinder */
+static int ZSTD_rowMatchFinderSupported(const ZSTD_strategy strategy) {
+    return (strategy >= ZSTD_greedy && strategy <= ZSTD_lazy2);
+}
+
+/* Returns true if the strategy and useRowMatchFinder mode indicate that we will use the row based matchfinder
+ * for this compression.
+ */
+static int ZSTD_rowMatchFinderUsed(const ZSTD_strategy strategy, const ZSTD_useRowMatchFinderMode_e mode) {
+    assert(mode != ZSTD_urm_auto);
+    return ZSTD_rowMatchFinderSupported(strategy) && (mode == ZSTD_urm_enableRowMatchFinder);
+}
+
+/* Returns row matchfinder usage enum given an initial mode and cParams */
+static ZSTD_useRowMatchFinderMode_e ZSTD_resolveRowMatchFinderMode(ZSTD_useRowMatchFinderMode_e mode,
+                                                                   const ZSTD_compressionParameters* const cParams) {
+#if !defined(ZSTD_NO_INTRINSICS) && (defined(__SSE2__) || defined(__ARM_NEON))
+    int const kHasSIMD128 = 1;
+#else
+    int const kHasSIMD128 = 0;
+#endif
+    if (mode != ZSTD_urm_auto) return mode; /* if requested enabled, but no SIMD, we still will use row matchfinder */
+    mode = ZSTD_urm_disableRowMatchFinder;
+    if (!ZSTD_rowMatchFinderSupported(cParams->strategy)) return mode;
+    if (kHasSIMD128) {
+        if (cParams->windowLog > 14) mode = ZSTD_urm_enableRowMatchFinder;
+    } else {
+        if (cParams->windowLog > 17) mode = ZSTD_urm_enableRowMatchFinder;
+    }
+    return mode;
+}
+
+/* Returns 1 if the arguments indicate that we should allocate a chainTable, 0 otherwise */
+static int ZSTD_allocateChainTable(const ZSTD_strategy strategy,
+                                   const ZSTD_useRowMatchFinderMode_e useRowMatchFinder,
+                                   const U32 forDDSDict) {
+    assert(useRowMatchFinder != ZSTD_urm_auto);
+    /* We always should allocate a chaintable if we are allocating a matchstate for a DDS dictionary matchstate.
+     * We do not allocate a chaintable if we are using ZSTD_fast, or are using the row-based matchfinder.
+     */
+    return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder));
+}
+
 /* Returns 1 if compression parameters are such that we should
  * enable long distance matching (wlog >= 27, strategy >= btopt).
  * Returns 0 otherwise.
@@ -241,6 +288,7 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
         cctxParams.splitBlocks = 1;
     }
 
+    cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);
     assert(!ZSTD_checkCParams(cParams));
     return cctxParams;
 }
@@ -299,6 +347,8 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par
      * But, set it for tracing anyway.
      */
     cctxParams->compressionLevel = compressionLevel;
+    cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, &params->cParams);
+    DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d", cctxParams->useRowMatchFinder);
 }
 
 size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params)
@@ -498,12 +548,17 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
         bounds.lowerBound = 0;
         bounds.upperBound = 1;
         return bounds;
-    
+
     case ZSTD_c_splitBlocks:
         bounds.lowerBound = 0;
         bounds.upperBound = 1;
         return bounds;
 
+    case ZSTD_c_useRowMatchFinder:
+        bounds.lowerBound = (int)ZSTD_urm_auto;
+        bounds.upperBound = (int)ZSTD_urm_enableRowMatchFinder;
+        return bounds;
+
     default:
         bounds.error = ERROR(parameter_unsupported);
         return bounds;
@@ -566,6 +621,7 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
     case ZSTD_c_blockDelimiters:
     case ZSTD_c_validateSequences:
     case ZSTD_c_splitBlocks:
+    case ZSTD_c_useRowMatchFinder:
     default:
         return 0;
     }
@@ -619,6 +675,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
     case ZSTD_c_blockDelimiters:
     case ZSTD_c_validateSequences:
     case ZSTD_c_splitBlocks:
+    case ZSTD_c_useRowMatchFinder:
         break;
 
     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
@@ -835,6 +892,11 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
         CCtxParams->splitBlocks = value;
         return CCtxParams->splitBlocks;
 
+    case ZSTD_c_useRowMatchFinder:
+        BOUNDCHECK(ZSTD_c_useRowMatchFinder, value);
+        CCtxParams->useRowMatchFinder = (ZSTD_useRowMatchFinderMode_e)value;
+        return CCtxParams->useRowMatchFinder;
+
     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
     }
 }
@@ -961,6 +1023,9 @@ size_t ZSTD_CCtxParams_getParameter(
     case ZSTD_c_splitBlocks :
         *value = (int)CCtxParams->splitBlocks;
         break;
+    case ZSTD_c_useRowMatchFinder :
+        *value = (int)CCtxParams->useRowMatchFinder;
+        break;
     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
     }
     return 0;
@@ -1327,9 +1392,14 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
 
 static size_t
 ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
+                       const ZSTD_useRowMatchFinderMode_e useRowMatchFinder,
+                       const U32 enableDedicatedDictSearch,
                        const U32 forCCtx)
 {
-    size_t const chainSize = (cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cParams->chainLog);
+    /* chain table size should be 0 for fast or row-hash strategies */
+    size_t const chainSize = ZSTD_allocateChainTable(cParams->strategy, useRowMatchFinder, enableDedicatedDictSearch && !forCCtx)
+                                ? ((size_t)1 << cParams->chainLog)
+                                : 0;
     size_t const hSize = ((size_t)1) << cParams->hashLog;
     U32    const hashLog3 = (forCCtx && cParams->minMatch==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0;
     size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0;
@@ -1345,6 +1415,9 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
       + ZSTD_cwksp_aligned_alloc_size((1<<Litbits) * sizeof(U32))
       + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t))
       + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
+    size_t const lazyAdditionalSpace = ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)
+                                            ? ZSTD_cwksp_aligned_alloc_size(hSize*sizeof(U16))
+                                            : 0;
     size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt))
                                 ? optPotentialSpace
                                 : 0;
@@ -1352,16 +1425,18 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
 
     /* tables are guaranteed to be sized in multiples of 64 bytes (or 16 uint32_t) */
     ZSTD_STATIC_ASSERT(ZSTD_HASHLOG_MIN >= 4 && ZSTD_WINDOWLOG_MIN >= 4 && ZSTD_CHAINLOG_MIN >= 4);
+    assert(useRowMatchFinder != ZSTD_urm_auto);
 
     DEBUGLOG(4, "chainSize: %u - hSize: %u - h3Size: %u",
                 (U32)chainSize, (U32)hSize, (U32)h3Size);
-    return tableSpace + optSpace + slackSpace;
+    return tableSpace + optSpace + slackSpace + lazyAdditionalSpace;
 }
 
 static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
         const ZSTD_compressionParameters* cParams,
         const ldmParams_t* ldmParams,
         const int isStatic,
+        const ZSTD_useRowMatchFinderMode_e useRowMatchFinder,
         const size_t buffInSize,
         const size_t buffOutSize,
         const U64 pledgedSrcSize)
@@ -1375,7 +1450,7 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
                             + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE));
     size_t const entropySpace = ZSTD_cwksp_alloc_size(ENTROPY_WORKSPACE_SIZE);
     size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t));
-    size_t const matchStateSize = ZSTD_sizeof_matchState(cParams, /* forCCtx */ 1);
+    size_t const matchStateSize = ZSTD_sizeof_matchState(cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 0, /* forCCtx */ 1);
 
     size_t const ldmSpace = ZSTD_ldm_getTableSize(*ldmParams);
     size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(*ldmParams, blockSize);
@@ -1406,19 +1481,32 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
 {
     ZSTD_compressionParameters const cParams =
                 ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
+    ZSTD_useRowMatchFinderMode_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder,
+                                                                                         &cParams);
 
     RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
     /* estimateCCtxSize is for one-shot compression. So no buffers should
      * be needed. However, we still allocate two 0-sized buffers, which can
      * take space under ASAN. */
     return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
-        &cParams, &params->ldmParams, 1, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN);
+        &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN);
 }
 
 size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams)
 {
-    ZSTD_CCtx_params const params = ZSTD_makeCCtxParamsFromCParams(cParams);
-    return ZSTD_estimateCCtxSize_usingCCtxParams(&params);
+    ZSTD_CCtx_params initialParams = ZSTD_makeCCtxParamsFromCParams(cParams);
+    if (ZSTD_rowMatchFinderSupported(cParams.strategy)) {
+        /* Pick bigger of not using and using row-based matchfinder for greedy and lazy strategies */
+        size_t noRowCCtxSize;
+        size_t rowCCtxSize;
+        initialParams.useRowMatchFinder = ZSTD_urm_disableRowMatchFinder;
+        noRowCCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams);
+        initialParams.useRowMatchFinder = ZSTD_urm_enableRowMatchFinder;
+        rowCCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams);
+        return MAX(noRowCCtxSize, rowCCtxSize);
+    } else {
+        return ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams);
+    }
 }
 
 static size_t ZSTD_estimateCCtxSize_internal(int compressionLevel)
@@ -1458,17 +1546,29 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
         size_t const outBuffSize = (params->outBufferMode == ZSTD_bm_buffered)
                 ? ZSTD_compressBound(blockSize) + 1
                 : 0;
+        ZSTD_useRowMatchFinderMode_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, &params->cParams);
 
         return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
-            &cParams, &params->ldmParams, 1, inBuffSize, outBuffSize,
+            &cParams, &params->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize,
             ZSTD_CONTENTSIZE_UNKNOWN);
     }
 }
 
 size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams)
 {
-    ZSTD_CCtx_params const params = ZSTD_makeCCtxParamsFromCParams(cParams);
-    return ZSTD_estimateCStreamSize_usingCCtxParams(&params);
+    ZSTD_CCtx_params initialParams = ZSTD_makeCCtxParamsFromCParams(cParams);
+    if (ZSTD_rowMatchFinderSupported(cParams.strategy)) {
+        /* Pick bigger of not using and using row-based matchfinder for greedy and lazy strategies */
+        size_t noRowCCtxSize;
+        size_t rowCCtxSize;
+        initialParams.useRowMatchFinder = ZSTD_urm_disableRowMatchFinder;
+        noRowCCtxSize = ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams);
+        initialParams.useRowMatchFinder = ZSTD_urm_enableRowMatchFinder;
+        rowCCtxSize = ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams);
+        return MAX(noRowCCtxSize, rowCCtxSize);
+    } else {
+        return ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams);
+    }
 }
 
 static size_t ZSTD_estimateCStreamSize_internal(int compressionLevel)
@@ -1593,20 +1693,27 @@ typedef enum {
     ZSTD_resetTarget_CCtx
 } ZSTD_resetTarget_e;
 
+
 static size_t
 ZSTD_reset_matchState(ZSTD_matchState_t* ms,
                       ZSTD_cwksp* ws,
                 const ZSTD_compressionParameters* cParams,
+                const ZSTD_useRowMatchFinderMode_e useRowMatchFinder,
                 const ZSTD_compResetPolicy_e crp,
                 const ZSTD_indexResetPolicy_e forceResetIndex,
                 const ZSTD_resetTarget_e forWho)
 {
-    size_t const chainSize = (cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cParams->chainLog);
+    /* disable chain table allocation for fast or row-based strategies */
+    size_t const chainSize = ZSTD_allocateChainTable(cParams->strategy, useRowMatchFinder,
+                                                     ms->dedicatedDictSearch && (forWho == ZSTD_resetTarget_CDict))
+                                ? ((size_t)1 << cParams->chainLog)
+                                : 0;
     size_t const hSize = ((size_t)1) << cParams->hashLog;
     U32    const hashLog3 = ((forWho == ZSTD_resetTarget_CCtx) && cParams->minMatch==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0;
     size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0;
 
     DEBUGLOG(4, "reset indices : %u", forceResetIndex == ZSTDirp_reset);
+    assert(useRowMatchFinder != ZSTD_urm_auto);
     if (forceResetIndex == ZSTDirp_reset) {
         ZSTD_window_init(&ms->window);
         ZSTD_cwksp_mark_tables_dirty(ws);
@@ -1645,11 +1752,23 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
         ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
     }
 
+    if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) {
+        {   /* Row match finder needs an additional table of hashes ("tags") */
+            size_t const tagTableSize = hSize*sizeof(U16);
+            ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
+            if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize);
+        }
+        {   /* Switch to 32-entry rows if searchLog is 5 (or more) */
+            U32 const rowLog = cParams->searchLog < 5 ? 4 : 5;
+            assert(cParams->hashLog > rowLog);
+            ms->rowHashLog = ZSTD_highbit32((U32)1 << (cParams->hashLog - rowLog));
+        }
+    }
+
     ms->cParams = *cParams;
 
     RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation,
                     "failed a workspace allocation in ZSTD_reset_matchState");
-
     return 0;
 }
 
@@ -1675,12 +1794,13 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
                                       ZSTD_buffered_policy_e const zbuff)
 {
     ZSTD_cwksp* const ws = &zc->workspace;
-    DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u",
-                (U32)pledgedSrcSize, params.cParams.windowLog);
+    DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u, useRowMatchFinder=%d",
+                (U32)pledgedSrcSize, params.cParams.windowLog, (int)params.useRowMatchFinder);
     assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
 
     zc->isFirstBlock = 1;
 
+    assert(params.useRowMatchFinder != ZSTD_urm_auto);
     if (params.ldmParams.enableLdm) {
         /* Adjust long distance matching parameters */
         ZSTD_ldm_adjustParameters(&params.ldmParams, &params.cParams);
@@ -1706,7 +1826,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
 
         size_t const neededSpace =
             ZSTD_estimateCCtxSize_usingCCtxParams_internal(
-                &params.cParams, &params.ldmParams, zc->staticSize != 0,
+                &params.cParams, &params.ldmParams, zc->staticSize != 0, params.useRowMatchFinder,
                 buffInSize, buffOutSize, pledgedSrcSize);
         int resizeWorkspace;
 
@@ -1802,6 +1922,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
             &zc->blockState.matchState,
             ws,
             &params.cParams,
+            params.useRowMatchFinder,
             crp,
             needsIndexReset,
             ZSTD_resetTarget_CCtx), "");
@@ -1878,6 +1999,7 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
                         U64 pledgedSrcSize,
                         ZSTD_buffered_policy_e zbuff)
 {
+    DEBUGLOG(4, "ZSTD_resetCCtx_byAttachingCDict() pledgedSrcSize=%zu", pledgedSrcSize);
     {
         ZSTD_compressionParameters adjusted_cdict_cParams = cdict->matchState.cParams;
         unsigned const windowLog = params.cParams.windowLog;
@@ -1893,6 +2015,7 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
         params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize,
                                                      cdict->dictContentSize, ZSTD_cpm_attachDict);
         params.cParams.windowLog = windowLog;
+        params.useRowMatchFinder = cdict->useRowMatchFinder;    /* cdict overrides */
         FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize,
                                                  ZSTDcrp_makeClean, zbuff), "");
         assert(cctx->appliedParams.cParams.strategy == adjusted_cdict_cParams.strategy);
@@ -1937,14 +2060,14 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
     const ZSTD_compressionParameters *cdict_cParams = &cdict->matchState.cParams;
 
     assert(!cdict->matchState.dedicatedDictSearch);
-
-    DEBUGLOG(4, "copying dictionary into context");
+    DEBUGLOG(4, "ZSTD_resetCCtx_byCopyingCDict() pledgedSrcSize=%zu", pledgedSrcSize);
 
     {   unsigned const windowLog = params.cParams.windowLog;
         assert(windowLog != 0);
         /* Copy only compression parameters related to tables. */
         params.cParams = *cdict_cParams;
         params.cParams.windowLog = windowLog;
+        params.useRowMatchFinder = cdict->useRowMatchFinder;
         FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize,
                                                  ZSTDcrp_leaveDirty, zbuff), "");
         assert(cctx->appliedParams.cParams.strategy == cdict_cParams->strategy);
@@ -1953,17 +2076,30 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
     }
 
     ZSTD_cwksp_mark_tables_dirty(&cctx->workspace);
+    assert(params.useRowMatchFinder != ZSTD_urm_auto);
 
     /* copy tables */
-    {   size_t const chainSize = (cdict_cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cdict_cParams->chainLog);
+    {   size_t const chainSize = ZSTD_allocateChainTable(cdict_cParams->strategy, cdict->useRowMatchFinder, 0 /* DDS guaranteed disabled */)
+                                                            ? ((size_t)1 << cdict_cParams->chainLog)
+                                                            : 0;
         size_t const hSize =  (size_t)1 << cdict_cParams->hashLog;
 
         ZSTD_memcpy(cctx->blockState.matchState.hashTable,
                cdict->matchState.hashTable,
                hSize * sizeof(U32));
-        ZSTD_memcpy(cctx->blockState.matchState.chainTable,
+        /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */
+        if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) {
+            ZSTD_memcpy(cctx->blockState.matchState.chainTable,
                cdict->matchState.chainTable,
                chainSize * sizeof(U32));
+        }
+        /* copy tag table */
+        if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) {
+            size_t const tagTableSize = hSize*sizeof(U16);
+            ZSTD_memcpy(cctx->blockState.matchState.tagTable,
+                cdict->matchState.tagTable,
+                tagTableSize);
+        }
     }
 
     /* Zero the hashTable3, since the cdict never fills it */
@@ -2027,14 +2163,15 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
                             U64 pledgedSrcSize,
                             ZSTD_buffered_policy_e zbuff)
 {
-    DEBUGLOG(5, "ZSTD_copyCCtx_internal");
     RETURN_ERROR_IF(srcCCtx->stage!=ZSTDcs_init, stage_wrong,
                     "Can't copy a ctx that's not in init stage.");
-
+    DEBUGLOG(5, "ZSTD_copyCCtx_internal");
     ZSTD_memcpy(&dstCCtx->customMem, &srcCCtx->customMem, sizeof(ZSTD_customMem));
     {   ZSTD_CCtx_params params = dstCCtx->requestedParams;
         /* Copy only compression parameters related to tables. */
         params.cParams = srcCCtx->appliedParams.cParams;
+        assert(srcCCtx->appliedParams.useRowMatchFinder != ZSTD_urm_auto);
+        params.useRowMatchFinder = srcCCtx->appliedParams.useRowMatchFinder;
         params.fParams = fParams;
         ZSTD_resetCCtx_internal(dstCCtx, params, pledgedSrcSize,
                                 ZSTDcrp_leaveDirty, zbuff);
@@ -2048,7 +2185,11 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
     ZSTD_cwksp_mark_tables_dirty(&dstCCtx->workspace);
 
     /* copy tables */
-    {   size_t const chainSize = (srcCCtx->appliedParams.cParams.strategy == ZSTD_fast) ? 0 : ((size_t)1 << srcCCtx->appliedParams.cParams.chainLog);
+    {   size_t const chainSize = ZSTD_allocateChainTable(srcCCtx->appliedParams.cParams.strategy,
+                                                         srcCCtx->appliedParams.useRowMatchFinder,
+                                                         0 /* forDDSDict */)
+                                    ? ((size_t)1 << srcCCtx->appliedParams.cParams.chainLog)
+                                    : 0;
         size_t const hSize =  (size_t)1 << srcCCtx->appliedParams.cParams.hashLog;
         int const h3log = srcCCtx->blockState.matchState.hashLog3;
         size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0;
@@ -2239,7 +2380,7 @@ typedef struct {
 /* ZSTD_buildSequencesStatistics():
  * Returns the size of the statistics for a given set of sequences, or a ZSTD error code,
  * Also modifies LLtype, Offtype, MLtype, and lastNCount to the appropriate values.
- * 
+ *
  * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32)
  */
 static ZSTD_symbolEncodingTypeStats_t
@@ -2510,7 +2651,7 @@ ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
 /* ZSTD_selectBlockCompressor() :
  * Not static, but internal use only (used by long distance matcher)
  * assumption : strat is a valid strategy */
-ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_dictMode_e dictMode)
+ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_useRowMatchFinderMode_e useRowMatchFinder, ZSTD_dictMode_e dictMode)
 {
     static const ZSTD_blockCompressor blockCompressor[4][ZSTD_STRATEGY_MAX+1] = {
         { ZSTD_compressBlock_fast  /* default for 0 */,
@@ -2558,7 +2699,28 @@ ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_dictMo
     ZSTD_STATIC_ASSERT((unsigned)ZSTD_fast == 1);
 
     assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat));
-    selectedCompressor = blockCompressor[(int)dictMode][(int)strat];
+    DEBUGLOG(4, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder);
+    if (ZSTD_rowMatchFinderUsed(strat, useRowMatchFinder)) {
+        static const ZSTD_blockCompressor rowBasedBlockCompressors[4][3] = {
+            { ZSTD_compressBlock_greedy_row,
+            ZSTD_compressBlock_lazy_row,
+            ZSTD_compressBlock_lazy2_row },
+            { ZSTD_compressBlock_greedy_extDict_row,
+            ZSTD_compressBlock_lazy_extDict_row,
+            ZSTD_compressBlock_lazy2_extDict_row },
+            { ZSTD_compressBlock_greedy_dictMatchState_row,
+            ZSTD_compressBlock_lazy_dictMatchState_row,
+            ZSTD_compressBlock_lazy2_dictMatchState_row },
+            { ZSTD_compressBlock_greedy_dedicatedDictSearch_row,
+            ZSTD_compressBlock_lazy_dedicatedDictSearch_row,
+            ZSTD_compressBlock_lazy2_dedicatedDictSearch_row }
+        };
+        DEBUGLOG(4, "Selecting a row-based matchfinder");
+        assert(useRowMatchFinder != ZSTD_urm_auto);
+        selectedCompressor = rowBasedBlockCompressors[(int)dictMode][(int)strat - (int)ZSTD_greedy];
+    } else {
+        selectedCompressor = blockCompressor[(int)dictMode][(int)strat];
+    }
     assert(selectedCompressor != NULL);
     return selectedCompressor;
 }
@@ -2627,6 +2789,7 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
                 ZSTD_ldm_blockCompress(&zc->externSeqStore,
                                        ms, &zc->seqStore,
                                        zc->blockState.nextCBlock->rep,
+                                       zc->appliedParams.useRowMatchFinder,
                                        src, srcSize);
             assert(zc->externSeqStore.pos <= zc->externSeqStore.size);
         } else if (zc->appliedParams.ldmParams.enableLdm) {
@@ -2643,10 +2806,13 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
                 ZSTD_ldm_blockCompress(&ldmSeqStore,
                                        ms, &zc->seqStore,
                                        zc->blockState.nextCBlock->rep,
+                                       zc->appliedParams.useRowMatchFinder,
                                        src, srcSize);
             assert(ldmSeqStore.pos == ldmSeqStore.size);
         } else {   /* not long range mode */
-            ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, dictMode);
+            ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
+                                                                                    zc->appliedParams.useRowMatchFinder,
+                                                                                    dictMode);
             ms->ldmSeqStore = NULL;
             lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
         }
@@ -2954,7 +3120,7 @@ static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
 /** ZSTD_buildBlockEntropyStats() :
  *  Builds entropy for the block.
  *  Requires workspace size ENTROPY_WORKSPACE_SIZE
- * 
+ *
  *  @return : 0 on success or error code
  */
 size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
@@ -3219,7 +3385,7 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_
 
 /* ZSTD_compressSeqStore_singleBlock():
  * Compresses a seqStore into a block with a block header, into the buffer dst.
- * 
+ *
  * Returns the total size of that block (including header) or a ZSTD error code.
  */
 static size_t ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore,
@@ -3298,11 +3464,11 @@ typedef struct {
  * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half.
  * If advantageous to split, then we recurse down the two sub-blocks. If not, or if an error occurred in estimation, then
  * we do not recurse.
- * 
+ *
  * Note: The recursion depth is capped by a heuristic minimum number of sequences, defined by MIN_SEQUENCES_BLOCK_SPLITTING.
  * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING).
  * In practice, recursion depth usually doesn't go beyond 4.
- * 
+ *
  * Furthermore, the number of splits is capped by MAX_NB_SPLITS. At MAX_NB_SPLITS == 196 with the current existing blockSize
  * maximum of 128 KB, this value is actually impossible to reach.
  */
@@ -3357,7 +3523,7 @@ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
 
 /* ZSTD_compressBlock_splitBlock():
  * Attempts to split a given block into multiple blocks to improve compression ratio.
- * 
+ *
  * Returns combined size of all blocks (which includes headers), or a ZSTD error code.
  */
 static size_t ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity,
@@ -3921,6 +4087,7 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
     const BYTE* ip = (const BYTE*) src;
     const BYTE* const iend = ip + srcSize;
 
+    DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
     ZSTD_window_update(&ms->window, src, srcSize);
     ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
 
@@ -3956,11 +4123,24 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
         case ZSTD_greedy:
         case ZSTD_lazy:
         case ZSTD_lazy2:
-            if (chunk >= HASH_READ_SIZE && ms->dedicatedDictSearch) {
-                assert(chunk == remaining); /* must load everything in one go */
-                ZSTD_dedicatedDictSearch_lazy_loadDictionary(ms, ichunk-HASH_READ_SIZE);
-            } else if (chunk >= HASH_READ_SIZE) {
-                ZSTD_insertAndFindFirstIndex(ms, ichunk-HASH_READ_SIZE);
+            if (chunk >= HASH_READ_SIZE) {
+                if (ms->dedicatedDictSearch) {
+                    assert(chunk == remaining); /* must load everything in one go */
+                    assert(ms->chainTable != NULL);
+                    ZSTD_dedicatedDictSearch_lazy_loadDictionary(ms, ichunk-HASH_READ_SIZE);
+                } else {
+                    assert(params->useRowMatchFinder != ZSTD_urm_auto);
+                    if (params->useRowMatchFinder == ZSTD_urm_enableRowMatchFinder) {
+                        size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog) * sizeof(U16);
+                        if (ip == src)
+                            ZSTD_memset(ms->tagTable, 0, tagTableSize);
+                        ZSTD_row_update(ms, ichunk-HASH_READ_SIZE);
+                        DEBUGLOG(4, "Using row-based hash table for lazy dict");
+                    } else {
+                        ZSTD_insertAndFindFirstIndex(ms, ichunk-HASH_READ_SIZE);
+                        DEBUGLOG(4, "Using chain-based hash table for lazy dict");
+                    }
+                }
             }
             break;
 
@@ -4115,7 +4295,6 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
     const BYTE* const dictEnd = dictPtr + dictSize;
     size_t dictID;
     size_t eSize;
-
     ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
     assert(dictSize >= 8);
     assert(MEM_readLE32(dictPtr) == ZSTD_MAGIC_DICTIONARY);
@@ -4454,7 +4633,10 @@ size_t ZSTD_estimateCDictSize_advanced(
     DEBUGLOG(5, "sizeof(ZSTD_CDict) : %u", (unsigned)sizeof(ZSTD_CDict));
     return ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict))
          + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE)
-         + ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0)
+         /* enableDedicatedDictSearch == 1 ensures that CDict estimation will not be too small
+          * in case we are using DDS with row-hash. */
+         + ZSTD_sizeof_matchState(&cParams, ZSTD_resolveRowMatchFinderMode(ZSTD_urm_auto, &cParams),
+                                  /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0)
          + (dictLoadMethod == ZSTD_dlm_byRef ? 0
             : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void *))));
 }
@@ -4508,6 +4690,7 @@ static size_t ZSTD_initCDict_internal(
         &cdict->matchState,
         &cdict->workspace,
         &params.cParams,
+        params.useRowMatchFinder,
         ZSTDcrp_makeClean,
         ZSTDirp_reset,
         ZSTD_resetTarget_CDict), "");
@@ -4531,14 +4714,17 @@ static size_t ZSTD_initCDict_internal(
 
 static ZSTD_CDict* ZSTD_createCDict_advanced_internal(size_t dictSize,
                                       ZSTD_dictLoadMethod_e dictLoadMethod,
-                                      ZSTD_compressionParameters cParams, ZSTD_customMem customMem)
+                                      ZSTD_compressionParameters cParams,
+                                      ZSTD_useRowMatchFinderMode_e useRowMatchFinder,
+                                      U32 enableDedicatedDictSearch,
+                                      ZSTD_customMem customMem)
 {
     if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL;
 
     {   size_t const workspaceSize =
             ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) +
             ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) +
-            ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0) +
+            ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, enableDedicatedDictSearch, /* forCCtx */ 0) +
             (dictLoadMethod == ZSTD_dlm_byRef ? 0
              : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*))));
         void* const workspace = ZSTD_customMalloc(workspaceSize, customMem);
@@ -4557,7 +4743,7 @@ static ZSTD_CDict* ZSTD_createCDict_advanced_internal(size_t dictSize,
         ZSTD_cwksp_move(&cdict->workspace, &ws);
         cdict->customMem = customMem;
         cdict->compressionLevel = ZSTD_NO_CLEVEL; /* signals advanced API usage */
-
+        cdict->useRowMatchFinder = useRowMatchFinder;
         return cdict;
     }
 }
@@ -4609,10 +4795,13 @@ ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced2(
             &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict);
     }
 
+    DEBUGLOG(3, "ZSTD_createCDict_advanced2: DDS: %u", cctxParams.enableDedicatedDictSearch);
     cctxParams.cParams = cParams;
+    cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);
 
     cdict = ZSTD_createCDict_advanced_internal(dictSize,
                         dictLoadMethod, cctxParams.cParams,
+                        cctxParams.useRowMatchFinder, cctxParams.enableDedicatedDictSearch,
                         customMem);
 
     if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
@@ -4681,7 +4870,9 @@ const ZSTD_CDict* ZSTD_initStaticCDict(
                                  ZSTD_dictContentType_e dictContentType,
                                  ZSTD_compressionParameters cParams)
 {
-    size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0);
+    ZSTD_useRowMatchFinderMode_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(ZSTD_urm_auto, &cParams);
+    /* enableDedicatedDictSearch == 1 ensures matchstate is not too small in case this CDict will be used for DDS + row hash */
+    size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0);
     size_t const neededSize = ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict))
                             + (dictLoadMethod == ZSTD_dlm_byRef ? 0
                                : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*))))
@@ -4706,6 +4897,8 @@ const ZSTD_CDict* ZSTD_initStaticCDict(
 
     ZSTD_CCtxParams_init(&params, 0);
     params.cParams = cParams;
+    params.useRowMatchFinder = useRowMatchFinder;
+    cdict->useRowMatchFinder = useRowMatchFinder;
 
     if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
                                               dict, dictSize,
@@ -5242,6 +5435,8 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
         params.splitBlocks = 1;
     }
 
+    params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, &params.cParams);
+
 #ifdef ZSTD_MULTITHREAD
     if ((cctx->pledgedSrcSizePlusOne-1) <= ZSTDMT_JOBSIZE_MIN) {
         params.nbWorkers = 0; /* do not invoke multi-threading when src size is too small */
@@ -6051,6 +6246,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel,
     else row = compressionLevel;
 
     {   ZSTD_compressionParameters cp = ZSTD_defaultCParameters[tableID][row];
+        DEBUGLOG(5, "ZSTD_getCParams_internal selected tableID: %u row: %u strat: %u", tableID, row, (U32)cp.strategy);
         /* acceleration factor */
         if (compressionLevel < 0) {
             int const clampedCompressionLevel = MAX(ZSTD_minCLevel(), compressionLevel);
diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h
index 447115478..3c488c61c 100644
--- a/lib/compress/zstd_compress_internal.h
+++ b/lib/compress/zstd_compress_internal.h
@@ -196,6 +196,9 @@ typedef struct {
 } ZSTD_window_t;
 
 typedef struct ZSTD_matchState_t ZSTD_matchState_t;
+
+#define ZSTD_ROW_HASH_CACHE_SIZE 8       /* Size of prefetching hash cache for row-based matchfinder */
+
 struct ZSTD_matchState_t {
     ZSTD_window_t window;   /* State for window round buffer management */
     U32 loadedDictEnd;      /* index of end of dictionary, within context's referential.
@@ -207,9 +210,15 @@ struct ZSTD_matchState_t {
                              */
     U32 nextToUpdate;       /* index from which to continue table update */
     U32 hashLog3;           /* dispatch table for matches of len==3 : larger == faster, more memory */
+
+    U32 rowHashLog;                          /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/
+    U16* tagTable;                           /* For row-based matchFinder: A row-based table containing the hashes and head index. */
+    U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */
+
     U32* hashTable;
     U32* hashTable3;
     U32* chainTable;
+
     int dedicatedDictSearch;  /* Indicates whether this matchState is using the
                                * dedicated dictionary search structure.
                                */
@@ -305,6 +314,9 @@ struct ZSTD_CCtx_params_s {
     /* Block splitting */
     int splitBlocks;
 
+    /* Param for deciding whether to use row-based matchfinder */
+    ZSTD_useRowMatchFinderMode_e useRowMatchFinder;
+
     /* Internal use, for createCCtxParams() and freeCCtxParams() only */
     ZSTD_customMem customMem;
 };  /* typedef'd to ZSTD_CCtx_params within "zstd.h" */
@@ -420,7 +432,7 @@ typedef enum {
 typedef size_t (*ZSTD_blockCompressor) (
         ZSTD_matchState_t* bs, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_dictMode_e dictMode);
+ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_useRowMatchFinderMode_e rowMatchfinderMode, ZSTD_dictMode_e dictMode);
 
 
 MEM_STATIC U32 ZSTD_LLcode(U32 litLength)
diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c
index 75745a79b..5cc242880 100644
--- a/lib/compress/zstd_lazy.c
+++ b/lib/compress/zstd_lazy.c
@@ -438,43 +438,9 @@ static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
     }
 }
 
-
-
-/* *********************************
-*  Hash Chain
+/***********************************
+* Dedicated dict search
 ***********************************/
-#define NEXT_IN_CHAIN(d, mask)   chainTable[(d) & (mask)]
-
-/* Update chains up to ip (excluded)
-   Assumption : always within prefix (i.e. not within extDict) */
-FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
-                        ZSTD_matchState_t* ms,
-                        const ZSTD_compressionParameters* const cParams,
-                        const BYTE* ip, U32 const mls)
-{
-    U32* const hashTable  = ms->hashTable;
-    const U32 hashLog = cParams->hashLog;
-    U32* const chainTable = ms->chainTable;
-    const U32 chainMask = (1 << cParams->chainLog) - 1;
-    const BYTE* const base = ms->window.base;
-    const U32 target = (U32)(ip - base);
-    U32 idx = ms->nextToUpdate;
-
-    while(idx < target) { /* catch up */
-        size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls);
-        NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
-        hashTable[h] = idx;
-        idx++;
-    }
-
-    ms->nextToUpdate = target;
-    return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
-}
-
-U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
-    const ZSTD_compressionParameters* const cParams = &ms->cParams;
-    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
-}
 
 void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip)
 {
@@ -500,7 +466,6 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B
     U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog);
     U32 const tmpChainSize = ((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;
     U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx;
-
     U32 hashIdx;
 
     assert(ms->cParams.chainLog <= 24);
@@ -591,6 +556,139 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B
     ms->nextToUpdate = target;
 }
 
+/* Returns the longest match length found in the dedicated dict search structure.
+ * If none are longer than the argument ml, then ml will be returned.
+ */
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts,
+                                            const ZSTD_matchState_t* const dms,
+                                            const BYTE* const ip, const BYTE* const iLimit,
+                                            const BYTE* const prefixStart, const U32 curr,
+                                            const U32 dictLimit, const size_t ddsIdx) {
+    const U32 ddsLowestIndex  = dms->window.dictLimit;
+    const BYTE* const ddsBase = dms->window.base;
+    const BYTE* const ddsEnd  = dms->window.nextSrc;
+    const U32 ddsSize         = (U32)(ddsEnd - ddsBase);
+    const U32 ddsIndexDelta   = dictLimit - ddsSize;
+    const U32 bucketSize      = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG);
+    const U32 bucketLimit     = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1;
+    U32 ddsAttempt;
+    U32 matchIndex;
+
+    for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) {
+        PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);
+    }
+
+    {
+        U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
+        U32 const chainIndex = chainPackedPointer >> 8;
+
+        PREFETCH_L1(&dms->chainTable[chainIndex]);
+    }
+
+    for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) {
+        size_t currentMl=0;
+        const BYTE* match;
+        matchIndex = dms->hashTable[ddsIdx + ddsAttempt];
+        match = ddsBase + matchIndex;
+
+        if (!matchIndex) {
+            return ml;
+        }
+
+        /* guaranteed by table construction */
+        (void)ddsLowestIndex;
+        assert(matchIndex >= ddsLowestIndex);
+        assert(match+4 <= ddsEnd);
+        if (MEM_read32(match) == MEM_read32(ip)) {
+            /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+            currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
+        }
+
+        /* save best solution */
+        if (currentMl > ml) {
+            ml = currentMl;
+            *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
+            if (ip+currentMl == iLimit) {
+                /* best possible, avoids read overflow on next attempt */
+                return ml;
+            }
+        }
+    }
+
+    {
+        U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
+        U32 chainIndex = chainPackedPointer >> 8;
+        U32 const chainLength = chainPackedPointer & 0xFF;
+        U32 const chainAttempts = nbAttempts - ddsAttempt;
+        U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts;
+        U32 chainAttempt;
+
+        for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) {
+            PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]);
+        }
+
+        for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) {
+            size_t currentMl=0;
+            const BYTE* match;
+            matchIndex = dms->chainTable[chainIndex];
+            match = ddsBase + matchIndex;
+
+            /* guaranteed by table construction */
+            assert(matchIndex >= ddsLowestIndex);
+            assert(match+4 <= ddsEnd);
+            if (MEM_read32(match) == MEM_read32(ip)) {
+                /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+                currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
+            }
+
+            /* save best solution */
+            if (currentMl > ml) {
+                ml = currentMl;
+                *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
+                if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+            }
+        }
+    }
+    return ml;
+}
+
+
+/* *********************************
+*  Hash Chain
+***********************************/
+#define NEXT_IN_CHAIN(d, mask)   chainTable[(d) & (mask)]
+
+/* Update chains up to ip (excluded)
+   Assumption : always within prefix (i.e. not within extDict) */
+FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
+                        ZSTD_matchState_t* ms,
+                        const ZSTD_compressionParameters* const cParams,
+                        const BYTE* ip, U32 const mls)
+{
+    U32* const hashTable  = ms->hashTable;
+    const U32 hashLog = cParams->hashLog;
+    U32* const chainTable = ms->chainTable;
+    const U32 chainMask = (1 << cParams->chainLog) - 1;
+    const BYTE* const base = ms->window.base;
+    const U32 target = (U32)(ip - base);
+    U32 idx = ms->nextToUpdate;
+
+    while(idx < target) { /* catch up */
+        size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls);
+        NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
+        hashTable[h] = idx;
+        idx++;
+    }
+
+    ms->nextToUpdate = target;
+    return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
+}
+
+U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
+}
 
 /* inlining is important to hardwire a hot branch (template emulation) */
 FORCE_INLINE_TEMPLATE
@@ -661,90 +759,8 @@ size_t ZSTD_HcFindBestMatch_generic (
     }
 
     if (dictMode == ZSTD_dedicatedDictSearch) {
-        const U32 ddsLowestIndex  = dms->window.dictLimit;
-        const BYTE* const ddsBase = dms->window.base;
-        const BYTE* const ddsEnd  = dms->window.nextSrc;
-        const U32 ddsSize         = (U32)(ddsEnd - ddsBase);
-        const U32 ddsIndexDelta   = dictLimit - ddsSize;
-        const U32 bucketSize      = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG);
-        const U32 bucketLimit     = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1;
-        U32 ddsAttempt;
-
-        for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) {
-            PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);
-        }
-
-        {
-            U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
-            U32 const chainIndex = chainPackedPointer >> 8;
-
-            PREFETCH_L1(&dms->chainTable[chainIndex]);
-        }
-
-        for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) {
-            size_t currentMl=0;
-            const BYTE* match;
-            matchIndex = dms->hashTable[ddsIdx + ddsAttempt];
-            match = ddsBase + matchIndex;
-
-            if (!matchIndex) {
-                return ml;
-            }
-
-            /* guaranteed by table construction */
-            (void)ddsLowestIndex;
-            assert(matchIndex >= ddsLowestIndex);
-            assert(match+4 <= ddsEnd);
-            if (MEM_read32(match) == MEM_read32(ip)) {
-                /* assumption : matchIndex <= dictLimit-4 (by table construction) */
-                currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
-            }
-
-            /* save best solution */
-            if (currentMl > ml) {
-                ml = currentMl;
-                *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
-                if (ip+currentMl == iLimit) {
-                    /* best possible, avoids read overflow on next attempt */
-                    return ml;
-                }
-            }
-        }
-
-        {
-            U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
-            U32 chainIndex = chainPackedPointer >> 8;
-            U32 const chainLength = chainPackedPointer & 0xFF;
-            U32 const chainAttempts = nbAttempts - ddsAttempt;
-            U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts;
-            U32 chainAttempt;
-
-            for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) {
-                PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]);
-            }
-
-            for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) {
-                size_t currentMl=0;
-                const BYTE* match;
-                matchIndex = dms->chainTable[chainIndex];
-                match = ddsBase + matchIndex;
-
-                /* guaranteed by table construction */
-                assert(matchIndex >= ddsLowestIndex);
-                assert(match+4 <= ddsEnd);
-                if (MEM_read32(match) == MEM_read32(ip)) {
-                    /* assumption : matchIndex <= dictLimit-4 (by table construction) */
-                    currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
-                }
-
-                /* save best solution */
-                if (currentMl > ml) {
-                    ml = currentMl;
-                    *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
-                    if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
-                }
-            }
-        }
+        ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms,
+                                                  ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
     } else if (dictMode == ZSTD_dictMatchState) {
         const U32* const dmsChainTable = dms->chainTable;
         const U32 dmsChainSize         = (1 << dms->cParams.chainLog);
@@ -845,11 +861,657 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
     }
 }
 
+/* *********************************
+* (SIMD) Row-based matchfinder
+***********************************/
+/* Constants for row-based hash */
+#define ZSTD_ROW_HASH_TAG_OFFSET 1                               /* byte offset of hashes in the match state's tagTable from the beginning of a row */
+#define ZSTD_ROW_HASH_TAG_BITS 8                                 /* nb bits to use for the tag */
+#define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
+
+#define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1)
+
+typedef U32 ZSTD_VecMask;   /* Clarifies when we are interacting with a U32 representing a mask of matches */
+
+#if !defined(ZSTD_NO_INTRINSICS) && defined(__SSE2__) /* SIMD SSE version */
+
+#include <emmintrin.h>
+typedef __m128i ZSTD_Vec128;
+
+/* Returns a 128-bit container with 128-bits from src */
+static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
+  return _mm_loadu_si128((ZSTD_Vec128 const*)src);
+}
+
+/* Returns a ZSTD_Vec128 with the byte "val" packed 16 times */
+static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
+  return _mm_set1_epi8((char)val);
+}
+
+/* Do byte-by-byte comparison result of x and y. Then collapse 128-bit resultant mask
+ * into a 32-bit mask that is the MSB of each byte.
+ * */
+static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
+  return (ZSTD_VecMask)_mm_movemask_epi8(_mm_cmpeq_epi8(x, y));
+}
+
+typedef struct {
+  __m128i fst;
+  __m128i snd;
+} ZSTD_Vec256;
+
+static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) {
+  ZSTD_Vec256 v;
+  v.fst = ZSTD_Vec128_read(ptr);
+  v.snd = ZSTD_Vec128_read((ZSTD_Vec128 const*)ptr + 1);
+  return v;
+}
+
+static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
+  ZSTD_Vec256 v;
+  v.fst = ZSTD_Vec128_set8(val);
+  v.snd = ZSTD_Vec128_set8(val);
+  return v;
+}
+
+static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
+  ZSTD_VecMask fstMask;
+  ZSTD_VecMask sndMask;
+  fstMask = ZSTD_Vec128_cmpMask8(x.fst, y.fst);
+  sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd);
+  return fstMask | (sndMask << 16);
+}
+
+#elif !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON) /* SIMD ARM NEON Version */
+
+#include <arm_neon.h>
+typedef uint8x16_t ZSTD_Vec128;
+
+static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
+  return vld1q_u8((const BYTE* const)src);
+}
+
+static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
+  return vdupq_n_u8(val);
+}
+
+/* Mimics '_mm_movemask_epi8()' from SSE */
+static U32 ZSTD_vmovmaskq_u8(ZSTD_Vec128 val) {
+    /* Shift out everything but the MSB bits in each byte */
+    uint16x8_t highBits = vreinterpretq_u16_u8(vshrq_n_u8(val, 7));
+    /* Merge the even lanes together with vsra (right shift and add) */
+    uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(highBits, highBits, 7));
+    uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
+    uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
+    /* Extract the low 8 bits from each lane, merge */
+    return vgetq_lane_u8(paired64, 0) | ((U32)vgetq_lane_u8(paired64, 8) << 8);
+}
+
+static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
+  return (ZSTD_VecMask)ZSTD_vmovmaskq_u8(vceqq_u8(x, y));
+}
+
+typedef struct {
+    uint8x16_t fst;
+    uint8x16_t snd;
+} ZSTD_Vec256;
+
+static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) {
+  ZSTD_Vec256 v;
+  v.fst = ZSTD_Vec128_read(ptr);
+  v.snd = ZSTD_Vec128_read((ZSTD_Vec128 const*)ptr + 1);
+  return v;
+}
+
+static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
+  ZSTD_Vec256 v;
+  v.fst = ZSTD_Vec128_set8(val);
+  v.snd = ZSTD_Vec128_set8(val);
+  return v;
+}
+
+static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
+  ZSTD_VecMask fstMask;
+  ZSTD_VecMask sndMask;
+  fstMask = ZSTD_Vec128_cmpMask8(x.fst, y.fst);
+  sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd);
+  return fstMask | (sndMask << 16);
+}
+
+#else /* Scalar fallback version */
+
+#define VEC128_NB_SIZE_T (16 / sizeof(size_t))
+typedef struct {
+    size_t vec[VEC128_NB_SIZE_T];
+} ZSTD_Vec128;
+
+static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
+    ZSTD_Vec128 ret;
+    ZSTD_memcpy(ret.vec, src, VEC128_NB_SIZE_T*sizeof(size_t));
+    return ret;
+}
+
+static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
+    ZSTD_Vec128 ret = { {0} };
+    int startBit = sizeof(size_t) * 8 - 8;
+    for (;startBit >= 0; startBit -= 8) {
+        unsigned j = 0;
+        for (;j < VEC128_NB_SIZE_T; ++j) {
+            ret.vec[j] |= ((size_t)val << startBit);
+        }
+    }
+    return ret;
+}
+
+/* Compare x to y, byte by byte, generating a "matches" bitfield */
+static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
+    ZSTD_VecMask res = 0;
+    unsigned i = 0;
+    unsigned l = 0;
+    for (; i < VEC128_NB_SIZE_T; ++i) {
+        const size_t cmp1 = x.vec[i];
+        const size_t cmp2 = y.vec[i];
+        unsigned j = 0;
+        for (; j < sizeof(size_t); ++j, ++l) {
+            if (((cmp1 >> j*8) & 0xFF) == ((cmp2 >> j*8) & 0xFF)) {
+                res |= ((U32)1 << (j+i*sizeof(size_t)));
+            }
+        }
+    }
+    return res;
+}
+
+#define VEC256_NB_SIZE_T 2*VEC128_NB_SIZE_T
+typedef struct {
+    size_t vec[VEC256_NB_SIZE_T];
+} ZSTD_Vec256;
+
+static ZSTD_Vec256 ZSTD_Vec256_read(const void* const src) {
+    ZSTD_Vec256 ret;
+    ZSTD_memcpy(ret.vec, src, VEC256_NB_SIZE_T*sizeof(size_t));
+    return ret;
+}
+
+static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
+    ZSTD_Vec256 ret = { {0} };
+    int startBit = sizeof(size_t) * 8 - 8;
+    for (;startBit >= 0; startBit -= 8) {
+        unsigned j = 0;
+        for (;j < VEC256_NB_SIZE_T; ++j) {
+            ret.vec[j] |= ((size_t)val << startBit);
+        }
+    }
+    return ret;
+}
+
+/* Compare x to y, byte by byte, generating a "matches" bitfield */
+static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
+    ZSTD_VecMask res = 0;
+    unsigned i = 0;
+    unsigned l = 0;
+    for (; i < VEC256_NB_SIZE_T; ++i) {
+        const size_t cmp1 = x.vec[i];
+        const size_t cmp2 = y.vec[i];
+        unsigned j = 0;
+        for (; j < sizeof(size_t); ++j, ++l) {
+            if (((cmp1 >> j*8) & 0xFF) == ((cmp2 >> j*8) & 0xFF)) {
+                res |= ((U32)1 << (j+i*sizeof(size_t)));
+            }
+        }
+    }
+    return res;
+}
+
+#endif /* !defined(ZSTD_NO_INTRINSICS) && defined(__SSE2__) */
+
+/* ZSTD_VecMask_next():
+ * Starting from the LSB, returns the idx of the next non-zero bit.
+ * Basically counting the nb of trailing zeroes.
+ */
+static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
+#   if defined(_MSC_VER)   /* Visual */
+    unsigned long r=0;
+    return _BitScanForward(&r, val) ? (U32)r : 0;
+#   elif defined(__GNUC__) && (__GNUC__ >= 3)
+    return (U32)__builtin_ctz(val);
+#   else
+    /* Software ctz version: http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightMultLookup */
+    static const U32 multiplyDeBruijnBitPosition[32] =
+    {
+        0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
+		31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
+    };
+	return multiplyDeBruijnBitPosition[((U32)((v & -(int)v) * 0x077CB531U)) >> 27];
+#   endif
+}
+
+/* ZSTD_VecMask_rotateRight():
+ * Rotates a bitfield to the right by "rotation" bits.
+ * If the rotation is greater than totalBits, the returned mask is 0.
+ */
+FORCE_INLINE_TEMPLATE ZSTD_VecMask
+ZSTD_VecMask_rotateRight(ZSTD_VecMask mask, U32 const rotation, U32 const totalBits) {
+  if (rotation == 0)
+    return mask;
+  switch (totalBits) {
+    default:
+      assert(0);
+    case 16:
+      return (mask >> rotation) | (U16)(mask << (16 - rotation));
+    case 32:
+      return (mask >> rotation) | (U32)(mask << (32 - rotation));
+  }
+}
+
+/* ZSTD_row_nextIndex():
+ * Returns the next index to insert at within a tagTable row, and updates the "head"
+ * value to reflect the update. Essentially cycles backwards from [0, {entries per row})
+ */
+FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
+  U32 const next = (*tagRow - 1) & rowMask;
+  *tagRow = (BYTE)next;
+  return next;
+}
+
+/* ZSTD_isAligned():
+ * Checks that a pointer is aligned to "align" bytes which must be a power of 2.
+ */
+MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
+    assert((align & (align - 1)) == 0);
+    return (((size_t)ptr) & (align - 1)) == 0;
+}
+
+/* ZSTD_row_prefetch():
+ * Performs prefetching for the hashTable and tagTable at a given row.
+ */
+FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
+    PREFETCH_L1(hashTable + relRow);
+    if (rowLog == 5) {
+        PREFETCH_L1(hashTable + relRow + 16);
+    }
+    PREFETCH_L1(tagTable + relRow);
+    assert(rowLog == 4 || rowLog == 5);
+    assert(ZSTD_isAligned(hashTable + relRow, 64));                 /* prefetched hash row always 64-byte aligned */
+    assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on a multiple of 32 or 64 bytes */
+}
+
+/* ZSTD_row_fillHashCache():
+ * Fill up the hash cache starting at idx, prefetching ZSTD_ROW_HASH_CACHE_SIZE entries.
+ */
+static void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
+                                   U32 const rowLog, U32 const mls,
+                                   U32 idx, const BYTE* const iend)
+{
+    U32 const* const hashTable = ms->hashTable;
+    U16 const* const tagTable = ms->tagTable;
+    U32 const hashLog = ms->rowHashLog;
+    U32 const maxElemsToPrefetch = (base + idx) >= iend ? 0 : (U32)(iend - (base + idx));
+    U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
+
+    for (; idx < lim; ++idx) {
+        U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+        U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+        ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
+        ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
+    }
+
+    DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[0], ms->hashCache[1],
+                                                     ms->hashCache[2], ms->hashCache[3], ms->hashCache[4],
+                                                     ms->hashCache[5], ms->hashCache[6], ms->hashCache[7]);
+}
+
+/* ZSTD_row_nextCachedHash():
+ * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at
+ * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
+ */
+FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
+                                                  U16 const* tagTable, BYTE const* base,
+                                                  U32 idx, U32 const hashLog,
+                                                  U32 const rowLog, U32 const mls)
+{
+    U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+    U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+    ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
+    {   U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
+        cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash;
+        return hash;
+    }
+}
+
+/* ZSTD_row_update_internal():
+ * Inserts the byte at ip into the appropriate position in the hash table.
+ * Determines the relative row, and the position within the {16, 32} entry row to insert at.
+ */
+FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
+                                                    U32 const mls, U32 const rowLog,
+                                                    U32 const rowMask, U32 const useCache)
+{
+    U32* const hashTable = ms->hashTable;
+    U16* const tagTable = ms->tagTable;
+    U32 const hashLog = ms->rowHashLog;
+    const BYTE* const base = ms->window.base;
+    const U32 target = (U32)(ip - base);
+    U32 idx = ms->nextToUpdate;
+
+    DEBUGLOG(6, "ZSTD_row_update_internal(): nextToUpdate=%u, current=%u", idx, target);
+    for (; idx < target; ++idx) {
+        U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, idx, hashLog, rowLog, mls)
+                                  : (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+        U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+        U32* const row = hashTable + relRow;
+        BYTE* tagRow = (BYTE*)(tagTable + relRow);  /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
+                                                       Explicit cast allows us to get exact desired position within each row */
+        U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
+
+        assert(hash == ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
+        ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
+        row[pos] = idx;
+    }
+    ms->nextToUpdate = target;
+}
+
+/* ZSTD_row_update():
+ * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary
+ * processing.
+ */
+void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
+    const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
+    const U32 rowMask = (1u << rowLog) - 1;
+    const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
+
+    DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
+    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
+}
+
+/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
+ * the hash at the nth position in a row of the tagTable.
+ */
+FORCE_INLINE_TEMPLATE
+ZSTD_VecMask ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries) {
+    ZSTD_VecMask matches = 0;
+    if (rowEntries == 16) {
+        ZSTD_Vec128 hashes        = ZSTD_Vec128_read(tagRow + ZSTD_ROW_HASH_TAG_OFFSET);
+        ZSTD_Vec128 expandedTags  = ZSTD_Vec128_set8(tag);
+        matches                   = ZSTD_Vec128_cmpMask8(hashes, expandedTags);
+    } else if (rowEntries == 32) {
+        ZSTD_Vec256 hashes        = ZSTD_Vec256_read(tagRow + ZSTD_ROW_HASH_TAG_OFFSET);
+        ZSTD_Vec256 expandedTags  = ZSTD_Vec256_set8(tag);
+        matches                   = ZSTD_Vec256_cmpMask8(hashes, expandedTags);
+    } else {
+        assert(0);
+    }
+    /* Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
+        to match up with the actual layout of the entries within the hashTable */
+    return ZSTD_VecMask_rotateRight(matches, head, rowEntries);
+}
+
+/* The high-level approach of the SIMD row based match finder is as follows:
+ * - Figure out where to insert the new entry:
+ *      - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag"
+ *      - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines
+ *        which row to insert into.
+ *      - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can
+ *        be considered as a circular buffer with a "head" index that resides in the tagTable.
+ *      - Also insert the "tag" into the equivalent row and position in the tagTable.
+ *          - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry.
+ *                  The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively,
+ *                  for alignment/performance reasons, leaving some bytes unused.
+ * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and
+ *   generate a bitfield that we can cycle through to check the collisions in the hash table.
+ * - Pick the longest match.
+ */
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_RowFindBestMatch_generic (
+                        ZSTD_matchState_t* ms,
+                        const BYTE* const ip, const BYTE* const iLimit,
+                        size_t* offsetPtr,
+                        const U32 mls, const ZSTD_dictMode_e dictMode,
+                        const U32 rowLog)
+{
+    U32* const hashTable = ms->hashTable;
+    U16* const tagTable = ms->tagTable;
+    U32* const hashCache = ms->hashCache;
+    const U32 hashLog = ms->rowHashLog;
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const U32 dictLimit = ms->window.dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const U32 curr = (U32)(ip-base);
+    const U32 maxDistance = 1U << cParams->windowLog;
+    const U32 lowestValid = ms->window.lowLimit;
+    const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
+    const U32 isDictionary = (ms->loadedDictEnd != 0);
+    const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
+    const U32 rowEntries = (1U << rowLog);
+    const U32 rowMask = rowEntries - 1;
+    const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
+    U32 nbAttempts = 1U << cappedSearchLog;
+    size_t ml=4-1;
+
+    /* DMS/DDS variables that may be referenced laster */
+    const ZSTD_matchState_t* const dms = ms->dictMatchState;
+    size_t ddsIdx;
+    U32 ddsExtraAttempts; /* cctx hash tables are limited in searches, but allow extra searches into DDS */
+    U32 dmsTag;
+    U32* dmsRow;
+    BYTE* dmsTagRow;
+
+    if (dictMode == ZSTD_dedicatedDictSearch) {
+        const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
+        {   /* Prefetch DDS hashtable entry */
+            ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG;
+            PREFETCH_L1(&dms->hashTable[ddsIdx]);
+        }
+        ddsExtraAttempts = cParams->searchLog > rowLog ? 1U << (cParams->searchLog - 5) : 0;
+    }
+
+    if (dictMode == ZSTD_dictMatchState) {
+        /* Prefetch DMS rows */
+        U32* const dmsHashTable = dms->hashTable;
+        U16* const dmsTagTable = dms->tagTable;
+        U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+        U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+        dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
+        dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow);
+        dmsRow = dmsHashTable + dmsRelRow;
+        ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog);
+    }
+
+    /* Update the hashTable and tagTable up to (but not including) ip */
+    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
+    {   /* Get the hash for ip, compute the appropriate row */
+        U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
+        U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+        U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
+        U32* const row = hashTable + relRow;
+        BYTE* tagRow = (BYTE*)(tagTable + relRow);
+        U32 const head = *tagRow & rowMask;
+        U32 matchBuffer[32 /* maximum nb entries per row */];
+        size_t numMatches = 0;
+        size_t currMatch = 0;
+        ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
+
+        /* Cycle through the matches and prefetch */
+        for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
+            U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
+            U32 const matchIndex = row[matchPos];
+            assert(numMatches < rowEntries);
+            if (matchIndex < lowLimit)
+                break;
+            if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
+                PREFETCH_L1(base + matchIndex);
+            } else {
+                PREFETCH_L1(dictBase + matchIndex);
+            }
+            matchBuffer[numMatches++] = matchIndex;
+        }
+
+        /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
+           in ZSTD_row_update_internal() at the next search. */
+        {
+            U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
+            tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
+            row[pos] = ms->nextToUpdate++;
+        }
+
+        /* Return the longest match */
+        for (; currMatch < numMatches; ++currMatch) {
+            U32 const matchIndex = matchBuffer[currMatch];
+            size_t currentMl=0;
+            assert(matchIndex < curr);
+            assert(matchIndex >= lowLimit);
+
+            if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
+                const BYTE* const match = base + matchIndex;
+                assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
+                if (match[ml] == ip[ml])   /* potentially better */
+                    currentMl = ZSTD_count(ip, match, iLimit);
+            } else {
+                const BYTE* const match = dictBase + matchIndex;
+                assert(match+4 <= dictEnd);
+                if (MEM_read32(match) == MEM_read32(ip))   /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+                    currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
+            }
+
+            /* Save best solution */
+            if (currentMl > ml) {
+                ml = currentMl;
+                *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
+                if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+            }
+        }
+    }
+
+    if (dictMode == ZSTD_dedicatedDictSearch) {
+        ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms,
+                                                  ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
+    } else if (dictMode == ZSTD_dictMatchState) {
+        /* TODO: Measure and potentially add prefetching to DMS */
+        const U32 dmsLowestIndex       = dms->window.dictLimit;
+        const BYTE* const dmsBase      = dms->window.base;
+        const BYTE* const dmsEnd       = dms->window.nextSrc;
+        const U32 dmsSize              = (U32)(dmsEnd - dmsBase);
+        const U32 dmsIndexDelta        = dictLimit - dmsSize;
+
+        {   /* Get the hash for ip, compute the appropriate row */
+            U32 const head = *dmsTagRow & rowMask;
+            U32 matchBuffer[32 /* maximum nb row entries */];
+            size_t numMatches = 0;
+            size_t currMatch = 0;
+            ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
+
+            for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
+                U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
+                U32 const matchIndex = dmsRow[matchPos];
+                if (matchIndex < dmsLowestIndex)
+                    break;
+                PREFETCH_L1(dmsBase + matchIndex);
+                matchBuffer[numMatches++] = matchIndex;
+            }
+
+            /* Return the longest match */
+            for (; currMatch < numMatches; ++currMatch) {
+                U32 const matchIndex = matchBuffer[currMatch];
+                size_t currentMl=0;
+                assert(matchIndex >= dmsLowestIndex);
+                assert(matchIndex < curr);
+
+                {   const BYTE* const match = dmsBase + matchIndex;
+                    assert(match+4 <= dmsEnd);
+                    if (MEM_read32(match) == MEM_read32(ip))
+                        currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;
+                }
+
+                if (currentMl > ml) {
+                    ml = currentMl;
+                    *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
+                    if (ip+currentMl == iLimit) break;
+                }
+            }
+        }
+    }
+    return ml;
+}
+
+/* Inlining is important to hardwire a hot branch (template emulation) */
+FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_selectMLS (
+                        ZSTD_matchState_t* ms,
+                        const BYTE* ip, const BYTE* const iLimit,
+                        const ZSTD_dictMode_e dictMode, size_t* offsetPtr, const U32 rowLog)
+{
+    switch(ms->cParams.minMatch)
+    {
+    default : /* includes case 3 */
+    case 4 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, dictMode, rowLog);
+    case 5 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, dictMode, rowLog);
+    case 7 :
+    case 6 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, dictMode, rowLog);
+    }
+}
+
+FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_selectRowLog (
+                        ZSTD_matchState_t* ms,
+                        const BYTE* ip, const BYTE* const iLimit,
+                        size_t* offsetPtr)
+{
+    const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
+    switch(cappedSearchLog)
+    {
+    default :
+    case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 4);
+    case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 5);
+    }
+}
+
+FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_dictMatchState_selectRowLog(
+                        ZSTD_matchState_t* ms,
+                        const BYTE* ip, const BYTE* const iLimit,
+                        size_t* offsetPtr)
+{
+    const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
+    switch(cappedSearchLog)
+    {
+    default :
+    case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 4);
+    case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 5);
+    }
+}
+
+FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog(
+                        ZSTD_matchState_t* ms,
+                        const BYTE* ip, const BYTE* const iLimit,
+                        size_t* offsetPtr)
+{
+    const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
+    switch(cappedSearchLog)
+    {
+    default :
+    case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 4);
+    case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 5);
+    }
+}
+
+FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_extDict_selectRowLog (
+                        ZSTD_matchState_t* ms,
+                        const BYTE* ip, const BYTE* const iLimit,
+                        size_t* offsetPtr)
+{
+    const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
+    switch(cappedSearchLog)
+    {
+    default :
+    case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 4);
+    case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 5);
+    }
+}
+
 
 /* *******************************
 *  Common parser - lazy strategy
 *********************************/
-typedef enum { search_hashChain, search_binaryTree } searchMethod_e;
+typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
 
 FORCE_INLINE_TEMPLATE size_t
 ZSTD_compressBlock_lazy_generic(
@@ -863,10 +1525,11 @@ ZSTD_compressBlock_lazy_generic(
     const BYTE* ip = istart;
     const BYTE* anchor = istart;
     const BYTE* const iend = istart + srcSize;
-    const BYTE* const ilimit = iend - 8;
+    const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 16 : iend - 8;
     const BYTE* const base = ms->window.base;
     const U32 prefixLowestIndex = ms->window.dictLimit;
     const BYTE* const prefixLowest = base + prefixLowestIndex;
+    const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
 
     typedef size_t (*searchMax_f)(
                         ZSTD_matchState_t* ms,
@@ -878,26 +1541,30 @@ ZSTD_compressBlock_lazy_generic(
      * that should never occur (extDict modes go to the other implementation
      * below and there is no DDSS for binary tree search yet).
      */
-    const searchMax_f searchFuncs[4][2] = {
+    const searchMax_f searchFuncs[4][3] = {
         {
             ZSTD_HcFindBestMatch_selectMLS,
-            ZSTD_BtFindBestMatch_selectMLS
+            ZSTD_BtFindBestMatch_selectMLS,
+            ZSTD_RowFindBestMatch_selectRowLog
         },
         {
+            NULL,
             NULL,
             NULL
         },
         {
             ZSTD_HcFindBestMatch_dictMatchState_selectMLS,
-            ZSTD_BtFindBestMatch_dictMatchState_selectMLS
+            ZSTD_BtFindBestMatch_dictMatchState_selectMLS,
+            ZSTD_RowFindBestMatch_dictMatchState_selectRowLog
         },
         {
             ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS,
-            NULL
+            NULL,
+            ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog
         }
     };
 
-    searchMax_f const searchMax = searchFuncs[dictMode][searchMethod == search_binaryTree];
+    searchMax_f const searchMax = searchFuncs[dictMode][(int)searchMethod];
     U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
 
     const int isDMS = dictMode == ZSTD_dictMatchState;
@@ -915,9 +1582,7 @@ ZSTD_compressBlock_lazy_generic(
 
     assert(searchMax != NULL);
 
-    DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u)", (U32)dictMode);
-
-    /* init */
+    DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);
     ip += (dictAndPrefixLength == 0);
     if (dictMode == ZSTD_noDict) {
         U32 const curr = (U32)(ip - base);
@@ -933,6 +1598,12 @@ ZSTD_compressBlock_lazy_generic(
         assert(offset_2 <= dictAndPrefixLength);
     }
 
+    if (searchMethod == search_rowHash) {
+        ZSTD_row_fillHashCache(ms, base, rowLog,
+                            MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
+                            ms->nextToUpdate, ilimit);
+    }
+
     /* Match Loop */
 #if defined(__GNUC__) && defined(__x86_64__)
     /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
@@ -1198,6 +1869,70 @@ size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
     return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
 }
 
+/* Row-based matchfinder */
+size_t ZSTD_compressBlock_lazy2_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_lazy_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_greedy_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_lazy_dictMatchState_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_greedy_dictMatchState_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
+}
+
+
+size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch);
+}
+
+size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
+}
+
+size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
+}
 
 FORCE_INLINE_TEMPLATE
 size_t ZSTD_compressBlock_lazy_extDict_generic(
@@ -1210,7 +1945,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
     const BYTE* ip = istart;
     const BYTE* anchor = istart;
     const BYTE* const iend = istart + srcSize;
-    const BYTE* const ilimit = iend - 8;
+    const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 16 : iend - 8;
     const BYTE* const base = ms->window.base;
     const U32 dictLimit = ms->window.dictLimit;
     const BYTE* const prefixStart = base + dictLimit;
@@ -1218,18 +1953,28 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
     const BYTE* const dictEnd  = dictBase + dictLimit;
     const BYTE* const dictStart  = dictBase + ms->window.lowLimit;
     const U32 windowLog = ms->cParams.windowLog;
+    const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
 
     typedef size_t (*searchMax_f)(
                         ZSTD_matchState_t* ms,
                         const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
-    searchMax_f searchMax = searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_extDict_selectMLS : ZSTD_HcFindBestMatch_extDict_selectMLS;
-
+    const searchMax_f searchFuncs[3] = {
+        ZSTD_HcFindBestMatch_extDict_selectMLS,
+        ZSTD_BtFindBestMatch_extDict_selectMLS,
+        ZSTD_RowFindBestMatch_extDict_selectRowLog
+    };
+    searchMax_f searchMax = searchFuncs[(int)searchMethod];
     U32 offset_1 = rep[0], offset_2 = rep[1];
 
-    DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic");
+    DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
 
     /* init */
     ip += (ip == prefixStart);
+    if (searchMethod == search_rowHash) {
+        ZSTD_row_fillHashCache(ms, base, rowLog,
+                               MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
+                               ms->nextToUpdate, ilimit);
+    }
 
     /* Match Loop */
 #if defined(__GNUC__) && defined(__x86_64__)
@@ -1410,3 +2155,26 @@ size_t ZSTD_compressBlock_btlazy2_extDict(
 {
     return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
 }
+
+size_t ZSTD_compressBlock_greedy_extDict_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
+}
+
+size_t ZSTD_compressBlock_lazy_extDict_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
+}
+
+size_t ZSTD_compressBlock_lazy2_extDict_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
+}
diff --git a/lib/compress/zstd_lazy.h b/lib/compress/zstd_lazy.h
index b75f7e8ae..150f7b390 100644
--- a/lib/compress/zstd_lazy.h
+++ b/lib/compress/zstd_lazy.h
@@ -26,6 +26,7 @@ extern "C" {
 #define ZSTD_LAZY_DDSS_BUCKET_LOG 2
 
 U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip);
+void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip);
 
 void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip);
 
@@ -43,6 +44,15 @@ size_t ZSTD_compressBlock_lazy(
 size_t ZSTD_compressBlock_greedy(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
 
 size_t ZSTD_compressBlock_btlazy2_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
@@ -56,6 +66,15 @@ size_t ZSTD_compressBlock_lazy_dictMatchState(
 size_t ZSTD_compressBlock_greedy_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_dictMatchState_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_dictMatchState_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
 
 size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
@@ -66,6 +85,15 @@ size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
 size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
 
 size_t ZSTD_compressBlock_greedy_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
@@ -76,9 +104,19 @@ size_t ZSTD_compressBlock_lazy_extDict(
 size_t ZSTD_compressBlock_lazy2_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_extDict_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_extDict_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_extDict_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_btlazy2_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
+        
 
 #if defined (__cplusplus)
 }
diff --git a/lib/compress/zstd_ldm.c b/lib/compress/zstd_ldm.c
index 66dbc2875..7dea97aa8 100644
--- a/lib/compress/zstd_ldm.c
+++ b/lib/compress/zstd_ldm.c
@@ -622,12 +622,13 @@ void ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes) {
 
 size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
     ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+    ZSTD_useRowMatchFinderMode_e useRowMatchFinder,
     void const* src, size_t srcSize)
 {
     const ZSTD_compressionParameters* const cParams = &ms->cParams;
     unsigned const minMatch = cParams->minMatch;
     ZSTD_blockCompressor const blockCompressor =
-        ZSTD_selectBlockCompressor(cParams->strategy, ZSTD_matchState_dictMode(ms));
+        ZSTD_selectBlockCompressor(cParams->strategy, useRowMatchFinder, ZSTD_matchState_dictMode(ms));
     /* Input bounds */
     BYTE const* const istart = (BYTE const*)src;
     BYTE const* const iend = istart + srcSize;
diff --git a/lib/compress/zstd_ldm.h b/lib/compress/zstd_ldm.h
index 7e69cc9b4..393466fa9 100644
--- a/lib/compress/zstd_ldm.h
+++ b/lib/compress/zstd_ldm.h
@@ -66,6 +66,7 @@ size_t ZSTD_ldm_generateSequences(
  */
 size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+            ZSTD_useRowMatchFinderMode_e useRowMatchFinder,
             void const* src, size_t srcSize);
 
 /**
diff --git a/lib/zstd.h b/lib/zstd.h
index 3a4a755e6..b95959a34 100644
--- a/lib/zstd.h
+++ b/lib/zstd.h
@@ -420,6 +420,7 @@ typedef enum {
      * ZSTD_c_blockDelimiters
      * ZSTD_c_validateSequences
      * ZSTD_c_splitBlocks
+     * ZSTD_c_useRowMatchFinder
      * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
      * note : never ever use experimentalParam? names directly;
      *        also, the enums values themselves are unstable and can still change.
@@ -436,7 +437,8 @@ typedef enum {
      ZSTD_c_experimentalParam10=1007,
      ZSTD_c_experimentalParam11=1008,
      ZSTD_c_experimentalParam12=1009,
-     ZSTD_c_experimentalParam13=1010
+     ZSTD_c_experimentalParam13=1010,
+     ZSTD_c_experimentalParam14=1011
 } ZSTD_cParameter;
 
 typedef struct {
@@ -1272,6 +1274,11 @@ typedef enum {
   ZSTD_lcm_uncompressed = 2   /**< Always emit uncompressed literals. */
 } ZSTD_literalCompressionMode_e;
 
+typedef enum {
+  ZSTD_urm_auto = 0,                   /* Automatically determine whether or not we use row matchfinder */
+  ZSTD_urm_disableRowMatchFinder = 1,  /* Never use row matchfinder */
+  ZSTD_urm_enableRowMatchFinder = 2    /* Always use row matchfinder when applicable */
+} ZSTD_useRowMatchFinderMode_e;
 
 /***************************************
 *  Frame size functions
@@ -1843,6 +1850,19 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* pre
  */
 #define ZSTD_c_splitBlocks ZSTD_c_experimentalParam13
 
+/* ZSTD_c_useRowMatchFinder
+ * Default is ZSTD_urm_auto.
+ * Controlled with ZSTD_useRowMatchFinderMode_e enum.
+ * 
+ * By default, in ZSTD_urm_auto, when finalizing the compression parameters, the library
+ * will decide at runtime whether to use the row-based matchfinder based on support for SIMD
+ * instructions as well as the windowLog.
+ * 
+ * Set to ZSTD_urm_disableRowMatchFinder to never use row-based matchfinder.
+ * Set to ZSTD_urm_enableRowMatchFinder to force usage of row-based matchfinder.
+ */
+#define ZSTD_c_useRowMatchFinder ZSTD_c_experimentalParam14
+
 /*! ZSTD_CCtx_getParameter() :
  *  Get the requested compression parameter value, selected by enum ZSTD_cParameter,
  *  and store it into int* value.
diff --git a/programs/benchzstd.c b/programs/benchzstd.c
index cd7902fbe..c40f0a24c 100644
--- a/programs/benchzstd.c
+++ b/programs/benchzstd.c
@@ -137,7 +137,8 @@ BMK_advancedParams_t BMK_initAdvancedParams(void) {
         0, /* ldmHashLog */
         0, /* ldmBuckSizeLog */
         0,  /* ldmHashRateLog */
-        ZSTD_lcm_auto /* literalCompressionMode */
+        ZSTD_lcm_auto, /* literalCompressionMode */
+        0 /* useRowMatchFinder */
     };
     return res;
 }
@@ -175,6 +176,7 @@ BMK_initCCtx(ZSTD_CCtx* ctx,
         CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_nbWorkers, adv->nbWorkers));
     }
     CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_compressionLevel, cLevel));
+    CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_useRowMatchFinder, adv->useRowMatchFinder));
     CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_enableLongDistanceMatching, adv->ldmFlag));
     CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_ldmMinMatch, adv->ldmMinMatch));
     CHECK_Z(ZSTD_CCtx_setParameter(ctx, ZSTD_c_ldmHashLog, adv->ldmHashLog));
diff --git a/programs/benchzstd.h b/programs/benchzstd.h
index bdeecae7a..9b40dcc29 100644
--- a/programs/benchzstd.h
+++ b/programs/benchzstd.h
@@ -117,6 +117,7 @@ typedef struct {
     int ldmBucketSizeLog;
     int ldmHashRateLog;
     ZSTD_literalCompressionMode_e literalCompressionMode;
+    int useRowMatchFinder;  /* use row-based matchfinder if possible */
 } BMK_advancedParams_t;
 
 /* returns default parameters used by nonAdvanced functions */
diff --git a/programs/fileio.c b/programs/fileio.c
index 66b86f710..fab7918d7 100644
--- a/programs/fileio.c
+++ b/programs/fileio.c
@@ -298,6 +298,7 @@ struct FIO_prefs_s {
     int blockSize;
     int overlapLog;
     U32 adaptiveMode;
+    U32 useRowMatchFinder;
     int rsyncable;
     int minAdaptLevel;
     int maxAdaptLevel;
@@ -468,6 +469,10 @@ void FIO_setAdaptiveMode(FIO_prefs_t* const prefs, unsigned adapt) {
     prefs->adaptiveMode = adapt;
 }
 
+void FIO_setUseRowMatchFinder(FIO_prefs_t* const prefs, int useRowMatchFinder) {
+    prefs->useRowMatchFinder = useRowMatchFinder;
+}
+
 void FIO_setRsyncable(FIO_prefs_t* const prefs, int rsyncable) {
     if ((rsyncable>0) && (prefs->nbWorkers==0))
         EXM_THROW(1, "Rsyncable mode is not compatible with single thread mode \n");
@@ -986,6 +991,7 @@ static cRess_t FIO_createCResources(FIO_prefs_t* const prefs,
     if (prefs->ldmHashRateLog != FIO_LDM_PARAM_NOTSET) {
         CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_ldmHashRateLog, prefs->ldmHashRateLog) );
     }
+    CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_useRowMatchFinder, prefs->useRowMatchFinder));
     /* compression parameters */
     CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_windowLog, (int)comprParams.windowLog) );
     CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_chainLog, (int)comprParams.chainLog) );
diff --git a/programs/fileio.h b/programs/fileio.h
index 252df2f26..1f5db4b52 100644
--- a/programs/fileio.h
+++ b/programs/fileio.h
@@ -77,6 +77,7 @@ void FIO_overwriteMode(FIO_prefs_t* const prefs);
 void FIO_setAdaptiveMode(FIO_prefs_t* const prefs, unsigned adapt);
 void FIO_setAdaptMin(FIO_prefs_t* const prefs, int minCLevel);
 void FIO_setAdaptMax(FIO_prefs_t* const prefs, int maxCLevel);
+void FIO_setUseRowMatchFinder(FIO_prefs_t* const prefs, int useRowMatchFinder);
 void FIO_setBlockSize(FIO_prefs_t* const prefs, int blockSize);
 void FIO_setChecksumFlag(FIO_prefs_t* const prefs, int checksumFlag);
 void FIO_setDictIDFlag(FIO_prefs_t* const prefs, int dictIDFlag);
diff --git a/programs/zstdcli.c b/programs/zstdcli.c
index cda2d9578..d9d2c701e 100644
--- a/programs/zstdcli.c
+++ b/programs/zstdcli.c
@@ -205,6 +205,7 @@ static void usage_advanced(const char* programName)
     DISPLAYOUT( "--long[=#]: enable long distance matching with given window log (default: %u) \n", g_defaultMaxWindowLog);
     DISPLAYOUT( "--fast[=#]: switch to very fast compression levels (default: %u) \n", 1);
     DISPLAYOUT( "--adapt : dynamically adapt compression level to I/O conditions \n");
+    DISPLAYOUT( "--[no-]row-match-finder : force enable/disable usage of fast row-based matchfinder for greedy, lazy, and lazy2 strategies \n");
 # ifdef ZSTD_MULTITHREAD
     DISPLAYOUT( " -T#    : spawns # compression threads (default: 1, 0==# cores) \n");
     DISPLAYOUT( " -B#    : select size of each job (default: 0==automatic) \n");
@@ -730,6 +731,7 @@ int main(int const argCount, const char* argv[])
         main_pause = 0,
         nbWorkers = 0,
         adapt = 0,
+        useRowMatchFinder = 0,
         adaptMin = MINCLEVEL,
         adaptMax = MAXCLEVEL,
         rsyncable = 0,
@@ -857,6 +859,8 @@ int main(int const argCount, const char* argv[])
                 if (!strcmp(argument, "--content-size")) { contentSize = 1; continue; }
                 if (!strcmp(argument, "--no-content-size")) { contentSize = 0; continue; }
                 if (!strcmp(argument, "--adapt")) { adapt = 1; continue; }
+                if (!strcmp(argument, "--no-row-match-finder")) { useRowMatchFinder = 1; continue; }
+                if (!strcmp(argument, "--row-match-finder")) { useRowMatchFinder = 2; continue; }
                 if (longCommandWArg(&argument, "--adapt=")) { adapt = 1; if (!parseAdaptParameters(argument, &adaptMin, &adaptMax)) { badusage(programName); CLEAN_RETURN(1); } continue; }
                 if (!strcmp(argument, "--single-thread")) { nbWorkers = 0; singleThread = 1; continue; }
                 if (!strcmp(argument, "--format=zstd")) { suffix = ZSTD_EXTENSION; FIO_setCompressionType(prefs, FIO_zstdCompression); continue; }
@@ -1196,6 +1200,7 @@ int main(int const argCount, const char* argv[])
         benchParams.ldmFlag = ldmFlag;
         benchParams.ldmMinMatch = (int)g_ldmMinMatch;
         benchParams.ldmHashLog = (int)g_ldmHashLog;
+        benchParams.useRowMatchFinder = useRowMatchFinder;
         if (g_ldmBucketSizeLog != LDM_PARAM_DEFAULT) {
             benchParams.ldmBucketSizeLog = (int)g_ldmBucketSizeLog;
         }
@@ -1348,6 +1353,7 @@ int main(int const argCount, const char* argv[])
         if (g_ldmBucketSizeLog != LDM_PARAM_DEFAULT) FIO_setLdmBucketSizeLog(prefs, (int)g_ldmBucketSizeLog);
         if (g_ldmHashRateLog != LDM_PARAM_DEFAULT) FIO_setLdmHashRateLog(prefs, (int)g_ldmHashRateLog);
         FIO_setAdaptiveMode(prefs, (unsigned)adapt);
+        FIO_setUseRowMatchFinder(prefs, useRowMatchFinder);
         FIO_setAdaptMin(prefs, adaptMin);
         FIO_setAdaptMax(prefs, adaptMax);
         FIO_setRsyncable(prefs, rsyncable);
@@ -1387,7 +1393,7 @@ int main(int const argCount, const char* argv[])
         else
             operationResult = FIO_compressMultipleFilenames(fCtx, prefs, filenames->fileNames, outMirroredDirName, outDirName, outFileName, suffix, dictFileName, cLevel, compressionParams);
 #else
-        (void)contentSize; (void)suffix; (void)adapt; (void)rsyncable; (void)ultra; (void)cLevel; (void)ldmFlag; (void)literalCompressionMode; (void)targetCBlockSize; (void)streamSrcSize; (void)srcSizeHint; (void)ZSTD_strategyMap; /* not used when ZSTD_NOCOMPRESS set */
+        (void)contentSize; (void)suffix; (void)adapt; (void)rsyncable; (void)ultra; (void)cLevel; (void)ldmFlag; (void)literalCompressionMode; (void)targetCBlockSize; (void)streamSrcSize; (void)srcSizeHint; (void)ZSTD_strategyMap; (void)useRowMatchFinder; /* not used when ZSTD_NOCOMPRESS set */
         DISPLAY("Compression not supported \n");
 #endif
     } else {  /* decompression or test */
diff --git a/tests/fuzz/.gitignore b/tests/fuzz/.gitignore
index 9bd280c08..8ef3a3efd 100644
--- a/tests/fuzz/.gitignore
+++ b/tests/fuzz/.gitignore
@@ -19,6 +19,7 @@ sequence_compression_api
 fuzz-*.log
 rt_lib_*
 d_lib_*
+crash-*
 
 # misc
 trace
diff --git a/tests/fuzz/zstd_helpers.c b/tests/fuzz/zstd_helpers.c
index 0607a0745..fcbe3361d 100644
--- a/tests/fuzz/zstd_helpers.c
+++ b/tests/fuzz/zstd_helpers.c
@@ -91,6 +91,7 @@ void FUZZ_setRandomParameters(ZSTD_CCtx *cctx, size_t srcSize, FUZZ_dataProducer
     /* Set misc parameters */
     setRand(cctx, ZSTD_c_nbWorkers, 0, 2, producer);
     setRand(cctx, ZSTD_c_rsyncable, 0, 1, producer);
+    setRand(cctx, ZSTD_c_useRowMatchFinder, 0, 2, producer);
     setRand(cctx, ZSTD_c_forceMaxWindow, 0, 1, producer);
     setRand(cctx, ZSTD_c_literalCompressionMode, 0, 2, producer);
     setRand(cctx, ZSTD_c_forceAttachDict, 0, 2, producer);
diff --git a/tests/fuzzer.c b/tests/fuzzer.c
index d58e6a568..1b1042393 100644
--- a/tests/fuzzer.c
+++ b/tests/fuzzer.c
@@ -1757,13 +1757,14 @@ static int basicUnitTests(U32 const seed, double compressibility)
             size_t const contentSize = 9 KB;
             const void* const dict = (const char*)CNBuffer;
             const void* const contentStart = (const char*)dict + flatdictSize;
+            /* These upper bounds are generally within a few bytes of the compressed size */
             size_t const target_nodict_cSize[22+1] = { 3840, 3770, 3870, 3830, 3770,
                                                        3770, 3770, 3770, 3750, 3750,
                                                        3742, 3670, 3670, 3660, 3660,
                                                        3660, 3660, 3660, 3660, 3660,
                                                        3660, 3660, 3660 };
             size_t const target_wdict_cSize[22+1] =  { 2830, 2890, 2890, 2820, 2940,
-                                                       2950, 2950, 2921, 2900, 2891,
+                                                       2950, 2950, 2925, 2900, 2891,
                                                        2910, 2910, 2910, 2770, 2760,
                                                        2750, 2750, 2750, 2750, 2750,
                                                        2750, 2750, 2750 };
@@ -1800,6 +1801,22 @@ static int basicUnitTests(U32 const seed, double compressibility)
                 DISPLAYLEVEL(4, "level %i with dictionary : max expected %u >= reached %u \n",
                                 l, (unsigned)target_wdict_cSize[l], (unsigned)wdict_cSize);
             }
+            /* Dict compression with DMS */
+            for ( l=1 ; l <= maxLevel; l++) {
+                size_t wdict_cSize;
+                CHECK_Z( ZSTD_CCtx_loadDictionary(ctxOrig, dict, flatdictSize) );
+                CHECK_Z( ZSTD_CCtx_setParameter(ctxOrig, ZSTD_c_compressionLevel, l) );
+                CHECK_Z( ZSTD_CCtx_setParameter(ctxOrig, ZSTD_c_enableDedicatedDictSearch, 0) );
+                CHECK_Z( ZSTD_CCtx_setParameter(ctxOrig, ZSTD_c_forceAttachDict, ZSTD_dictForceAttach) );
+                wdict_cSize = ZSTD_compress2(ctxOrig, compressedBuffer, compressedBufferSize, contentStart, contentSize);
+                if (wdict_cSize > target_wdict_cSize[l]) {
+                    DISPLAYLEVEL(1, "error : compression with dictionary and compress2 at level %i worse than expected (%u > %u) \n",
+                                    l, (unsigned)wdict_cSize, (unsigned)target_wdict_cSize[l]);
+                    goto _output_error;
+                }
+                DISPLAYLEVEL(4, "level %i with dictionary and compress2 : max expected %u >= reached %u \n",
+                                l, (unsigned)target_wdict_cSize[l], (unsigned)wdict_cSize);
+            }
 
             DISPLAYLEVEL(4, "compression efficiency tests OK \n");
         }