"Short cache" optimization for level 1-4 DMS (+5-30% compression speed) (#3152)

author Elliot Gorokhovsky <embg@fb.com>

Tue, 21 Jun 2022 21:27:19 +0000 (14:27 -0700)

committer GitHub <noreply@github.com>

Tue, 21 Jun 2022 21:27:19 +0000 (17:27 -0400)
author Elliot Gorokhovsky <embg@fb.com>
Tue, 21 Jun 2022 21:27:19 +0000 (14:27 -0700)
committer GitHub <noreply@github.com>
Tue, 21 Jun 2022 21:27:19 +0000 (17:27 -0400)
diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c

index ec2a2fe270488a77d8e3fb2f1353361ac232e723..46a9dbe6690e6dd8cd47d9f6fa58d0d6f17c8c03 100644 (file)
--- a/lib/compress/zstd_compress.c
+++ b/lib/compress/zstd_compress.c
@@ -275,6 +275,12 @@ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
      return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable;
  }
  
+/* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged.
+ * If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */
+static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) {
+    return cParams->strategy == ZSTD_fast || cParams->strategy == ZSTD_dfast;
+}
+
  static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
          ZSTD_compressionParameters cParams)
  {
@@ -1367,6 +1373,13 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
      if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN)
          cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN;  /* minimum wlog required for valid frame header */
  
+    if (mode == ZSTD_cpm_createCDict && ZSTD_CDictIndicesAreTagged(&cPar)) {
+        U32 const maxShortCacheHashLog = 32 - ZSTD_SHORT_CACHE_TAG_BITS;
+        if (cPar.hashLog > maxShortCacheHashLog) {
+            cPar.hashLog = maxShortCacheHashLog;
+        }
+    }
+
      return cPar;
  }
  
@@ -2096,6 +2109,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
      return 0;
  }
  
+static void ZSTD_copyCDictTableIntoCCtx(U32* dst, U32 const* src, size_t tableSize,
+                                        ZSTD_compressionParameters const* cParams) {
+    if (ZSTD_CDictIndicesAreTagged(cParams)){
+        /* Remove tags from the CDict table if they are present.
+         * See docs on "short cache" in zstd_compress_internal.h for context. */
+        size_t i;
+        for (i = 0; i < tableSize; i++) {
+            U32 const taggedIndex = src[i];
+            U32 const index = taggedIndex >> ZSTD_SHORT_CACHE_TAG_BITS;
+            dst[i] = index;
+        }
+    } else {
+        ZSTD_memcpy(dst, src, tableSize * sizeof(U32));
+    }
+}
+
  static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
                              const ZSTD_CDict* cdict,
                              ZSTD_CCtx_params params,
@@ -2131,14 +2160,15 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
                                                              : 0;
          size_t const hSize =  (size_t)1 << cdict_cParams->hashLog;
  
-        ZSTD_memcpy(cctx->blockState.matchState.hashTable,
-               cdict->matchState.hashTable,
-               hSize * sizeof(U32));
+        ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.hashTable,
+                                cdict->matchState.hashTable,
+                                hSize, cdict_cParams);
+
          /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */
          if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) {
-            ZSTD_memcpy(cctx->blockState.matchState.chainTable,
-               cdict->matchState.chainTable,
-               chainSize * sizeof(U32));
+            ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.chainTable,
+                                    cdict->matchState.chainTable,
+                                    chainSize, cdict_cParams);
          }
          /* copy tag table */
          if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) {
@@ -4205,7 +4235,8 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
                                           ZSTD_cwksp* ws,
                                           ZSTD_CCtx_params const* params,
                                           const void* src, size_t srcSize,
-                                         ZSTD_dictTableLoadMethod_e dtlm)
+                                         ZSTD_dictTableLoadMethod_e dtlm,
+                                         ZSTD_tableFillPurpose_e tfp)
  {
      const BYTE* ip = (const BYTE*) src;
      const BYTE* const iend = ip + srcSize;
@@ -4214,22 +4245,37 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
      /* Assert that the ms params match the params we're being given */
      ZSTD_assertEqualCParams(params->cParams, ms->cParams);
  
-    if (srcSize > ZSTD_CHUNKSIZE_MAX) {
+    {   /* Ensure large dictionaries can't cause index overflow */
+
          /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX.
           * Dictionaries right at the edge will immediately trigger overflow
           * correction, but I don't want to insert extra constraints here.
           */
-        U32 const maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX;
-        /* We must have cleared our windows when our source is this large. */
-        assert(ZSTD_window_isEmpty(ms->window));
-        if (loadLdmDict)
-            assert(ZSTD_window_isEmpty(ls->window));
+        U32 maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX;
+
+        int const CDictTaggedIndices = ZSTD_CDictIndicesAreTagged(&params->cParams);
+        if (CDictTaggedIndices && tfp == ZSTD_tfp_forCDict) {
+            /* Some dictionary matchfinders in zstd use "short cache",
+             * which treats the lower ZSTD_SHORT_CACHE_TAG_BITS of each
+             * CDict hashtable entry as a tag rather than as part of an index.
+             * When short cache is used, we need to truncate the dictionary
+             * so that its indices don't overlap with the tag. */
+            U32 const shortCacheMaxDictSize = (1u << (32 - ZSTD_SHORT_CACHE_TAG_BITS)) - ZSTD_WINDOW_START_INDEX;
+            maxDictSize = MIN(maxDictSize, shortCacheMaxDictSize);
+            assert(!loadLdmDict);
+        }
+
          /* If the dictionary is too large, only load the suffix of the dictionary. */
          if (srcSize > maxDictSize) {
              ip = iend - maxDictSize;
              src = ip;
              srcSize = maxDictSize;
-        }
+    }   }
+
+    if (srcSize > ZSTD_CHUNKSIZE_MAX) {
+        /* We must have cleared our windows when our source is this large. */
+        assert(ZSTD_window_isEmpty(ms->window));
+        if (loadLdmDict) assert(ZSTD_window_isEmpty(ls->window));
      }
  
      DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
@@ -4252,10 +4298,10 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
      switch(params->cParams.strategy)
      {
      case ZSTD_fast:
-        ZSTD_fillHashTable(ms, iend, dtlm);
+        ZSTD_fillHashTable(ms, iend, dtlm, tfp);
          break;
      case ZSTD_dfast:
-        ZSTD_fillDoubleHashTable(ms, iend, dtlm);
+        ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp);
          break;
  
      case ZSTD_greedy:
@@ -4421,6 +4467,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
                                        ZSTD_CCtx_params const* params,
                                        const void* dict, size_t dictSize,
                                        ZSTD_dictTableLoadMethod_e dtlm,
+                                      ZSTD_tableFillPurpose_e tfp,
                                        void* workspace)
  {
      const BYTE* dictPtr = (const BYTE*)dict;
@@ -4439,7 +4486,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
      {
          size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
          FORWARD_IF_ERROR(ZSTD_loadDictionaryContent(
-            ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), "");
+            ms, NULL, ws, params, dictPtr, dictContentSize, dtlm, tfp), "");
      }
      return dictID;
  }
@@ -4455,6 +4502,7 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
                           const void* dict, size_t dictSize,
                                 ZSTD_dictContentType_e dictContentType,
                                 ZSTD_dictTableLoadMethod_e dtlm,
+                               ZSTD_tableFillPurpose_e tfp,
                                 void* workspace)
  {
      DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize);
@@ -4467,13 +4515,13 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
  
      /* dict restricted modes */
      if (dictContentType == ZSTD_dct_rawContent)
-        return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm);
+        return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm, tfp);
  
      if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) {
          if (dictContentType == ZSTD_dct_auto) {
              DEBUGLOG(4, "raw content dictionary detected");
              return ZSTD_loadDictionaryContent(
-                ms, ls, ws, params, dict, dictSize, dtlm);
+                ms, ls, ws, params, dict, dictSize, dtlm, tfp);
          }
          RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, "");
          assert(0);   /* impossible */
@@ -4481,7 +4529,7 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
  
      /* dict as full zstd dictionary */
      return ZSTD_loadZstdDictionary(
-        bs, ms, ws, params, dict, dictSize, dtlm, workspace);
+        bs, ms, ws, params, dict, dictSize, dtlm, tfp, workspace);
  }
  
  #define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB)
@@ -4524,11 +4572,11 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
                          cctx->blockState.prevCBlock, &cctx->blockState.matchState,
                          &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent,
                          cdict->dictContentSize, cdict->dictContentType, dtlm,
-                        cctx->entropyWorkspace)
+                        ZSTD_tfp_forCCtx, cctx->entropyWorkspace)
                : ZSTD_compress_insertDictionary(
                          cctx->blockState.prevCBlock, &cctx->blockState.matchState,
                          &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize,
-                        dictContentType, dtlm, cctx->entropyWorkspace);
+                        dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->entropyWorkspace);
          FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
          assert(dictID <= UINT_MAX);
          cctx->dictID = (U32)dictID;
@@ -4832,7 +4880,7 @@ static size_t ZSTD_initCDict_internal(
          {   size_t const dictID = ZSTD_compress_insertDictionary(
                      &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace,
                      &params, cdict->dictContent, cdict->dictContentSize,
-                    dictContentType, ZSTD_dtlm_full, cdict->entropyWorkspace);
+                    dictContentType, ZSTD_dtlm_full, ZSTD_tfp_forCDict, cdict->entropyWorkspace);
              FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
              assert(dictID <= (size_t)(U32)-1);
              cdict->dictID = (U32)dictID;
diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h

index bbae53303e6cbf88d7455079cd8cf99b0285cb59..870bcc8be90eb2e9a94cbdac6295784375876be5 100644 (file)
--- a/lib/compress/zstd_compress_internal.h
+++ b/lib/compress/zstd_compress_internal.h
@@ -434,6 +434,7 @@ struct ZSTD_CCtx_s {
  };
  
  typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e;
+typedef enum { ZSTD_tfp_forCCtx, ZSTD_tfp_forCDict } ZSTD_tableFillPurpose_e;
  
  typedef enum {
      ZSTD_noDict = 0,
@@ -745,32 +746,36 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match,
   *  Hashes
   ***************************************/
  static const U32 prime3bytes = 506832829U;
-static U32    ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes)  >> (32-h) ; }
+static U32    ZSTD_hash3(U32 u, U32 h) { assert(h <= 32); return ((u << (32-24)) * prime3bytes)  >> (32-h) ; }
  MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */
  
  static const U32 prime4bytes = 2654435761U;
-static U32    ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; }
+static U32    ZSTD_hash4(U32 u, U32 h) { assert(h <= 32); return (u * prime4bytes) >> (32-h) ; }
  static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); }
  
  static const U64 prime5bytes = 889523592379ULL;
-static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u  << (64-40)) * prime5bytes) >> (64-h)) ; }
+static size_t ZSTD_hash5(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u  << (64-40)) * prime5bytes) >> (64-h)) ; }
  static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); }
  
  static const U64 prime6bytes = 227718039650203ULL;
-static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u  << (64-48)) * prime6bytes) >> (64-h)) ; }
+static size_t ZSTD_hash6(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u  << (64-48)) * prime6bytes) >> (64-h)) ; }
  static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
  
  static const U64 prime7bytes = 58295818150454627ULL;
-static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u  << (64-56)) * prime7bytes) >> (64-h)) ; }
+static size_t ZSTD_hash7(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u  << (64-56)) * prime7bytes) >> (64-h)) ; }
  static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); }
  
  static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
-static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
+static size_t ZSTD_hash8(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
  static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
  
  MEM_STATIC FORCE_INLINE_ATTR
  size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
  {
+    /* Although some of these hashes do support hBits up to 64, some do not.
+     * To be on the safe side, always avoid hBits > 32. */
+    assert(hBits <= 32);
+
      switch(mls)
      {
      default:
@@ -1264,6 +1269,42 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)
  
  #endif
  
+/* Short Cache */
+
+/* Normally, zstd matchfinders follow this flow:
+ *     1. Compute hash at ip
+ *     2. Load index from hashTable[hash]
+ *     3. Check if *ip == *(base + index)
+ * In dictionary compression, loading *(base + index) is often an L2 or even L3 miss.
+ *
+ * Short cache is an optimization which allows us to avoid step 3 most of the time
+ * when the data doesn't actually match. With short cache, the flow becomes:
+ *     1. Compute (hash, currentTag) at ip. currentTag is an 8-bit independent hash at ip.
+ *     2. Load (index, matchTag) from hashTable[hash]. See ZSTD_writeTaggedIndex to understand how this works.
+ *     3. Only if currentTag == matchTag, check *ip == *(base + index). Otherwise, continue.
+ *
+ * Currently, short cache is only implemented in CDict hashtables. Thus, its use is limited to
+ * dictMatchState matchfinders.
+ */
+#define ZSTD_SHORT_CACHE_TAG_BITS 8
+#define ZSTD_SHORT_CACHE_TAG_MASK ((1u << ZSTD_SHORT_CACHE_TAG_BITS) - 1)
+
+/* Helper function for ZSTD_fillHashTable and ZSTD_fillDoubleHashTable.
+ * Unpacks hashAndTag into (hash, tag), then packs (index, tag) into hashTable[hash]. */
+MEM_STATIC void ZSTD_writeTaggedIndex(U32* const hashTable, size_t hashAndTag, U32 index) {
+    size_t const hash = hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
+    U32 const tag = (U32)(hashAndTag & ZSTD_SHORT_CACHE_TAG_MASK);
+    assert(index >> (32 - ZSTD_SHORT_CACHE_TAG_BITS) == 0);
+    hashTable[hash] = (index << ZSTD_SHORT_CACHE_TAG_BITS) | tag;
+}
+
+/* Helper function for short cache matchfinders.
+ * Unpacks tag1 and tag2 from lower bits of packedTag1 and packedTag2, then checks if the tags match. */
+MEM_STATIC int ZSTD_comparePackedTags(size_t packedTag1, size_t packedTag2) {
+    U32 const tag1 = packedTag1 & ZSTD_SHORT_CACHE_TAG_MASK;
+    U32 const tag2 = packedTag2 & ZSTD_SHORT_CACHE_TAG_MASK;
+    return tag1 == tag2;
+}
  
  #if defined (__cplusplus)
  }
diff --git a/lib/compress/zstd_double_fast.c b/lib/compress/zstd_double_fast.c

index d8412ef0b7e653115921cc712d85aa974e2bbc1a..6697ba0a91ce7cab5803b26a2ad6ce69446deee4 100644 (file)
--- a/lib/compress/zstd_double_fast.c
+++ b/lib/compress/zstd_double_fast.c
@@ -11,8 +11,43 @@
  #include "zstd_compress_internal.h"
  #include "zstd_double_fast.h"
  
+static void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms,
+                              void const* end, ZSTD_dictTableLoadMethod_e dtlm)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashLarge = ms->hashTable;
+    U32  const hBitsL = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    U32  const mls = cParams->minMatch;
+    U32* const hashSmall = ms->chainTable;
+    U32  const hBitsS = cParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    const BYTE* const base = ms->window.base;
+    const BYTE* ip = base + ms->nextToUpdate;
+    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+    const U32 fastHashFillStep = 3;
  
-void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+    /* Always insert every fastHashFillStep position into the hash tables.
+     * Insert the other positions into the large hash table if their entry
+     * is empty.
+     */
+    for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) {
+        U32 const curr = (U32)(ip - base);
+        U32 i;
+        for (i = 0; i < fastHashFillStep; ++i) {
+            size_t const smHashAndTag = ZSTD_hashPtr(ip + i, hBitsS, mls);
+            size_t const lgHashAndTag = ZSTD_hashPtr(ip + i, hBitsL, 8);
+            if (i == 0) {
+                ZSTD_writeTaggedIndex(hashSmall, smHashAndTag, curr + i);
+            }
+            if (i == 0 || hashLarge[lgHashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {
+                ZSTD_writeTaggedIndex(hashLarge, lgHashAndTag, curr + i);
+            }
+            /* Only load extra positions for ZSTD_dtlm_full */
+            if (dtlm == ZSTD_dtlm_fast)
+                break;
+    }   }
+}
+
+static void ZSTD_fillDoubleHashTableForCCtx(ZSTD_matchState_t* ms,
                                void const* end, ZSTD_dictTableLoadMethod_e dtlm)
  {
      const ZSTD_compressionParameters* const cParams = &ms->cParams;
@@ -43,7 +78,19 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
              /* Only load extra positions for ZSTD_dtlm_full */
              if (dtlm == ZSTD_dtlm_fast)
                  break;
-    }   }
+        }   }
+}
+
+void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+                        const void* const end,
+                        ZSTD_dictTableLoadMethod_e dtlm,
+                        ZSTD_tableFillPurpose_e tfp)
+{
+    if (tfp == ZSTD_tfp_forCDict) {
+        ZSTD_fillDoubleHashTableForCDict(ms, end, dtlm);
+    } else {
+        ZSTD_fillDoubleHashTableForCCtx(ms, end, dtlm);
+    }
  }
  
  
@@ -289,8 +336,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
      const BYTE* const dictStart    = dictBase + dictStartIndex;
      const BYTE* const dictEnd      = dms->window.nextSrc;
      const U32 dictIndexDelta       = prefixLowestIndex - (U32)(dictEnd - dictBase);
-    const U32 dictHBitsL           = dictCParams->hashLog;
-    const U32 dictHBitsS           = dictCParams->chainLog;
+    const U32 dictHBitsL           = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    const U32 dictHBitsS           = dictCParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
      const U32 dictAndPrefixLength  = (U32)((ip - prefixLowest) + (dictEnd - dictStart));
  
      DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic");
@@ -312,8 +359,12 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
          U32 offset;
          size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8);
          size_t const h = ZSTD_hashPtr(ip, hBitsS, mls);
-        size_t const dictHL = ZSTD_hashPtr(ip, dictHBitsL, 8);
-        size_t const dictHS = ZSTD_hashPtr(ip, dictHBitsS, mls);
+        size_t const dictHashAndTagL = ZSTD_hashPtr(ip, dictHBitsL, 8);
+        size_t const dictHashAndTagS = ZSTD_hashPtr(ip, dictHBitsS, mls);
+        U32 const dictMatchIndexAndTagL = dictHashLong[dictHashAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS];
+        U32 const dictMatchIndexAndTagS = dictHashSmall[dictHashAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS];
+        int const dictTagsMatchL = ZSTD_comparePackedTags(dictMatchIndexAndTagL, dictHashAndTagL);
+        int const dictTagsMatchS = ZSTD_comparePackedTags(dictMatchIndexAndTagS, dictHashAndTagS);
          U32 const curr = (U32)(ip-base);
          U32 const matchIndexL = hashLong[h2];
          U32 matchIndexS = hashSmall[h];
@@ -343,9 +394,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
                  while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
                  goto _match_found;
              }
-        } else {
+        } else if (dictTagsMatchL) {
              /* check dictMatchState long match */
-            U32 const dictMatchIndexL = dictHashLong[dictHL];
+            U32 const dictMatchIndexL = dictMatchIndexAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS;
              const BYTE* dictMatchL = dictBase + dictMatchIndexL;
              assert(dictMatchL < dictEnd);
  
@@ -361,9 +412,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
              if (MEM_read32(match) == MEM_read32(ip)) {
                  goto _search_next_long;
              }
-        } else {
+        } else if (dictTagsMatchS) {
              /* check dictMatchState short match */
-            U32 const dictMatchIndexS = dictHashSmall[dictHS];
+            U32 const dictMatchIndexS = dictMatchIndexAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS;
              match = dictBase + dictMatchIndexS;
              matchIndexS = dictMatchIndexS + dictIndexDelta;
  
@@ -378,10 +429,11 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
          continue;
  
  _search_next_long:
-
          {   size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
-            size_t const dictHLNext = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
+            size_t const dictHashAndTagL3 = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
              U32 const matchIndexL3 = hashLong[hl3];
+            U32 const dictMatchIndexAndTagL3 = dictHashLong[dictHashAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS];
+            int const dictTagsMatchL3 = ZSTD_comparePackedTags(dictMatchIndexAndTagL3, dictHashAndTagL3);
              const BYTE* matchL3 = base + matchIndexL3;
              hashLong[hl3] = curr + 1;
  
@@ -394,9 +446,9 @@ _search_next_long:
                      while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
                      goto _match_found;
                  }
-            } else {
+            } else if (dictTagsMatchL3) {
                  /* check dict long +1 match */
-                U32 const dictMatchIndexL3 = dictHashLong[dictHLNext];
+                U32 const dictMatchIndexL3 = dictMatchIndexAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS;
                  const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3;
                  assert(dictMatchL3 < dictEnd);
                  if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) {
diff --git a/lib/compress/zstd_double_fast.h b/lib/compress/zstd_double_fast.h

index e16b7b03a32439e936128988e5a6e0ffdd7475c0..6d8ee8c651fa92db324690a9b084185661e7c96c 100644 (file)
--- a/lib/compress/zstd_double_fast.h
+++ b/lib/compress/zstd_double_fast.h
@@ -19,7 +19,8 @@ extern "C" {
  #include "zstd_compress_internal.h"     /* ZSTD_CCtx, size_t */
  
  void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
-                              void const* end, ZSTD_dictTableLoadMethod_e dtlm);
+                              void const* end, ZSTD_dictTableLoadMethod_e dtlm,
+                              ZSTD_tableFillPurpose_e tfp);
  size_t ZSTD_compressBlock_doubleFast(
          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
          void const* src, size_t srcSize);
diff --git a/lib/compress/zstd_fast.c b/lib/compress/zstd_fast.c

index b0ec972c68258a512cdec2b5a6594f14c3e28bf4..de7336907e395d951c125ab9810c6862c82ed85d 100644 (file)
--- a/lib/compress/zstd_fast.c
+++ b/lib/compress/zstd_fast.c
@@ -11,8 +11,42 @@
  #include "zstd_compress_internal.h"  /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */
  #include "zstd_fast.h"
  
+static void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms,
+                        const void* const end,
+                        ZSTD_dictTableLoadMethod_e dtlm)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashTable = ms->hashTable;
+    U32  const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    U32  const mls = cParams->minMatch;
+    const BYTE* const base = ms->window.base;
+    const BYTE* ip = base + ms->nextToUpdate;
+    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+    const U32 fastHashFillStep = 3;
  
-void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+    /* Currently, we always use ZSTD_dtlm_full for filling CDict tables.
+     * Feel free to remove this assert if there's a good reason! */
+    assert(dtlm == ZSTD_dtlm_full);
+
+    /* Always insert every fastHashFillStep position into the hash table.
+     * Insert the other positions if their hash entry is empty.
+     */
+    for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
+        U32 const curr = (U32)(ip - base);
+        {   size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls);
+            ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr);   }
+
+        if (dtlm == ZSTD_dtlm_fast) continue;
+        /* Only load extra positions for ZSTD_dtlm_full */
+        {   U32 p;
+            for (p = 1; p < fastHashFillStep; ++p) {
+                size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls);
+                if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {  /* not yet filled */
+                    ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p);
+                }   }   }   }
+}
+
+static void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms,
                          const void* const end,
                          ZSTD_dictTableLoadMethod_e dtlm)
  {
@@ -25,6 +59,10 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
      const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
      const U32 fastHashFillStep = 3;
  
+    /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables.
+     * Feel free to remove this assert if there's a good reason! */
+    assert(dtlm == ZSTD_dtlm_fast);
+
      /* Always insert every fastHashFillStep position into the hash table.
       * Insert the other positions if their hash entry is empty.
       */
@@ -42,6 +80,18 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
      }   }   }   }
  }
  
+void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+                        const void* const end,
+                        ZSTD_dictTableLoadMethod_e dtlm,
+                        ZSTD_tableFillPurpose_e tfp)
+{
+    if (tfp == ZSTD_tfp_forCDict) {
+        ZSTD_fillHashTableForCDict(ms, end, dtlm);
+    } else {
+        ZSTD_fillHashTableForCCtx(ms, end, dtlm);
+    }
+}
+
  
  /**
   * If you squint hard enough (and ignore repcodes), the search operation at any
@@ -435,7 +485,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
      const BYTE* const dictEnd      = dms->window.nextSrc;
      const U32 dictIndexDelta       = prefixStartIndex - (U32)(dictEnd - dictBase);
      const U32 dictAndPrefixLength  = (U32)(istart - prefixStart + dictEnd - dictStart);
-    const U32 dictHLog             = dictCParams->hashLog;
+    const U32 dictHBits            = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
  
      /* if a dictionary is still attached, it necessarily means that
       * it is within window size. So we just check it. */
@@ -463,8 +513,11 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
      while (ip1 <= ilimit) {   /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */
          size_t mLength;
          size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls);
-        const size_t dictHash0 = ZSTD_hashPtr(ip0, dictHLog, mls);
-        U32 dictMatchIndex = dictHashTable[dictHash0];
+
+        size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls);
+        U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS];
+        int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0);
+
          U32 matchIndex = hashTable[hash0];
          U32 curr = (U32)(ip0 - base);
          size_t step = stepSize;
@@ -479,7 +532,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
                                     dictBase + (repIndex - dictIndexDelta) :
                                     base + repIndex;
              const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls);
-            const size_t dictHash1 = ZSTD_hashPtr(ip1, dictHLog, mls);
+            size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls);
              hashTable[hash0] = curr;   /* update hash table */
  
              if (((U32) ((prefixStartIndex - 1) - repIndex) >=
@@ -490,26 +543,33 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
                  ip0++;
                  ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
                  break;
-            } else if (matchIndex <= prefixStartIndex) {
-                /* We only look for a dict match if the normal matchIndex is invalid */
+            }
+
+            if (dictTagsMatch) {
+                /* Found a possible dict match */
+                const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
                  const BYTE* dictMatch = dictBase + dictMatchIndex;
                  if (dictMatchIndex > dictStartIndex &&
                      MEM_read32(dictMatch) == MEM_read32(ip0)) {
-                    /* found a dict match */
-                    U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta);
-                    mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4;
-                    while (((ip0 > anchor) & (dictMatch > dictStart))
-                           && (ip0[-1] == dictMatch[-1])) {
-                        ip0--;
-                        dictMatch--;
-                        mLength++;
-                    } /* catch up */
-                    offset_2 = offset_1;
-                    offset_1 = offset;
-                    ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
-                    break;
+                    /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */
+                    if (matchIndex <= prefixStartIndex) {
+                        U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta);
+                        mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4;
+                        while (((ip0 > anchor) & (dictMatch > dictStart))
+                            && (ip0[-1] == dictMatch[-1])) {
+                            ip0--;
+                            dictMatch--;
+                            mLength++;
+                        } /* catch up */
+                        offset_2 = offset_1;
+                        offset_1 = offset;
+                        ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+                        break;
+                    }
                  }
-            } else if (MEM_read32(match) == MEM_read32(ip0)) {
+            }
+
+            if (matchIndex > prefixStartIndex && MEM_read32(match) == MEM_read32(ip0)) {
                  /* found a regular match */
                  U32 const offset = (U32) (ip0 - match);
                  mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4;
@@ -526,7 +586,8 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
              }
  
              /* Prepare for next iteration */
-            dictMatchIndex = dictHashTable[dictHash1];
+            dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS];
+            dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1);
              matchIndex = hashTable[hash1];
  
              if (ip1 >= nextStep) {
diff --git a/lib/compress/zstd_fast.h b/lib/compress/zstd_fast.h

index 0d4a0c1090ffa9f4508ff531d95c4da6637631db..3bfeb2c5f83cba4c1a17afbddd0fca3f452de7cd 100644 (file)
--- a/lib/compress/zstd_fast.h
+++ b/lib/compress/zstd_fast.h
@@ -19,7 +19,8 @@ extern "C" {
  #include "zstd_compress_internal.h"
  
  void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
-                        void const* end, ZSTD_dictTableLoadMethod_e dtlm);
+                        void const* end, ZSTD_dictTableLoadMethod_e dtlm,
+                        ZSTD_tableFillPurpose_e tfp);
  size_t ZSTD_compressBlock_fast(
          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
          void const* src, size_t srcSize);
diff --git a/lib/compress/zstd_ldm.c b/lib/compress/zstd_ldm.c

index e1d2f7410144bbe0c3ef8d6088f820dec9686823..c14c62454f4250204211f81f0a6389e06db4667c 100644 (file)
--- a/lib/compress/zstd_ldm.c
+++ b/lib/compress/zstd_ldm.c
@@ -242,11 +242,11 @@ static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
      switch(ms->cParams.strategy)
      {
      case ZSTD_fast:
-        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast);
+        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
          break;
  
      case ZSTD_dfast:
-        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast);
+        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
          break;
  
      case ZSTD_greedy:
author	Elliot Gorokhovsky <embg@fb.com>
	Tue, 21 Jun 2022 21:27:19 +0000 (14:27 -0700)
committer	GitHub <noreply@github.com>
	Tue, 21 Jun 2022 21:27:19 +0000 (17:27 -0400)
lib/compress/zstd_compress.c		patch \| blob \| blame \| history
lib/compress/zstd_compress_internal.h		patch \| blob \| blame \| history
lib/compress/zstd_double_fast.c		patch \| blob \| blame \| history
lib/compress/zstd_double_fast.h		patch \| blob \| blame \| history
lib/compress/zstd_fast.c		patch \| blob \| blame \| history
lib/compress/zstd_fast.h		patch \| blob \| blame \| history
lib/compress/zstd_ldm.c		patch \| blob \| blame \| history