[LDM] Speed optimization on repetitive data

author Nick Terrell <terrelln@fb.com>

Mon, 3 May 2021 21:32:15 +0000 (14:32 -0700)

committer Nick Terrell <terrelln@fb.com>

Tue, 4 May 2021 17:57:42 +0000 (10:57 -0700)
author Nick Terrell <terrelln@fb.com>
Mon, 3 May 2021 21:32:15 +0000 (14:32 -0700)
committer Nick Terrell <terrelln@fb.com>
Tue, 4 May 2021 17:57:42 +0000 (10:57 -0700)
diff --git a/lib/compress/zstd_ldm.c b/lib/compress/zstd_ldm.c

index 7dea97aa825ee2be048eeafbe18e05773c7372c4..f072b2628bff3066b4a1ca3f4bb8cf3f6ed75308 100644 (file)
--- a/lib/compress/zstd_ldm.c
+++ b/lib/compress/zstd_ldm.c
@@ -57,6 +57,33 @@ static void ZSTD_ldm_gear_init(ldmRollingHashState_t* state, ldmParams_t const*
      }
  }
  
+/** ZSTD_ldm_gear_reset()
+ * Feeds [data, data + minMatchLength) into the hash without registering any
+ * splits. This effectively resets the hash state. This is used when skipping
+ * over data, either at the beginning of a block, or skipping sections.
+ */
+static void ZSTD_ldm_gear_reset(ldmRollingHashState_t* state,
+                                BYTE const* data, size_t minMatchLength)
+{
+    U64 hash = state->rolling;
+    size_t n = 0;
+
+#define GEAR_ITER_ONCE() do {                                  \
+        hash = (hash << 1) + ZSTD_ldm_gearTab[data[n] & 0xff]; \
+        n += 1;                                                \
+    } while (0)
+    while (n + 3 < minMatchLength) {
+        GEAR_ITER_ONCE();
+        GEAR_ITER_ONCE();
+        GEAR_ITER_ONCE();
+        GEAR_ITER_ONCE();
+    }
+    while (n < minMatchLength) {
+        GEAR_ITER_ONCE();
+    }
+#undef GEAR_ITER_ONCE
+}
+
  /** ZSTD_ldm_gear_feed():
   *
   * Registers in the splits array all the split points found in the first
@@ -255,7 +282,7 @@ void ZSTD_ldm_fillHashTable(
      while (ip < iend) {
          size_t hashed;
          unsigned n;
-        
+
          numSplits = 0;
          hashed = ZSTD_ldm_gear_feed(&hashState, ip, iend - ip, splits, &numSplits);
  
@@ -327,16 +354,8 @@ static size_t ZSTD_ldm_generateSequences_internal(
  
      /* Initialize the rolling hash state with the first minMatchLength bytes */
      ZSTD_ldm_gear_init(&hashState, params);
-    {
-        size_t n = 0;
-
-        while (n < minMatchLength) {
-            numSplits = 0;
-            n += ZSTD_ldm_gear_feed(&hashState, ip + n, minMatchLength - n,
-                                    splits, &numSplits);
-        }
-        ip += minMatchLength;
-    }
+    ZSTD_ldm_gear_reset(&hashState, ip, minMatchLength);
+    ip += minMatchLength;
  
      while (ip < ilimit) {
          size_t hashed;
@@ -361,6 +380,7 @@ static size_t ZSTD_ldm_generateSequences_internal(
          for (n = 0; n < numSplits; n++) {
              size_t forwardMatchLength = 0, backwardMatchLength = 0,
                     bestMatchLength = 0, mLength;
+            U32 offset;
              BYTE const* const split = candidates[n].split;
              U32 const checksum = candidates[n].checksum;
              U32 const hash = candidates[n].hash;
@@ -428,9 +448,9 @@ static size_t ZSTD_ldm_generateSequences_internal(
              }
  
              /* Match found */
+            offset = (U32)(split - base) - bestEntry->offset;
              mLength = forwardMatchLength + backwardMatchLength;
              {
-                U32 const offset = (U32)(split - base) - bestEntry->offset;
                  rawSeq* const seq = rawSeqStore->seq + rawSeqStore->size;
  
                  /* Out of sequence storage */
@@ -447,6 +467,21 @@ static size_t ZSTD_ldm_generateSequences_internal(
              ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params);
  
              anchor = split + forwardMatchLength;
+
+            /* If we find a match that ends after the data that we've hashed
+             * then we have a repeating, overlapping, pattern. E.g. all zeros.
+             * If one repetition of the pattern matches our `stopMask` then all
+             * repetitions will. We don't need to insert them all into out table,
+             * only the first one. So skip over overlapping matches.
+             * This is a major speed boost (20x) for compressing a single byte
+             * repeated, when that byte ends up in the table.
+             */
+            if (anchor > ip + hashed) {
+                ZSTD_ldm_gear_reset(&hashState, anchor - minMatchLength, minMatchLength);
+                /* Continue the outter loop at anchor (ip + hashed == anchor). */
+                ip = anchor - hashed;
+                break;
+            }
          }
  
          ip += hashed;
author	Nick Terrell <terrelln@fb.com>
	Mon, 3 May 2021 21:32:15 +0000 (14:32 -0700)
committer	Nick Terrell <terrelln@fb.com>
	Tue, 4 May 2021 17:57:42 +0000 (10:57 -0700)