[zdict] Remove ZDICT_CONTENTSIZE_MIN restriction for ZDICT_finalizeDictionary

author Nick Terrell <terrelln@fb.com>

Wed, 1 Dec 2021 00:51:16 +0000 (16:51 -0800)

committer Nick Terrell <terrelln@fb.com>

Wed, 1 Dec 2021 02:02:26 +0000 (18:02 -0800)
author Nick Terrell <terrelln@fb.com>
Wed, 1 Dec 2021 00:51:16 +0000 (16:51 -0800)
committer Nick Terrell <terrelln@fb.com>
Wed, 1 Dec 2021 02:02:26 +0000 (18:02 -0800)
diff --git a/lib/dictBuilder/zdict.c b/lib/dictBuilder/zdict.c

index d93b202e7a50740bf77492db9299507db70fa0b5..8b8b381edd99e94251495e6e86b2b27c306ea604 100644 (file)
--- a/lib/dictBuilder/zdict.c
+++ b/lib/dictBuilder/zdict.c
@@ -915,6 +915,17 @@ _cleanup:
  }
  
  
+/**
+ * @returns the maximum repcode value
+ */
+static U32 ZDICT_maxRep(U32 const reps[ZSTD_REP_NUM])
+{
+    U32 maxRep = reps[0];
+    int r;
+    for (r = 1; r < ZSTD_REP_NUM; ++r)
+        maxRep = MAX(maxRep, reps[r]);
+    return maxRep;
+}
  
  size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
                            const void* customDictContent, size_t dictContentSize,
@@ -926,11 +937,13 @@ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
      BYTE header[HBUFFSIZE];
      int const compressionLevel = (params.compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : params.compressionLevel;
      U32 const notificationLevel = params.notificationLevel;
+    /* The final dictionary content must be at least as large as the largest repcode */
+    size_t const minContentSize = (size_t)ZDICT_maxRep(repStartValue);
+    size_t paddingSize;
  
      /* check conditions */
      DEBUGLOG(4, "ZDICT_finalizeDictionary");
      if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
-    if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong);
      if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
  
      /* dictionary header */
@@ -954,12 +967,43 @@ size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
          hSize += eSize;
      }
  
-    /* copy elements in final buffer ; note : src and dst buffer can overlap */
-    if (hSize + dictContentSize > dictBufferCapacity) dictContentSize = dictBufferCapacity - hSize;
-    {   size_t const dictSize = hSize + dictContentSize;
-        char* dictEnd = (char*)dictBuffer + dictSize;
-        memmove(dictEnd - dictContentSize, customDictContent, dictContentSize);
-        memcpy(dictBuffer, header, hSize);
+    /* Shrink the content size if it doesn't fit in the buffer */
+    if (hSize + dictContentSize > dictBufferCapacity) {
+        dictContentSize = dictBufferCapacity - hSize;
+    }
+
+    /* Pad the dictionary content with zeros if it is too small */
+    if (dictContentSize < minContentSize) {
+        RETURN_ERROR_IF(hSize + minContentSize > dictBufferCapacity, dstSize_tooSmall,
+                        "dictBufferCapacity too small to fit max repcode");
+        paddingSize = minContentSize - dictContentSize;
+    } else {
+        paddingSize = 0;
+    }
+
+    {
+        size_t const dictSize = hSize + paddingSize + dictContentSize;
+
+        /* The dictionary consists of the header, optional padding, and the content.
+         * The padding comes before the content because the "best" position in the
+         * dictionary is the last byte.
+         */
+        BYTE* const outDictHeader = (BYTE*)dictBuffer;
+        BYTE* const outDictPadding = outDictHeader + hSize;
+        BYTE* const outDictContent = outDictPadding + paddingSize;
+
+        assert(dictSize <= dictBufferCapacity);
+        assert(outDictContent + dictContentSize == (BYTE*)dictBuffer + dictSize);
+
+        /* First copy the customDictContent into its final location.
+         * `customDictContent` and `dictBuffer` may overlap, so we must
+         * do this before any other writes into the output buffer.
+         * Then copy the header & padding into the output buffer.
+         */
+        memmove(outDictContent, customDictContent, dictContentSize);
+        memcpy(outDictHeader, header, hSize);
+        memset(outDictPadding, 0, paddingSize);
+
          return dictSize;
      }
  }
diff --git a/lib/zdict.h b/lib/zdict.h

index ac98a169bfa7c933d5109feb4b2331dc9f21898b..f1e139a40ddb000f4dcc72f53b64e6a206304f79 100644 (file)
--- a/lib/zdict.h
+++ b/lib/zdict.h
@@ -237,7 +237,6 @@ typedef struct {
   * is presumed that the most profitable content is at the end of the dictionary,
   * since that is the cheapest to reference.
   *
- * `dictContentSize` must be >= ZDICT_CONTENTSIZE_MIN bytes.
   * `maxDictSize` must be >= max(dictContentSize, ZSTD_DICTSIZE_MIN).
   *
   * @return: size of dictionary stored into `dstDictBuffer` (<= `maxDictSize`),
@@ -272,8 +271,9 @@ ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode);
   * Use them only in association with static linking.
   * ==================================================================================== */
  
-#define ZDICT_CONTENTSIZE_MIN 128
  #define ZDICT_DICTSIZE_MIN    256
+/* Deprecated: Remove in v1.6.0 */
+#define ZDICT_CONTENTSIZE_MIN 128
  
  /*! ZDICT_cover_params_t:
   *  k and d are the only required parameters.
diff --git a/tests/playTests.sh b/tests/playTests.sh

index 1edca7c3c72efe42c18c4e066f87eb66cc8aa596..3c773ed5b5ee247eabd92068a53d15d28cb5a04c 100755 (executable)
--- a/tests/playTests.sh
+++ b/tests/playTests.sh
@@ -1018,12 +1018,8 @@ zstd -o tmpDict --train "$TESTDIR"/*.c "$PRGDIR"/*.c
  test -f tmpDict
  zstd --train "$TESTDIR"/*.c "$PRGDIR"/*.c
  test -f dictionary
-println "- Test dictionary training fails"
-echo "000000000000000000000000000000000" > tmpz
-zstd --train tmpz tmpz tmpz tmpz tmpz tmpz tmpz tmpz tmpz && die "Dictionary training should fail : source is all zeros"
  if [ -n "$hasMT" ]
  then
-  zstd --train -T0 tmpz tmpz tmpz tmpz tmpz tmpz tmpz tmpz tmpz && die "Dictionary training should fail : source is all zeros"
    println "- Create dictionary with multithreading enabled"
    zstd --train -T0 "$TESTDIR"/*.c "$PRGDIR"/*.c -o tmpDict
  fi
author	Nick Terrell <terrelln@fb.com>
	Wed, 1 Dec 2021 00:51:16 +0000 (16:51 -0800)
committer	Nick Terrell <terrelln@fb.com>
	Wed, 1 Dec 2021 02:02:26 +0000 (18:02 -0800)
lib/dictBuilder/zdict.c		patch \| blob \| blame \| history
lib/zdict.h		patch \| blob \| blame \| history
tests/playTests.sh		patch \| blob \| blame \| history