Adding shrinking flag for cover and fastcover (#1656)

author Tyler-Tran <39778355+Tyler-Tran@users.noreply.github.com>

Thu, 27 Jun 2019 23:26:57 +0000 (16:26 -0700)

committer Nick Terrell <terrelln@fb.com>

Thu, 27 Jun 2019 23:26:57 +0000 (16:26 -0700)
author Tyler-Tran <39778355+Tyler-Tran@users.noreply.github.com>
Thu, 27 Jun 2019 23:26:57 +0000 (16:26 -0700)
committer Nick Terrell <terrelln@fb.com>
Thu, 27 Jun 2019 23:26:57 +0000 (16:26 -0700)
diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c

index 961e1cb9d231898138f9370c6b2ceabd642d19a9..621996759b6aeb67ecc3d101a6ffa5fb13e28caf 100644 (file)
--- a/lib/dictBuilder/cover.c
+++ b/lib/dictBuilder/cover.c
@@ -889,9 +889,11 @@ void COVER_best_start(COVER_best_t *best) {
   * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
   * If this dictionary is the best so far save it and its parameters.
   */
-void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
-                              ZDICT_cover_params_t parameters, void *dict,
-                              size_t dictSize) {
+void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
+                              COVER_dictSelection_t selection) {
+  void* dict = selection.dictContent;
+  size_t compressedSize = selection.totalCompressedSize;
+  size_t dictSize = selection.dictSize;
    if (!best) {
      return;
    }
@@ -917,6 +919,9 @@ void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
          }
        }
        /* Save the dictionary, parameters, and size */
+      if (!dict) {
+        return;
+      }
        memcpy(best->dict, dict, dictSize);
        best->dictSize = dictSize;
        best->parameters = parameters;
@@ -929,6 +934,111 @@ void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
    }
  }
  
+COVER_dictSelection_t COVER_dictSelectionError(size_t error) {
+    COVER_dictSelection_t selection = { NULL, 0, error };
+    return selection;
+}
+
+unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection) {
+  return (ZSTD_isError(selection.totalCompressedSize) || !selection.dictContent);
+}
+
+void COVER_dictSelectionFree(COVER_dictSelection_t selection){
+  free(selection.dictContent);
+}
+
+COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent,
+        size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
+        size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize) {
+
+  size_t largestDict = 0;
+  size_t largestCompressed = 0;
+  BYTE* customDictContentEnd = customDictContent + dictContentSize;
+
+  BYTE * largestDictbuffer = (BYTE *)malloc(dictContentSize);
+  BYTE * candidateDictBuffer = (BYTE *)malloc(dictContentSize);
+  double regressionTolerance = ((double)params.shrinkDictMaxRegression / 100.0) + 1.00;
+
+  if (!largestDictbuffer || !candidateDictBuffer) {
+    free(largestDictbuffer);
+    free(candidateDictBuffer);
+    return COVER_dictSelectionError(dictContentSize);
+  }
+
+  /* Initial dictionary size and compressed size */
+  memcpy(largestDictbuffer, customDictContent, dictContentSize);
+  dictContentSize = ZDICT_finalizeDictionary(
+    largestDictbuffer, dictContentSize, customDictContent, dictContentSize,
+    samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
+
+  if (ZDICT_isError(dictContentSize)) {
+    free(largestDictbuffer);
+    free(candidateDictBuffer);
+    return COVER_dictSelectionError(dictContentSize);
+  }
+
+  totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
+                                                       samplesBuffer, offsets,
+                                                       nbCheckSamples, nbSamples,
+                                                       largestDictbuffer, dictContentSize);
+
+  if (ZSTD_isError(totalCompressedSize)) {
+    free(largestDictbuffer);
+    free(candidateDictBuffer);
+    return COVER_dictSelectionError(totalCompressedSize);
+  }
+
+  if (params.shrinkDict == 0) {
+    COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
+    free(candidateDictBuffer);
+    return selection;
+  }
+
+  largestDict = dictContentSize;
+  largestCompressed = totalCompressedSize;
+  dictContentSize = ZDICT_DICTSIZE_MIN;
+
+  /* Largest dict is initially at least ZDICT_DICTSIZE_MIN */
+  while (dictContentSize < largestDict) {
+    memcpy(candidateDictBuffer, largestDictbuffer, largestDict);
+    dictContentSize = ZDICT_finalizeDictionary(
+      candidateDictBuffer, dictContentSize, customDictContentEnd - dictContentSize, dictContentSize,
+      samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
+
+    if (ZDICT_isError(dictContentSize)) {
+      free(largestDictbuffer);
+      free(candidateDictBuffer);
+      return COVER_dictSelectionError(dictContentSize);
+
+    }
+
+    totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
+                                                         samplesBuffer, offsets,
+                                                         nbCheckSamples, nbSamples,
+                                                         candidateDictBuffer, dictContentSize);
+
+    if (ZSTD_isError(totalCompressedSize)) {
+      free(largestDictbuffer);
+      free(candidateDictBuffer);
+      return COVER_dictSelectionError(totalCompressedSize);
+    }
+
+    if (totalCompressedSize <= largestCompressed * regressionTolerance) {
+      COVER_dictSelection_t selection = { candidateDictBuffer, dictContentSize, totalCompressedSize };
+      free(largestDictbuffer);
+      return selection;
+    }
+    dictContentSize *= 2;
+  }
+  dictContentSize = largestDict;
+  totalCompressedSize = largestCompressed;
+  {
+    COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
+    free(candidateDictBuffer);
+    return selection;
+  }
+}
+
  /**
   * Parameters for COVER_tryParameters().
   */
@@ -954,6 +1064,7 @@ static void COVER_tryParameters(void *opaque) {
    /* Allocate space for hash table, dict, and freqs */
    COVER_map_t activeDmers;
    BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
+  COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
    U32 *freqs = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
    if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
      DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
@@ -969,29 +1080,21 @@ static void COVER_tryParameters(void *opaque) {
    {
      const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,
                                                dictBufferCapacity, parameters);
-    dictBufferCapacity = ZDICT_finalizeDictionary(
-        dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
-        ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples,
-        parameters.zParams);
-    if (ZDICT_isError(dictBufferCapacity)) {
-      DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
+    selection = COVER_selectDict(dict + tail, dictBufferCapacity - tail,
+        ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
+        totalCompressedSize);
+
+    if (COVER_dictSelectionIsError(selection)) {
+      DISPLAYLEVEL(1, "Failed to select dictionary\n");
        goto _cleanup;
      }
    }
-  /* Check total compressed size */
-  totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
-                                                       ctx->samples, ctx->offsets,
-                                                       ctx->nbTrainSamples, ctx->nbSamples,
-                                                       dict, dictBufferCapacity);
-
  _cleanup:
-  COVER_best_finish(data->best, totalCompressedSize, parameters, dict,
-                    dictBufferCapacity);
+  free(dict);
+  COVER_best_finish(data->best, parameters, selection);
    free(data);
    COVER_map_destroy(&activeDmers);
-  if (dict) {
-    free(dict);
-  }
+  COVER_dictSelectionFree(selection);
    if (freqs) {
      free(freqs);
    }
@@ -1013,6 +1116,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
    const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
    const unsigned kIterations =
        (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
+  const unsigned shrinkDict = 0;
    /* Local variables */
    const int displayLevel = parameters->zParams.notificationLevel;
    unsigned iteration = 1;
@@ -1091,6 +1195,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
        data->parameters.d = d;
        data->parameters.splitPoint = splitPoint;
        data->parameters.steps = kSteps;
+      data->parameters.shrinkDict = shrinkDict;
        data->parameters.zParams.notificationLevel = g_displayLevel;
        /* Check the parameters */
        if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) {
diff --git a/lib/dictBuilder/cover.h b/lib/dictBuilder/cover.h

index efb46807c7851064583658d482e610ffb96c1780..d9e0636a65981c614397dd4e044e4fcfc949ac8b 100644 (file)
--- a/lib/dictBuilder/cover.h
+++ b/lib/dictBuilder/cover.h
@@ -46,6 +46,15 @@ typedef struct {
    U32 size;
  } COVER_epoch_info_t;
  
+/**
+ * Struct used for the dictionary selection function.
+ */
+typedef struct COVER_dictSelection {
+  BYTE* dictContent;
+  size_t dictSize;
+  size_t totalCompressedSize;
+} COVER_dictSelection_t;
+
  /**
   * Computes the number of epochs and the size of each epoch.
   * We will make sure that each epoch gets at least 10 * k bytes.
@@ -107,6 +116,32 @@ void COVER_best_start(COVER_best_t *best);
   * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
   * If this dictionary is the best so far save it and its parameters.
   */
-void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
-                       ZDICT_cover_params_t parameters, void *dict,
-                       size_t dictSize);
+void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
+                       COVER_dictSelection_t selection);
+/**
+ * Error function for COVER_selectDict function. Checks if the return
+ * value is an error.
+ */
+unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection);
+
+ /**
+  * Error function for COVER_selectDict function. Returns a struct where
+  * return.totalCompressedSize is a ZSTD error.
+  */
+COVER_dictSelection_t COVER_dictSelectionError(size_t error);
+
+/**
+ * Always call after selectDict is called to free up used memory from
+ * newly created dictionary.
+ */
+void COVER_dictSelectionFree(COVER_dictSelection_t selection);
+
+/**
+ * Called to finalize the dictionary and select one based on whether or not
+ * the shrink-dict flag was enabled. If enabled the dictionary used is the
+ * smallest dictionary within a specified regression of the compressed size
+ * from the largest dictionary.
+ */
+ COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent,
+                       size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
+                       size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);
diff --git a/lib/dictBuilder/fastcover.c b/lib/dictBuilder/fastcover.c

index 40131e693632245925bcfd883c7db73906c1892d..941bb5a26ae66fcb8d1620a7e5723341a492e23e 100644 (file)
--- a/lib/dictBuilder/fastcover.c
+++ b/lib/dictBuilder/fastcover.c
@@ -435,7 +435,6 @@ FASTCOVER_buildDictionary(const FASTCOVER_ctx_t* ctx,
    return tail;
  }
  
-
  /**
   * Parameters for FASTCOVER_tryParameters().
   */
@@ -464,6 +463,7 @@ static void FASTCOVER_tryParameters(void *opaque)
    U16* segmentFreqs = (U16 *)calloc(((U64)1 << ctx->f), sizeof(U16));
    /* Allocate space for hash table, dict, and freqs */
    BYTE *const dict = (BYTE * const)malloc(dictBufferCapacity);
+  COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
    U32 *freqs = (U32*) malloc(((U64)1 << ctx->f) * sizeof(U32));
    if (!segmentFreqs || !dict || !freqs) {
      DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
@@ -473,27 +473,24 @@ static void FASTCOVER_tryParameters(void *opaque)
    memcpy(freqs, ctx->freqs, ((U64)1 << ctx->f) * sizeof(U32));
    /* Build the dictionary */
    { const size_t tail = FASTCOVER_buildDictionary(ctx, freqs, dict, dictBufferCapacity,
-                                                  parameters, segmentFreqs);
+                                                    parameters, segmentFreqs);
+
      const unsigned nbFinalizeSamples = (unsigned)(ctx->nbTrainSamples * ctx->accelParams.finalize / 100);
-    dictBufferCapacity = ZDICT_finalizeDictionary(
-        dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
-        ctx->samples, ctx->samplesSizes, nbFinalizeSamples, parameters.zParams);
-    if (ZDICT_isError(dictBufferCapacity)) {
-      DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
+    selection = COVER_selectDict(dict + tail, dictBufferCapacity - tail,
+         ctx->samples, ctx->samplesSizes, nbFinalizeSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
+         totalCompressedSize);
+
+    if (COVER_dictSelectionIsError(selection)) {
+      DISPLAYLEVEL(1, "Failed to select dictionary\n");
        goto _cleanup;
      }
    }
-  /* Check total compressed size */
-  totalCompressedSize = COVER_checkTotalCompressedSize(parameters, ctx->samplesSizes,
-                                                       ctx->samples, ctx->offsets,
-                                                       ctx->nbTrainSamples, ctx->nbSamples,
-                                                       dict, dictBufferCapacity);
  _cleanup:
-  COVER_best_finish(data->best, totalCompressedSize, parameters, dict,
-                    dictBufferCapacity);
+  free(dict);
+  COVER_best_finish(data->best, parameters, selection);
    free(data);
    free(segmentFreqs);
-  free(dict);
+  COVER_dictSelectionFree(selection);
    free(freqs);
  }
  
@@ -508,6 +505,7 @@ FASTCOVER_convertToCoverParams(ZDICT_fastCover_params_t fastCoverParams,
      coverParams->nbThreads = fastCoverParams.nbThreads;
      coverParams->splitPoint = fastCoverParams.splitPoint;
      coverParams->zParams = fastCoverParams.zParams;
+    coverParams->shrinkDict = fastCoverParams.shrinkDict;
  }
  
  
@@ -524,6 +522,7 @@ FASTCOVER_convertToFastCoverParams(ZDICT_cover_params_t coverParams,
      fastCoverParams->f = f;
      fastCoverParams->accel = accel;
      fastCoverParams->zParams = coverParams.zParams;
+    fastCoverParams->shrinkDict = coverParams.shrinkDict;
  }
  
  
@@ -619,6 +618,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
          (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
      const unsigned f = parameters->f == 0 ? DEFAULT_F : parameters->f;
      const unsigned accel = parameters->accel == 0 ? DEFAULT_ACCEL : parameters->accel;
+    const unsigned shrinkDict = 0;
      /* Local variables */
      const int displayLevel = parameters->zParams.notificationLevel;
      unsigned iteration = 1;
@@ -703,6 +703,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
          data->parameters.d = d;
          data->parameters.splitPoint = splitPoint;
          data->parameters.steps = kSteps;
+        data->parameters.shrinkDict = shrinkDict;
          data->parameters.zParams.notificationLevel = g_displayLevel;
          /* Check the parameters */
          if (!FASTCOVER_checkParameters(data->parameters, dictBufferCapacity,
diff --git a/lib/dictBuilder/zdict.h b/lib/dictBuilder/zdict.h

index e22973173cb8996305560d9c5ae1c65d9d5953d7..37978ecdfb818be65a11f34c90a66483da7af700 100644 (file)
--- a/lib/dictBuilder/zdict.h
+++ b/lib/dictBuilder/zdict.h
@@ -94,6 +94,8 @@ typedef struct {
      unsigned steps;              /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
      unsigned nbThreads;          /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
      double splitPoint;           /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (1.0), 1.0 when all samples are used for both training and testing */
+    unsigned shrinkDict;         /* Train dictionaries to shrink in size starting from the minimum size and selects the smallest dictionary that is shrinkDictMaxRegression% worse than the largest dictionary. 0 means no shrinking and 1 means shrinking  */
+    unsigned shrinkDictMaxRegression; /* Sets shrinkDictMaxRegression so that a smaller dictionary can be at worse shrinkDictMaxRegression% worse than the max dict size dictionary. */
      ZDICT_params_t zParams;
  } ZDICT_cover_params_t;
  
@@ -105,6 +107,9 @@ typedef struct {
      unsigned nbThreads;          /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
      double splitPoint;           /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (0.75), 1.0 when all samples are used for both training and testing */
      unsigned accel;              /* Acceleration level: constraint: 0 < accel <= 10, higher means faster and less accurate, 0 means default(1) */
+    unsigned shrinkDict;         /* Train dictionaries to shrink in size starting from the minimum size and selects the smallest dictionary that is shrinkDictMaxRegression% worse than the largest dictionary. 0 means no shrinking and 1 means shrinking  */
+    unsigned shrinkDictMaxRegression; /* Sets shrinkDictMaxRegression so that a smaller dictionary can be at worse shrinkDictMaxRegression% worse than the max dict size dictionary. */
+
      ZDICT_params_t zParams;
  } ZDICT_fastCover_params_t;
  
diff --git a/programs/zstdcli.c b/programs/zstdcli.c

index c0dd925cefa921108c238090a08eeb2b6ebe5fab..e862d398fb3817fd2fd5667846061b75e1ac8be2 100644 (file)
--- a/programs/zstdcli.c
+++ b/programs/zstdcli.c
@@ -179,8 +179,8 @@ static int usage_advanced(const char* programName)
      DISPLAY( "\n");
      DISPLAY( "Dictionary builder : \n");
      DISPLAY( "--train ## : create a dictionary from a training set of files \n");
-    DISPLAY( "--train-cover[=k=#,d=#,steps=#,split=#] : use the cover algorithm with optional args\n");
-    DISPLAY( "--train-fastcover[=k=#,d=#,f=#,steps=#,split=#,accel=#] : use the fast cover algorithm with optional args\n");
+    DISPLAY( "--train-cover[=k=#,d=#,steps=#,split=#,shrink[=#]] : use the cover algorithm with optional args\n");
+    DISPLAY( "--train-fastcover[=k=#,d=#,f=#,steps=#,split=#,accel=#,shrink[=#]] : use the fast cover algorithm with optional args\n");
      DISPLAY( "--train-legacy[=s=#] : use the legacy algorithm with selectivity (default: %u)\n", g_defaultSelectivityLevel);
      DISPLAY( " -o file : `file` is dictionary name (default: %s) \n", g_defaultDictName);
      DISPLAY( "--maxdict=# : limit dictionary to specified size (default: %u) \n", g_defaultMaxDictSize);
@@ -299,6 +299,7 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
   * @return 1 means that cover parameters were correct
   * @return 0 in case of malformed parameters
   */
+static const unsigned kDefaultRegression = 1;
  static unsigned parseCoverParameters(const char* stringPtr, ZDICT_cover_params_t* params)
  {
      memset(params, 0, sizeof(*params));
@@ -311,10 +312,23 @@ static unsigned parseCoverParameters(const char* stringPtr, ZDICT_cover_params_t
            params->splitPoint = (double)splitPercentage / 100.0;
            if (stringPtr[0]==',') { stringPtr++; continue; } else break;
          }
+        if (longCommandWArg(&stringPtr, "shrink")) {
+          params->shrinkDictMaxRegression = kDefaultRegression;
+          params->shrinkDict = 1;
+          if (stringPtr[0]=='=') {
+            stringPtr++;
+            params->shrinkDictMaxRegression = readU32FromChar(&stringPtr);
+          }
+          if (stringPtr[0]==',') {
+            stringPtr++;
+            continue;
+          }
+          else break;
+        }
          return 0;
      }
      if (stringPtr[0] != 0) return 0;
-    DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\nsplit=%u\n", params->k, params->d, params->steps, (unsigned)(params->splitPoint * 100));
+    DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nsteps=%u\nsplit=%u\nshrink%u\n", params->k, params->d, params->steps, (unsigned)(params->splitPoint * 100), params->shrinkDictMaxRegression);
      return 1;
  }
  
@@ -338,10 +352,23 @@ static unsigned parseFastCoverParameters(const char* stringPtr, ZDICT_fastCover_
            params->splitPoint = (double)splitPercentage / 100.0;
            if (stringPtr[0]==',') { stringPtr++; continue; } else break;
          }
+        if (longCommandWArg(&stringPtr, "shrink")) {
+          params->shrinkDictMaxRegression = kDefaultRegression;
+          params->shrinkDict = 1;
+          if (stringPtr[0]=='=') {
+            stringPtr++;
+            params->shrinkDictMaxRegression = readU32FromChar(&stringPtr);
+          }
+          if (stringPtr[0]==',') {
+            stringPtr++;
+            continue;
+          }
+          else break;
+        }
          return 0;
      }
      if (stringPtr[0] != 0) return 0;
-    DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", params->k, params->d, params->f, params->steps, (unsigned)(params->splitPoint * 100), params->accel);
+    DISPLAYLEVEL(4, "cover: k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\nshrink=%u\n", params->k, params->d, params->f, params->steps, (unsigned)(params->splitPoint * 100), params->accel, params->shrinkDictMaxRegression);
      return 1;
  }
  
@@ -367,6 +394,8 @@ static ZDICT_cover_params_t defaultCoverParams(void)
      params.d = 8;
      params.steps = 4;
      params.splitPoint = 1.0;
+    params.shrinkDict = 0;
+    params.shrinkDictMaxRegression = kDefaultRegression;
      return params;
  }
  
@@ -379,6 +408,8 @@ static ZDICT_fastCover_params_t defaultFastCoverParams(void)
      params.steps = 4;
      params.splitPoint = 0.75; /* different from default splitPoint of cover */
      params.accel = DEFAULT_ACCEL;
+    params.shrinkDict = 0;
+    params.shrinkDictMaxRegression = kDefaultRegression;
      return params;
  }
  #endif
diff --git a/tests/fuzzer.c b/tests/fuzzer.c

index 2b6bfff6a4c241f94b78a7b9e46f2a9b81743d84..f42de9ed55c3727eae55fb2278711751cbb5e7dc 100644 (file)
--- a/tests/fuzzer.c
+++ b/tests/fuzzer.c
@@ -1104,6 +1104,22 @@ static int basicUnitTests(U32 seed, double compressibility)
          }
          DISPLAYLEVEL(3, "OK, created dictionary of size %u \n", (unsigned)dictSize);
  
+        DISPLAYLEVEL(3, "test%3i : COVER dictBuilder with shrinkDict: ", testNb++);
+        { U32 u; for (u=0; u<nbSamples; u++) samplesSizes[u] = sampleUnitSize; }
+        {   ZDICT_cover_params_t coverParams;
+            memset(&coverParams, 0, sizeof(coverParams));
+            coverParams.steps = 8;
+            coverParams.nbThreads = 4;
+            coverParams.shrinkDict = 1;
+            coverParams.shrinkDictMaxRegression = 1;
+            dictSize = ZDICT_optimizeTrainFromBuffer_cover(
+                dictBuffer, dictBufferCapacity,
+                CNBuffer, samplesSizes, nbSamples/8,  /* less samples for faster tests */
+                &coverParams);
+            if (ZDICT_isError(dictSize)) goto _output_error;
+        }
+        DISPLAYLEVEL(3, "OK, created dictionary of size %u \n", (unsigned)dictSize);
+
          DISPLAYLEVEL(3, "test%3i : Multithreaded FASTCOVER dictBuilder : ", testNb++);
          { U32 u; for (u=0; u<nbSamples; u++) samplesSizes[u] = sampleUnitSize; }
          {   ZDICT_fastCover_params_t fastCoverParams;
@@ -1118,6 +1134,22 @@ static int basicUnitTests(U32 seed, double compressibility)
          }
          DISPLAYLEVEL(3, "OK, created dictionary of size %u \n", (unsigned)dictSize);
  
+        DISPLAYLEVEL(3, "test%3i : FASTCOVER dictBuilder with shrinkDict: ", testNb++);
+        { U32 u; for (u=0; u<nbSamples; u++) samplesSizes[u] = sampleUnitSize; }
+        {   ZDICT_fastCover_params_t fastCoverParams;
+            memset(&fastCoverParams, 0, sizeof(fastCoverParams));
+            fastCoverParams.steps = 8;
+            fastCoverParams.nbThreads = 4;
+            fastCoverParams.shrinkDict = 1;
+            fastCoverParams.shrinkDictMaxRegression = 1;
+            dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(
+                dictBuffer, dictBufferCapacity,
+                CNBuffer, samplesSizes, nbSamples,
+                &fastCoverParams);
+            if (ZDICT_isError(dictSize)) goto _output_error;
+        }
+        DISPLAYLEVEL(3, "OK, created dictionary of size %u \n", (unsigned)dictSize);
+
          DISPLAYLEVEL(3, "test%3i : check dictID : ", testNb++);
          dictID = ZDICT_getDictID(dictBuffer, dictSize);
          if (dictID==0) goto _output_error;
diff --git a/tests/playTests.sh b/tests/playTests.sh

index c4d68998e56cff748ff0c6a5931fa4519ed6c555..2b8843f9784839c56d950d521d30df27a9c540e3 100755 (executable)
--- a/tests/playTests.sh
+++ b/tests/playTests.sh
@@ -499,6 +499,10 @@ $ZSTD --train-fastcover=k=56,d=8 && die "Create dictionary without input file"
  println "- Create dictionary with short dictID"
  $ZSTD --train-fastcover=k=46,d=8,f=15,split=80 "$TESTDIR"/*.c "$PRGDIR"/*.c --dictID=1 -o tmpDict1
  cmp tmpDict tmpDict1 && die "dictionaries should have different ID !"
+println "- Create dictionaries with shrink-dict flag enabled"
+$ZSTD --train-fastcover=steps=256,shrink "$TESTDIR"/*.c "$PRGDIR"/*.c -o tmpShrinkDict
+$ZSTD --train-fastcover=steps=256,shrink=1 "$TESTDIR"/*.c "$PRGDIR"/*.c -o tmpShrinkDict1
+$ZSTD --train-fastcover=steps=256,shrink=5 "$TESTDIR"/*.c "$PRGDIR"/*.c -o tmpShrinkDict2
  println "- Create dictionary with size limit"
  $ZSTD --train-fastcover=steps=8 "$TESTDIR"/*.c "$PRGDIR"/*.c -o tmpDict2 --maxdict=4K
  println "- Compare size of dictionary from 90% training samples with 80% training samples"
@@ -989,6 +993,10 @@ $ZSTD --train-cover=k=56,d=8 && die "Create dictionary without input file (shoul
  println "- Create second (different) dictionary"
  $ZSTD --train-cover=k=56,d=8 "$TESTDIR"/*.c "$PRGDIR"/*.c "$PRGDIR"/*.h -o tmpDictC
  $ZSTD -d tmp.zst -D tmpDictC -fo result && die "wrong dictionary not detected!"
+println "- Create dictionary using shrink-dict flag"
+$ZSTD --train-cover=steps=256,shrink "$TESTDIR"/*.c "$PRGDIR"/*.c --dictID=1 -o tmpShrinkDict
+$ZSTD --train-cover=steps=256,shrink=1 "$TESTDIR"/*.c "$PRGDIR"/*.c --dictID=1 -o tmpShrinkDict1
+$ZSTD --train-cover=steps=256,shrink=5 "$TESTDIR"/*.c "$PRGDIR"/*.c --dictID=1 -o tmpShrinkDict2
  println "- Create dictionary with short dictID"
  $ZSTD --train-cover=k=46,d=8,split=80 "$TESTDIR"/*.c "$PRGDIR"/*.c --dictID=1 -o tmpDict1
  cmp tmpDict tmpDict1 && die "dictionaries should have different ID !"
author	Tyler-Tran <39778355+Tyler-Tran@users.noreply.github.com>
	Thu, 27 Jun 2019 23:26:57 +0000 (16:26 -0700)
committer	Nick Terrell <terrelln@fb.com>
	Thu, 27 Jun 2019 23:26:57 +0000 (16:26 -0700)
lib/dictBuilder/cover.c		patch \| blob \| blame \| history
lib/dictBuilder/cover.h		patch \| blob \| blame \| history
lib/dictBuilder/fastcover.c		patch \| blob \| blame \| history
lib/dictBuilder/zdict.h		patch \| blob \| blame \| history
programs/zstdcli.c		patch \| blob \| blame \| history
tests/fuzzer.c		patch \| blob \| blame \| history
tests/playTests.sh		patch \| blob \| blame \| history