[zdict] Make COVER the default algorithm

author Nick Terrell <terrelln@fb.com>

Tue, 27 Jun 2017 04:07:14 +0000 (21:07 -0700)

committer Nick Terrell <terrelln@fb.com>

Tue, 27 Jun 2017 04:09:22 +0000 (21:09 -0700)
author Nick Terrell <terrelln@fb.com>
Tue, 27 Jun 2017 04:07:14 +0000 (21:07 -0700)
committer Nick Terrell <terrelln@fb.com>
Tue, 27 Jun 2017 04:09:22 +0000 (21:09 -0700)
diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c

index 1863c8f34542e3577c7727a6c3419d096ab51265..06c1b9fadb7ae00a672b0b76fec602dfb4bc02e3 100644 (file)
--- a/lib/dictBuilder/cover.c
+++ b/lib/dictBuilder/cover.c
@@ -398,7 +398,8 @@ typedef struct {
   */
  static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
                                             COVER_map_t *activeDmers, U32 begin,
-                                           U32 end, COVER_params_t parameters) {
+                                           U32 end,
+                                           ZDICT_cover_params_t parameters) {
    /* Constants */
    const U32 k = parameters.k;
    const U32 d = parameters.d;
@@ -478,7 +479,7 @@ static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
   * Check the validity of the parameters.
   * Returns non-zero if the parameters are valid and 0 otherwise.
   */
-static int COVER_checkParameters(COVER_params_t parameters) {
+static int COVER_checkParameters(ZDICT_cover_params_t parameters) {
    /* k and d are required parameters */
    if (parameters.d == 0 || parameters.k == 0) {
      return 0;
@@ -600,7 +601,7 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
  static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
                                      COVER_map_t *activeDmers, void *dictBuffer,
                                      size_t dictBufferCapacity,
-                                    COVER_params_t parameters) {
+                                    ZDICT_cover_params_t parameters) {
    BYTE *const dict = (BYTE *)dictBuffer;
    size_t tail = dictBufferCapacity;
    /* Divide the data up into epochs of equal size.
@@ -639,22 +640,10 @@ static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
    return tail;
  }
  
-/**
- * Translate from COVER_params_t to ZDICT_params_t required for finalizing the
- * dictionary.
- */
-static ZDICT_params_t COVER_translateParams(COVER_params_t parameters) {
-  ZDICT_params_t zdictParams;
-  memset(&zdictParams, 0, sizeof(zdictParams));
-  zdictParams.notificationLevel = 1;
-  zdictParams.dictID = parameters.dictID;
-  zdictParams.compressionLevel = parameters.compressionLevel;
-  return zdictParams;
-}
-
-ZDICTLIB_API size_t COVER_trainFromBuffer(
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
      void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
-    const size_t *samplesSizes, unsigned nbSamples, COVER_params_t parameters) {
+    const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_cover_params_t parameters) {
    BYTE *const dict = (BYTE *)dictBuffer;
    COVER_ctx_t ctx;
    COVER_map_t activeDmers;
@@ -673,7 +662,7 @@ ZDICTLIB_API size_t COVER_trainFromBuffer(
      return ERROR(dstSize_tooSmall);
    }
    /* Initialize global data */
-  g_displayLevel = parameters.notificationLevel;
+  g_displayLevel = parameters.zParams.notificationLevel;
    /* Initialize context and activeDmers */
    if (!COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
                        parameters.d)) {
@@ -690,10 +679,9 @@ ZDICTLIB_API size_t COVER_trainFromBuffer(
      const size_t tail =
          COVER_buildDictionary(&ctx, ctx.freqs, &activeDmers, dictBuffer,
                                dictBufferCapacity, parameters);
-    ZDICT_params_t zdictParams = COVER_translateParams(parameters);
      const size_t dictionarySize = ZDICT_finalizeDictionary(
          dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
-        samplesBuffer, samplesSizes, nbSamples, zdictParams);
+        samplesBuffer, samplesSizes, nbSamples, parameters.zParams);
      if (!ZSTD_isError(dictionarySize)) {
        DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
                     (U32)dictionarySize);
@@ -718,7 +706,7 @@ typedef struct COVER_best_s {
    size_t liveJobs;
    void *dict;
    size_t dictSize;
-  COVER_params_t parameters;
+  ZDICT_cover_params_t parameters;
    size_t compressedSize;
  } COVER_best_t;
  
@@ -786,7 +774,7 @@ static void COVER_best_start(COVER_best_t *best) {
   * If this dictionary is the best so far save it and its parameters.
   */
  static void COVER_best_finish(COVER_best_t *best, size_t compressedSize,
-                              COVER_params_t parameters, void *dict,
+                              ZDICT_cover_params_t parameters, void *dict,
                                size_t dictSize) {
    if (!best) {
      return;
@@ -830,7 +818,7 @@ typedef struct COVER_tryParameters_data_s {
    const COVER_ctx_t *ctx;
    COVER_best_t *best;
    size_t dictBufferCapacity;
-  COVER_params_t parameters;
+  ZDICT_cover_params_t parameters;
  } COVER_tryParameters_data_t;
  
  /**
@@ -842,7 +830,7 @@ static void COVER_tryParameters(void *opaque) {
    /* Save parameters as local variables */
    COVER_tryParameters_data_t *const data = (COVER_tryParameters_data_t *)opaque;
    const COVER_ctx_t *const ctx = data->ctx;
-  const COVER_params_t parameters = data->parameters;
+  const ZDICT_cover_params_t parameters = data->parameters;
    size_t dictBufferCapacity = data->dictBufferCapacity;
    size_t totalCompressedSize = ERROR(GENERIC);
    /* Allocate space for hash table, dict, and freqs */
@@ -863,10 +851,10 @@ static void COVER_tryParameters(void *opaque) {
    {
      const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,
                                                dictBufferCapacity, parameters);
-    const ZDICT_params_t zdictParams = COVER_translateParams(parameters);
      dictBufferCapacity = ZDICT_finalizeDictionary(
          dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
-        ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbSamples, zdictParams);
+        ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbSamples,
+        parameters.zParams);
      if (ZDICT_isError(dictBufferCapacity)) {
        DISPLAYLEVEL(1, "Failed to finalize dictionary\n");
        goto _cleanup;
@@ -892,8 +880,8 @@ static void COVER_tryParameters(void *opaque) {
      }
      /* Create the cctx and cdict */
      cctx = ZSTD_createCCtx();
-    cdict =
-        ZSTD_createCDict(dict, dictBufferCapacity, parameters.compressionLevel);
+    cdict = ZSTD_createCDict(dict, dictBufferCapacity,
+                             parameters.zParams.compressionLevel);
      if (!dst || !cctx || !cdict) {
        goto _compressCleanup;
      }
@@ -930,12 +918,10 @@ _cleanup:
    }
  }
  
-ZDICTLIB_API size_t COVER_optimizeTrainFromBuffer(void *dictBuffer,
-                                                  size_t dictBufferCapacity,
-                                                  const void *samplesBuffer,
-                                                  const size_t *samplesSizes,
-                                                  unsigned nbSamples,
-                                                  COVER_params_t *parameters) {
+ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
+    void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
+    const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_cover_params_t *parameters) {
    /* constants */
    const unsigned nbThreads = parameters->nbThreads;
    const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
@@ -947,7 +933,7 @@ ZDICTLIB_API size_t COVER_optimizeTrainFromBuffer(void *dictBuffer,
    const unsigned kIterations =
        (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
    /* Local variables */
-  const int displayLevel = parameters->notificationLevel;
+  const int displayLevel = parameters->zParams.notificationLevel;
    unsigned iteration = 1;
    unsigned d;
    unsigned k;
@@ -976,7 +962,7 @@ ZDICTLIB_API size_t COVER_optimizeTrainFromBuffer(void *dictBuffer,
    /* Initialization */
    COVER_best_init(&best);
    /* Turn down global display level to clean up display at level 2 and below */
-  g_displayLevel = parameters->notificationLevel - 1;
+  g_displayLevel = parameters->zParams.notificationLevel - 1;
    /* Loop through d first because each new value needs a new context */
    LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
                      kIterations);
diff --git a/lib/dictBuilder/zdict.c b/lib/dictBuilder/zdict.c

index 943ddde0faf3cb30ea65e28302227d6a90189a9b..8bc6a0191ecd611a0740894c03d1b1e391e51adc 100644 (file)
--- a/lib/dictBuilder/zdict.c
+++ b/lib/dictBuilder/zdict.c
@@ -487,7 +487,7 @@ static U32 ZDICT_dictSize(const dictItem* dictList)
  }
  
  
-static size_t ZDICT_trainBuffer(dictItem* dictList, U32 dictListSize,
+static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
                              const void* const buffer, size_t bufferSize,   /* buffer must end with noisy guard band */
                              const size_t* fileSizes, unsigned nbFiles,
                              U32 minRatio, U32 notificationLevel)
@@ -634,17 +634,6 @@ static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
      }   }   }
  }
  
-/*
-static size_t ZDICT_maxSampleSize(const size_t* fileSizes, unsigned nbFiles)
-{
-    unsigned u;
-    size_t max=0;
-    for (u=0; u<nbFiles; u++)
-        if (max < fileSizes[u]) max = fileSizes[u];
-    return max;
-}
-*/
-
  static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles)
  {
      size_t total=0;
@@ -930,14 +919,14 @@ size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictCo
  }
  
  
-/*! ZDICT_trainFromBuffer_unsafe() :
+/*! ZDICT_trainFromBuffer_unsafe_legacy() :
  *   Warning : `samplesBuffer` must be followed by noisy guard band.
  *   @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
  */
-size_t ZDICT_trainFromBuffer_unsafe(
+size_t ZDICT_trainFromBuffer_unsafe_legacy(
                              void* dictBuffer, size_t maxDictSize,
                              const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
-                            ZDICT_params_t params)
+                            ZDICT_legacy_params_t params)
  {
      U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/16));
      dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
@@ -946,7 +935,7 @@ size_t ZDICT_trainFromBuffer_unsafe(
      size_t const targetDictSize = maxDictSize;
      size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
      size_t dictSize = 0;
-    U32 const notificationLevel = params.notificationLevel;
+    U32 const notificationLevel = params.zParams.notificationLevel;
  
      /* checks */
      if (!dictList) return ERROR(memory_allocation);
@@ -957,13 +946,13 @@ size_t ZDICT_trainFromBuffer_unsafe(
      ZDICT_initDictItem(dictList);
  
      /* build dictionary */
-    ZDICT_trainBuffer(dictList, dictListSize,
-                    samplesBuffer, samplesBuffSize,
-                    samplesSizes, nbSamples,
-                    minRep, notificationLevel);
+    ZDICT_trainBuffer_legacy(dictList, dictListSize,
+                       samplesBuffer, samplesBuffSize,
+                       samplesSizes, nbSamples,
+                       minRep, notificationLevel);
  
      /* display best matches */
-    if (params.notificationLevel>= 3) {
+    if (params.zParams.notificationLevel>= 3) {
          U32 const nb = MIN(25, dictList[0].pos);
          U32 const dictContentSize = ZDICT_dictSize(dictList);
          U32 u;
@@ -1026,7 +1015,7 @@ size_t ZDICT_trainFromBuffer_unsafe(
  
          dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
                                                               samplesBuffer, samplesSizes, nbSamples,
-                                                             params);
+                                                             params.zParams);
      }
  
      /* clean up */
@@ -1037,9 +1026,9 @@ size_t ZDICT_trainFromBuffer_unsafe(
  
  /* issue : samplesBuffer need to be followed by a noisy guard band.
  *  work around : duplicate the buffer, and add the noise */
-size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity,
-                                      const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
-                                      ZDICT_params_t params)
+size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
+                              const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                              ZDICT_legacy_params_t params)
  {
      size_t result;
      void* newBuff;
@@ -1052,10 +1041,9 @@ size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacit
      memcpy(newBuff, samplesBuffer, sBuffSize);
      ZDICT_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH);   /* guard band, for end of buffer condition */
  
-    result = ZDICT_trainFromBuffer_unsafe(
-                                        dictBuffer, dictBufferCapacity,
-                                        newBuff, samplesSizes, nbSamples,
-                                        params);
+    result =
+        ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, dictBufferCapacity, newBuff,
+                                            samplesSizes, nbSamples, params);
      free(newBuff);
      return result;
  }
@@ -1064,11 +1052,13 @@ size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacit
  size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
                               const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
  {
-    ZDICT_params_t params;
+    ZDICT_cover_params_t params;
      memset(&params, 0, sizeof(params));
-    return ZDICT_trainFromBuffer_advanced(dictBuffer, dictBufferCapacity,
-                                          samplesBuffer, samplesSizes, nbSamples,
-                                          params);
+    params.d = 8;
+    params.steps = 4;
+    return ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, dictBufferCapacity,
+                                               samplesBuffer, samplesSizes,
+                                               nbSamples, &params);
  }
  
  size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
diff --git a/lib/dictBuilder/zdict.h b/lib/dictBuilder/zdict.h

index 5ef2a3f5b3aa4262370ba8f0d7a062c8002e158f..7bfbb351a1ddf4f1784305e6d63234d74a9d9539 100644 (file)
--- a/lib/dictBuilder/zdict.h
+++ b/lib/dictBuilder/zdict.h
@@ -36,18 +36,20 @@ extern "C" {
  #endif
  
  
-/*! ZDICT_trainFromBuffer() :
-    Train a dictionary from an array of samples.
-    Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
-    supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
-    The resulting dictionary will be saved into `dictBuffer`.
-    @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
-              or an error code, which can be tested with ZDICT_isError().
-    Tips : In general, a reasonable dictionary has a size of ~ 100 KB.
-           It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`.
-           In general, it's recommended to provide a few thousands samples, but this can vary a lot.
-           It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
-*/
+/*! ZDICT_trainFromBuffer():
+ * Train a dictionary from an array of samples.
+ * Uses ZDICT_optimizeTrainFromBuffer_cover() single-threaded, with d=8 and steps=4.
+ * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ * The resulting dictionary will be saved into `dictBuffer`.
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *           or an error code, which can be tested with ZDICT_isError().
+ * Note: ZDICT_trainFromBuffer() requires about 9 bytes of memory for each input byte.
+ * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
+ *        It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`.
+ *        In general, it's recommended to provide a few thousands samples, but this can vary a lot.
+ *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
+ */
  ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
                         const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples);
  
@@ -69,94 +71,78 @@ ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode);
   * ==================================================================================== */
  
  typedef struct {
-    unsigned selectivityLevel;   /* 0 means default; larger => select more => larger dictionary */
      int      compressionLevel;   /* 0 means default; target a specific zstd compression level */
      unsigned notificationLevel;  /* Write to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */
      unsigned dictID;             /* 0 means auto mode (32-bits random value); other : force dictID value */
-    unsigned reserved[2];        /* reserved space for future parameters */
  } ZDICT_params_t;
  
-
-/*! ZDICT_trainFromBuffer_advanced() :
-    Same as ZDICT_trainFromBuffer() with control over more parameters.
-    `parameters` is optional and can be provided with values set to 0 to mean "default".
-    @return : size of dictionary stored into `dictBuffer` (<= `dictBufferSize`),
-              or an error code, which can be tested by ZDICT_isError().
-    note : ZDICT_trainFromBuffer_advanced() will send notifications into stderr if instructed to, using notificationLevel>0.
-*/
-ZDICTLIB_API size_t ZDICT_trainFromBuffer_advanced(void* dictBuffer, size_t dictBufferCapacity,
-                                const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
-                                ZDICT_params_t parameters);
-
-/*! COVER_params_t :
-    For all values 0 means default.
-    k and d are the only required parameters.
-*/
+/*! ZDICT_cover_params_t:
+ *  For all values 0 means default.
+ *  k and d are the only required parameters.
+ */
  typedef struct {
      unsigned k;                  /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
      unsigned d;                  /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
      unsigned steps;              /* Number of steps : Only used for optimization : 0 means default (32) : Higher means more parameters checked */
-
      unsigned nbThreads;          /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
-    unsigned notificationLevel;  /* Write to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */
-    unsigned dictID;             /* 0 means auto mode (32-bits random value); other : force dictID value */
-    int      compressionLevel;   /* 0 means default; target a specific zstd compression level */
-} COVER_params_t;
-
-
-/*! COVER_trainFromBuffer() :
-    Train a dictionary from an array of samples using the COVER algorithm.
-    Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
-    supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
-    The resulting dictionary will be saved into `dictBuffer`.
-    @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
-              or an error code, which can be tested with ZDICT_isError().
-    Note : COVER_trainFromBuffer() requires about 9 bytes of memory for each input byte.
-    Tips : In general, a reasonable dictionary has a size of ~ 100 KB.
-           It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`.
-           In general, it's recommended to provide a few thousands samples, but this can vary a lot.
-           It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
-*/
-ZDICTLIB_API size_t COVER_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
-                              const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
-                              COVER_params_t parameters);
-
-/*! COVER_optimizeTrainFromBuffer() :
-    The same requirements as above hold for all the parameters except `parameters`.
-    This function tries many parameter combinations and picks the best parameters.
-    `*parameters` is filled with the best parameters found, and the dictionary
-    constructed with those parameters is stored in `dictBuffer`.
-
-    All of the parameters d, k, steps are optional.
-    If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}.
-    if steps is zero it defaults to its default value.
-    If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [16, 2048].
-
-    @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
-              or an error code, which can be tested with ZDICT_isError().
-              On success `*parameters` contains the parameters selected.
-    Note : COVER_optimizeTrainFromBuffer() requires about 8 bytes of memory for each input byte and additionally another 5 bytes of memory for each byte of memory for each thread.
-*/
-ZDICTLIB_API size_t COVER_optimizeTrainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
-                                     const void* samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
-                                     COVER_params_t *parameters);
-
-/*! ZDICT_finalizeDictionary() :
-
-    Given a custom content as a basis for dictionary, and a set of samples,
-    finalize dictionary by adding headers and statistics.
-
-    Samples must be stored concatenated in a flat buffer `samplesBuffer`,
-    supplied with an array of sizes `samplesSizes`, providing the size of each sample in order.
-
-    dictContentSize must be >= ZDICT_CONTENTSIZE_MIN bytes.
-    maxDictSize must be >= dictContentSize, and must be >= ZDICT_DICTSIZE_MIN bytes.
-
-    @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`),
-              or an error code, which can be tested by ZDICT_isError().
-    note : ZDICT_finalizeDictionary() will push notifications into stderr if instructed to, using notificationLevel>0.
-    note 2 : dictBuffer and dictContent can overlap
-*/
+    ZDICT_params_t zParams;
+} ZDICT_cover_params_t;
+
+
+/*! ZDICT_trainFromBuffer_cover():
+ * Train a dictionary from an array of samples using the COVER algorithm.
+ * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ * The resulting dictionary will be saved into `dictBuffer`.
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *           or an error code, which can be tested with ZDICT_isError().
+ * Note: ZDICT_trainFromBuffer_cover() requires about 9 bytes of memory for each input byte.
+ * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
+ *        It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`.
+ *        In general, it's recommended to provide a few thousands samples, but this can vary a lot.
+ *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
+ */
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
+    void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
+    const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_cover_params_t parameters);
+
+/*! ZDICT_optimizeTrainFromBuffer_cover():
+ * The same requirements as above hold for all the parameters except `parameters`.
+ * This function tries many parameter combinations and picks the best parameters.
+ * `*parameters` is filled with the best parameters found, and the dictionary
+ * constructed with those parameters is stored in `dictBuffer`.
+ *
+ * All of the parameters d, k, steps are optional.
+ * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}.
+ * if steps is zero it defaults to its default value.
+ * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [16, 2048].
+ *
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *           or an error code, which can be tested with ZDICT_isError().
+ *           On success `*parameters` contains the parameters selected.
+ * Note: ZDICT_optimizeTrainFromBuffer_cover() requires about 8 bytes of memory for each input byte and additionally another 5 bytes of memory for each byte of memory for each thread.
+ */
+ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
+    void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
+    const size_t *samplesSizes, unsigned nbSamples,
+    ZDICT_cover_params_t *parameters);
+
+/*! ZDICT_finalizeDictionary():
+ * Given a custom content as a basis for dictionary, and a set of samples,
+ * finalize dictionary by adding headers and statistics.
+ *
+ * Samples must be stored concatenated in a flat buffer `samplesBuffer`,
+ * supplied with an array of sizes `samplesSizes`, providing the size of each sample in order.
+ *
+ * dictContentSize must be >= ZDICT_CONTENTSIZE_MIN bytes.
+ * maxDictSize must be >= dictContentSize, and must be >= ZDICT_DICTSIZE_MIN bytes.
+ *
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`),
+ *           or an error code, which can be tested by ZDICT_isError().
+ * Note: ZDICT_finalizeDictionary() will push notifications into stderr if instructed to, using notificationLevel>0.
+ * Note 2: dictBuffer and dictContent can overlap
+ */
  #define ZDICT_CONTENTSIZE_MIN 128
  #define ZDICT_DICTSIZE_MIN    256
  ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
@@ -164,7 +150,28 @@ ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBuffer
                                  const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
                                  ZDICT_params_t parameters);
  
-
+typedef struct {
+    unsigned selectivityLevel;   /* 0 means default; larger => select more => larger dictionary */
+    ZDICT_params_t zParams;
+} ZDICT_legacy_params_t;
+
+/*! ZDICT_trainFromBuffer_legacy():
+ * Train a dictionary from an array of samples.
+ * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ * The resulting dictionary will be saved into `dictBuffer`.
+ * `parameters` is optional and can be provided with values set to 0 to mean "default".
+ * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *           or an error code, which can be tested with ZDICT_isError().
+ * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
+ *        It's obviously possible to target smaller or larger ones, just by specifying different `dictBufferCapacity`.
+ *        In general, it's recommended to provide a few thousands samples, but this can vary a lot.
+ *        It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
+ * Note: ZDICT_trainFromBuffer_legacy() will send notifications into stderr if instructed to, using notificationLevel>0.
+ */
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_legacy(
+    void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
+    const size_t *samplesSizes, unsigned nbSamples, ZDICT_legacy_params_t parameters);
  
  /* Deprecation warnings */
  /* It is generally possible to disable deprecation warnings from compiler,
diff --git a/programs/dibio.c b/programs/dibio.c

index aac36425cf752de16155c7766e519c1b3a7da51e..31cde5c95db11f11ac491e3bb8d4c816c311b6ab 100644 (file)
--- a/programs/dibio.c
+++ b/programs/dibio.c
@@ -216,21 +216,21 @@ static U64 DiB_getTotalCappedFileSize(const char** fileNamesTable, unsigned nbFi
  }
  
  
-/*! ZDICT_trainFromBuffer_unsafe() :
+/*! ZDICT_trainFromBuffer_unsafe_legacy() :
      Strictly Internal use only !!
-    Same as ZDICT_trainFromBuffer_advanced(), but does not control `samplesBuffer`.
+    Same as ZDICT_trainFromBuffer_legacy(), but does not control `samplesBuffer`.
      `samplesBuffer` must be followed by noisy guard band to avoid out-of-buffer reads.
      @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
                or an error code.
  */
-size_t ZDICT_trainFromBuffer_unsafe(void* dictBuffer, size_t dictBufferCapacity,
-                              const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
-                              ZDICT_params_t parameters);
+size_t ZDICT_trainFromBuffer_unsafe_legacy(void* dictBuffer, size_t dictBufferCapacity,
+                                           const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                                           ZDICT_legacy_params_t parameters);
  
  
  int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
                         const char** fileNamesTable, unsigned nbFiles,
-                       ZDICT_params_t *params, COVER_params_t *coverParams,
+                       ZDICT_legacy_params_t *params, ZDICT_cover_params_t *coverParams,
                         int optimizeCover)
  {
      void* const dictBuffer = malloc(maxDictSize);
@@ -243,8 +243,8 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
      int result = 0;
  
      /* Checks */
-    if (params) g_displayLevel = params->notificationLevel;
-    else if (coverParams) g_displayLevel = coverParams->notificationLevel;
+    if (params) g_displayLevel = params->zParams.notificationLevel;
+    else if (coverParams) g_displayLevel = coverParams->zParams.notificationLevel;
      else EXM_THROW(13, "Neither dictionary algorith selected");   /* should not happen */
      if ((!fileSizes) || (!srcBuffer) || (!dictBuffer)) EXM_THROW(12, "not enough memory for DiB_trainFiles");   /* should not happen */
      if (g_tooLargeSamples) {
@@ -273,20 +273,20 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
          size_t dictSize;
          if (params) {
              DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH);   /* guard band, for end of buffer condition */
-            dictSize = ZDICT_trainFromBuffer_unsafe(dictBuffer, maxDictSize,
-                                                    srcBuffer, fileSizes, nbFiles,
-                                                    *params);
+            dictSize = ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, maxDictSize,
+                                                           srcBuffer, fileSizes, nbFiles,
+                                                           *params);
          } else if (optimizeCover) {
-            dictSize = COVER_optimizeTrainFromBuffer(
-                dictBuffer, maxDictSize, srcBuffer, fileSizes, nbFiles,
-                coverParams);
+            dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,
+                                                           srcBuffer, fileSizes, nbFiles,
+                                                           coverParams);
              if (!ZDICT_isError(dictSize)) {
-              DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\n", coverParams->k, coverParams->d, coverParams->steps);
+                DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\n", coverParams->k, coverParams->d, coverParams->steps);
              }
          } else {
-            dictSize = COVER_trainFromBuffer(dictBuffer, maxDictSize,
-                                             srcBuffer, fileSizes, nbFiles,
-                                             *coverParams);
+            dictSize =
+                ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
+                                            fileSizes, nbFiles, *coverParams);
          }
          if (ZDICT_isError(dictSize)) {
              DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
diff --git a/programs/dibio.h b/programs/dibio.h

index e61d0042c850729d07227dff04018ebc30bfa310..84f7d580283d4c0e1f04f611bcbf34a3d99feb49 100644 (file)
--- a/programs/dibio.h
+++ b/programs/dibio.h
@@ -32,7 +32,7 @@
  */
  int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
                         const char** fileNamesTable, unsigned nbFiles,
-                       ZDICT_params_t *params, COVER_params_t *coverParams,
+                       ZDICT_legacy_params_t *params, ZDICT_cover_params_t *coverParams,
                         int optimizeCover);
  
  #endif
diff --git a/programs/zstdcli.c b/programs/zstdcli.c

index 57943863f7c83c5ee7146d3dbed90bc8a6858a73..34826a6f64655b4963e6be727d0ed007694d595d 100644 (file)
--- a/programs/zstdcli.c
+++ b/programs/zstdcli.c
@@ -246,7 +246,7 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
   * @return 1 means that cover parameters were correct
   * @return 0 in case of malformed parameters
   */
-static unsigned parseCoverParameters(const char* stringPtr, COVER_params_t* params)
+static unsigned parseCoverParameters(const char* stringPtr, ZDICT_cover_params_t* params)
  {
      memset(params, 0, sizeof(*params));
      for (; ;) {
@@ -275,9 +275,9 @@ static unsigned parseLegacyParameters(const char* stringPtr, unsigned* selectivi
      return 1;
  }
  
-static COVER_params_t defaultCoverParams(void)
+static ZDICT_cover_params_t defaultCoverParams(void)
  {
-    COVER_params_t params;
+    ZDICT_cover_params_t params;
      memset(&params, 0, sizeof(params));
      params.d = 8;
      params.steps = 4;
@@ -356,7 +356,7 @@ int main(int argCount, const char* argv[])
      unsigned fileNamesNb;
  #endif
  #ifndef ZSTD_NODICT
-    COVER_params_t coverParams = defaultCoverParams();
+    ZDICT_cover_params_t coverParams = defaultCoverParams();
      int cover = 1;
  #endif
  
@@ -695,20 +695,20 @@ int main(int argCount, const char* argv[])
      /* Check if dictionary builder is selected */
      if (operation==zom_train) {
  #ifndef ZSTD_NODICT
+        ZDICT_params_t zParams;
+        zParams.compressionLevel = dictCLevel;
+        zParams.notificationLevel = g_displayLevel;
+        zParams.dictID = dictID;
          if (cover) {
              int const optimize = !coverParams.k || !coverParams.d;
              coverParams.nbThreads = nbThreads;
-            coverParams.compressionLevel = dictCLevel;
-            coverParams.notificationLevel = g_displayLevel;
-            coverParams.dictID = dictID;
+            coverParams.zParams = zParams;
              operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, NULL, &coverParams, optimize);
          } else {
-            ZDICT_params_t dictParams;
+            ZDICT_legacy_params_t dictParams;
              memset(&dictParams, 0, sizeof(dictParams));
-            dictParams.compressionLevel = dictCLevel;
              dictParams.selectivityLevel = dictSelect;
-            dictParams.notificationLevel = g_displayLevel;
-            dictParams.dictID = dictID;
+            dictParams.zParams = zParams;
              operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, &dictParams, NULL, 0);
          }
  #endif
diff --git a/tests/fuzzer.c b/tests/fuzzer.c

index a3a56d9d84bbe65be275184efd9d5c5acbe837fb..86f0555d58107be46e6ccc23a18d79fa28a7ce17 100644 (file)
--- a/tests/fuzzer.c
+++ b/tests/fuzzer.c
@@ -639,7 +639,7 @@ static int basicUnitTests(U32 seed, double compressibility)
          size_t const sampleUnitSize = 8 KB;
          U32 const nbSamples = (U32)(totalSampleSize / sampleUnitSize);
          size_t* const samplesSizes = (size_t*) malloc(nbSamples * sizeof(size_t));
-        COVER_params_t params;
+        ZDICT_cover_params_t params;
          U32 dictID;
  
          if (dictBuffer==NULL || samplesSizes==NULL) {
@@ -648,14 +648,14 @@ static int basicUnitTests(U32 seed, double compressibility)
              goto _output_error;
          }
  
-        DISPLAYLEVEL(4, "test%3i : COVER_trainFromBuffer : ", testNb++);
+        DISPLAYLEVEL(4, "test%3i : ZDICT_trainFromBuffer_cover : ", testNb++);
          { U32 u; for (u=0; u<nbSamples; u++) samplesSizes[u] = sampleUnitSize; }
          memset(&params, 0, sizeof(params));
          params.d = 1 + (FUZ_rand(&seed) % 16);
          params.k = params.d + (FUZ_rand(&seed) % 256);
-        dictSize = COVER_trainFromBuffer(dictBuffer, dictSize,
-                                         CNBuffer, samplesSizes, nbSamples,
-                                         params);
+        dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, dictSize,
+                                               CNBuffer, samplesSizes, nbSamples,
+                                               params);
          if (ZDICT_isError(dictSize)) goto _output_error;
          DISPLAYLEVEL(4, "OK, created dictionary of size %u \n", (U32)dictSize);
  
@@ -664,12 +664,12 @@ static int basicUnitTests(U32 seed, double compressibility)
          if (dictID==0) goto _output_error;
          DISPLAYLEVEL(4, "OK : %u \n", dictID);
  
-        DISPLAYLEVEL(4, "test%3i : COVER_optimizeTrainFromBuffer : ", testNb++);
+        DISPLAYLEVEL(4, "test%3i : ZDICT_optimizeTrainFromBuffer_cover : ", testNb++);
          memset(&params, 0, sizeof(params));
          params.steps = 4;
-        optDictSize = COVER_optimizeTrainFromBuffer(dictBuffer, optDictSize,
-                                                    CNBuffer, samplesSizes, nbSamples / 4,
-                                                    &params);
+        optDictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, optDictSize,
+                                                          CNBuffer, samplesSizes,
+                                                          nbSamples / 4, &params);
          if (ZDICT_isError(optDictSize)) goto _output_error;
          DISPLAYLEVEL(4, "OK, created dictionary of size %u \n", (U32)optDictSize);
  
diff --git a/tests/symbols.c b/tests/symbols.c

index 5139a6548ef1a027a64107edbf1559cc1b8981b7..8920187f37f76b5de0640f4fa961398948574373 100644 (file)
--- a/tests/symbols.c
+++ b/tests/symbols.c
@@ -131,7 +131,10 @@ static const void *symbols[] = {
    &ZDICT_isError,
    &ZDICT_getErrorName,
  /* zdict.h: advanced functions */
-  &ZDICT_trainFromBuffer_advanced,
+  &ZDICT_trainFromBuffer_cover,
+  &ZDICT_optimizeTrainFromBuffer_cover,
+  &ZDICT_finalizeDictionary,
+  &ZDICT_trainFromBuffer_legacy,
    &ZDICT_addEntropyTablesFromBuffer,
    NULL,
  };
author	Nick Terrell <terrelln@fb.com>
	Tue, 27 Jun 2017 04:07:14 +0000 (21:07 -0700)
committer	Nick Terrell <terrelln@fb.com>
	Tue, 27 Jun 2017 04:09:22 +0000 (21:09 -0700)
lib/dictBuilder/cover.c		patch \| blob \| blame \| history
lib/dictBuilder/zdict.c		patch \| blob \| blame \| history
lib/dictBuilder/zdict.h		patch \| blob \| blame \| history
programs/dibio.c		patch \| blob \| blame \| history
programs/dibio.h		patch \| blob \| blame \| history
programs/zstdcli.c		patch \| blob \| blame \| history
tests/fuzzer.c		patch \| blob \| blame \| history
tests/symbols.c		patch \| blob \| blame \| history