]> git.ipfire.org Git - thirdparty/zstd.git/commitdiff
Add non-optimize FASTCOVER (#1260)
authorJennifer Liu <jenniferliu2018@u.northwestern.edu>
Wed, 1 Aug 2018 18:06:16 +0000 (11:06 -0700)
committerNick Terrell <nickrterrell@gmail.com>
Wed, 1 Aug 2018 18:06:16 +0000 (11:06 -0700)
* Add non-optimize FASTCOVER

* Minor fix

* Pass param as value instead of pointer

contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
contrib/experimental_dict_builders/fastCover/README.md
contrib/experimental_dict_builders/fastCover/fastCover.c
contrib/experimental_dict_builders/fastCover/fastCover.h
contrib/experimental_dict_builders/fastCover/main.c
contrib/experimental_dict_builders/fastCover/test.sh

index a1831197335ef2d14e384f8f5d41f492d8a85c8e..559776e2b089bcc172617444e96750ea83bc1ea4 100644 (file)
@@ -18,109 +18,109 @@ make ARG="in=../../../lib/dictBuilder in=../../../lib/compress"
 - Fourth column is chosen d and fifth column is chosen k
 
 github:
-NODICT       0.000005       2.999642        
-RANDOM       0.036114       8.791189        
-LEGACY       1.111024       8.173529        
-COVER       57.856477       10.652243        8          1298
-COVER       5.769965       10.652243        8          1298
-FAST15       9.965877       10.555630        8          1874
-FAST15       0.140285       10.555630        8          1874
-FAST16       10.337194       10.701698        8          1106
-FAST16       0.114887       10.701698        8          1106
-FAST17       10.207121       10.650652        8          1106
-FAST17       0.135424       10.650652        8          1106
-FAST18       11.463120       10.499142        8          1826
-FAST18       0.154287       10.499142        8          1826
-FAST19       12.143020       10.527140        8          1826
-FAST19       0.158889       10.527140        8          1826
-FAST20       12.510857       10.494710        8          1826
-FAST20       0.171334       10.494710        8          1826
-FAST21       13.201432       10.503488        8          1778
-FAST21       0.192867       10.503488        8          1778
-FAST22       13.754560       10.509284        8          1826
-FAST22       0.206276       10.509284        8          1826
-FAST23       14.708633       10.509284        8          1826
-FAST23       0.221751       10.509284        8          1826
-FAST24       15.134848       10.512369        8          1826
-FAST24       0.234242       10.512369        8          1826
+NODICT       0.000025       2.999642        
+RANDOM       0.030101       8.791189        
+LEGACY       0.913108       8.173529        
+COVER       59.234160       10.652243        8          1298
+COVER       6.258459       10.652243        8          1298
+FAST15       9.959246       10.555630        8          1874
+FAST15       0.077719       10.555630        8          1874
+FAST16       10.028343       10.701698        8          1106
+FAST16       0.078117       10.701698        8          1106
+FAST17       10.567355       10.650652        8          1106
+FAST17       0.124833       10.650652        8          1106
+FAST18       11.795287       10.499142        8          1826
+FAST18       0.086992       10.499142        8          1826
+FAST19       13.132451       10.527140        8          1826
+FAST19       0.134716       10.527140        8          1826
+FAST20       14.366314       10.494710        8          1826
+FAST20       0.128844       10.494710        8          1826
+FAST21       14.941238       10.503488        8          1778
+FAST21       0.134975       10.503488        8          1778
+FAST22       15.146226       10.509284        8          1826
+FAST22       0.146918       10.509284        8          1826
+FAST23       16.260552       10.509284        8          1826
+FAST23       0.158494       10.509284        8          1826
+FAST24       16.806037       10.512369        8          1826
+FAST24       0.190464       10.512369        8          1826
 
 hg-commands:
-NODICT       0.000004       2.425291        
-RANDOM       0.055073       3.490331        
-LEGACY       0.927414       3.911682        
-COVER       72.749028       4.132653        8          386
-COVER       3.391066       4.132653        8          386
-FAST15       10.910989       3.920720        6          1106
-FAST15       0.130480       3.920720        6          1106
-FAST16       10.565224       4.033306        8          674
-FAST16       0.146228       4.033306        8          674
-FAST17       11.394137       4.064132        8          1490
-FAST17       0.175567       4.064132        8          1490
-FAST18       11.040248       4.086714        8          290
-FAST18       0.132692       4.086714        8          290
-FAST19       11.335856       4.097947        8          578
-FAST19       0.181441       4.097947        8          578
-FAST20       14.166272       4.102851        8          434
-FAST20       0.203632       4.102851        8          434
-FAST21       15.848896       4.105350        8          530
-FAST21       0.269518       4.105350        8          530
-FAST22       15.570995       4.104100        8          530
-FAST22       0.238512       4.104100        8          530
-FAST23       17.437566       4.098110        8          914
-FAST23       0.270788       4.098110        8          914
-FAST24       18.836604       4.117367        8          722
-FAST24       0.323618       4.117367        8          722
+NODICT       0.000026       2.425291        
+RANDOM       0.046270       3.490331        
+LEGACY       0.847904       3.911682        
+COVER       71.691804       4.132653        8          386
+COVER       3.187085       4.132653        8          386
+FAST15       11.593687       3.920720        6          1106
+FAST15       0.082431       3.920720        6          1106
+FAST16       11.775958       4.033306        8          674
+FAST16       0.092587       4.033306        8          674
+FAST17       11.965064       4.064132        8          1490
+FAST17       0.106382       4.064132        8          1490
+FAST18       11.438197       4.086714        8          290
+FAST18       0.097293       4.086714        8          290
+FAST19       12.292512       4.097947        8          578
+FAST19       0.104406       4.097947        8          578
+FAST20       13.857857       4.102851        8          434
+FAST20       0.139467       4.102851        8          434
+FAST21       14.599613       4.105350        8          530
+FAST21       0.189416       4.105350        8          530
+FAST22       15.966109       4.104100        8          530
+FAST22       0.183817       4.104100        8          530
+FAST23       18.033645       4.098110        8          914
+FAST23       0.246641       4.098110        8          914
+FAST24       22.992891       4.117367        8          722
+FAST24       0.285994       4.117367        8          722
 
 hg-changelog:
-NODICT       0.000006       1.377613        
-RANDOM       0.253393       2.097487        
-LEGACY       2.410568       2.058907        
-COVER       203.550681       2.189685        8          98
-COVER       7.381697       2.189685        8          98
-FAST15       45.960609       2.130794        6          386
-FAST15       0.512057       2.130794        6          386
-FAST16       44.594817       2.144845        8          194
-FAST16       0.601258       2.144845        8          194
-FAST17       45.852992       2.156099        8          242
-FAST17       0.500844       2.156099        8          242
-FAST18       46.624930       2.172439        6          98
-FAST18       0.680501       2.172439        6          98
-FAST19       47.754905       2.180321        6          98
-FAST19       0.606180       2.180321        6          98
-FAST20       56.733632       2.187431        6          98
-FAST20       0.710149       2.187431        6          98
-FAST21       59.723173       2.184185        6          146
-FAST21       0.875562       2.184185        6          146
-FAST22       66.570788       2.182830        6          98
-FAST22       1.061013       2.182830        6          98
-FAST23       73.817645       2.186399        8          98
-FAST23       0.838496       2.186399        8          98
-FAST24       78.059933       2.185608        6          98
-FAST24       0.843158       2.185608        6          98
+NODICT       0.000007       1.377613        
+RANDOM       0.297345       2.097487        
+LEGACY       2.633992       2.058907        
+COVER       219.179786       2.189685        8          98
+COVER       6.620852       2.189685        8          98
+FAST15       47.635082       2.130794        6          386
+FAST15       0.321297       2.130794        6          386
+FAST16       43.837676       2.144845        8          194
+FAST16       0.312640       2.144845        8          194
+FAST17       49.349017       2.156099        8          242
+FAST17       0.348459       2.156099        8          242
+FAST18       51.153784       2.172439        6          98
+FAST18       0.353106       2.172439        6          98
+FAST19       52.627045       2.180321        6          98
+FAST19       0.390612       2.180321        6          98
+FAST20       63.748782       2.187431        6          98
+FAST20       0.489544       2.187431        6          98
+FAST21       68.709198       2.184185        6          146
+FAST21       0.530852       2.184185        6          146
+FAST22       68.491639       2.182830        6          98
+FAST22       0.645699       2.182830        6          98
+FAST23       72.558688       2.186399        8          98
+FAST23       0.593539       2.186399        8          98
+FAST24       76.137195       2.185608        6          98
+FAST24       0.680132       2.185608        6          98
 
 hg-manifest:
-NODICT       0.000005       1.866385        
-RANDOM       0.735840       2.309436        
-LEGACY       9.322081       2.506977        
-COVER       885.961515       2.582528        8          434
-COVER       32.678552       2.582528        8          434
-FAST15       114.414413       2.392920        6          1826
-FAST15       1.412690       2.392920        6          1826
-FAST16       113.869718       2.480762        6          1922
-FAST16       1.539424       2.480762        6          1922
-FAST17       113.333636       2.548285        6          1682
-FAST17       1.473196       2.548285        6          1682
-FAST18       111.717871       2.567634        6          386
-FAST18       1.421200       2.567634        6          386
-FAST19       112.428344       2.581653        8          338
-FAST19       1.412185       2.581653        8          338
-FAST20       128.897480       2.586881        8          194
-FAST20       1.586570       2.586881        8          194
-FAST21       168.465684       2.590051        6          242
-FAST21       2.190732       2.590051        6          242
-FAST22       202.320435       2.591376        6          194
-FAST22       2.667877       2.591376        6          194
-FAST23       228.952201       2.591131        8          434
-FAST23       3.315501       2.591131        8          434
-FAST24       327.320020       2.591548        6          290
-FAST24       5.048348       2.591548        6          290
+NODICT       0.000026       1.866385        
+RANDOM       0.784554       2.309436        
+LEGACY       10.193714       2.506977        
+COVER       988.206583       2.582528        8          434
+COVER       39.726199       2.582528        8          434
+FAST15       168.388819       2.392920        6          1826
+FAST15       1.272178       2.392920        6          1826
+FAST16       161.822607       2.480762        6          1922
+FAST16       1.164908       2.480762        6          1922
+FAST17       157.688544       2.548285        6          1682
+FAST17       1.222439       2.548285        6          1682
+FAST18       154.529585       2.567634        6          386
+FAST18       1.217596       2.567634        6          386
+FAST19       160.244979       2.581653        8          338
+FAST19       1.282450       2.581653        8          338
+FAST20       191.503297       2.586881        8          194
+FAST20       2.009748       2.586881        8          194
+FAST21       226.389709       2.590051        6          242
+FAST21       2.494543       2.590051        6          242
+FAST22       217.859055       2.591376        6          194
+FAST22       2.295693       2.591376        6          194
+FAST23       236.819791       2.591131        8          434
+FAST23       2.744711       2.591131        8          434
+FAST24       269.187800       2.591548        6          290
+FAST24       2.923671       2.591548        6          290
index 75008a087804a6af51fccc3da604d37a013d0899..d92e8d5cb3ec1905b578c5d51258a4c5d2a79c2e 100644 (file)
@@ -91,14 +91,26 @@ dictInfo* createDictFromFiles(sampleInfo *info, unsigned maxDictSize,
           dictSize = ZDICT_trainFromBuffer_random(dictBuffer, maxDictSize, info->srcBuffer,
                                                info->samplesSizes, info->nbSamples, *randomParams);
         }else if(coverParams) {
-          dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize, info->srcBuffer,
-                                                info->samplesSizes, info->nbSamples, coverParams);
+          /* Run the optimize version if either k or d is not provided */
+          if (!coverParams->d || !coverParams->k){
+            dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize, info->srcBuffer,
+                                                  info->samplesSizes, info->nbSamples, coverParams);
+          } else {
+            dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, info->srcBuffer,
+                                                  info->samplesSizes, info->nbSamples, *coverParams);
+          }
         } else if(legacyParams) {
           dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize, info->srcBuffer,
                                                info->samplesSizes, info->nbSamples, *legacyParams);
         } else if(fastParams) {
-          dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer,
-                                                info->samplesSizes, info->nbSamples, fastParams);
+          /* Run the optimize version if either k or d is not provided */
+          if (!fastParams->d || !fastParams->k) {
+            dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer,
+                                                  info->samplesSizes, info->nbSamples, fastParams);
+          } else {
+            dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer,
+                                                  info->samplesSizes, info->nbSamples, *fastParams);
+          }
         } else {
           dictSize = 0;
         }
@@ -403,7 +415,6 @@ int main(int argCount, const char* argv[])
       goto _cleanup;
     }
 
-
     /* for fastCover (with k and d provided) */
     const int fastResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL, &fastParam);
     DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", fastParam.k, fastParam.d, fastParam.f, fastParam.steps, (unsigned)(fastParam.splitPoint * 100));
@@ -411,7 +422,6 @@ int main(int argCount, const char* argv[])
       result = 1;
       goto _cleanup;
     }
-
   }
 
 
index 66e00ee0445d55b2b8e9adbdae3eacd7e0c0b693..ad377743f2a719d0b3e6abe6367e8d5a12f0b607 100644 (file)
@@ -16,8 +16,8 @@ make test
 
 
 ###Usage:
-To build a random dictionary with the provided arguments: make ARG= followed by arguments
-
+To build a FASTCOVER dictionary with the provided arguments: make ARG= followed by arguments
+If k or d is not provided, the optimize version of FASTCOVER is run.
 
 ### Examples:
 make ARG="in=../../../lib/dictBuilder out=dict100 dictID=520"
index cf71075ab64401fff38c694e3dc49d9d52139f84..84d841b107db8eaa4e72f7afa4958a5be0bf3a89 100644 (file)
@@ -629,6 +629,55 @@ _cleanup:
   }
 }
 
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_fastCover(
+    void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
+    const size_t *samplesSizes, unsigned nbSamples, ZDICT_fastCover_params_t parameters) {
+    BYTE* const dict = (BYTE*)dictBuffer;
+    FASTCOVER_ctx_t ctx;
+    parameters.splitPoint = 1.0;
+    /* Initialize global data */
+    g_displayLevel = parameters.zParams.notificationLevel;
+    /* Checks */
+    if (!FASTCOVER_checkParameters(parameters, dictBufferCapacity)) {
+      DISPLAYLEVEL(1, "FASTCOVER parameters incorrect\n");
+      return ERROR(GENERIC);
+    }
+    if (nbSamples == 0) {
+      DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n");
+      return ERROR(GENERIC);
+    }
+    if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
+      DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
+                   ZDICT_DICTSIZE_MIN);
+      return ERROR(dstSize_tooSmall);
+    }
+    /* Initialize context */
+    if (!FASTCOVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
+                            parameters.d, parameters.splitPoint, parameters.f)) {
+      DISPLAYLEVEL(1, "Failed to initialize context\n");
+      return ERROR(GENERIC);
+    }
+    /* Build the dictionary */
+    DISPLAYLEVEL(2, "Building dictionary\n");
+    {
+      const size_t tail = FASTCOVER_buildDictionary(&ctx, ctx.freqs, dictBuffer,
+                                                dictBufferCapacity, parameters);
+
+      const size_t dictionarySize = ZDICT_finalizeDictionary(
+          dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
+          samplesBuffer, samplesSizes, (unsigned)ctx.nbTrainSamples,
+          parameters.zParams);
+      if (!ZSTD_isError(dictionarySize)) {
+          DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
+                      (U32)dictionarySize);
+      }
+      FASTCOVER_ctx_destroy(&ctx);
+      return dictionarySize;
+    }
+}
+
+
+
 ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(
     void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
     const size_t *samplesSizes, unsigned nbSamples,
@@ -657,15 +706,15 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(
 
     /* Checks */
     if (splitPoint <= 0 || splitPoint > 1) {
-      LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
+      LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect splitPoint\n");
       return ERROR(GENERIC);
     }
     if (kMinK < kMaxD || kMaxK < kMinK) {
-      LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
+      LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect k\n");
       return ERROR(GENERIC);
     }
     if (nbSamples == 0) {
-      DISPLAYLEVEL(1, "fast must have at least one input file\n");
+      DISPLAYLEVEL(1, "FASTCOVER must have at least one input file\n");
       return ERROR(GENERIC);
     }
     if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
index eca04baabd8f14dcb24039e33366cd1f5d714d14..958e9f4239308e77568bc9c57a5f6f260e338790 100644 (file)
@@ -12,9 +12,6 @@
 #include "zdict.h"
 
 
-
-
-
 typedef struct {
     unsigned k;                  /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
     unsigned d;                  /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
@@ -26,7 +23,6 @@ typedef struct {
 } ZDICT_fastCover_params_t;
 
 
-
 /*! ZDICT_optimizeTrainFromBuffer_fastCover():
  *  Train a dictionary from an array of samples using a modified version of the COVER algorithm.
  *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
@@ -41,7 +37,21 @@ typedef struct {
  *           or an error code, which can be tested with ZDICT_isError().
  *           On success `*parameters` contains the parameters selected.
  */
-ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(
+ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(
+     void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
+     const size_t *samplesSizes, unsigned nbSamples,
+     ZDICT_fastCover_params_t *parameters);
+
+
+/*! ZDICT_trainFromBuffer_fastCover():
+ *  Train a dictionary from an array of samples using a modified version of the COVER algorithm.
+ *  Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
+ *  supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
+ *  The resulting dictionary will be saved into `dictBuffer`.
+ *  d, k, and f are required.
+ *  @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
+ *           or an error code, which can be tested with ZDICT_isError().
+ */
+ZDICTLIB_API size_t ZDICT_trainFromBuffer_fastCover(
     void *dictBuffer, size_t dictBufferCapacity, const void *samplesBuffer,
-    const size_t *samplesSizes, unsigned nbSamples,
-    ZDICT_fastCover_params_t *parameters);
+    const size_t *samplesSizes, unsigned nbSamples, ZDICT_fastCover_params_t parameters);
index f286b0506798df9bac37a4d07cccca1cdae6134a..df7d91812e29574201ae284796770cb1809c2b75 100644 (file)
@@ -64,8 +64,14 @@ int FASTCOVER_trainFromFiles(const char* dictFileName, sampleInfo *info,
         EXM_THROW(12, "not enough memory for trainFromFiles");   /* should not happen */
 
     {   size_t dictSize;
-        dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer,
-                                             info->samplesSizes, info->nbSamples, params);
+        /* Run the optimize version if either k or d is not provided */
+        if (!params->d || !params->k) {
+          dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer,
+                                               info->samplesSizes, info->nbSamples, params);
+        } else {
+          dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, info->srcBuffer,
+                                               info->samplesSizes, info->nbSamples, *params);
+        }
         DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\n", params->k, params->d, params->f, params->steps, (unsigned)(params->splitPoint*100));
         if (ZDICT_isError(dictSize)) {
             DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
@@ -92,8 +98,8 @@ int main(int argCount, const char* argv[])
   int operationResult = 0;
 
   /* Initialize arguments to default values */
-  unsigned k = 200;
-  unsigned d = 8;
+  unsigned k = 0;
+  unsigned d = 0;
   unsigned f = 23;
   unsigned steps = 32;
   unsigned nbThreads = 1;
index 91d4f4923e8b02876d6b4d8f7a6a0ea89440cb2a..f86915b59fc5bbeb1267cfd47e247dc0584cfb01 100644 (file)
@@ -1,8 +1,8 @@
-echo "Building fastCover dictionary with in=../../lib/common k=200 f=20 out=dict1"
-./main in=../../../lib/common k=200 f=20 out=dict1
+echo "Building fastCover dictionary with in=../../lib/common f=20 out=dict1"
+./main in=../../../lib/common f=20 out=dict1
 zstd -be3 -D dict1 -r ../../../lib/common -q
-echo "Building fastCover dictionary with in=../../lib/common k=500 f=24 out=dict2 dictID=100 maxdict=140000"
-./main in=../../../lib/common k=500 f=24 out=dict2 dictID=100 maxdict=140000
+echo "Building fastCover dictionary with in=../../lib/common k=500 d=6 f=24 out=dict2 dictID=100 maxdict=140000"
+./main in=../../../lib/common k=500 d=6 f=24 out=dict2 dictID=100 maxdict=140000
 zstd -be3 -D dict2 -r ../../../lib/common -q
 echo "Building fastCover dictionary with 2 sample sources"
 ./main in=../../../lib/common in=../../../lib/compress out=dict3