]> git.ipfire.org Git - thirdparty/zstd.git/commitdiff
Refactoring and benchmark without dictionary
authorJennifer Liu <jenniferliu620@fb.com>
Sat, 21 Jul 2018 00:03:47 +0000 (17:03 -0700)
committerJennifer Liu <jenniferliu620@fb.com>
Sat, 21 Jul 2018 00:03:47 +0000 (17:03 -0700)
15 files changed:
contrib/benchmarkDictBuilder/README.md [deleted file]
contrib/benchmarkDictBuilder/dictBuilder.h [deleted file]
contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile [moved from contrib/benchmarkDictBuilder/Makefile with 76% similarity]
contrib/experimental_dict_builders/benchmarkDictBuilder/README.md [new file with mode: 0644]
contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c [moved from contrib/benchmarkDictBuilder/benchmark.c with 53% similarity]
contrib/experimental_dict_builders/benchmarkDictBuilder/dictBuilder.h [new file with mode: 0644]
contrib/experimental_dict_builders/benchmarkDictBuilder/test.sh [moved from contrib/benchmarkDictBuilder/test.sh with 54% similarity]
contrib/experimental_dict_builders/randomDictBuilder/Makefile [moved from contrib/randomDictBuilder/Makefile with 79% similarity]
contrib/experimental_dict_builders/randomDictBuilder/README.md [moved from contrib/randomDictBuilder/README.md with 85% similarity]
contrib/experimental_dict_builders/randomDictBuilder/io.c [moved from contrib/randomDictBuilder/io.c with 89% similarity]
contrib/experimental_dict_builders/randomDictBuilder/io.h [moved from contrib/randomDictBuilder/io.h with 78% similarity]
contrib/experimental_dict_builders/randomDictBuilder/main.c [moved from contrib/randomDictBuilder/main.c with 79% similarity]
contrib/experimental_dict_builders/randomDictBuilder/random.c [moved from contrib/randomDictBuilder/random.c with 100% similarity]
contrib/experimental_dict_builders/randomDictBuilder/random.h [moved from contrib/randomDictBuilder/random.h with 100% similarity]
contrib/experimental_dict_builders/randomDictBuilder/test.sh [moved from contrib/randomDictBuilder/test.sh with 52% similarity]

diff --git a/contrib/benchmarkDictBuilder/README.md b/contrib/benchmarkDictBuilder/README.md
deleted file mode 100644 (file)
index b680a53..0000000
+++ /dev/null
@@ -1,43 +0,0 @@
-Benchmarking Dictionary Builder
-
-### Permitted Argument:
-Input File/Directory (in=fileName): required; file/directory used to build dictionary; if directory, will operate recursively for files inside directory; can include multiple files/directories, each following "in="
-
-###Running Test:
-make test
-
-###Usage:
-Benchmark given input files: make ARG= followed by permitted arguments
-
-### Examples:
-make ARG="in=../../lib/dictBuilder in=../../lib/compress"
-
-###Benchmarking Result:
-
-github:
-| Algorithm     | Speed(sec)    | Compression Ratio  |
-| ------------- |:-------------:| ------------------:|
-| random        | 0.182254      |  8.786957          |
-| cover         | 34.821007     |  10.430999         |
-| legacy        | 1.125494      |  8.989482          |
-
-hg-commands
-| Algorithm     | Speed(sec)    | Compression Ratio  |
-| ------------- |:-------------:| ------------------:|
-| random        | 0.089231      |  3.489515          |
-| cover         | 32.342462     |  4.030274          |
-| legacy        | 1.066594      |  3.911896          |
-
-hg-manifest
-| Algorithm     | Speed(sec)    | Compression Ratio  |
-| ------------- |:-------------:| ------------------:|
-| random        | 1.095083      |  2.309485          |
-| cover         | 517.999132    |  2.575331          |
-| legacy        | 10.789509     |  2.506775          |
-
-hg-changelog
-| Algorithm     | Speed(sec)    | Compression Ratio  |
-| ------------- |:-------------:| ------------------:|
-| random        | 0.639630      |  2.096785          |
-| cover         | 121.398023    |  2.175706          |
-| legacy        | 3.050893      |  2.058273          |
diff --git a/contrib/benchmarkDictBuilder/dictBuilder.h b/contrib/benchmarkDictBuilder/dictBuilder.h
deleted file mode 100644 (file)
index a2dae57..0000000
+++ /dev/null
@@ -1,10 +0,0 @@
-/*! ZDICT_trainFromBuffer_unsafe_legacy() :
-    Strictly Internal use only !!
-    Same as ZDICT_trainFromBuffer_legacy(), but does not control `samplesBuffer`.
-    `samplesBuffer` must be followed by noisy guard band to avoid out-of-buffer reads.
-    @return : size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
-              or an error code.
-*/
-size_t ZDICT_trainFromBuffer_unsafe_legacy(void* dictBuffer, size_t dictBufferCapacity,
-                                           const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
-                                           ZDICT_legacy_params_t parameters);
similarity index 76%
rename from contrib/benchmarkDictBuilder/Makefile
rename to contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile
index d36d96d527176ab86312ef896f64e945c3aa690b..72ce04f2a56bd2f4854b0290322b1c1084f53ce1 100644 (file)
@@ -2,7 +2,7 @@ ARG :=
 
 CC ?= gcc
 CFLAGS ?= -O3
-INCLUDES := -I ../randomDictBuilder -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder
+INCLUDES := -I ../randomDictBuilder -I ../../../programs -I ../../../lib/common -I ../../../lib -I ../../../lib/dictBuilder
 
 RANDOM_FILE := ../randomDictBuilder/random.c
 IO_FILE := ../randomDictBuilder/io.c
@@ -34,11 +34,11 @@ io.o: $(IO_FILE)
        $(CC) $(CFLAGS) $(INCLUDES) -c $(IO_FILE)
 
 libzstd.a:
-       $(MAKE) -C ../../lib libzstd.a
-       mv ../../lib/libzstd.a .
+       $(MAKE) -C ../../../lib libzstd.a
+       mv ../../../lib/libzstd.a .
 
 .PHONY: clean
 clean:
        rm -f *.o benchmark libzstd.a
-       $(MAKE) -C ../../lib clean
+       $(MAKE) -C ../../../lib clean
        echo "Cleaning is completed"
diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md
new file mode 100644 (file)
index 0000000..de783a0
--- /dev/null
@@ -0,0 +1,47 @@
+Benchmarking Dictionary Builder
+
+### Permitted Argument:
+Input File/Directory (in=fileName): required; file/directory used to build dictionary; if directory, will operate recursively for files inside directory; can include multiple files/directories, each following "in="
+
+###Running Test:
+make test
+
+###Usage:
+Benchmark given input files: make ARG= followed by permitted arguments
+
+### Examples:
+make ARG="in=../../../lib/dictBuilder in=../../../lib/compress"
+
+###Benchmarking Result:
+
+github:
+| Algorithm     | Speed(sec)    | Compression Ratio  |
+| ------------- |:-------------:| ------------------:|
+| nodict        | 0.000004      |  2.999642          |
+| random        | 0.180238      |  8.786957          |
+| cover         | 33.891987     |  10.430999         |
+| legacy        | 1.077569      |  8.989482          |
+
+hg-commands
+| Algorithm     | Speed(sec)    | Compression Ratio  |
+| ------------- |:-------------:| ------------------:|
+| nodict        | 0.000006      |  2.425291          |
+| random        | 0.088735      |  3.489515          |
+| cover         | 35.447300     |  4.030274          |
+| legacy        | 1.048509      |  3.911896          |
+
+hg-manifest
+| Algorithm     | Speed(sec)    | Compression Ratio  |
+| ------------- |:-------------:| ------------------:|
+| nodict        | 0.000005      |  1.866385          |
+| random        | 1.148231      |  2.309485          |
+| cover         | 509.685257    |  2.575331          |
+| legacy        | 10.705866     |  2.506775          |
+
+hg-changelog
+| Algorithm     | Speed(sec)    | Compression Ratio  |
+| ------------- |:-------------:| ------------------:|
+| nodict        | 0.000005      |  1.377613          |
+| random        | 0.706434      |  2.096785          |
+| cover         | 122.815783    |  2.175706          |
+| legacy        | 3.010318      |  2.058273          |
similarity index 53%
rename from contrib/benchmarkDictBuilder/benchmark.c
rename to contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c
index aabd96a085194a7997bed820963380a3a7bbb073..890afb8b464edf39efece0ec8b904ac7ea16ba5c 100644 (file)
@@ -44,12 +44,14 @@ static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
     exit(error);                                                          \
 }
 
+
 /*-*************************************
 *  Constants
 ***************************************/
 static const unsigned g_defaultMaxDictSize = 110 KB;
-#define MEMMULT 11
-#define NOISELENGTH 32
+#define DEFAULT_CLEVEL 3
+#define DEFAULT_DISPLAYLEVEL 2
+
 
 /*-*************************************
 *  Struct
@@ -60,57 +62,6 @@ typedef struct {
 } dictInfo;
 
 
-/*-*************************************
-*  Commandline related functions
-***************************************/
-static unsigned readU32FromChar(const char** stringPtr){
-    const char errorMsg[] = "error: numeric value too large";
-    unsigned result = 0;
-    while ((**stringPtr >='0') && (**stringPtr <='9')) {
-        unsigned const max = (((unsigned)(-1)) / 10) - 1;
-        if (result > max) exit(1);
-        result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
-    }
-    if ((**stringPtr=='K') || (**stringPtr=='M')) {
-        unsigned const maxK = ((unsigned)(-1)) >> 10;
-        if (result > maxK) exit(1);
-        result <<= 10;
-        if (**stringPtr=='M') {
-            if (result > maxK) exit(1);
-            result <<= 10;
-        }
-        (*stringPtr)++;  /* skip `K` or `M` */
-        if (**stringPtr=='i') (*stringPtr)++;
-        if (**stringPtr=='B') (*stringPtr)++;
-    }
-    return result;
-}
-
-/** longCommandWArg() :
- *  check if *stringPtr is the same as longCommand.
- *  If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand.
- * @return 0 and doesn't modify *stringPtr otherwise.
- */
-static unsigned longCommandWArg(const char** stringPtr, const char* longCommand){
-    size_t const comSize = strlen(longCommand);
-    int const result = !strncmp(*stringPtr, longCommand, comSize);
-    if (result) *stringPtr += comSize;
-    return result;
-}
-
-static void fillNoise(void* buffer, size_t length)
-{
-    unsigned const prime1 = 2654435761U;
-    unsigned const prime2 = 2246822519U;
-    unsigned acc = prime1;
-    size_t p=0;;
-
-    for (p=0; p<length; p++) {
-        acc *= prime2;
-        ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
-    }
-}
-
 /*-*************************************
 * Dictionary related operations
 ***************************************/
@@ -122,9 +73,9 @@ dictInfo* createDictFromFiles(sampleInfo *info, unsigned maxDictSize,
                   ZDICT_random_params_t *randomParams, ZDICT_cover_params_t *coverParams,
                   ZDICT_legacy_params_t *legacyParams) {
     unsigned const displayLevel = randomParams ? randomParams->zParams.notificationLevel :
-                        coverParams ? coverParams->zParams.notificationLevel :
-                        legacyParams ? legacyParams->zParams.notificationLevel :
-                        0;   /* should never happen */
+                                  coverParams ? coverParams->zParams.notificationLevel :
+                                  legacyParams ? legacyParams->zParams.notificationLevel :
+                                  DEFAULT_DISPLAYLEVEL;   /* no dict */
     void* const dictBuffer = malloc(maxDictSize);
 
     dictInfo* dInfo;
@@ -140,21 +91,15 @@ dictInfo* createDictFromFiles(sampleInfo *info, unsigned maxDictSize,
         }else if(coverParams) {
           dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize, info->srcBuffer,
                                                 info->samplesSizes, info->nbSamples, coverParams);
-        } else {
-          size_t totalSize= 0;
-          for (int i = 0; i < info->nbSamples; i++) {
-            totalSize += info->samplesSizes[i];
-          }
-          size_t const maxMem = findMaxMem(totalSize * MEMMULT) / MEMMULT;
-          size_t loadedSize = (size_t) MIN ((unsigned long long)maxMem, totalSize);
-          fillNoise((char*)(info->srcBuffer) + loadedSize, NOISELENGTH);
-          dictSize = ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, maxDictSize, info->srcBuffer,
+        } else if(legacyParams) {
+          dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize, info->srcBuffer,
                                                info->samplesSizes, info->nbSamples, *legacyParams);
+        } else {
+          dictSize = 0;
         }
         if (ZDICT_isError(dictSize)) {
             DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
             free(dictBuffer);
-            freeSampleInfo(info);
             return dInfo;
         }
         dInfo = (dictInfo *)malloc(sizeof(dictInfo));
@@ -173,6 +118,7 @@ double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLev
   /* Local variables */
   size_t totalCompressedSize = 0;
   size_t totalOriginalSize = 0;
+  unsigned hasDict = dInfo->dictSize > 0 ? 1 : 0;
   double cRatio;
   size_t dstCapacity;
   int i;
@@ -193,15 +139,6 @@ double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLev
     dst = malloc(dstCapacity);
   }
 
-  /* Create the cctx and cdict */
-  cctx = ZSTD_createCCtx();
-  cdict = ZSTD_createCDict(dInfo->dictBuffer, dInfo->dictSize, compressionLevel);
-
-  if(!cctx || !cdict || !dst) {
-    cRatio = -1;
-    goto _cleanup;
-  }
-
   /* Calculate offset for each sample */
   offsets = (size_t *)malloc((srcInfo->nbSamples + 1) * sizeof(size_t));
   offsets[0] = 0;
@@ -209,13 +146,35 @@ double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLev
     offsets[i] = offsets[i - 1] + srcInfo->samplesSizes[i - 1];
   }
 
+  /* Create the cctx */
+  cctx = ZSTD_createCCtx();
+  if(!cctx || !dst) {
+    cRatio = -1;
+    goto _nodictCleanup;
+  }
+
+  /* Create CDict if there's a dictionary stored on buffer */
+  if (hasDict) {
+    cdict = ZSTD_createCDict(dInfo->dictBuffer, dInfo->dictSize, compressionLevel);
+    if(!cdict) {
+      cRatio = -1;
+      goto _dictCleanup;
+    }
+  }
+
   /* Compress each sample and sum their sizes*/
   const BYTE *const samples = (const BYTE *)srcInfo->srcBuffer;
   for (i = 0; i < srcInfo->nbSamples; i++) {
-    const size_t compressedSize = ZSTD_compress_usingCDict(cctx, dst, dstCapacity, samples + offsets[i], srcInfo->samplesSizes[i], cdict);
+    size_t compressedSize;
+    if(hasDict) {
+      compressedSize = ZSTD_compress_usingCDict(cctx, dst, dstCapacity, samples + offsets[i], srcInfo->samplesSizes[i], cdict);
+    } else {
+      compressedSize = ZSTD_compressCCtx(cctx, dst, dstCapacity,samples + offsets[i], srcInfo->samplesSizes[i], compressionLevel);
+    }
     if (ZSTD_isError(compressedSize)) {
       cRatio = -1;
-      goto _cleanup;
+      if(hasDict) goto _dictCleanup;
+      else goto _nodictCleanup;
     }
     totalCompressedSize += compressedSize;
   }
@@ -230,15 +189,14 @@ double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLev
   DISPLAYLEVEL(2, "compressed size is %lu\n", totalCompressedSize);
   cRatio = (double)totalOriginalSize/(double)totalCompressedSize;
 
-_cleanup:
-  if(dst) {
-    free(dst);
-  }
-  if(offsets) {
-    free(offsets);
-  }
-  ZSTD_freeCCtx(cctx);
+_dictCleanup:
   ZSTD_freeCDict(cdict);
+
+_nodictCleanup:
+  free(dst);
+  free(offsets);
+  ZSTD_freeCCtx(cctx);
+
   return cRatio;
 }
 
@@ -257,102 +215,48 @@ void freeDictInfo(dictInfo* info) {
 /*-********************************************************
   *  Benchmarking functions
 **********************************************************/
-/** benchmarkRandom() :
- *  Measure how long random dictionary builder takes and compression ratio with the random dictionary
- *  @return 0 if benchmark successfully, 1 otherwise
- */
-int benchmarkRandom(sampleInfo *srcInfo, unsigned maxDictSize, ZDICT_random_params_t *randomParam) {
-  const int displayLevel = randomParam->zParams.notificationLevel;
-  int result = 0;
-  clock_t t;
-  t = clock();
-  dictInfo* dInfo = createDictFromFiles(srcInfo, maxDictSize, randomParam, NULL, NULL);
-  t = clock() - t;
-  double time_taken = ((double)t)/CLOCKS_PER_SEC;
-  if (!dInfo) {
-    DISPLAYLEVEL(1, "RANDOM does not train successfully\n");
-    result = 1;
-    goto _cleanup;
-  }
-  DISPLAYLEVEL(2, "RANDOM took %f seconds to execute \n", time_taken);
-
-  double cRatio = compressWithDict(srcInfo, dInfo, randomParam->zParams.compressionLevel, displayLevel);
-  if (cRatio < 0) {
-    DISPLAYLEVEL(1, "Compressing with RANDOM dictionary does not work\n");
-    result = 1;
-    goto _cleanup;
-  }
-  DISPLAYLEVEL(2, "Compression ratio with random dictionary is %f\n", cRatio);
-
-
-_cleanup:
-  freeDictInfo(dInfo);
-  return result;
-}
-
-/** benchmarkCover() :
- *  Measure how long random dictionary builder takes and compression ratio with the cover dictionary
+/** benchmarkDictBuilder() :
+ *  Measure how long a dictionary builder takes and compression ratio with the dictionary built
  *  @return 0 if benchmark successfully, 1 otherwise
  */
-int benchmarkCover(sampleInfo *srcInfo, unsigned maxDictSize,
-                ZDICT_cover_params_t *coverParam) {
-  const int displayLevel = coverParam->zParams.notificationLevel;
+int benchmarkDictBuilder(sampleInfo *srcInfo, unsigned maxDictSize, ZDICT_random_params_t *randomParam,
+                        ZDICT_cover_params_t *coverParam, ZDICT_legacy_params_t *legacyParam) {
+  /* Local variables */
+  const unsigned displayLevel = randomParam ? randomParam->zParams.notificationLevel :
+                                coverParam ? coverParam->zParams.notificationLevel :
+                                legacyParam ? legacyParam->zParams.notificationLevel :
+                                DEFAULT_DISPLAYLEVEL;   /* no dict */
+  const char* name = randomParam ? "RANDOM" :
+                    coverParam ? "COVER" :
+                    legacyParam ? "LEGACY" :
+                    "NODICT";    /* no dict */
+  const unsigned cLevel = randomParam ? randomParam->zParams.compressionLevel :
+                          coverParam ? coverParam->zParams.compressionLevel :
+                          legacyParam ? legacyParam->zParams.compressionLevel :
+                          DEFAULT_CLEVEL;   /* no dict */
   int result = 0;
-  clock_t t;
-  t = clock();
-  dictInfo* dInfo = createDictFromFiles(srcInfo, maxDictSize, NULL, coverParam, NULL);
-  t = clock() - t;
-  double time_taken = ((double)t)/CLOCKS_PER_SEC;
-  if (!dInfo) {
-    DISPLAYLEVEL(1, "COVER does not train successfully\n");
-    result = 1;
-    goto _cleanup;
-  }
-  DISPLAYLEVEL(2, "COVER took %f seconds to execute \n", time_taken);
 
-  double cRatio = compressWithDict(srcInfo, dInfo, coverParam->zParams.compressionLevel, displayLevel);
-  if (cRatio < 0) {
-    DISPLAYLEVEL(1, "Compressing with COVER dictionary does not work\n");
-    result = 1;
-    goto _cleanup;
-  }
-  DISPLAYLEVEL(2, "Compression ratio with cover dictionary is %f\n", cRatio);
-
-_cleanup:
-  freeDictInfo(dInfo);
-  return result;
-}
-
-
-
-/** benchmarkLegacy() :
- *  Measure how long legacy dictionary builder takes and compression ratio with the legacy dictionary
- *  @return 0 if benchmark successfully, 1 otherwise
- */
-int benchmarkLegacy(sampleInfo *srcInfo, unsigned maxDictSize, ZDICT_legacy_params_t *legacyParam) {
-  const int displayLevel = legacyParam->zParams.notificationLevel;
-  int result = 0;
-  clock_t t;
-  t = clock();
-  dictInfo* dInfo = createDictFromFiles(srcInfo, maxDictSize, NULL, NULL, legacyParam);
-  t = clock() - t;
-  double time_taken = ((double)t)/CLOCKS_PER_SEC;
+  /* Calculate speed */
+  const UTIL_time_t begin = UTIL_getTime();
+  dictInfo* dInfo = createDictFromFiles(srcInfo, maxDictSize, randomParam, coverParam, legacyParam);
+  const U64 timeMicro = UTIL_clockSpanMicro(begin);
+  const double timeSec = timeMicro / (double)SEC_TO_MICRO;
   if (!dInfo) {
-    DISPLAYLEVEL(1, "LEGACY does not train successfully\n");
+    DISPLAYLEVEL(1, "%s does not train successfully\n", name);
     result = 1;
     goto _cleanup;
-
   }
-  DISPLAYLEVEL(2, "LEGACY took %f seconds to execute \n", time_taken);
+  DISPLAYLEVEL(2, "%s took %f seconds to execute \n", name, timeSec);
 
-  double cRatio = compressWithDict(srcInfo, dInfo, legacyParam->zParams.compressionLevel, displayLevel);
+  /* Calculate compression ratio */
+  double cRatio = compressWithDict(srcInfo, dInfo, cLevel, displayLevel);
   if (cRatio < 0) {
-    DISPLAYLEVEL(1, "Compressing with LEGACY dictionary does not work\n");
+    DISPLAYLEVEL(1, "Compressing with %s dictionary does not work\n", name);
     result = 1;
     goto _cleanup;
 
   }
-  DISPLAYLEVEL(2, "Compression ratio with legacy dictionary is %f\n", cRatio);
+  DISPLAYLEVEL(2, "Compression ratio with %s dictionary is %f\n", name, cRatio);
 
 _cleanup:
   freeDictInfo(dInfo);
@@ -363,15 +267,16 @@ _cleanup:
 
 int main(int argCount, const char* argv[])
 {
-  int displayLevel = 2;
+  const int displayLevel = DEFAULT_DISPLAYLEVEL;
   const char* programName = argv[0];
   int result = 0;
+
   /* Initialize arguments to default values */
-  unsigned k = 200;
-  unsigned d = 6;
-  unsigned cLevel = 3;
-  unsigned dictID = 0;
-  unsigned maxDictSize = g_defaultMaxDictSize;
+  const unsigned k = 200;
+  const unsigned d = 6;
+  const unsigned cLevel = DEFAULT_CLEVEL;
+  const unsigned dictID = 0;
+  const unsigned maxDictSize = g_defaultMaxDictSize;
 
   /* Initialize table to store input files */
   const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*));
@@ -379,7 +284,7 @@ int main(int argCount, const char* argv[])
 
   char* fileNamesBuf = NULL;
   unsigned fileNamesNb = filenameIdx;
-  int followLinks = 0;
+  const int followLinks = 0;
   const char** extendedFileList = NULL;
 
   /* Parse arguments */
@@ -394,7 +299,6 @@ int main(int argCount, const char* argv[])
     return 1;
   }
 
-
   /* Get the list of all files recursively (because followLinks==0)*/
   extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf,
                                         &fileNamesNb, followLinks);
@@ -406,6 +310,7 @@ int main(int argCount, const char* argv[])
     filenameIdx = fileNamesNb;
   }
 
+  /* get sampleInfo */
   size_t blockSize = 0;
   sampleInfo* srcInfo= getSampleInfo(filenameTable,
                     filenameIdx, blockSize, maxDictSize, displayLevel);
@@ -416,38 +321,53 @@ int main(int argCount, const char* argv[])
   zParams.notificationLevel = displayLevel;
   zParams.dictID = dictID;
 
+  /* with no dict */
+  {
+    const int noDictResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, NULL);
+    if(noDictResult) {
+      result = 1;
+      goto _cleanup;
+    }
+  }
+
   /* for random */
-  ZDICT_random_params_t randomParam;
-  randomParam.zParams = zParams;
-  randomParam.k = k;
-  int randomResult = benchmarkRandom(srcInfo, maxDictSize, &randomParam);
-  if(randomResult) {
-    result = 1;
-    goto _cleanup;
+  {
+    ZDICT_random_params_t randomParam;
+    randomParam.zParams = zParams;
+    randomParam.k = k;
+    const int randomResult = benchmarkDictBuilder(srcInfo, maxDictSize, &randomParam, NULL, NULL);
+    if(randomResult) {
+      result = 1;
+      goto _cleanup;
+    }
   }
 
   /* for cover */
-  ZDICT_cover_params_t coverParam;
-  memset(&coverParam, 0, sizeof(coverParam));
-  coverParam.zParams = zParams;
-  coverParam.splitPoint = 1.0;
-  coverParam.d = d;
-  coverParam.steps = 40;
-  coverParam.nbThreads = 1;
-  int coverOptResult = benchmarkCover(srcInfo, maxDictSize, &coverParam);
-  if(coverOptResult) {
-    result = 1;
-    goto _cleanup;
+  {
+    ZDICT_cover_params_t coverParam;
+    memset(&coverParam, 0, sizeof(coverParam));
+    coverParam.zParams = zParams;
+    coverParam.splitPoint = 1.0;
+    coverParam.d = d;
+    coverParam.steps = 40;
+    coverParam.nbThreads = 1;
+    const int coverOptResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, &coverParam, NULL);
+    if(coverOptResult) {
+      result = 1;
+      goto _cleanup;
+    }
   }
 
   /* for legacy */
-  ZDICT_legacy_params_t legacyParam;
-  legacyParam.zParams = zParams;
-  legacyParam.selectivityLevel = 9;
-  int legacyResult = benchmarkLegacy(srcInfo, maxDictSize, &legacyParam);
-  if(legacyResult) {
-    result = 1;
-    goto _cleanup;
+  {
+    ZDICT_legacy_params_t legacyParam;
+    legacyParam.zParams = zParams;
+    legacyParam.selectivityLevel = 9;
+    const int legacyResult = benchmarkDictBuilder(srcInfo, maxDictSize, NULL, NULL, &legacyParam);
+    if(legacyResult) {
+      result = 1;
+      goto _cleanup;
+    }
   }
 
   /* Free allocated memory */
diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/dictBuilder.h b/contrib/experimental_dict_builders/benchmarkDictBuilder/dictBuilder.h
new file mode 100644 (file)
index 0000000..781ec8c
--- /dev/null
@@ -0,0 +1,6 @@
+/* ZDICT_trainFromBuffer_legacy() :
+ * issue : samplesBuffer need to be followed by a noisy guard band.
+ * work around : duplicate the buffer, and add the noise */
+size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
+                                    const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
+                                    ZDICT_legacy_params_t params);
similarity index 54%
rename from contrib/benchmarkDictBuilder/test.sh
rename to contrib/experimental_dict_builders/benchmarkDictBuilder/test.sh
index 6354784e445e89daddcc3f565b0978e3c3429a55..5eaf5930a3c630651b34a7a7c144dd6e8c721616 100644 (file)
@@ -1,2 +1,2 @@
 echo "Benchmark with in=../../lib/common"
-./benchmark in=../../lib/common
+./benchmark in=../../../lib/common
similarity index 79%
rename from contrib/randomDictBuilder/Makefile
rename to contrib/experimental_dict_builders/randomDictBuilder/Makefile
index 5f9240bf6199b690fc1cbb50e74f85cb559f9d28..bbd40e47c312630591923ab31beeed630f582e7c 100644 (file)
@@ -2,9 +2,9 @@ ARG :=
 
 CC ?= gcc
 CFLAGS ?= -O3
-INCLUDES := -I ../../programs -I ../../lib/common -I ../../lib -I ../../lib/dictBuilder
+INCLUDES := -I ../../../programs -I ../../../lib/common -I ../../../lib -I ../../../lib/dictBuilder
 
-TEST_INPUT := ../../lib
+TEST_INPUT := ../../../lib
 TEST_OUTPUT := randomDict
 
 all: main run clean
@@ -30,8 +30,8 @@ io.o: io.c
        $(CC) $(CFLAGS) $(INCLUDES) -c io.c
 
 libzstd.a:
-       $(MAKE) -C ../../lib libzstd.a
-       mv ../../lib/libzstd.a .
+       $(MAKE) -C ../../../lib libzstd.a
+       mv ../../../lib/libzstd.a .
 
 .PHONY: testrun
 testrun: main
@@ -48,5 +48,5 @@ testshell: test.sh
 .PHONY: clean
 clean:
        rm -f *.o main libzstd.a
-       $(MAKE) -C ../../lib clean
+       $(MAKE) -C ../../../lib clean
        echo "Cleaning is completed"
similarity index 85%
rename from contrib/randomDictBuilder/README.md
rename to contrib/experimental_dict_builders/randomDictBuilder/README.md
index 0e70d3dccc7aa5777572f17d60512b0cad5996e4..da12a4280541cd477fc10b6962463f7bbf840281 100644 (file)
@@ -16,5 +16,5 @@ To build a random dictionary with the provided arguments: make ARG= followed by
 
 
 ### Examples:
-make ARG="in=../../lib/dictBuilder out=dict100 dictID=520"
-make ARG="in=../../lib/dictBuilder in=../../lib/compress"
+make ARG="in=../../../lib/dictBuilder out=dict100 dictID=520"
+make ARG="in=../../../lib/dictBuilder in=../../../lib/compress"
similarity index 89%
rename from contrib/randomDictBuilder/io.c
rename to contrib/experimental_dict_builders/randomDictBuilder/io.c
index 1217b574747d8945a18fb566ed06163c70b3e470..bfe39eaed6b1edefae5ff9ea11ae4b42b7716383 100644 (file)
@@ -53,6 +53,39 @@ static const size_t g_maxMemory = (sizeof(size_t) == 4) ?
 #define NOISELENGTH 32
 
 
+/*-*************************************
+*  Commandline related functions
+***************************************/
+unsigned readU32FromChar(const char** stringPtr){
+    const char errorMsg[] = "error: numeric value too large";
+    unsigned result = 0;
+    while ((**stringPtr >='0') && (**stringPtr <='9')) {
+        unsigned const max = (((unsigned)(-1)) / 10) - 1;
+        if (result > max) exit(1);
+        result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
+    }
+    if ((**stringPtr=='K') || (**stringPtr=='M')) {
+        unsigned const maxK = ((unsigned)(-1)) >> 10;
+        if (result > maxK) exit(1);
+        result <<= 10;
+        if (**stringPtr=='M') {
+            if (result > maxK) exit(1);
+            result <<= 10;
+        }
+        (*stringPtr)++;  /* skip `K` or `M` */
+        if (**stringPtr=='i') (*stringPtr)++;
+        if (**stringPtr=='B') (*stringPtr)++;
+    }
+    return result;
+}
+
+unsigned longCommandWArg(const char** stringPtr, const char* longCommand){
+    size_t const comSize = strlen(longCommand);
+    int const result = !strncmp(*stringPtr, longCommand, comSize);
+    if (result) *stringPtr += comSize;
+    return result;
+}
+
 
 /* ********************************************************
 *  File related operations
similarity index 78%
rename from contrib/randomDictBuilder/io.h
rename to contrib/experimental_dict_builders/randomDictBuilder/io.h
index e2f454c2631bfb32da60594b005d8b38835872e5..0ee24604eed2c05c99c430ee9c9cef8330080c13 100644 (file)
@@ -50,5 +50,11 @@ void freeSampleInfo(sampleInfo *info);
 void saveDict(const char* dictFileName, const void* buff, size_t buffSize);
 
 
+unsigned readU32FromChar(const char** stringPtr);
 
-size_t findMaxMem(unsigned long long requiredMem);
+/** longCommandWArg() :
+ *  check if *stringPtr is the same as longCommand.
+ *  If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand.
+ * @return 0 and doesn't modify *stringPtr otherwise.
+ */
+unsigned longCommandWArg(const char** stringPtr, const char* longCommand);
similarity index 79%
rename from contrib/randomDictBuilder/main.c
rename to contrib/experimental_dict_builders/randomDictBuilder/main.c
index 4751a9e1c8e5f10ea6c3deac1ed064dc0efa74bb..3f3a6ca70187e41fe83bc4bf7265b73c38e02f1f 100644 (file)
@@ -52,46 +52,6 @@ static const unsigned g_defaultMaxDictSize = 110 KB;
 
 
 
-/*-*************************************
-*  Commandline related functions
-***************************************/
-static unsigned readU32FromChar(const char** stringPtr){
-    const char errorMsg[] = "error: numeric value too large";
-    unsigned result = 0;
-    while ((**stringPtr >='0') && (**stringPtr <='9')) {
-        unsigned const max = (((unsigned)(-1)) / 10) - 1;
-        if (result > max) exit(1);
-        result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
-    }
-    if ((**stringPtr=='K') || (**stringPtr=='M')) {
-        unsigned const maxK = ((unsigned)(-1)) >> 10;
-        if (result > maxK) exit(1);
-        result <<= 10;
-        if (**stringPtr=='M') {
-            if (result > maxK) exit(1);
-            result <<= 10;
-        }
-        (*stringPtr)++;  /* skip `K` or `M` */
-        if (**stringPtr=='i') (*stringPtr)++;
-        if (**stringPtr=='B') (*stringPtr)++;
-    }
-    return result;
-}
-
-/** longCommandWArg() :
- *  check if *stringPtr is the same as longCommand.
- *  If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand.
- * @return 0 and doesn't modify *stringPtr otherwise.
- */
-static unsigned longCommandWArg(const char** stringPtr, const char* longCommand){
-    size_t const comSize = strlen(longCommand);
-    int const result = !strncmp(*stringPtr, longCommand, comSize);
-    if (result) *stringPtr += comSize;
-    return result;
-}
-
-
-
 /*-*************************************
 *  RANDOM
 ***************************************/
similarity index 52%
rename from contrib/randomDictBuilder/test.sh
rename to contrib/experimental_dict_builders/randomDictBuilder/test.sh
index 497820f8822cc6b24512c9936445474d478e6de4..1eb732e52a0931fb75ed2516b686c424479001b6 100644 (file)
@@ -1,12 +1,12 @@
 echo "Building random dictionary with in=../../lib/common k=200 out=dict1"
-./main in=../../lib/common k=200 out=dict1
-zstd -be3 -D dict1 -r ../../lib/common -q
+./main in=../../../lib/common k=200 out=dict1
+zstd -be3 -D dict1 -r ../../../lib/common -q
 echo "Building random dictionary with in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000"
-./main in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000
-zstd -be3 -D dict2 -r ../../lib/common -q
+./main in=../../../lib/common k=500 out=dict2 dictID=100 maxdict=140000
+zstd -be3 -D dict2 -r ../../../lib/common -q
 echo "Building random dictionary with 2 sample sources"
-./main in=../../lib/common in=../../lib/compress out=dict3
-zstd -be3 -D dict3 -r ../../lib/common -q
+./main in=../../../lib/common in=../../../lib/compress out=dict3
+zstd -be3 -D dict3 -r ../../../lib/common -q
 echo "Removing dict1 dict2 dict3"
 rm -f dict1 dict2 dict3