largeNbDicts can compress and compare dict vs noDict

author Yann Collet <cyan@fb.com>

Tue, 28 Aug 2018 00:08:44 +0000 (17:08 -0700)

committer Yann Collet <cyan@fb.com>

Tue, 28 Aug 2018 00:08:44 +0000 (17:08 -0700)
author Yann Collet <cyan@fb.com>
Tue, 28 Aug 2018 00:08:44 +0000 (17:08 -0700)
committer Yann Collet <cyan@fb.com>
Tue, 28 Aug 2018 00:08:44 +0000 (17:08 -0700)
diff --git a/contrib/largeNbDicts/Makefile b/contrib/largeNbDicts/Makefile

index 082f0102a34f82c2eaf8914ea333e1ed1887a875..026d76f125b451ab82defb12b1d04be7f4577772 100644 (file)
--- a/contrib/largeNbDicts/Makefile
+++ b/contrib/largeNbDicts/Makefile
@@ -21,6 +21,8 @@ CFLAGS  += $(DEBUGFLAGS) $(MOREFLAGS)
  
  default: largeNbDicts
  
+all : largeNbDicts
+
  largeNbDicts: LDFLAGS += -lzstd
  largeNbDicts: largeNbDicts.c
         $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
diff --git a/contrib/largeNbDicts/largeNbDicts b/contrib/largeNbDicts/largeNbDicts

index 40416f050fa594cb085444ebb1ca433629d59318..c057a2b78aa551a2de831b6e304f8747a6ea3d0f 100755 (executable)

Binary files a/contrib/largeNbDicts/largeNbDicts and b/contrib/largeNbDicts/largeNbDicts differ
diff --git a/contrib/largeNbDicts/largeNbDicts.c b/contrib/largeNbDicts/largeNbDicts.c

index 749d9660d7ed9aedea8f0ddf0904f516b2dd1c8c..536e45ffeff090e0c153caba269f5926191d2317 100644 (file)
--- a/contrib/largeNbDicts/largeNbDicts.c
+++ b/contrib/largeNbDicts/largeNbDicts.c
@@ -159,6 +159,33 @@ buffer_collection_t splitBuffer(buffer_t srcBuffer, size_t blockSize)
  }
  
  
+/*---  dictionary creation  ---*/
+
+buffer_t createDictionary(const char* dictionary,
+                        const void* srcBuffer, size_t* srcBlockSizes, unsigned nbBlocks)
+{
+    if (dictionary) {
+        DISPLAYLEVEL(3, "loading dictionary %s \n", dictionary);
+        return createBuffer_fromFile(dictionary);
+    } else {
+        DISPLAYLEVEL(3, "creating dictionary, of target size %u bytes \n", DICTSIZE);
+        void* const dictBuffer = malloc(DICTSIZE);
+        assert(dictBuffer != NULL);
+
+        size_t const dictSize = ZDICT_trainFromBuffer(dictBuffer, DICTSIZE,
+                                                    srcBuffer,
+                                                    srcBlockSizes,
+                                                    nbBlocks);
+        assert(!ZSTD_isError(dictSize));
+
+        buffer_t result;
+        result.ptr = dictBuffer;
+        result.capacity = DICTSIZE;
+        result.size = dictSize;
+        return result;
+    }
+}
+
  
  /*---  ddict_collection_t  ---*/
  
@@ -181,6 +208,7 @@ static void freeDDictCollection(ddict_collection_t ddictc)
  static ddict_collection_t createDDictCollection(const void* dictBuffer, size_t dictSize, size_t nbDDict)
  {
      ZSTD_DDict** const ddicts = malloc(nbDDict * sizeof(ZSTD_DDict*));
+    assert(ddicts != NULL);
      if (ddicts==NULL) return kNullDDictCollection;
      for (size_t dictNb=0; dictNb < nbDDict; dictNb++) {
          ddicts[dictNb] = ZSTD_createDDict(dictBuffer, dictSize);
@@ -193,29 +221,64 @@ static ddict_collection_t createDDictCollection(const void* dictBuffer, size_t d
  }
  
  
+/* ---   Compression  --- */
  
-/*---  Benchmark  --- */
+/* compressBlocks() :
+ * @return : total compressed size of all blocks,
+ *        or 0 if error.
+ */
+static size_t compressBlocks(buffer_collection_t dstBlockBuffers, buffer_collection_t srcBlockBuffers, ZSTD_CDict* cdict, int cLevel)
+{
+    size_t const nbBlocks = srcBlockBuffers.nbBuffers;
+    assert(dstBlockBuffers.nbBuffers == srcBlockBuffers.nbBuffers);
+
+    ZSTD_CCtx* const cctx = ZSTD_createCCtx();
+    assert(cctx != NULL);
+
+    size_t totalCSize = 0;
+    for (size_t blockNb=0; blockNb < nbBlocks; blockNb++) {
+        size_t cBlockSize;
+        if (cdict == NULL) {
+            cBlockSize = ZSTD_compressCCtx(cctx,
+                            dstBlockBuffers.buffers[blockNb], dstBlockBuffers.capacities[blockNb],
+                            srcBlockBuffers.buffers[blockNb], srcBlockBuffers.capacities[blockNb],
+                            cLevel);
+            assert(!ZSTD_isError(cBlockSize));
+        } else {
+            cBlockSize = ZSTD_compress_usingCDict(cctx,
+                            dstBlockBuffers.buffers[blockNb], dstBlockBuffers.capacities[blockNb],
+                            srcBlockBuffers.buffers[blockNb], srcBlockBuffers.capacities[blockNb],
+                            cdict);
+            assert(!ZSTD_isError(cBlockSize));
+        }
+        totalCSize += cBlockSize;
+    }
+    return totalCSize;
+}
+
+
+/* ---  Benchmark  --- */
  
  
  /* bench() :
+ * fileName : file to load for benchmarking purpose
+ * dictionary : optional (can be NULL), file to load as dictionary,
+ *              if none provided : will be calculated on the fly by the program.
   * @return : 0 is success, 1+ otherwise */
-int bench(const char* fileName)
+int bench(const char* fileName, const char* dictionary)
  {
      int result = 0;
  
      DISPLAYLEVEL(3, "loading %s... \n", fileName);
      buffer_t const srcBuffer = createBuffer_fromFile(fileName);
-    if (srcBuffer.ptr == NULL) {
-        DISPLAYLEVEL(1," error reading file %s \n", fileName);
-        return 1;
-    }
+    assert(srcBuffer.ptr != NULL);
      DISPLAYLEVEL(3, "created src buffer of size %.1f MB \n",
                      (double)(srcBuffer.size) / (1 MB));
  
      buffer_collection_t const srcBlockBuffers = splitBuffer(srcBuffer, BLOCKSIZE);
      assert(srcBlockBuffers.buffers != NULL);
      unsigned const nbBlocks = (unsigned)srcBlockBuffers.nbBuffers;
-    DISPLAYLEVEL(3, "splitting input into %u blocks of max size %u bytes \n",
+    DISPLAYLEVEL(3, "split input into %u blocks of max size %u bytes \n",
                      nbBlocks, BLOCKSIZE);
  
      size_t const dstBlockSize = ZSTD_compressBound(BLOCKSIZE);
@@ -230,36 +293,44 @@ int bench(const char* fileName)
      buffer_collection_t const dstBlockBuffers = splitBuffer(dstBuffer, dstBlockSize);
      assert(dstBlockBuffers.buffers != NULL);
  
-    DISPLAYLEVEL(3, "creating dictionary, of target size %u bytes \n", DICTSIZE);
-    void* const dictBuffer = malloc(DICTSIZE);
-    if (dictBuffer == NULL) { result = 1; goto _cleanup; }
-
-    size_t const dictSize = ZDICT_trainFromBuffer(dictBuffer, DICTSIZE,
-                                                srcBuffer.ptr,
-                                                srcBlockBuffers.capacities,
-                                                nbBlocks);
-    if (ZSTD_isError(dictSize)) {
-        DISPLAYLEVEL(1, "error creating dictionary \n");
-        result = 1;
-        goto _cleanup;
-    }
+    /* dictionary determination */
+    buffer_t const dictBuffer = createDictionary(dictionary,
+                                srcBuffer.ptr,
+                                srcBlockBuffers.capacities, nbBlocks);
+    assert(dictBuffer.ptr != NULL);
+
+    ZSTD_CDict* const cdict = ZSTD_createCDict(dictBuffer.ptr, dictBuffer.size, COMP_LEVEL);
+    assert(cdict != NULL);
+
+    size_t const cTotalSizeNoDict = compressBlocks(dstBlockBuffers, srcBlockBuffers, NULL, COMP_LEVEL);
+    assert(cTotalSizeNoDict != 0);
+    DISPLAYLEVEL(3, "compressing at level %u without dictionary : Ratio=%.2f  (%u bytes) \n",
+                    COMP_LEVEL,
+                    (double)srcBuffer.size / cTotalSizeNoDict, (unsigned)cTotalSizeNoDict);
+
+    size_t const cTotalSize = compressBlocks(dstBlockBuffers, srcBlockBuffers, cdict, COMP_LEVEL);
+    assert(cTotalSize != 0);
+    DISPLAYLEVEL(3, "compressed using a %u bytes dictionary : Ratio=%.2f  (%u bytes) \n",
+                    (unsigned)dictBuffer.size,
+                    (double)srcBuffer.size / cTotalSize, (unsigned)cTotalSize);
  
-    size_t const dictMem = ZSTD_estimateDDictSize(dictSize, ZSTD_dlm_byCopy);
+    size_t const dictMem = ZSTD_estimateDDictSize(dictBuffer.size, ZSTD_dlm_byCopy);
      size_t const allDictMem = dictMem * nbBlocks;
      DISPLAYLEVEL(3, "generating %u dictionaries, using %.1f MB of memory \n",
                      nbBlocks, (double)allDictMem / (1 MB));
  
-    ZSTD_CDict* const cdict = ZSTD_createCDict(dictBuffer, dictSize, COMP_LEVEL);
-    do {
-        ddict_collection_t const dictionaries = createDDictCollection(dictBuffer, dictSize, nbBlocks);
-        assert(dictionaries.ddicts != NULL);
+    ddict_collection_t const dictionaries = createDDictCollection(dictBuffer.ptr, dictBuffer.size, nbBlocks);
+    assert(dictionaries.ddicts != NULL);
  
-        freeDDictCollection(dictionaries);
-    } while(0);
-    ZSTD_freeCDict(cdict);
  
-_cleanup:
-    free(dictBuffer);
+
+    //result = benchMem(srcBlockBuffers, dstBlockBuffers, dictionaries);;
+
+
+
+    freeDDictCollection(dictionaries);
+    ZSTD_freeCDict(cdict);
+    freeBuffer(dictBuffer);
      freeCollection(dstBlockBuffers);
      freeBuffer(dstBuffer);
      freeCollection(srcBlockBuffers);
@@ -276,7 +347,7 @@ _cleanup:
  int bad_usage(const char* exeName)
  {
      DISPLAY (" bad usage : \n");
-    DISPLAY (" %s filename \n", exeName);
+    DISPLAY (" %s filename [-D dictionary] \n", exeName);
      return 1;
  }
  
@@ -284,6 +355,15 @@ int main (int argc, const char** argv)
  {
      const char* const exeName = argv[0];
  
-    if (argc != 2) return bad_usage(exeName);
-    return bench(argv[1]);
+    if (argc < 2) return bad_usage(exeName);
+    const char* const fileName = argv[1];
+
+    const char* dictionary = NULL;
+    if (argc > 2) {
+        if (argc != 4) return bad_usage(exeName);
+        if (strcmp(argv[2], "-D")) return bad_usage(exeName);
+        dictionary = argv[3];
+    }
+
+    return bench(fileName, dictionary);
  }
author	Yann Collet <cyan@fb.com>
	Tue, 28 Aug 2018 00:08:44 +0000 (17:08 -0700)
committer	Yann Collet <cyan@fb.com>
	Tue, 28 Aug 2018 00:08:44 +0000 (17:08 -0700)
contrib/largeNbDicts/Makefile		patch \| blob \| blame \| history
contrib/largeNbDicts/largeNbDicts		patch \| blob \| blame \| history
contrib/largeNbDicts/largeNbDicts.c		patch \| blob \| blame \| history