added tutorial warning messages for dictBuilder

author Yann Collet <yann.collet.73@gmail.com>

Wed, 27 Jul 2016 10:35:29 +0000 (12:35 +0200)

committer Yann Collet <yann.collet.73@gmail.com>

Wed, 27 Jul 2016 10:43:09 +0000 (12:43 +0200)
author Yann Collet <yann.collet.73@gmail.com>
Wed, 27 Jul 2016 10:35:29 +0000 (12:35 +0200)
committer Yann Collet <yann.collet.73@gmail.com>
Wed, 27 Jul 2016 10:43:09 +0000 (12:43 +0200)
diff --git a/.gitignore b/.gitignore

index e7c9a5687fb93adda502e3e0ab8ddfa47f31c5c2..0c458153c566cc87370b936a072bc482e7ebcdc7 100644 (file)
--- a/.gitignore
+++ b/.gitignore
@@ -40,6 +40,7 @@ projects/cmake/
  
  # Test artefacts
  tmp*
+dictionary
  
  # tmp files
  *.swp
diff --git a/NEWS b/NEWS

index 7ffa4023f179507d014a09c514b5643c3b1e7d74..d01a331314aab6e0447fa75cdf40af50202d057b 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -1,5 +1,6 @@
  v0.8.0
  New : updated compresson format
+Improved : better speed on clang and gcc -O2, thanks to Eric Biggers
  Fixed : legacy mode with ZSTD_HEAPMODE=0, by Christopher Bergqvist
  Fixed : premature end of frame when zero-sized raw block, reported by Eric Biggers
  Fixed : checksum correctly checked in single-pass mode
diff --git a/lib/dictBuilder/zdict.c b/lib/dictBuilder/zdict.c

index f151855565986c79bf0807859d28145bb25f3530..75a9b1e339adb01f04563df039bd690b35ee7666 100644 (file)
--- a/lib/dictBuilder/zdict.c
+++ b/lib/dictBuilder/zdict.c
@@ -924,7 +924,7 @@ size_t ZDICT_trainFromBuffer_unsafe(
                              const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
                              ZDICT_params_t params)
  {
-    U32 const dictListSize = MAX( MAX(DICTLISTSIZE, nbSamples), (U32)(maxDictSize/16));
+    U32 const dictListSize = MAX(MAX(DICTLISTSIZE, nbSamples), (U32)(maxDictSize/16));
      dictItem* const dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
      unsigned selectivity = params.selectivityLevel;
      size_t const targetDictSize = maxDictSize;
@@ -957,17 +957,25 @@ size_t ZDICT_trainFromBuffer_unsafe(
              DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
              DISPLAYLEVEL(3, "list %u best segments \n", nb);
              for (u=1; u<=nb; u++) {
-                U32 p = dictList[u].pos;
-                U32 l = dictList[u].length;
-                U32 d = MIN(40, l);
+                U32 pos = dictList[u].pos;
+                U32 length = dictList[u].length;
+                U32 printedLength = MIN(40, length);
                  DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
-                             u, l, p, dictList[u].savings);
-                ZDICT_printHex(3, (const char*)samplesBuffer+p, d);
+                             u, length, pos, dictList[u].savings);
+                ZDICT_printHex(3, (const char*)samplesBuffer+pos, printedLength);
                  DISPLAYLEVEL(3, "| \n");
      }   }   }
  
      /* create dictionary */
      {   U32 dictContentSize = ZDICT_dictSize(dictList);
+        U64 const totalSamplesSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
+        if (dictContentSize < targetDictSize/2) {
+            DISPLAYLEVEL(2, "!  warning : created dictionary significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize);
+            DISPLAYLEVEL(2, "!  consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+1);
+            DISPLAYLEVEL(2, "!  note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
+            if (totalSamplesSize < 10 * targetDictSize)
+                DISPLAYLEVEL(2, "!  consider also increasing the number of samples (total size : %u MB)\n", (U32)(totalSamplesSize>>20));
+        }
  
          /* build dict content */
          {   U32 u;
diff --git a/programs/dibio.c b/programs/dibio.c

index a61ea9cc6a4c1f61827fc11f3fcd84120eb94380..cb864ec1d1153c7142168951c96611123b4d5e98 100644 (file)
--- a/programs/dibio.c
+++ b/programs/dibio.c
@@ -202,9 +202,16 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
  
      /* Checks */
      if ((!fileSizes) || (!srcBuffer) || (!dictBuffer)) EXM_THROW(12, "not enough memory for DiB_trainFiles");   /* should not happen */
+    g_displayLevel = params.notificationLevel;
+    if (nbFiles < 5) {
+        DISPLAYLEVEL(2, "!  Warning : nb of samples too low for proper processing \n");
+        DISPLAYLEVEL(2, "!  Please provide one file per sample \n");
+        DISPLAYLEVEL(2, "!  Avoid concatenating multiple samples into a single file \n");
+        DISPLAYLEVEL(2, "!  otherwise, dictBuilder will be unable to find the beginning of each sample \n");
+        DISPLAYLEVEL(2, "!  resulting in distorted statistics \n");
+    }
  
      /* init */
-    g_displayLevel = params.notificationLevel;
      if (benchedSize < totalSizeToLoad)
          DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(benchedSize >> 20));
author	Yann Collet <yann.collet.73@gmail.com>
	Wed, 27 Jul 2016 10:35:29 +0000 (12:35 +0200)
committer	Yann Collet <yann.collet.73@gmail.com>
	Wed, 27 Jul 2016 10:43:09 +0000 (12:43 +0200)
.gitignore		patch \| blob \| blame \| history
NEWS		patch \| blob \| blame \| history
lib/dictBuilder/zdict.c		patch \| blob \| blame \| history
programs/dibio.c		patch \| blob \| blame \| history