]> git.ipfire.org Git - thirdparty/zstd.git/commitdiff
dictBuilder fails to create dictionary on certain input 626/head
authorYann Collet <cyan@fb.com>
Thu, 23 Mar 2017 23:24:02 +0000 (16:24 -0700)
committerYann Collet <cyan@fb.com>
Thu, 23 Mar 2017 23:24:02 +0000 (16:24 -0700)
Properly expressed with an error code (see zstd_errors.h)
and a cli return code != 0

lib/common/error_private.c
lib/common/zstd_errors.h
lib/dictBuilder/zdict.c
programs/zstdcli.c
tests/playTests.sh

index a0fa1724aee8b823ff8395aa86cfb811024c85d1..44ae2010418449b5cd45bf1909a981d7535b18f8 100644 (file)
@@ -37,6 +37,7 @@ const char* ERR_getErrorString(ERR_enum code)
     case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small";
     case PREFIX(dictionary_corrupted): return "Dictionary is corrupted";
     case PREFIX(dictionary_wrong): return "Dictionary mismatch";
+    case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples";
     case PREFIX(maxCode):
     default: return notErrorCode;
     }
index 949dbd0fffac9b5827eae5300efa7114c9b186ec..3d579d969363035efb2b65c45d330adc0b420a28 100644 (file)
@@ -57,6 +57,7 @@ typedef enum {
   ZSTD_error_maxSymbolValue_tooSmall,
   ZSTD_error_dictionary_corrupted,
   ZSTD_error_dictionary_wrong,
+  ZSTD_error_dictionaryCreation_failed,
   ZSTD_error_maxCode
 } ZSTD_ErrorCode;
 
index c84195791e66c81f7e1ba280b8141e4ac800444b..842167db6acc8d753397921b2d2a0b5a7d59d9e4 100644 (file)
@@ -62,8 +62,9 @@
 #define MINRATIO 4
 static const int g_compressionLevel_default = 6;
 static const U32 g_selectivity_default = 9;
-static const size_t g_provision_entropySize = 200;
+static const size_t g_provision_entropySize = 192;
 static const size_t g_min_fast_dictContent = 192;
+static const size_t g_dictContentSize_min = 32;
 
 
 /*-*************************************
@@ -929,8 +930,8 @@ size_t ZDICT_trainFromBuffer_unsafe(
 
     /* checks */
     if (!dictList) return ERROR(memory_allocation);
-    if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) { free(dictList); return ERROR(dstSize_tooSmall); }
-    if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return 0; }   /* not enough source to create dictionary */
+    if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) { free(dictList); return ERROR(dstSize_tooSmall); }   /* requested dictionary size is too small */
+    if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return ERROR(dictionaryCreation_failed); }   /* not enough source to create dictionary */
 
     /* init */
     ZDICT_initDictItem(dictList);
@@ -963,6 +964,7 @@ size_t ZDICT_trainFromBuffer_unsafe(
 
     /* create dictionary */
     {   U32 dictContentSize = ZDICT_dictSize(dictList);
+        if (dictContentSize < g_dictContentSize_min) { free(dictList); return ERROR(dictionaryCreation_failed); }   /* dictionary content too small */
         if (dictContentSize < targetDictSize/3) {
             DISPLAYLEVEL(2, "!  warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize);
             if (minRep > MINRATIO) {
index 4d7fbb357e9fe41a18eb9222542f2ff7fd6439f4..281301bd1cd2c912be1e090cb169c97aafe15bd3 100644 (file)
@@ -629,7 +629,7 @@ int main(int argCount, const char* argv[])
             coverParams.compressionLevel = dictCLevel;
             coverParams.notificationLevel = g_displayLevel;
             coverParams.dictID = dictID;
-            DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, NULL, &coverParams, cover - 1);
+            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, NULL, &coverParams, cover - 1);
         } else {
             ZDICT_params_t dictParams;
             memset(&dictParams, 0, sizeof(dictParams));
@@ -637,7 +637,7 @@ int main(int argCount, const char* argv[])
             dictParams.selectivityLevel = dictSelect;
             dictParams.notificationLevel = g_displayLevel;
             dictParams.dictID = dictID;
-            DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, &dictParams, NULL, 0);
+            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, &dictParams, NULL, 0);
         }
 #endif
         goto _end;
index e98e0f44e6314b9bb750620e39702c240505f9ca..897a901554cf733d0beaf7bf15f370ed78e7b32c 100755 (executable)
@@ -281,6 +281,11 @@ case "$UNAME" in
   *) $MD5SUM -c tmph1 ;;
 esac
 rm -rf dirTestDict
+$ECHO "- dictionary builder on bogus input"
+$ECHO "Hello World" > tmp
+$ZSTD --train -q tmp && die "Dictionary training should fail : not enough input source"
+./datagen -P0 -g10M > tmp
+$ZSTD --train -q tmp && die "Dictionary training should fail : source is pure noise"
 rm tmp*