From: Yann Collet Date: Sun, 18 Dec 2016 10:58:23 +0000 (+0100) Subject: Fix : size estimation when some samples are very large X-Git-Tag: v1.1.3^2~49 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=1496c3dc47fc0cf849fb82d7e6b13a1c19fdd34a;p=thirdparty%2Fzstd.git Fix : size estimation when some samples are very large --- diff --git a/lib/dictBuilder/zdict.c b/lib/dictBuilder/zdict.c index b539c09fa..921e37886 100644 --- a/lib/dictBuilder/zdict.c +++ b/lib/dictBuilder/zdict.c @@ -569,7 +569,7 @@ static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params, if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(1, "warning : ZSTD_copyCCtx failed \n"); return; } } cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_ABSOLUTEMAX, src, srcSize); - if (ZSTD_isError(cSize)) { DISPLAYLEVEL(1, "warning : could not compress sample size %u \n", (U32)srcSize); return; } + if (ZSTD_isError(cSize)) { DISPLAYLEVEL(3, "warning : could not compress sample size %u \n", (U32)srcSize); return; } if (cSize) { /* if == 0; block is not compressible */ const seqStore_t* seqStorePtr = ZSTD_getSeqStore(esr.zc); diff --git a/programs/dibio.c b/programs/dibio.c index 6fce405b9..fa4241b14 100644 --- a/programs/dibio.c +++ b/programs/dibio.c @@ -31,7 +31,8 @@ #define MB *(1 <<20) #define GB *(1U<<30) -#define MEMMULT 11 +#define SAMPLESIZE_MAX (128 KB) +#define MEMMULT 11 /* rough estimation : memory cost to analyze 1 byte of sample */ static const size_t maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t)); #define NOISELENGTH 32 @@ -97,7 +98,7 @@ static unsigned DiB_loadFiles(void* buffer, size_t* bufferSizePtr, for (n=0; n *bufferSizePtr-pos) break; { FILE* const f = fopen(fileName, "rb"); if (f==NULL) EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileName, strerror(errno)); @@ -163,6 +164,21 @@ static void DiB_saveDict(const char* dictFileName, } +static int g_tooLargeSamples = 0; +static U64 DiB_getTotalCappedFileSize(const char** fileNamesTable, unsigned nbFiles) +{ + U64 total = 0; + unsigned n; + for (n=0; n 2*SAMPLESIZE_MAX); + } + return total; +} + + /*! ZDICT_trainFromBuffer_unsafe() : Strictly Internal use only !! Same as ZDICT_trainFromBuffer_advanced(), but does not control `samplesBuffer`. @@ -181,7 +197,7 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize, { void* const dictBuffer = malloc(maxDictSize); size_t* const fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t)); - unsigned long long const totalSizeToLoad = UTIL_getTotalFileSize(fileNamesTable, nbFiles); + unsigned long long const totalSizeToLoad = DiB_getTotalCappedFileSize(fileNamesTable, nbFiles); size_t const maxMem = DiB_findMaxMem(totalSizeToLoad * MEMMULT) / MEMMULT; size_t benchedSize = (size_t) MIN ((unsigned long long)maxMem, totalSizeToLoad); void* const srcBuffer = malloc(benchedSize+NOISELENGTH); @@ -190,7 +206,12 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize, /* Checks */ if ((!fileSizes) || (!srcBuffer) || (!dictBuffer)) EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */ g_displayLevel = params.notificationLevel; - if (nbFiles < 5) { + if (g_tooLargeSamples) { + DISPLAYLEVEL(2, "! Warning : some samples are very large \n"); + DISPLAYLEVEL(2, "! Note that dictionary is only useful for small files or beginning of large files. \n"); + DISPLAYLEVEL(2, "! As a consequence, only the first %u bytes of each file are loaded \n", SAMPLESIZE_MAX); + } + if ((nbFiles < 5) || (totalSizeToLoad < 9 * (unsigned long long)maxDictSize)) { DISPLAYLEVEL(2, "! Warning : nb of samples too low for proper processing ! \n"); DISPLAYLEVEL(2, "! Please provide _one file per sample_. \n"); DISPLAYLEVEL(2, "! Do not concatenate samples together into a single file, \n");