From: Yann Collet Date: Fri, 29 Jan 2016 01:45:26 +0000 (+0100) Subject: added fast sampling mode X-Git-Tag: v0.5.0~1^2~3^2~19 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f5229e0cd8a9b7f1121c3c848e301b0354ce3dad;p=thirdparty%2Fzstd.git added fast sampling mode --- diff --git a/dictBuilder/dibcli.c b/dictBuilder/dibcli.c index 434053482..121620943 100644 --- a/dictBuilder/dibcli.c +++ b/dictBuilder/dibcli.c @@ -20,43 +20,19 @@ You can contact the author at : - zstd source repository : https://github.com/Cyan4973/zstd - - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c */ - -/************************************** -* Compiler Options -**************************************/ -#define _CRT_SECURE_NO_WARNINGS /* Visual : removes warning from strcpy */ -#define _POSIX_SOURCE 1 /* triggers fileno() within on unix */ - - -/************************************** +/*-************************************ * Includes **************************************/ -#include /* fprintf, getchar */ #include /* exit, calloc, free */ #include /* strcmp, strlen */ +#include /* fprintf, getchar */ #include "dictBuilder.h" -/************************************** -* OS-specific Includes -**************************************/ -#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32) || defined(__CYGWIN__) -# include /* _O_BINARY */ -# include /* _setmode, _isatty */ -# define SET_BINARY_MODE(file) _setmode(_fileno(file), _O_BINARY) -# define IS_CONSOLE(stdStream) _isatty(_fileno(stdStream)) -#else -# include /* isatty */ -# define SET_BINARY_MODE(file) -# define IS_CONSOLE(stdStream) isatty(fileno(stdStream)) -#endif - - -/************************************** +/*-************************************ * Constants **************************************/ #define PROGRAM_DESCRIPTION "Dictionary builder" @@ -72,21 +48,22 @@ #define MB *(1 <<20) #define GB *(1U<<30) +static const unsigned compressionLevelDefault = 5; static const unsigned selectionLevelDefault = 9; /* determined experimentally */ static const unsigned maxDictSizeDefault = 110 KB; static const char* dictFileNameDefault = "dictionary"; -/************************************** +/*-************************************ * Display Macros **************************************/ -#define DISPLAY(...) fprintf(displayOut, __VA_ARGS__) -#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); } -static FILE* displayOut; -static unsigned displayLevel = 2; // 0 : no display // 1: errors // 2 : + result + interaction + warnings ; // 3 : + progression; // 4 : + information +#define DISPLAY(...) fprintf(g_displayOut, __VA_ARGS__) +#define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); } +static FILE* g_displayOut; +static unsigned g_displayLevel = 2; // 0 : no display // 1: errors // 2 : + result + interaction + warnings ; // 3 : + progression; // 4 : + information -/************************************** +/*-************************************ * Exceptions **************************************/ #define DEBUG 0 @@ -101,7 +78,7 @@ static unsigned displayLevel = 2; // 0 : no display // 1: errors // 2 : + re } -/************************************** +/*-************************************ * Command Line **************************************/ static int usage(const char* programName) @@ -110,8 +87,8 @@ static int usage(const char* programName) DISPLAY( " %s [arg] [filenames]\n", programName); DISPLAY( "\n"); DISPLAY( "Arguments :\n"); - DISPLAY( "--maxdict : limit dictionary to specified size (default : %u) \n", maxDictSizeDefault); DISPLAY( " -o : name of dictionary file (default: %s) \n", dictFileNameDefault); + DISPLAY( "--maxdict : limit dictionary to specified size (default : %u) \n", maxDictSizeDefault); DISPLAY( " -h/-H : display help/long help and exit\n"); return 0; } @@ -122,8 +99,10 @@ static int usage_advanced(const char* programName) usage(programName); DISPLAY( "\n"); DISPLAY( "Advanced arguments :\n"); - DISPLAY( " -# : selection level # (default :%u)\n", selectionLevelDefault); DISPLAY( " -V : display Version number and exit\n"); + DISPLAY( "--fast : fast sampling mode\n"); + DISPLAY( " -L# : target compression level (default: %u)\n", compressionLevelDefault); + DISPLAY( " -S# : dictionary selectivity level # (default: %u)\n", selectionLevelDefault); DISPLAY( " -v : verbose mode\n"); DISPLAY( " -q : suppress warnings; specify twice to suppress errors too\n"); return 0; @@ -132,7 +111,7 @@ static int usage_advanced(const char* programName) static int badusage(const char* programName) { DISPLAYLEVEL(1, "Incorrect parameters\n"); - if (displayLevel >= 1) usage(programName); + if (g_displayLevel >= 1) usage(programName); return 1; } @@ -153,6 +132,7 @@ int main(int argCount, const char** argv) operationResult=0, nextArgumentIsMaxDict=0, nextArgumentIsDictFileName=0; + unsigned cLevel = compressionLevelDefault; unsigned maxDictSize = maxDictSizeDefault; unsigned selectionLevel = selectionLevelDefault; const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*)); /* argCount >= 1 */ @@ -161,7 +141,7 @@ int main(int argCount, const char** argv) const char* dictFileName = dictFileNameDefault; /* init */ - displayOut = stderr; /* unfortunately, cannot be set at declaration */ + g_displayOut = stderr; /* unfortunately, cannot be set at declaration */ if (filenameTable==NULL) EXM_THROW(1, "not enough memory\n"); /* Pick out program name from path. Don't rely on stdlib because of conflicting behavior */ for (i = (int)strlen(programName); i > 0; i--) { if ((programName[i] == '/') || (programName[i] == '\\')) { i++; break; } } @@ -190,40 +170,44 @@ int main(int argCount, const char** argv) } /* long commands (--long-word) */ - if (!strcmp(argument, "--version")) { displayOut=stdout; DISPLAY(WELCOME_MESSAGE); return 0; } - if (!strcmp(argument, "--help")) { displayOut=stdout; return usage_advanced(programName); } - if (!strcmp(argument, "--verbose")) { displayLevel=4; continue; } - if (!strcmp(argument, "--quiet")) { displayLevel--; continue; } + if (!strcmp(argument, "--version")) { g_displayOut=stdout; DISPLAY(WELCOME_MESSAGE); return 0; } + if (!strcmp(argument, "--help")) { g_displayOut=stdout; return usage_advanced(programName); } + if (!strcmp(argument, "--verbose")) { g_displayLevel++; if (g_displayLevel<3) g_displayLevel=3; continue; } + if (!strcmp(argument, "--quiet")) { g_displayLevel--; continue; } if (!strcmp(argument, "--maxdict")) { nextArgumentIsMaxDict=1; continue; } + if (!strcmp(argument, "--fast")) { selectionLevel=0; cLevel=1; continue; } /* Decode commands (note : aggregated commands are allowed) */ if (argument[0]=='-') { argument++; while (argument[0]!=0) { - /* selection Level */ - if ((*argument>='0') && (*argument<='9')) { - selectionLevel = 0; - while ((*argument >= '0') && (*argument <= '9')) { - selectionLevel *= 10; - selectionLevel += *argument - '0'; - argument++; - } - continue; - } - switch(argument[0]) { /* Display help */ - case 'V': displayOut=stdout; DISPLAY(WELCOME_MESSAGE); return 0; /* Version Only */ + case 'V': g_displayOut=stdout; DISPLAY(WELCOME_MESSAGE); return 0; /* Version Only */ case 'H': - case 'h': displayOut=stdout; return usage_advanced(programName); + case 'h': g_displayOut=stdout; return usage_advanced(programName); + + /* Selection level */ + case 'S': argument++; + selectionLevel = 0; + while ((*argument >= '0') && (*argument <= '9')) + selectionLevel *= 10, selectionLevel += *argument++ - '0'; + break; + + /* Selection level */ + case 'L': argument++; + cLevel = 0; + while ((*argument >= '0') && (*argument <= '9')) + cLevel *= 10, cLevel += *argument++ - '0'; + break; /* Verbose mode */ - case 'v': displayLevel++; if (displayLevel<3) displayLevel=3; argument++; break; + case 'v': g_displayLevel++; if (g_displayLevel<3) g_displayLevel=3; argument++; break; /* Quiet mode */ - case 'q': displayLevel--; argument++; break; + case 'q': g_displayLevel--; argument++; break; /* dictionary name */ case 'o': nextArgumentIsDictFileName=1; argument++; break; @@ -247,8 +231,8 @@ int main(int argCount, const char** argv) if (filenameIdx==0) return badusage(programName); /* building ... */ - DiB_setNotificationLevel(displayLevel); - operationResult = DiB_trainDictionary(dictFileName, maxDictSize, selectionLevel, filenameTable, filenameIdx); + DiB_setNotificationLevel(g_displayLevel); + operationResult = DiB_trainDictionary(dictFileName, maxDictSize, selectionLevel, cLevel, filenameTable, filenameIdx); if (main_pause) waitEnter(); free((void*)filenameTable); diff --git a/dictBuilder/dictBuilder.c b/dictBuilder/dictBuilder.c index 8bb100573..4abae9fd4 100644 --- a/dictBuilder/dictBuilder.c +++ b/dictBuilder/dictBuilder.c @@ -51,6 +51,7 @@ #include /* clock */ #include "mem.h" /* read */ +#include "error_private.h" #include "divsufsort.h" #include "dictBuilder.h" #include "zstd_compress.c" @@ -85,6 +86,7 @@ static const size_t maxMemory = (sizeof(size_t)==4) ? (2 GB - 64 MB) : (size_t #define PRIME2 2246822519U #define MINRATIO 4 +static const U32 g_compressionLevel_default = 5; /*-************************************* @@ -714,6 +716,7 @@ static void DiB_countEStats(EStats_ress_t esr, #define OFFCODE_MAX 18 static size_t DiB_analyzeEntropy(void* dstBuffer, size_t maxDstSize, + unsigned compressionLevel, const void* srcBuffer, size_t* fileSizes, unsigned nbFiles, const void* dictBuffer, size_t dictBufferSize) { @@ -740,7 +743,8 @@ static size_t DiB_analyzeEntropy(void* dstBuffer, size_t maxDstSize, esr.zc = ZSTD_createCCtx(); esr.workPlace = malloc(BLOCKSIZE); if (!esr.ref || !esr.zc || !esr.workPlace) EXM_THROW(30, "Not enough memory"); - params = ZSTD_getParams(5, dictBufferSize + 15 KB); + if (compressionLevel==0) compressionLevel=g_compressionLevel_default; + params = ZSTD_getParams(compressionLevel, dictBufferSize + 15 KB); params.strategy = ZSTD_greedy; ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params); @@ -827,8 +831,49 @@ static void DiB_saveDict(const char* dictFileName, } -int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, unsigned shiftRatio, - const char** fileNamesTable, unsigned nbFiles) +#define DIB_FASTSEGMENTSIZE 64 +/*! DiB_fastSampling (based on an idea by Giuseppe Ottaviano) + Fill @dictBuffer with stripes of size DIB_FASTSEGMENTSIZE from @samplesBuffer + up to @dictSize. + Filling starts from the end of @dictBuffer, down to maximum possible. + if @dictSize is not a multiply of DIB_FASTSEGMENTSIZE, some bytes at beginning of @dictBuffer won't be used. + @return : amount of data written into @dictBuffer + or an error Code (if @dictSize or @samplesSize too small) +*/ +static size_t DiB_fastSampling(void* dictBuffer, size_t dictSize, + const void* samplesBuffer, size_t samplesSize) +{ + char* dstPtr = (char*)dictBuffer + dictSize; + const char* srcPtr = (const char*)samplesBuffer; + size_t nbSegments = dictSize / DIB_FASTSEGMENTSIZE; + size_t segNb, interSize; + + if (nbSegments <= 2) return ERROR(srcSize_wrong); + if (samplesSize < dictSize) return ERROR(srcSize_wrong); + + /* first and last segments are part of dictionary, in case they contain interesting header/footer */ + dstPtr -= DIB_FASTSEGMENTSIZE; + memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE); + dstPtr -= DIB_FASTSEGMENTSIZE; + memcpy(dstPtr, srcPtr+samplesSize-DIB_FASTSEGMENTSIZE, DIB_FASTSEGMENTSIZE); + + /* regularly copy a segment */ + interSize = (samplesSize - nbSegments*DIB_FASTSEGMENTSIZE) / (nbSegments-1); + srcPtr += DIB_FASTSEGMENTSIZE; + for (segNb=2; segNb < nbSegments; segNb++) { + srcPtr += interSize; + dstPtr -= DIB_FASTSEGMENTSIZE; + memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE); + srcPtr += DIB_FASTSEGMENTSIZE; + } + + return nbSegments * DIB_FASTSEGMENTSIZE; +} + + +int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, + unsigned shiftRatio, unsigned compressionLevel, + const char** fileNamesTable, unsigned nbFiles) { void* srcBuffer; size_t benchedSize; @@ -852,42 +897,44 @@ int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, unsigned /* Load input buffer */ DiB_loadFiles(srcBuffer, benchedSize, fileSizes, fileNamesTable, nbFiles); - DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* for end of buffer condition */ - - /* Train */ - snprintf (mfName, sizeof(mfName), " %u files", nbFiles); - if (nbFiles > 1) displayName = mfName; - else displayName = fileNamesTable[0]; - - DiB_trainBuffer(dictList, dictListSize, - srcBuffer, benchedSize, - displayName, - fileSizes, nbFiles, maxDictSize, - shiftRatio); - - /* display best matches */ - if (g_displayLevel>= 3) { - const U32 nb = 25; - U32 u; - U32 dictContentSize = DiB_dictSize(dictList); - DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize); - DISPLAYLEVEL(3, "list %u best segments \n", nb); - for (u=1; u<=nb; u++) { - U32 p = dictList[u].pos; - U32 l = dictList[u].length; - U32 d = MIN(40, l); - DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |", - u, l, p, dictList[u].savings); - DiB_printHex(3, (char*)srcBuffer+p, d); - DISPLAYLEVEL(3, "| \n"); - } } + DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */ + + if (shiftRatio>0) + { + /* analyze samples */ + snprintf (mfName, sizeof(mfName), " %u files", nbFiles); + if (nbFiles > 1) displayName = mfName; + else displayName = fileNamesTable[0]; + + DiB_trainBuffer(dictList, dictListSize, + srcBuffer, benchedSize, + displayName, + fileSizes, nbFiles, maxDictSize, + shiftRatio); + + /* display best matches */ + if (g_displayLevel>= 3) { + const U32 nb = 25; + U32 u; + U32 dictContentSize = DiB_dictSize(dictList); + DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize); + DISPLAYLEVEL(3, "list %u best segments \n", nb); + for (u=1; u<=nb; u++) { + U32 p = dictList[u].pos; + U32 l = dictList[u].length; + U32 d = MIN(40, l); + DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |", + u, l, p, dictList[u].savings); + DiB_printHex(3, (char*)srcBuffer+p, d); + DISPLAYLEVEL(3, "| \n"); + } } } /* create dictionary */ { void* dictContent; U32 dictContentSize = DiB_dictSize(dictList); void* dictHeader; - size_t dictHeaderSize, hSize; + size_t dictHeaderSize, hSize, addedContentLength; BYTE* ptr; U32 u; @@ -895,18 +942,27 @@ int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, unsigned #define EBSIZE (2 KB) dictHeaderSize = EBSIZE; dictHeader = malloc(dictHeaderSize); - dictContent = malloc(dictContentSize); + dictContent = malloc(maxDictSize); if (!dictHeader || !dictContent) EXM_THROW(2, "not enough memory"); /* build dict content */ - ptr = (BYTE*)dictContent + dictContentSize; - + ptr = (BYTE*)dictContent + maxDictSize; for (u=1; upos; u++) { U32 l = dictList[u].length; ptr -= l; memcpy(ptr, (char*)srcBuffer+dictList[u].pos, l); } + /* fast dict content mode */ + if (shiftRatio==0) { + addedContentLength = ptr-(BYTE*)dictContent; + DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */ + DISPLAYLEVEL(2, "Adding %u KB from fast sampling \n", (U32)(addedContentLength>>10)); + addedContentLength = DiB_fastSampling(dictContent, addedContentLength, srcBuffer, benchedSize); + if (!ERR_isError(addedContentLength)) + ptr -= addedContentLength, dictContentSize += addedContentLength; + } + /* dictionary header */ MEM_writeLE32(dictHeader, ZSTD_DICT_MAGIC); hSize = 4; @@ -915,14 +971,15 @@ int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, unsigned /* entropic tables */ DISPLAYLEVEL(2, "statistics ... \n"); hSize += DiB_analyzeEntropy((char*)dictHeader+4, dictHeaderSize, - srcBuffer, fileSizes, nbFiles, - dictContent, dictContentSize); + compressionLevel, + srcBuffer, fileSizes, nbFiles, + ptr, dictContentSize); /* save dict */ { size_t dictSize = hSize + dictContentSize; DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName); - DiB_saveDict(dictFileName, dictHeader, hSize, dictContent, dictContentSize); + DiB_saveDict(dictFileName, dictHeader, hSize, ptr, dictContentSize); //DiB_saveDict(dictFileName, NULL, 0, dictContent, dictContentSize); // content only } /* clean */ diff --git a/dictBuilder/dictBuilder.h b/dictBuilder/dictBuilder.h index 74fa2b0c0..3c52c7daa 100644 --- a/dictBuilder/dictBuilder.h +++ b/dictBuilder/dictBuilder.h @@ -24,7 +24,7 @@ */ /* This library is designed for a single-threaded console application. -* It abruptly exits (exit() function) when it encounters an error condition. */ +* It exit() and printf() into stderr when it encounters an error condition. */ /*-************************************* * Version @@ -37,14 +37,17 @@ unsigned DiB_versionNumber (void); /*-************************************* -* Main functions +* Public functions ***************************************/ /*! DiB_trainDictionary Train a dictionary from a set of files provided by @fileNamesTable - Resulting dictionary is written in file @dictFileName - @result : 0 if fine + Resulting dictionary is written in file @dictFileName. + @selectivityLevel change criteria for insertion into the dictionary (more => bigger selection => larger dictionary) + @compressionLevel can be used to target a specific compression level of zstd. 0 means "default". + @result : 0 == ok */ -int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, unsigned selectivityLevel, +int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, + unsigned selectivityLevel, unsigned compressionLevel, const char** fileNamesTable, unsigned nbFiles);