You can contact the author at :
- zstd source repository : https://github.com/Cyan4973/zstd
- - ztsd public forum : https://groups.google.com/forum/#!forum/lz4c
*/
-
-/**************************************
-* Compiler Options
-**************************************/
-#define _CRT_SECURE_NO_WARNINGS /* Visual : removes warning from strcpy */
-#define _POSIX_SOURCE 1 /* triggers fileno() within <stdio.h> on unix */
-
-
-/**************************************
+/*-************************************
* Includes
**************************************/
-#include <stdio.h> /* fprintf, getchar */
#include <stdlib.h> /* exit, calloc, free */
#include <string.h> /* strcmp, strlen */
+#include <stdio.h> /* fprintf, getchar */
#include "dictBuilder.h"
-/**************************************
-* OS-specific Includes
-**************************************/
-#if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(_WIN32) || defined(__CYGWIN__)
-# include <fcntl.h> /* _O_BINARY */
-# include <io.h> /* _setmode, _isatty */
-# define SET_BINARY_MODE(file) _setmode(_fileno(file), _O_BINARY)
-# define IS_CONSOLE(stdStream) _isatty(_fileno(stdStream))
-#else
-# include <unistd.h> /* isatty */
-# define SET_BINARY_MODE(file)
-# define IS_CONSOLE(stdStream) isatty(fileno(stdStream))
-#endif
-
-
-/**************************************
+/*-************************************
* Constants
**************************************/
#define PROGRAM_DESCRIPTION "Dictionary builder"
#define MB *(1 <<20)
#define GB *(1U<<30)
+static const unsigned compressionLevelDefault = 5;
static const unsigned selectionLevelDefault = 9; /* determined experimentally */
static const unsigned maxDictSizeDefault = 110 KB;
static const char* dictFileNameDefault = "dictionary";
-/**************************************
+/*-************************************
* Display Macros
**************************************/
-#define DISPLAY(...) fprintf(displayOut, __VA_ARGS__)
-#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
-static FILE* displayOut;
-static unsigned displayLevel = 2; // 0 : no display // 1: errors // 2 : + result + interaction + warnings ; // 3 : + progression; // 4 : + information
+#define DISPLAY(...) fprintf(g_displayOut, __VA_ARGS__)
+#define DISPLAYLEVEL(l, ...) if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); }
+static FILE* g_displayOut;
+static unsigned g_displayLevel = 2; // 0 : no display // 1: errors // 2 : + result + interaction + warnings ; // 3 : + progression; // 4 : + information
-/**************************************
+/*-************************************
* Exceptions
**************************************/
#define DEBUG 0
}
-/**************************************
+/*-************************************
* Command Line
**************************************/
static int usage(const char* programName)
DISPLAY( " %s [arg] [filenames]\n", programName);
DISPLAY( "\n");
DISPLAY( "Arguments :\n");
- DISPLAY( "--maxdict : limit dictionary to specified size (default : %u) \n", maxDictSizeDefault);
DISPLAY( " -o : name of dictionary file (default: %s) \n", dictFileNameDefault);
+ DISPLAY( "--maxdict : limit dictionary to specified size (default : %u) \n", maxDictSizeDefault);
DISPLAY( " -h/-H : display help/long help and exit\n");
return 0;
}
usage(programName);
DISPLAY( "\n");
DISPLAY( "Advanced arguments :\n");
- DISPLAY( " -# : selection level # (default :%u)\n", selectionLevelDefault);
DISPLAY( " -V : display Version number and exit\n");
+ DISPLAY( "--fast : fast sampling mode\n");
+ DISPLAY( " -L# : target compression level (default: %u)\n", compressionLevelDefault);
+ DISPLAY( " -S# : dictionary selectivity level # (default: %u)\n", selectionLevelDefault);
DISPLAY( " -v : verbose mode\n");
DISPLAY( " -q : suppress warnings; specify twice to suppress errors too\n");
return 0;
static int badusage(const char* programName)
{
DISPLAYLEVEL(1, "Incorrect parameters\n");
- if (displayLevel >= 1) usage(programName);
+ if (g_displayLevel >= 1) usage(programName);
return 1;
}
operationResult=0,
nextArgumentIsMaxDict=0,
nextArgumentIsDictFileName=0;
+ unsigned cLevel = compressionLevelDefault;
unsigned maxDictSize = maxDictSizeDefault;
unsigned selectionLevel = selectionLevelDefault;
const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*)); /* argCount >= 1 */
const char* dictFileName = dictFileNameDefault;
/* init */
- displayOut = stderr; /* unfortunately, cannot be set at declaration */
+ g_displayOut = stderr; /* unfortunately, cannot be set at declaration */
if (filenameTable==NULL) EXM_THROW(1, "not enough memory\n");
/* Pick out program name from path. Don't rely on stdlib because of conflicting behavior */
for (i = (int)strlen(programName); i > 0; i--) { if ((programName[i] == '/') || (programName[i] == '\\')) { i++; break; } }
}
/* long commands (--long-word) */
- if (!strcmp(argument, "--version")) { displayOut=stdout; DISPLAY(WELCOME_MESSAGE); return 0; }
- if (!strcmp(argument, "--help")) { displayOut=stdout; return usage_advanced(programName); }
- if (!strcmp(argument, "--verbose")) { displayLevel=4; continue; }
- if (!strcmp(argument, "--quiet")) { displayLevel--; continue; }
+ if (!strcmp(argument, "--version")) { g_displayOut=stdout; DISPLAY(WELCOME_MESSAGE); return 0; }
+ if (!strcmp(argument, "--help")) { g_displayOut=stdout; return usage_advanced(programName); }
+ if (!strcmp(argument, "--verbose")) { g_displayLevel++; if (g_displayLevel<3) g_displayLevel=3; continue; }
+ if (!strcmp(argument, "--quiet")) { g_displayLevel--; continue; }
if (!strcmp(argument, "--maxdict")) { nextArgumentIsMaxDict=1; continue; }
+ if (!strcmp(argument, "--fast")) { selectionLevel=0; cLevel=1; continue; }
/* Decode commands (note : aggregated commands are allowed) */
if (argument[0]=='-') {
argument++;
while (argument[0]!=0) {
- /* selection Level */
- if ((*argument>='0') && (*argument<='9')) {
- selectionLevel = 0;
- while ((*argument >= '0') && (*argument <= '9')) {
- selectionLevel *= 10;
- selectionLevel += *argument - '0';
- argument++;
- }
- continue;
- }
-
switch(argument[0])
{
/* Display help */
- case 'V': displayOut=stdout; DISPLAY(WELCOME_MESSAGE); return 0; /* Version Only */
+ case 'V': g_displayOut=stdout; DISPLAY(WELCOME_MESSAGE); return 0; /* Version Only */
case 'H':
- case 'h': displayOut=stdout; return usage_advanced(programName);
+ case 'h': g_displayOut=stdout; return usage_advanced(programName);
+
+ /* Selection level */
+ case 'S': argument++;
+ selectionLevel = 0;
+ while ((*argument >= '0') && (*argument <= '9'))
+ selectionLevel *= 10, selectionLevel += *argument++ - '0';
+ break;
+
+ /* Selection level */
+ case 'L': argument++;
+ cLevel = 0;
+ while ((*argument >= '0') && (*argument <= '9'))
+ cLevel *= 10, cLevel += *argument++ - '0';
+ break;
/* Verbose mode */
- case 'v': displayLevel++; if (displayLevel<3) displayLevel=3; argument++; break;
+ case 'v': g_displayLevel++; if (g_displayLevel<3) g_displayLevel=3; argument++; break;
/* Quiet mode */
- case 'q': displayLevel--; argument++; break;
+ case 'q': g_displayLevel--; argument++; break;
/* dictionary name */
case 'o': nextArgumentIsDictFileName=1; argument++; break;
if (filenameIdx==0) return badusage(programName);
/* building ... */
- DiB_setNotificationLevel(displayLevel);
- operationResult = DiB_trainDictionary(dictFileName, maxDictSize, selectionLevel, filenameTable, filenameIdx);
+ DiB_setNotificationLevel(g_displayLevel);
+ operationResult = DiB_trainDictionary(dictFileName, maxDictSize, selectionLevel, cLevel, filenameTable, filenameIdx);
if (main_pause) waitEnter();
free((void*)filenameTable);
#include <time.h> /* clock */
#include "mem.h" /* read */
+#include "error_private.h"
#include "divsufsort.h"
#include "dictBuilder.h"
#include "zstd_compress.c"
#define PRIME2 2246822519U
#define MINRATIO 4
+static const U32 g_compressionLevel_default = 5;
/*-*************************************
#define OFFCODE_MAX 18
static size_t DiB_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
+ unsigned compressionLevel,
const void* srcBuffer, size_t* fileSizes, unsigned nbFiles,
const void* dictBuffer, size_t dictBufferSize)
{
esr.zc = ZSTD_createCCtx();
esr.workPlace = malloc(BLOCKSIZE);
if (!esr.ref || !esr.zc || !esr.workPlace) EXM_THROW(30, "Not enough memory");
- params = ZSTD_getParams(5, dictBufferSize + 15 KB);
+ if (compressionLevel==0) compressionLevel=g_compressionLevel_default;
+ params = ZSTD_getParams(compressionLevel, dictBufferSize + 15 KB);
params.strategy = ZSTD_greedy;
ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params);
}
-int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize, unsigned shiftRatio,
- const char** fileNamesTable, unsigned nbFiles)
+#define DIB_FASTSEGMENTSIZE 64
+/*! DiB_fastSampling (based on an idea by Giuseppe Ottaviano)
+ Fill @dictBuffer with stripes of size DIB_FASTSEGMENTSIZE from @samplesBuffer
+ up to @dictSize.
+ Filling starts from the end of @dictBuffer, down to maximum possible.
+ if @dictSize is not a multiply of DIB_FASTSEGMENTSIZE, some bytes at beginning of @dictBuffer won't be used.
+ @return : amount of data written into @dictBuffer
+ or an error Code (if @dictSize or @samplesSize too small)
+*/
+static size_t DiB_fastSampling(void* dictBuffer, size_t dictSize,
+ const void* samplesBuffer, size_t samplesSize)
+{
+ char* dstPtr = (char*)dictBuffer + dictSize;
+ const char* srcPtr = (const char*)samplesBuffer;
+ size_t nbSegments = dictSize / DIB_FASTSEGMENTSIZE;
+ size_t segNb, interSize;
+
+ if (nbSegments <= 2) return ERROR(srcSize_wrong);
+ if (samplesSize < dictSize) return ERROR(srcSize_wrong);
+
+ /* first and last segments are part of dictionary, in case they contain interesting header/footer */
+ dstPtr -= DIB_FASTSEGMENTSIZE;
+ memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
+ dstPtr -= DIB_FASTSEGMENTSIZE;
+ memcpy(dstPtr, srcPtr+samplesSize-DIB_FASTSEGMENTSIZE, DIB_FASTSEGMENTSIZE);
+
+ /* regularly copy a segment */
+ interSize = (samplesSize - nbSegments*DIB_FASTSEGMENTSIZE) / (nbSegments-1);
+ srcPtr += DIB_FASTSEGMENTSIZE;
+ for (segNb=2; segNb < nbSegments; segNb++) {
+ srcPtr += interSize;
+ dstPtr -= DIB_FASTSEGMENTSIZE;
+ memcpy(dstPtr, srcPtr, DIB_FASTSEGMENTSIZE);
+ srcPtr += DIB_FASTSEGMENTSIZE;
+ }
+
+ return nbSegments * DIB_FASTSEGMENTSIZE;
+}
+
+
+int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize,
+ unsigned shiftRatio, unsigned compressionLevel,
+ const char** fileNamesTable, unsigned nbFiles)
{
void* srcBuffer;
size_t benchedSize;
/* Load input buffer */
DiB_loadFiles(srcBuffer, benchedSize, fileSizes, fileNamesTable, nbFiles);
- DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* for end of buffer condition */
-
- /* Train */
- snprintf (mfName, sizeof(mfName), " %u files", nbFiles);
- if (nbFiles > 1) displayName = mfName;
- else displayName = fileNamesTable[0];
-
- DiB_trainBuffer(dictList, dictListSize,
- srcBuffer, benchedSize,
- displayName,
- fileSizes, nbFiles, maxDictSize,
- shiftRatio);
-
- /* display best matches */
- if (g_displayLevel>= 3) {
- const U32 nb = 25;
- U32 u;
- U32 dictContentSize = DiB_dictSize(dictList);
- DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
- DISPLAYLEVEL(3, "list %u best segments \n", nb);
- for (u=1; u<=nb; u++) {
- U32 p = dictList[u].pos;
- U32 l = dictList[u].length;
- U32 d = MIN(40, l);
- DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
- u, l, p, dictList[u].savings);
- DiB_printHex(3, (char*)srcBuffer+p, d);
- DISPLAYLEVEL(3, "| \n");
- } }
+ DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */
+
+ if (shiftRatio>0)
+ {
+ /* analyze samples */
+ snprintf (mfName, sizeof(mfName), " %u files", nbFiles);
+ if (nbFiles > 1) displayName = mfName;
+ else displayName = fileNamesTable[0];
+
+ DiB_trainBuffer(dictList, dictListSize,
+ srcBuffer, benchedSize,
+ displayName,
+ fileSizes, nbFiles, maxDictSize,
+ shiftRatio);
+
+ /* display best matches */
+ if (g_displayLevel>= 3) {
+ const U32 nb = 25;
+ U32 u;
+ U32 dictContentSize = DiB_dictSize(dictList);
+ DISPLAYLEVEL(3, "\n %u segments found, of total size %u \n", dictList[0].pos, dictContentSize);
+ DISPLAYLEVEL(3, "list %u best segments \n", nb);
+ for (u=1; u<=nb; u++) {
+ U32 p = dictList[u].pos;
+ U32 l = dictList[u].length;
+ U32 d = MIN(40, l);
+ DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
+ u, l, p, dictList[u].savings);
+ DiB_printHex(3, (char*)srcBuffer+p, d);
+ DISPLAYLEVEL(3, "| \n");
+ } } }
/* create dictionary */
{
void* dictContent;
U32 dictContentSize = DiB_dictSize(dictList);
void* dictHeader;
- size_t dictHeaderSize, hSize;
+ size_t dictHeaderSize, hSize, addedContentLength;
BYTE* ptr;
U32 u;
#define EBSIZE (2 KB)
dictHeaderSize = EBSIZE;
dictHeader = malloc(dictHeaderSize);
- dictContent = malloc(dictContentSize);
+ dictContent = malloc(maxDictSize);
if (!dictHeader || !dictContent) EXM_THROW(2, "not enough memory");
/* build dict content */
- ptr = (BYTE*)dictContent + dictContentSize;
-
+ ptr = (BYTE*)dictContent + maxDictSize;
for (u=1; u<dictList->pos; u++) {
U32 l = dictList[u].length;
ptr -= l;
memcpy(ptr, (char*)srcBuffer+dictList[u].pos, l);
}
+ /* fast dict content mode */
+ if (shiftRatio==0) {
+ addedContentLength = ptr-(BYTE*)dictContent;
+ DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
+ DISPLAYLEVEL(2, "Adding %u KB from fast sampling \n", (U32)(addedContentLength>>10));
+ addedContentLength = DiB_fastSampling(dictContent, addedContentLength, srcBuffer, benchedSize);
+ if (!ERR_isError(addedContentLength))
+ ptr -= addedContentLength, dictContentSize += addedContentLength;
+ }
+
/* dictionary header */
MEM_writeLE32(dictHeader, ZSTD_DICT_MAGIC);
hSize = 4;
/* entropic tables */
DISPLAYLEVEL(2, "statistics ... \n");
hSize += DiB_analyzeEntropy((char*)dictHeader+4, dictHeaderSize,
- srcBuffer, fileSizes, nbFiles,
- dictContent, dictContentSize);
+ compressionLevel,
+ srcBuffer, fileSizes, nbFiles,
+ ptr, dictContentSize);
/* save dict */
{
size_t dictSize = hSize + dictContentSize;
DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
- DiB_saveDict(dictFileName, dictHeader, hSize, dictContent, dictContentSize);
+ DiB_saveDict(dictFileName, dictHeader, hSize, ptr, dictContentSize);
//DiB_saveDict(dictFileName, NULL, 0, dictContent, dictContentSize); // content only
}
/* clean */