From: Yann Collet Date: Tue, 6 Sep 2016 07:44:59 +0000 (+0200) Subject: improved compression on small files when using same parameters X-Git-Tag: v1.1.0~61 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a7737f6a60dfdcd890fa912a197554d59981dcc4;p=thirdparty%2Fzstd.git improved compression on small files when using same parameters --- diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 47cf64150..7e30f5a05 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -225,81 +225,114 @@ size_t ZSTD_estimateCCtxSize(ZSTD_compressionParameters cParams) return sizeof(ZSTD_CCtx) + neededSpace; } + +static U32 ZSTD_equivalentParams(ZSTD_parameters param1, ZSTD_parameters param2) +{ + return (param1.cParams.hashLog == param2.cParams.hashLog) + & (param1.cParams.chainLog == param2.cParams.chainLog) + & (param1.cParams.strategy == param2.cParams.strategy); +} + +/*! ZSTD_continueCCtx() : + reuse CCtx without reset (note : requires no dictionary) */ +static size_t ZSTD_continueCCtx(ZSTD_CCtx* cctx, ZSTD_parameters params, U64 frameContentSize) +{ + U32 const end = (U32)(cctx->nextSrc - cctx->base); + cctx->params = params; + cctx->frameContentSize = frameContentSize; + cctx->lowLimit = end; + cctx->dictLimit = end; + cctx->nextToUpdate = end+1; + cctx->stage = ZSTDcs_init; + cctx->dictID = 0; + cctx->loadedDictEnd = 0; + { int i; for (i=0; irep[i] = repStartValue[i]; } + cctx->seqStore.litLengthSum = 0; /* force reset stats */ + return 0; +} + +typedef enum { ZSTDcrp_continue, ZSTDcrp_noMemset, ZSTDcrp_fullReset } ZSTD_compResetPolicy_e; + /*! ZSTD_resetCCtx_advanced() : - note : 'params' is expected to be validated */ + note : 'params' must be validated */ static size_t ZSTD_resetCCtx_advanced (ZSTD_CCtx* zc, ZSTD_parameters params, U64 frameContentSize, - U32 reset) -{ /* note : params considered validated here */ - size_t const blockSize = MIN(ZSTD_BLOCKSIZE_ABSOLUTEMAX, (size_t)1 << params.cParams.windowLog); - U32 const divider = (params.cParams.searchLength==3) ? 3 : 4; - size_t const maxNbSeq = blockSize / divider; - size_t const tokenSpace = blockSize + 11*maxNbSeq; - size_t const chainSize = (params.cParams.strategy == ZSTD_fast) ? 0 : (1 << params.cParams.chainLog); - size_t const hSize = ((size_t)1) << params.cParams.hashLog; - U32 const hashLog3 = (params.cParams.searchLength>3) ? 0 : MIN(ZSTD_HASHLOG3_MAX, params.cParams.windowLog); - size_t const h3Size = ((size_t)1) << hashLog3; - size_t const tableSpace = (chainSize + hSize + h3Size) * sizeof(U32); - void* ptr; - - /* Check if workSpace is large enough, alloc a new one if needed */ - { size_t const optSpace = ((MaxML+1) + (MaxLL+1) + (MaxOff+1) + (1<workSpaceSize < neededSpace) { - ZSTD_free(zc->workSpace, zc->customMem); - zc->workSpace = ZSTD_malloc(neededSpace, zc->customMem); - if (zc->workSpace == NULL) return ERROR(memory_allocation); - zc->workSpaceSize = neededSpace; - } } + ZSTD_compResetPolicy_e crp) +{ + if (crp == ZSTDcrp_continue) /* still some issues */ + if (ZSTD_equivalentParams(params, zc->params)) + return ZSTD_continueCCtx(zc, params, frameContentSize); + + { size_t const blockSize = MIN(ZSTD_BLOCKSIZE_ABSOLUTEMAX, (size_t)1 << params.cParams.windowLog); + U32 const divider = (params.cParams.searchLength==3) ? 3 : 4; + size_t const maxNbSeq = blockSize / divider; + size_t const tokenSpace = blockSize + 11*maxNbSeq; + size_t const chainSize = (params.cParams.strategy == ZSTD_fast) ? 0 : (1 << params.cParams.chainLog); + size_t const hSize = ((size_t)1) << params.cParams.hashLog; + U32 const hashLog3 = (params.cParams.searchLength>3) ? 0 : MIN(ZSTD_HASHLOG3_MAX, params.cParams.windowLog); + size_t const h3Size = ((size_t)1) << hashLog3; + size_t const tableSpace = (chainSize + hSize + h3Size) * sizeof(U32); + void* ptr; + + /* Check if workSpace is large enough, alloc a new one if needed */ + { size_t const optSpace = ((MaxML+1) + (MaxLL+1) + (MaxOff+1) + (1<workSpaceSize < neededSpace) { + ZSTD_free(zc->workSpace, zc->customMem); + zc->workSpace = ZSTD_malloc(neededSpace, zc->customMem); + if (zc->workSpace == NULL) return ERROR(memory_allocation); + zc->workSpaceSize = neededSpace; + } } - if (reset) memset(zc->workSpace, 0, tableSpace); /* reset tables only */ - XXH64_reset(&zc->xxhState, 0); - zc->hashLog3 = hashLog3; - zc->hashTable = (U32*)(zc->workSpace); - zc->chainTable = zc->hashTable + hSize; - zc->hashTable3 = zc->chainTable + chainSize; - ptr = zc->hashTable3 + h3Size; - zc->hufTable = (HUF_CElt*)ptr; - zc->flagStaticTables = 0; - ptr = ((U32*)ptr) + 256; /* note : HUF_CElt* is incomplete type, size is simulated using U32 */ - - zc->nextToUpdate = 1; - zc->nextSrc = NULL; - zc->base = NULL; - zc->dictBase = NULL; - zc->dictLimit = 0; - zc->lowLimit = 0; - zc->params = params; - zc->blockSize = blockSize; - zc->frameContentSize = frameContentSize; - { int i; for (i=0; irep[i] = repStartValue[i]; } - - if (params.cParams.strategy == ZSTD_btopt) { - zc->seqStore.litFreq = (U32*)ptr; - zc->seqStore.litLengthFreq = zc->seqStore.litFreq + (1<seqStore.matchLengthFreq = zc->seqStore.litLengthFreq + (MaxLL+1); - zc->seqStore.offCodeFreq = zc->seqStore.matchLengthFreq + (MaxML+1); - ptr = zc->seqStore.offCodeFreq + (MaxOff+1); - zc->seqStore.matchTable = (ZSTD_match_t*)ptr; - ptr = zc->seqStore.matchTable + ZSTD_OPT_NUM+1; - zc->seqStore.priceTable = (ZSTD_optimal_t*)ptr; - ptr = zc->seqStore.priceTable + ZSTD_OPT_NUM+1; - zc->seqStore.litLengthSum = 0; - } - zc->seqStore.sequencesStart = (seqDef*)ptr; - ptr = zc->seqStore.sequencesStart + maxNbSeq; - zc->seqStore.llCode = (BYTE*) ptr; - zc->seqStore.mlCode = zc->seqStore.llCode + maxNbSeq; - zc->seqStore.ofCode = zc->seqStore.mlCode + maxNbSeq; - zc->seqStore.litStart = zc->seqStore.ofCode + maxNbSeq; + if (crp!=ZSTDcrp_noMemset) memset(zc->workSpace, 0, tableSpace); /* reset tables only */ + XXH64_reset(&zc->xxhState, 0); + zc->hashLog3 = hashLog3; + zc->hashTable = (U32*)(zc->workSpace); + zc->chainTable = zc->hashTable + hSize; + zc->hashTable3 = zc->chainTable + chainSize; + ptr = zc->hashTable3 + h3Size; + zc->hufTable = (HUF_CElt*)ptr; + zc->flagStaticTables = 0; + ptr = ((U32*)ptr) + 256; /* note : HUF_CElt* is incomplete type, size is simulated using U32 */ + + zc->nextToUpdate = 1; + zc->nextSrc = NULL; + zc->base = NULL; + zc->dictBase = NULL; + zc->dictLimit = 0; + zc->lowLimit = 0; + zc->params = params; + zc->blockSize = blockSize; + zc->frameContentSize = frameContentSize; + { int i; for (i=0; irep[i] = repStartValue[i]; } + + if (params.cParams.strategy == ZSTD_btopt) { + zc->seqStore.litFreq = (U32*)ptr; + zc->seqStore.litLengthFreq = zc->seqStore.litFreq + (1<seqStore.matchLengthFreq = zc->seqStore.litLengthFreq + (MaxLL+1); + zc->seqStore.offCodeFreq = zc->seqStore.matchLengthFreq + (MaxML+1); + ptr = zc->seqStore.offCodeFreq + (MaxOff+1); + zc->seqStore.matchTable = (ZSTD_match_t*)ptr; + ptr = zc->seqStore.matchTable + ZSTD_OPT_NUM+1; + zc->seqStore.priceTable = (ZSTD_optimal_t*)ptr; + ptr = zc->seqStore.priceTable + ZSTD_OPT_NUM+1; + zc->seqStore.litLengthSum = 0; + } + zc->seqStore.sequencesStart = (seqDef*)ptr; + ptr = zc->seqStore.sequencesStart + maxNbSeq; + zc->seqStore.llCode = (BYTE*) ptr; + zc->seqStore.mlCode = zc->seqStore.llCode + maxNbSeq; + zc->seqStore.ofCode = zc->seqStore.mlCode + maxNbSeq; + zc->seqStore.litStart = zc->seqStore.ofCode + maxNbSeq; - zc->stage = ZSTDcs_init; - zc->dictID = 0; - zc->loadedDictEnd = 0; + zc->stage = ZSTDcs_init; + zc->dictID = 0; + zc->loadedDictEnd = 0; - return 0; + return 0; + } } @@ -312,7 +345,7 @@ size_t ZSTD_copyCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx) if (srcCCtx->stage!=ZSTDcs_init) return ERROR(stage_wrong); memcpy(&dstCCtx->customMem, &srcCCtx->customMem, sizeof(ZSTD_customMem)); - ZSTD_resetCCtx_advanced(dstCCtx, srcCCtx->params, srcCCtx->frameContentSize, 0); + ZSTD_resetCCtx_advanced(dstCCtx, srcCCtx->params, srcCCtx->frameContentSize, ZSTDcrp_noMemset); dstCCtx->params.fParams.contentSizeFlag = 0; /* content size different from the one set during srcCCtx init */ /* copy tables */ @@ -2529,14 +2562,15 @@ static size_t ZSTD_compress_insertDictionary(ZSTD_CCtx* zc, const void* dict, si /*! ZSTD_compressBegin_internal() : * @return : 0, or an error code */ -static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* zc, +static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, U64 pledgedSrcSize) { - size_t const resetError = ZSTD_resetCCtx_advanced(zc, params, pledgedSrcSize, 1); + ZSTD_compResetPolicy_e const crp = dictSize ? ZSTDcrp_fullReset : ZSTDcrp_continue; + size_t const resetError = ZSTD_resetCCtx_advanced(cctx, params, pledgedSrcSize, crp); if (ZSTD_isError(resetError)) return resetError; - return ZSTD_compress_insertDictionary(zc, dict, dictSize); + return ZSTD_compress_insertDictionary(cctx, dict, dictSize); } @@ -2547,8 +2581,8 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, ZSTD_parameters params, unsigned long long pledgedSrcSize) { /* compression parameters verification and optimization */ - { size_t const errorCode = ZSTD_checkCParams_advanced(params.cParams, pledgedSrcSize); - if (ZSTD_isError(errorCode)) return errorCode; } + size_t const errorCode = ZSTD_checkCParams_advanced(params.cParams, pledgedSrcSize); + if (ZSTD_isError(errorCode)) return errorCode; return ZSTD_compressBegin_internal(cctx, dict, dictSize, params, pledgedSrcSize); } diff --git a/tests/fuzzer.c b/tests/fuzzer.c index b8b7f0c93..323854b32 100644 --- a/tests/fuzzer.c +++ b/tests/fuzzer.c @@ -556,7 +556,7 @@ static int fuzzerTests(U32 seed, U32 nbTests, unsigned startTest, U32 const maxD /* compression tests */ { unsigned const cLevel = (FUZ_rand(&lseed) % (ZSTD_maxCLevel() - (FUZ_highbit32((U32)sampleSize)/3))) + 1; cSize = ZSTD_compressCCtx(ctx, cBuffer, cBufferSize, sampleBuffer, sampleSize, cLevel); - CHECK(ZSTD_isError(cSize), "ZSTD_compressCCtx failed"); + CHECK(ZSTD_isError(cSize), "ZSTD_compressCCtx failed : %s", ZSTD_getErrorName(cSize)); /* compression failure test : too small dest buffer */ if (cSize > 3) {