From a90779397ad6372429188253eb00ed4c39f34912 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Wed, 26 Aug 2020 17:52:54 -0700 Subject: [PATCH] [lib] Reduce zstd stack usage by 1KB --- lib/common/fse.h | 3 ++- lib/compress/fse_compress.c | 8 +++++--- lib/compress/huf_compress.c | 2 +- lib/compress/zstd_compress.c | 16 ++++++++++------ lib/compress/zstd_compress_internal.h | 5 ++++- lib/compress/zstd_compress_sequences.c | 1 + lib/compress/zstd_compress_superblock.c | 4 ++-- tests/decodecorpus.c | 6 +++--- 8 files changed, 28 insertions(+), 17 deletions(-) diff --git a/lib/common/fse.h b/lib/common/fse.h index 0abf8550f..83a07847a 100644 --- a/lib/common/fse.h +++ b/lib/common/fse.h @@ -335,8 +335,9 @@ size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue); /* FSE_buildCTable_wksp() : * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`). - * `wkspSize` must be >= `(1<= `FSE_BUILD_CTABLE_WORKSPACE_SIZE(maxSymbolValue, tableLog)`. */ +#define FSE_BUILD_CTABLE_WORKSPACE_SIZE(maxSymbolValue, tableLog) (sizeof(unsigned) * (maxSymbolValue + 2) + (1ull << tableLog)) size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); #define FSE_BUILD_DTABLE_WKSP_SIZE(maxTableLog, maxSymbolValue) (sizeof(short) * (maxSymbolValue + 1) + (1ULL << maxTableLog) + 8) diff --git a/lib/compress/fse_compress.c b/lib/compress/fse_compress.c index 21fabad9e..a2622c5e3 100644 --- a/lib/compress/fse_compress.c +++ b/lib/compress/fse_compress.c @@ -74,13 +74,15 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableLog ? tableSize>>1 : 1) ; FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT); U32 const step = FSE_TABLESTEP(tableSize); - U32 cumul[FSE_MAX_SYMBOL_VALUE+2]; - FSE_FUNCTION_TYPE* const tableSymbol = (FSE_FUNCTION_TYPE*)workSpace; + U32* cumul = (U32*)workSpace; + FSE_FUNCTION_TYPE* tableSymbol = (FSE_FUNCTION_TYPE*)(cumul + (maxSymbolValue + 2)); + U32 highThreshold = tableSize-1; + if ((size_t)workSpace & 3) return ERROR(GENERIC); /* Must be 4 byte aligned */ + if (FSE_BUILD_CTABLE_WORKSPACE_SIZE(maxSymbolValue, tableLog) > wkspSize) return ERROR(tableLog_tooLarge); /* CTable header */ - if (((size_t)1 << tableLog) * sizeof(FSE_FUNCTION_TYPE) > wkspSize) return ERROR(tableLog_tooLarge); tableU16[-2] = (U16) tableLog; tableU16[-1] = (U16) maxSymbolValue; assert(tableLog < 16); /* required for threshold strategy to work */ diff --git a/lib/compress/huf_compress.c b/lib/compress/huf_compress.c index 25d7fbfb8..17e242e40 100644 --- a/lib/compress/huf_compress.c +++ b/lib/compress/huf_compress.c @@ -69,7 +69,7 @@ static size_t HUF_compressWeights (void* dst, size_t dstSize, const void* weight U32 tableLog = MAX_FSE_TABLELOG_FOR_HUFF_HEADER; FSE_CTable CTable[FSE_CTABLE_SIZE_U32(MAX_FSE_TABLELOG_FOR_HUFF_HEADER, HUF_TABLELOG_MAX)]; - BYTE scratchBuffer[1<staticSize = workspaceSize; /* statically sized space. entropyWorkspace never moves (but prev/next block swap places) */ - if (!ZSTD_cwksp_check_available(&cctx->workspace, HUF_WORKSPACE_SIZE + 2 * sizeof(ZSTD_compressedBlockState_t))) return NULL; + if (!ZSTD_cwksp_check_available(&cctx->workspace, ENTROPY_WORKSPACE_SIZE + 2 * sizeof(ZSTD_compressedBlockState_t))) return NULL; cctx->blockState.prevCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t)); cctx->blockState.nextCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t)); - cctx->entropyWorkspace = (U32*)ZSTD_cwksp_reserve_object(&cctx->workspace, HUF_WORKSPACE_SIZE); + cctx->entropyWorkspace = (U32*)ZSTD_cwksp_reserve_object(&cctx->workspace, ENTROPY_WORKSPACE_SIZE); cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); return cctx; } @@ -1139,7 +1139,7 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) + ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(seqDef)) + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE)); - size_t const entropySpace = ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE); + size_t const entropySpace = ZSTD_cwksp_alloc_size(ENTROPY_WORKSPACE_SIZE); size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t)); size_t const matchStateSize = ZSTD_sizeof_matchState(cParams, /* forCCtx */ 1); @@ -1502,7 +1502,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, RETURN_ERROR_IF(zc->blockState.prevCBlock == NULL, memory_allocation, "couldn't allocate prevCBlock"); zc->blockState.nextCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t)); RETURN_ERROR_IF(zc->blockState.nextCBlock == NULL, memory_allocation, "couldn't allocate nextCBlock"); - zc->entropyWorkspace = (U32*) ZSTD_cwksp_reserve_object(ws, HUF_WORKSPACE_SIZE); + zc->entropyWorkspace = (U32*) ZSTD_cwksp_reserve_object(ws, ENTROPY_WORKSPACE_SIZE); RETURN_ERROR_IF(zc->blockState.nextCBlock == NULL, memory_allocation, "couldn't allocate entropyWorkspace"); } } @@ -1976,7 +1976,7 @@ ZSTD_compressSequences_internal(seqStore_t* seqStorePtr, { const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN; ZSTD_strategy const strategy = cctxParams->cParams.strategy; - unsigned count[MaxSeq+1]; + unsigned* count = (unsigned*)entropyWorkspace; FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable; FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable; FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable; @@ -1992,8 +1992,12 @@ ZSTD_compressSequences_internal(seqStore_t* seqStorePtr, BYTE* seqHead; BYTE* lastNCount = NULL; + entropyWorkspace = count + (MaxSeq + 1); + entropyWkspSize -= (MaxSeq + 1) * sizeof(*count); + DEBUGLOG(5, "ZSTD_compressSequences_internal (nbSeq=%zu)", nbSeq); ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<= HUF_WORKSPACE_SIZE); /* Compress literals */ { const BYTE* const literals = seqStorePtr->litStart; @@ -2448,7 +2452,7 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc, &zc->appliedParams, dst, dstCapacity, srcSize, - zc->entropyWorkspace, HUF_WORKSPACE_SIZE /* statically allocated in resetCCtx */, + zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */, zc->bmi2); if (frame && diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h index 92ee3f31c..4760f6b08 100644 --- a/lib/compress/zstd_compress_internal.h +++ b/lib/compress/zstd_compress_internal.h @@ -232,6 +232,9 @@ struct ZSTD_CCtx_params_s { ZSTD_customMem customMem; }; /* typedef'd to ZSTD_CCtx_params within "zstd.h" */ +#define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2)) +#define ENTROPY_WORKSPACE_SIZE (HUF_WORKSPACE_SIZE + COMPRESS_SEQUENCES_WORKSPACE_SIZE) + struct ZSTD_CCtx_s { ZSTD_compressionStage_e stage; int cParamsChanged; /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */ @@ -258,7 +261,7 @@ struct ZSTD_CCtx_s { size_t maxNbLdmSequences; rawSeqStore_t externSeqStore; /* Mutable reference to external sequences */ ZSTD_blockState_t blockState; - U32* entropyWorkspace; /* entropy workspace of HUF_WORKSPACE_SIZE bytes */ + U32* entropyWorkspace; /* entropy workspace of ENTROPY_WORKSPACE_SIZE bytes */ /* streaming */ char* inBuff; diff --git a/lib/compress/zstd_compress_sequences.c b/lib/compress/zstd_compress_sequences.c index 2a450e537..be30c08c6 100644 --- a/lib/compress/zstd_compress_sequences.c +++ b/lib/compress/zstd_compress_sequences.c @@ -266,6 +266,7 @@ ZSTD_buildCTable(void* dst, size_t dstCapacity, nbSeq_1--; } assert(nbSeq_1 > 1); + assert(entropyWorkspaceSize >= FSE_BUILD_CTABLE_WORKSPACE_SIZE(MaxSeq, MaxFSELog)); FORWARD_IF_ERROR(FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max, ZSTD_useLowProbCount(nbSeq_1)), ""); { size_t const NCountSize = FSE_writeNCount(op, oend - op, norm, max, tableLog); /* overflow protected */ FORWARD_IF_ERROR(NCountSize, "FSE_writeNCount failed"); diff --git a/lib/compress/zstd_compress_superblock.c b/lib/compress/zstd_compress_superblock.c index 25da3d6ea..144c120d2 100644 --- a/lib/compress/zstd_compress_superblock.c +++ b/lib/compress/zstd_compress_superblock.c @@ -835,7 +835,7 @@ size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc, &zc->blockState.nextCBlock->entropy, &zc->appliedParams, &entropyMetadata, - zc->entropyWorkspace, HUF_WORKSPACE_SIZE /* statically allocated in resetCCtx */), ""); + zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), ""); return ZSTD_compressSubBlock_multi(&zc->seqStore, zc->blockState.prevCBlock, @@ -845,5 +845,5 @@ size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc, dst, dstCapacity, src, srcSize, zc->bmi2, lastBlock, - zc->entropyWorkspace, HUF_WORKSPACE_SIZE /* statically allocated in resetCCtx */); + zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */); } diff --git a/tests/decodecorpus.c b/tests/decodecorpus.c index 76a789891..96d562be7 100644 --- a/tests/decodecorpus.c +++ b/tests/decodecorpus.c @@ -815,7 +815,7 @@ static size_t writeSequences(U32* seed, frame_t* frame, seqStore_t* seqStorePtr, BYTE* const oend = (BYTE*)frame->dataEnd; BYTE* op = (BYTE*)frame->data; BYTE* seqHead; - BYTE scratchBuffer[1<