From 6a9c525903ca65aad61b50381b5781c1187143d1 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Thu, 22 Dec 2022 11:30:15 -0800 Subject: [PATCH] spec update : require minimum nb of literals for 4-streams mode Reported by @shulib : the specification for 4-streams mode doesn't work when the amount of literals to compress is 5 bytes. Extending it, it also doesn't work for sizes 1 or 2. This patch updates the specification and the implementation to require a minimum of 6 literals to trigger or accept the 4-streams mode. The impact is expected to be a no-op : the 4-streams mode is never triggered for such small quantity of literals anyway, since it would be wasteful (it costs ~7.3 bytes more than single-stream mode). An informal lower limit is set at ~256 bytes, so the technical minimum is very far from this limit. This is just meant for completeness of the specification. --- doc/zstd_compression_format.md | 24 ++++++++++++---- lib/common/error_private.c | 1 + lib/common/huf.h | 6 ++-- lib/common/zstd_internal.h | 1 + lib/compress/zstd_compress_literals.c | 3 ++ lib/decompress/huf_decompress.c | 40 +++++++++++++++----------- lib/decompress/zstd_decompress_block.c | 8 +++++- lib/zstd_errors.h | 1 + 8 files changed, 60 insertions(+), 24 deletions(-) diff --git a/doc/zstd_compression_format.md b/doc/zstd_compression_format.md index 042dbe89a..e40677a1d 100644 --- a/doc/zstd_compression_format.md +++ b/doc/zstd_compression_format.md @@ -435,7 +435,7 @@ They can be decoded first, and then copied during [Sequence Execution], or they can be decoded on the flow during [Sequence Execution]. Literals can be stored uncompressed or compressed using Huffman prefix codes. -When compressed, an optional tree description can be present, +When compressed, a tree description may optionally be present, followed by 1 or 4 streams. | `Literals_Section_Header` | [`Huffman_Tree_Description`] | [jumpTable] | Stream1 | [Stream2] | [Stream3] | [Stream4] | @@ -510,7 +510,7 @@ Its value is : `Size_Format = (Literals_Section_Header[0]>>2) & 3` `Regenerated_Size = (Literals_Section_Header[0]>>4) + (Literals_Section_Header[1]<<4) + (Literals_Section_Header[2]<<12)` Only Stream1 is present for these cases. -Note : it's allowed to represent a short value (for example `13`) +Note : it's allowed to represent a short value (for example `27`) using a long format, even if it's less efficient. __`Size_Format` for `Compressed_Literals_Block` and `Treeless_Literals_Block`__ : @@ -521,19 +521,33 @@ __`Size_Format` for `Compressed_Literals_Block` and `Treeless_Literals_Block`__ Both `Regenerated_Size` and `Compressed_Size` use 10 bits (0-1023). `Literals_Section_Header` uses 3 bytes. - `Size_Format` == 01 : 4 streams. - Both `Regenerated_Size` and `Compressed_Size` use 10 bits (0-1023). + Both `Regenerated_Size` and `Compressed_Size` use 10 bits (6-1023). `Literals_Section_Header` uses 3 bytes. - `Size_Format` == 10 : 4 streams. - Both `Regenerated_Size` and `Compressed_Size` use 14 bits (0-16383). + Both `Regenerated_Size` and `Compressed_Size` use 14 bits (6-16383). `Literals_Section_Header` uses 4 bytes. - `Size_Format` == 11 : 4 streams. - Both `Regenerated_Size` and `Compressed_Size` use 18 bits (0-262143). + Both `Regenerated_Size` and `Compressed_Size` use 18 bits (6-262143). `Literals_Section_Header` uses 5 bytes. Both `Compressed_Size` and `Regenerated_Size` fields follow __little-endian__ convention. Note: `Compressed_Size` __includes__ the size of the Huffman Tree description _when_ it is present. +4 streams is superior to 1 stream in decompression speed, +by exploiting instruction level parallelism. +But it's also more expensive, +costing on average ~7.3 bytes more than the 1 stream mode, mostly from the jump table. + +In general, use the 4 streams mode when there are more literals to decode, +to favor higher decompression speeds. +Beyond 1KB, the 4 streams mode is compulsory anyway. + +Note that a minimum of 6 bytes is required for the 4 streams mode. +That's a technical minimum, but it's not recommended to employ the 4 streams mode +for such a small quantity, that would be wasteful. +A more practical lower bound would be around ~256 bytes. + #### Raw Literals Block The data in Stream1 is `Regenerated_Size` bytes long, it contains the raw literals data to be used during [Sequence Execution]. diff --git a/lib/common/error_private.c b/lib/common/error_private.c index baf9b0b0c..fe73c5edd 100644 --- a/lib/common/error_private.c +++ b/lib/common/error_private.c @@ -29,6 +29,7 @@ const char* ERR_getErrorString(ERR_enum code) case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding"; case PREFIX(corruption_detected): return "Data corruption detected"; case PREFIX(checksum_wrong): return "Restored data doesn't match checksum"; + case PREFIX(literals_headerWrong): return "Header of Literals' block doesn't respect format specification"; case PREFIX(parameter_unsupported): return "Unsupported parameter"; case PREFIX(parameter_outOfBound): return "Parameter is out of bound"; case PREFIX(init_missing): return "Context should be init first"; diff --git a/lib/common/huf.h b/lib/common/huf.h index 679493bfa..f7316f9b2 100644 --- a/lib/common/huf.h +++ b/lib/common/huf.h @@ -88,8 +88,10 @@ HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity, unsigned maxSymbolValue, unsigned tableLog); /** HUF_compress4X_wksp() : - * Same as HUF_compress2(), but uses externally allocated `workSpace`. - * `workspace` must be at least as large as HUF_WORKSPACE_SIZE */ + * Same as HUF_compress2(), but uses externally allocated @workSpace. + * @workSpace's size, aka @wkspSize, must be >= HUF_WORKSPACE_SIZE + * @srcSize must be >= 6 + */ #define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */) #define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64)) HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity, diff --git a/lib/common/zstd_internal.h b/lib/common/zstd_internal.h index c9c97b7e4..48558873d 100644 --- a/lib/common/zstd_internal.h +++ b/lib/common/zstd_internal.h @@ -94,6 +94,7 @@ typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e; #define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */ #define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */) /* for a non-null block */ +#define MIN_LITERALS_FOR_4_STREAMS 6 typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e; diff --git a/lib/compress/zstd_compress_literals.c b/lib/compress/zstd_compress_literals.c index e97c485ab..972d40e71 100644 --- a/lib/compress/zstd_compress_literals.c +++ b/lib/compress/zstd_compress_literals.c @@ -164,16 +164,19 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, switch(lhSize) { case 3: /* 2 - 2 - 10 - 10 */ + if (!singleStream) assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS); { U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14); MEM_writeLE24(ostart, lhc); break; } case 4: /* 2 - 2 - 14 - 14 */ + assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS); { U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18); MEM_writeLE32(ostart, lhc); break; } case 5: /* 2 - 2 - 18 - 18 */ + assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS); { U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22); MEM_writeLE32(ostart, lhc); ostart[4] = (BYTE)(cLitSize >> 10); diff --git a/lib/decompress/huf_decompress.c b/lib/decompress/huf_decompress.c index b3ba5926b..e5c35dae9 100644 --- a/lib/decompress/huf_decompress.c +++ b/lib/decompress/huf_decompress.c @@ -289,10 +289,11 @@ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decodi static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) { U64 D4; if (MEM_isLittleEndian()) { - D4 = (symbol << 8) + nbBits; + D4 = (U64)((symbol << 8) + nbBits); } else { - D4 = symbol + (nbBits << 8); + D4 = (U64)(symbol + (nbBits << 8)); } + assert(D4 < (1U << 16)); D4 *= 0x0001000100010001ULL; return D4; } @@ -383,9 +384,8 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr * rankStart[0] is not filled because there are no entries in the table for * weight 0. */ - { - int n; - int nextRankStart = 0; + { int n; + U32 nextRankStart = 0; int const unroll = 4; int const nLimit = (int)nbSymbols - unroll + 1; for (n=0; n<(int)tableLog+1; n++) { @@ -412,10 +412,9 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr * We can switch based on the length to a different inner loop which is * optimized for that particular case. */ - { - U32 w; - int symbol=wksp->rankVal[0]; - int rankStart=0; + { U32 w; + int symbol = wksp->rankVal[0]; + int rankStart = 0; for (w=1; wrankVal[w]; int const length = (1 << w) >> 1; @@ -525,7 +524,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons while (p < pEnd) HUF_DECODE_SYMBOLX1_0(p, bitDPtr); - return pEnd-pStart; + return (size_t)(pEnd-pStart); } FORCE_INLINE_TEMPLATE size_t @@ -551,6 +550,10 @@ HUF_decompress1X1_usingDTable_internal_body( return dstSize; } +/* HUF_decompress4X1_usingDTable_internal_body(): + * Conditions : + * @dstSize >= 6 + */ FORCE_INLINE_TEMPLATE size_t HUF_decompress4X1_usingDTable_internal_body( void* dst, size_t dstSize, @@ -594,6 +597,7 @@ HUF_decompress4X1_usingDTable_internal_body( if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ + if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */ CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); @@ -679,8 +683,7 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm( const BYTE* const iend = (const BYTE*)cSrc + 6; BYTE* const oend = (BYTE*)dst + dstSize; HUF_DecompressAsmArgs args; - { - size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); + { size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); FORWARD_IF_ERROR(ret, "Failed to init asm args"); if (ret != 0) return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); @@ -700,8 +703,7 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm( (void)iend; /* finish bit streams one by one. */ - { - size_t const segmentSize = (dstSize+3) / 4; + { size_t const segmentSize = (dstSize+3) / 4; BYTE* segmentEnd = (BYTE*)dst; int i; for (i = 0; i < 4; ++i) { @@ -1246,6 +1248,11 @@ HUF_decompress1X2_usingDTable_internal_body( /* decoded size */ return dstSize; } + +/* HUF_decompress4X2_usingDTable_internal_body(): + * Conditions: + * @dstSize >= 6 + */ FORCE_INLINE_TEMPLATE size_t HUF_decompress4X2_usingDTable_internal_body( void* dst, size_t dstSize, @@ -1286,8 +1293,9 @@ HUF_decompress4X2_usingDTable_internal_body( DTableDesc const dtd = HUF_getDTableDesc(DTable); U32 const dtLog = dtd.tableLog; - if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ - if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ + if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ + if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ + if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */ CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); diff --git a/lib/decompress/zstd_decompress_block.c b/lib/decompress/zstd_decompress_block.c index 94728a165..b2351e8f1 100644 --- a/lib/decompress/zstd_decompress_block.c +++ b/lib/decompress/zstd_decompress_block.c @@ -166,6 +166,10 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, } RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); + if (!singleStream) + RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong, + "Not enough literals (%zu) for the 4-streams mode (min %u)", + litSize, MIN_LITERALS_FOR_4_STREAMS); RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, ""); RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, ""); ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0); @@ -181,6 +185,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx)); } else { + assert(litSize >= MIN_LITERALS_FOR_4_STREAMS); hufSuccess = HUF_decompress4X_usingDTable_bmi2( dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx)); @@ -509,7 +514,8 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, for (i = 8; i < n; i += 8) { MEM_write64(spread + pos + i, sv); } - pos += n; + assert(n>=0); + pos += (size_t)n; } } /* Now we spread those positions across the table. diff --git a/lib/zstd_errors.h b/lib/zstd_errors.h index 57a50d024..880e8e4f6 100644 --- a/lib/zstd_errors.h +++ b/lib/zstd_errors.h @@ -70,6 +70,7 @@ typedef enum { ZSTD_error_frameParameter_windowTooLarge = 16, ZSTD_error_corruption_detected = 20, ZSTD_error_checksum_wrong = 22, + ZSTD_error_literals_headerWrong = 24, ZSTD_error_dictionary_corrupted = 30, ZSTD_error_dictionary_wrong = 32, ZSTD_error_dictionaryCreation_failed = 34, -- 2.47.2