From: Yann Collet Date: Sat, 23 Jul 2016 14:31:49 +0000 (+0200) Subject: unified encoding types X-Git-Tag: v0.8.0^2~53 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f8e7b5363fd3a702e1c260f42eb7112e084d3405;p=thirdparty%2Fzstd.git unified encoding types --- diff --git a/lib/common/zstd_internal.h b/lib/common/zstd_internal.h index f51def2b8..8e50da347 100644 --- a/lib/common/zstd_internal.h +++ b/lib/common/zstd_internal.h @@ -88,13 +88,13 @@ static const size_t ZSTD_did_fieldSize[4] = { 0, 1, 2, 4 }; #define ZSTD_BLOCKHEADERSIZE 3 /* C standard doesn't allow `static const` variable to be init using another `static const` variable */ static const size_t ZSTD_blockHeaderSize = ZSTD_BLOCKHEADERSIZE; -typedef enum { bt_raw, bt_rle, bt_compressed, bt_end } blockType_t; +typedef enum { bt_raw, bt_rle, bt_compressed, bt_end } blockType_e; #define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */ #define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */) /* for a non-null block */ #define HufLog 12 -typedef enum { lbt_huffman, lbt_repeat, lbt_raw, lbt_rle } litBlockType_t; +typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e; #define LONGNBSEQ 0x7F00 @@ -111,11 +111,6 @@ typedef enum { lbt_huffman, lbt_repeat, lbt_raw, lbt_rle } litBlockType_t; #define LLFSELog 9 #define OffFSELog 8 -#define FSE_ENCODING_RAW 0 -#define FSE_ENCODING_RLE 1 -#define FSE_ENCODING_STATIC 2 -#define FSE_ENCODING_DYNAMIC 3 - static const U32 LL_bits[MaxLL+1] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 4, 6, 7, 8, 9,10,11,12, 13,14,15,16 }; diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 14bcfa76b..84e756b88 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -444,14 +444,14 @@ static size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void switch(flSize) { case 1: /* 2 - 1 - 5 */ - ostart[0] = (BYTE)((U32)lbt_raw + (srcSize<<3)); + ostart[0] = (BYTE)((U32)set_basic + (srcSize<<3)); break; case 2: /* 2 - 2 - 12 */ - MEM_writeLE16(ostart, (U16)((U32)lbt_raw + (1<<2) + (srcSize<<4))); + MEM_writeLE16(ostart, (U16)((U32)set_basic + (1<<2) + (srcSize<<4))); break; default: /*note : should not be necessary : flSize is within {1,2,3} */ case 3: /* 2 - 2 - 20 */ - MEM_writeLE32(ostart, (U32)((U32)lbt_raw + (3<<2) + (srcSize<<4))); + MEM_writeLE32(ostart, (U32)((U32)set_basic + (3<<2) + (srcSize<<4))); break; } @@ -469,14 +469,14 @@ static size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, cons switch(flSize) { case 1: /* 2 - 1 - 5 */ - ostart[0] = (BYTE)((U32)lbt_rle + (srcSize<<3)); + ostart[0] = (BYTE)((U32)set_rle + (srcSize<<3)); break; case 2: /* 2 - 2 - 12 */ - MEM_writeLE16(ostart, (U16)((U32)lbt_rle + (1<<2) + (srcSize<<4))); + MEM_writeLE16(ostart, (U16)((U32)set_rle + (1<<2) + (srcSize<<4))); break; default: /*note : should not be necessary : flSize is necessarily within {1,2,3} */ case 3: /* 2 - 2 - 20 */ - MEM_writeLE32(ostart, (U32)((U32)lbt_rle + (3<<2) + (srcSize<<4))); + MEM_writeLE32(ostart, (U32)((U32)set_rle + (3<<2) + (srcSize<<4))); break; } @@ -495,7 +495,7 @@ static size_t ZSTD_compressLiterals (ZSTD_CCtx* zc, size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB); BYTE* const ostart = (BYTE*)dst; U32 singleStream = srcSize < 256; - litBlockType_t hType = lbt_huffman; + symbolEncodingType_e hType = set_compressed; size_t cLitSize; @@ -507,7 +507,7 @@ static size_t ZSTD_compressLiterals (ZSTD_CCtx* zc, if (dstCapacity < lhSize+1) return ERROR(dstSize_tooSmall); /* not enough space for compression */ if (zc->flagStaticTables && (lhSize==3)) { - hType = lbt_repeat; + hType = set_repeat; singleStream = 1; cLitSize = HUF_compress1X_usingCTable(ostart+lhSize, dstCapacity-lhSize, src, srcSize, zc->hufTable); } else { @@ -652,12 +652,12 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* zc, if ((mostFrequent == nbSeq) && (nbSeq > 2)) { *op++ = llCodeTable[0]; FSE_buildCTable_rle(CTable_LitLength, (BYTE)max); - LLtype = FSE_ENCODING_RLE; + LLtype = set_rle; } else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) { - LLtype = FSE_ENCODING_STATIC; + LLtype = set_repeat; } else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (LL_defaultNormLog-1)))) { FSE_buildCTable(CTable_LitLength, LL_defaultNorm, MaxLL, LL_defaultNormLog); - LLtype = FSE_ENCODING_RAW; + LLtype = set_basic; } else { size_t nbSeq_1 = nbSeq; const U32 tableLog = FSE_optimalTableLog(LLFSELog, nbSeq, max); @@ -667,7 +667,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* zc, if (FSE_isError(NCountSize)) return ERROR(GENERIC); op += NCountSize; } FSE_buildCTable(CTable_LitLength, norm, max, tableLog); - LLtype = FSE_ENCODING_DYNAMIC; + LLtype = set_compressed; } } /* CTable for Offsets */ @@ -676,12 +676,12 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* zc, if ((mostFrequent == nbSeq) && (nbSeq > 2)) { *op++ = ofCodeTable[0]; FSE_buildCTable_rle(CTable_OffsetBits, (BYTE)max); - Offtype = FSE_ENCODING_RLE; + Offtype = set_rle; } else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) { - Offtype = FSE_ENCODING_STATIC; + Offtype = set_repeat; } else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (OF_defaultNormLog-1)))) { FSE_buildCTable(CTable_OffsetBits, OF_defaultNorm, MaxOff, OF_defaultNormLog); - Offtype = FSE_ENCODING_RAW; + Offtype = set_basic; } else { size_t nbSeq_1 = nbSeq; const U32 tableLog = FSE_optimalTableLog(OffFSELog, nbSeq, max); @@ -691,7 +691,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* zc, if (FSE_isError(NCountSize)) return ERROR(GENERIC); op += NCountSize; } FSE_buildCTable(CTable_OffsetBits, norm, max, tableLog); - Offtype = FSE_ENCODING_DYNAMIC; + Offtype = set_compressed; } } /* CTable for MatchLengths */ @@ -700,12 +700,12 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* zc, if ((mostFrequent == nbSeq) && (nbSeq > 2)) { *op++ = *mlCodeTable; FSE_buildCTable_rle(CTable_MatchLength, (BYTE)max); - MLtype = FSE_ENCODING_RLE; + MLtype = set_rle; } else if ((zc->flagStaticTables) && (nbSeq < MAX_SEQ_FOR_STATIC_FSE)) { - MLtype = FSE_ENCODING_STATIC; + MLtype = set_repeat; } else if ((nbSeq < MIN_SEQ_FOR_DYNAMIC_FSE) || (mostFrequent < (nbSeq >> (ML_defaultNormLog-1)))) { FSE_buildCTable(CTable_MatchLength, ML_defaultNorm, MaxML, ML_defaultNormLog); - MLtype = FSE_ENCODING_RAW; + MLtype = set_basic; } else { size_t nbSeq_1 = nbSeq; const U32 tableLog = FSE_optimalTableLog(MLFSELog, nbSeq, max); @@ -715,7 +715,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* zc, if (FSE_isError(NCountSize)) return ERROR(GENERIC); op += NCountSize; } FSE_buildCTable(CTable_MatchLength, norm, max, tableLog); - MLtype = FSE_ENCODING_DYNAMIC; + MLtype = set_compressed; } } *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2)); diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c index fb499f26c..71dbc92a2 100644 --- a/lib/decompress/zstd_decompress.c +++ b/lib/decompress/zstd_decompress.c @@ -120,7 +120,7 @@ struct ZSTD_DCtx_s size_t expected; U32 rep[3]; ZSTD_frameParams fParams; - blockType_t bType; /* used in ZSTD_decompressContinue(), to transfer blockType between header decoding and block decoding stages */ + blockType_e bType; /* used in ZSTD_decompressContinue(), to transfer blockType between header decoding and block decoding stages */ ZSTD_dStage stage; U32 litEntropy; U32 fseEntropy; @@ -427,7 +427,7 @@ static size_t ZSTD_decodeFrameHeader(ZSTD_DCtx* dctx, const void* src, size_t sr typedef struct { - blockType_t blockType; + blockType_e blockType; U32 origSize; } blockProperties_t; @@ -438,7 +438,7 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, blockProperties_t* bp if (srcSize < ZSTD_blockHeaderSize) return ERROR(srcSize_wrong); { U32 const cBlockHeader = MEM_readLE24(src); U32 const cSize = cBlockHeader >> 2; - bpPtr->blockType = (blockType_t)(cBlockHeader & 3); + bpPtr->blockType = (blockType_e)(cBlockHeader & 3); bpPtr->origSize = cSize; /* only useful for RLE */ if (bpPtr->blockType == bt_end) return 0; if (bpPtr->blockType == bt_rle) return 1; @@ -463,14 +463,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, if (srcSize < MIN_CBLOCK_SIZE) return ERROR(corruption_detected); { const BYTE* const istart = (const BYTE*) src; - litBlockType_t const litBlockType = (litBlockType_t)(istart[0] & 3); + symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3); - switch(litBlockType) + switch(litEncType) { - case lbt_repeat: + case set_repeat: if (dctx->litEntropy==0) return ERROR(dictionary_corrupted); /* fall-through */ - case lbt_huffman: + case set_compressed: if (srcSize < 5) return ERROR(corruption_detected); /* srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3 */ { size_t lhSize, litSize, litCSize; U32 singleStream=0; @@ -504,7 +504,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, if (litSize > ZSTD_BLOCKSIZE_ABSOLUTEMAX) return ERROR(corruption_detected); if (litCSize + lhSize > srcSize) return ERROR(corruption_detected); - if (HUF_isError((litBlockType==lbt_repeat) ? + if (HUF_isError((litEncType==set_repeat) ? ( singleStream ? HUF_decompress1X_usingDTable(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->hufTable) : HUF_decompress4X_usingDTable(dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->hufTable) ) : @@ -520,7 +520,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, return litCSize + lhSize; } - case lbt_raw: + case set_basic: { size_t litSize, lhSize; U32 const lhlCode = ((istart[0]) >> 2) & 3; switch(lhlCode) @@ -554,7 +554,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, return lhSize+litSize; } - case lbt_rle: + case set_rle: { U32 const lhlCode = ((istart[0]) >> 2) & 3; size_t litSize, lhSize; switch(lhlCode) @@ -592,25 +592,25 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, @return : nb bytes read from src, or an error code if it fails, testable with ZSTD_isError() */ -FORCE_INLINE size_t ZSTD_buildSeqTable(FSE_DTable* DTable, U32 type, U32 max, U32 maxLog, +FORCE_INLINE size_t ZSTD_buildSeqTable(FSE_DTable* DTable, symbolEncodingType_e type, U32 max, U32 maxLog, const void* src, size_t srcSize, const S16* defaultNorm, U32 defaultLog, U32 flagRepeatTable) { switch(type) { - case FSE_ENCODING_RLE : + case set_rle : if (!srcSize) return ERROR(srcSize_wrong); if ( (*(const BYTE*)src) > max) return ERROR(corruption_detected); FSE_buildDTable_rle(DTable, *(const BYTE*)src); /* if *src > max, data is corrupted */ return 1; - case FSE_ENCODING_RAW : + case set_basic : FSE_buildDTable(DTable, defaultNorm, max, defaultLog); return 0; - case FSE_ENCODING_STATIC: + case set_repeat: if (!flagRepeatTable) return ERROR(corruption_detected); return 0; default : /* impossible */ - case FSE_ENCODING_DYNAMIC : + case set_compressed : { U32 tableLog; S16 norm[MaxSeq+1]; size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize); @@ -646,9 +646,9 @@ size_t ZSTD_decodeSeqHeaders(int* nbSeqPtr, } /* FSE table descriptors */ - { U32 const LLtype = *ip >> 6; - U32 const OFtype = (*ip >> 4) & 3; - U32 const MLtype = (*ip >> 2) & 3; + { symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6); + symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3); + symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3); ip++; /* check */ diff --git a/programs/fullbench.c b/programs/fullbench.c index 01e8f59ee..53041db77 100644 --- a/programs/fullbench.c +++ b/programs/fullbench.c @@ -31,8 +31,9 @@ #include /* clock_t, clock, CLOCKS_PER_SEC */ #include "mem.h" +#include "zstd_internal.h" /* ZSTD_blockHeaderSize, blockType_e, KB, MB */ #define ZSTD_STATIC_LINKING_ONLY /* ZSTD_compressBegin, ZSTD_compressContinue, etc. */ -#include "zstd.h" /* ZSTD_VERSION_STRING */ +#include "zstd.h" /* ZSTD_VERSION_STRING */ #define FSE_STATIC_LINKING_ONLY /* FSE_DTABLE_SIZE_U32 */ #include "fse.h" #include "zbuff.h" @@ -46,10 +47,6 @@ #define AUTHOR "Yann Collet" #define WELCOME_MESSAGE "*** %s %s %i-bits, by %s (%s) ***\n", PROGRAM_DESCRIPTION, ZSTD_VERSION_STRING, (int)(sizeof(void*)*8), AUTHOR, __DATE__ - -#define KB *(1<<10) -#define MB *(1<<20) - #define NBLOOPS 6 #define TIMELOOP_S 2 @@ -110,9 +107,8 @@ static size_t BMK_findMaxMem(U64 requiredMem) /*_******************************************************* * Benchmark wrappers *********************************************************/ -typedef enum { bt_compressed, bt_raw, bt_rle, bt_end } blockType_t; typedef struct { - blockType_t blockType; + blockType_e blockType; U32 unusedBits; U32 origSize; } blockProperties_t; @@ -214,8 +210,8 @@ size_t local_ZSTD_decompressContinue(void* dst, size_t dstCapacity, void* buff2, static size_t benchMem(const void* src, size_t srcSize, U32 benchNb) { BYTE* dstBuff; - size_t dstBuffSize; - BYTE* buff2; + size_t const dstBuffSize = ZSTD_compressBound(srcSize); + void* buff2; const char* benchName; size_t (*benchFunction)(void* dst, size_t dstSize, void* verifBuff, const void* src, size_t srcSize); double bestTime = 100000000.; @@ -252,9 +248,8 @@ static size_t benchMem(const void* src, size_t srcSize, U32 benchNb) } /* Allocation */ - dstBuffSize = ZSTD_compressBound(srcSize); dstBuff = (BYTE*)malloc(dstBuffSize); - buff2 = (BYTE*)malloc(dstBuffSize); + buff2 = malloc(dstBuffSize); if ((!dstBuff) || (!buff2)) { DISPLAY("\nError: not enough memory!\n"); free(dstBuff); free(buff2); @@ -287,7 +282,7 @@ static size_t benchMem(const void* src, size_t srcSize, U32 benchNb) DISPLAY("ZSTD_decodeLiteralsBlock : impossible to test on this sample (not compressible)\n"); goto _cleanOut; } - skippedSize = frameHeaderSize + 3 /* ZSTD_blockHeaderSize */; + skippedSize = frameHeaderSize + ZSTD_blockHeaderSize; memcpy(buff2, dstBuff+skippedSize, g_cSize-skippedSize); srcSize = srcSize > 128 KB ? 128 KB : srcSize; /* speed relative to block */ break; @@ -309,9 +304,9 @@ static size_t benchMem(const void* src, size_t srcSize, U32 benchNb) DISPLAY("ZSTD_decodeSeqHeaders : impossible to test on this sample (not compressible)\n"); goto _cleanOut; } - iend = ip + 3 /* ZSTD_blockHeaderSize */ + cBlockSize; /* End of first block */ - ip += 3 /* ZSTD_blockHeaderSize */; /* skip block header */ - ip += ZSTD_decodeLiteralsBlock(g_zdc, ip, iend-ip); /* skip literal segment */ + iend = ip + ZSTD_blockHeaderSize + cBlockSize; /* End of first block */ + ip += ZSTD_blockHeaderSize; /* skip block header */ + ip += ZSTD_decodeLiteralsBlock(g_zdc, ip, iend-ip); /* skip literal segment */ g_cSize = iend-ip; memcpy(buff2, ip, g_cSize); /* copy rest of block (it starts by SeqHeader) */ srcSize = srcSize > 128 KB ? 128 KB : srcSize; /* speed relative to block */ diff --git a/zstd_compression_format.md b/zstd_compression_format.md index 7112fa152..8f72b1e63 100644 --- a/zstd_compression_format.md +++ b/zstd_compression_format.md @@ -437,19 +437,19 @@ Header is in charge of describing how literals are packed. It's a byte-aligned variable-size bitfield, ranging from 1 to 5 bytes, using little-endian convention. -| BlockType | sizes format | regenerated size | [compressed size] | -| --------- | ------------ | ---------------- | ----------------- | -| 2 bits | 1 - 2 bits | 5 - 20 bits | 0 - 18 bits | +| EncodingType | sizes format | regenerated size | [compressed size] | +| ------------ | ------------ | ---------------- | ----------------- | +| 2 bits | 1 - 2 bits | 5 - 20 bits | 0 - 18 bits | In this representation, bits on the left are smallest bits. -__Block Type__ : +__Encoding Type__ : This field uses 2 lowest bits of first byte, describing 4 different block types : -| Value | 0 | 1 | 2 | 3 | -| ---------- | ---------- | ------ | --- | ------- | -| Block Type | Compressed | Repeat | Raw | RLE | +| Value | 0 | 1 | 2 | 3 | +| ---------- | --- | --- | ---------- | ----------- | +| Block Type | Raw | RLE | Compressed | RepeatStats | - Compressed : This is a standard huffman-compressed block, starting with a huffman tree description. @@ -764,14 +764,14 @@ Literal Lengths, Offsets and Match Lengths respectively. They follow the same enumeration : -| Value | 0 | 1 | 2 | 3 | -| ---------------- | ------ | --- | ------ | --- | -| Compression Mode | predef | RLE | Repeat | FSE | +| Value | 0 | 1 | 2 | 3 | +| ---------------- | ------ | --- | ---------- | ------ | +| Compression Mode | predef | RLE | Compressed | Repeat | - "predef" : uses a pre-defined distribution table. - "RLE" : it's a single code, repeated `nbSeqs` times. - "Repeat" : re-use distribution table from previous compressed block. -- "FSE" : standard FSE compression. +- "Compressed" : standard FSE compression. A distribution table will be present. It will be described in [next part](#distribution-tables).