From: Nick Terrell Date: Thu, 15 Feb 2018 03:20:32 +0000 (-0800) Subject: [compress] Support BMI2 X-Git-Tag: v1.3.4~1^2~54^2~1 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b58f01537ea49d2f87c7b9e6273611736f13925e;p=thirdparty%2Fzstd.git [compress] Support BMI2 --- diff --git a/lib/common/huf.h b/lib/common/huf.h index 3b96ae8b7..cadee595d 100644 --- a/lib/common/huf.h +++ b/lib/common/huf.h @@ -223,7 +223,7 @@ typedef enum { * If it uses hufTable it does not modify hufTable or repeat. * If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used. * If preferRepeat then the old table will always be used if valid. */ -size_t HUF_compress4X_repeat(void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize, HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat); /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */ +size_t HUF_compress4X_repeat(void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize, HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2); /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */ /** HUF_buildCTable_wksp() : * Same as HUF_buildCTable(), but using externally allocated scratch buffer. @@ -279,7 +279,7 @@ size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, si * If it uses hufTable it does not modify hufTable or repeat. * If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used. * If preferRepeat then the old table will always be used if valid. */ -size_t HUF_compress1X_repeat(void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize, HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat); /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */ +size_t HUF_compress1X_repeat(void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize, HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2); /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */ size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* single-symbol decoder */ size_t HUF_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* double-symbol decoder */ diff --git a/lib/compress/huf_compress.c b/lib/compress/huf_compress.c index cfc5a98bb..846fd4c62 100644 --- a/lib/compress/huf_compress.c +++ b/lib/compress/huf_compress.c @@ -46,6 +46,7 @@ #include /* memcpy, memset */ #include /* printf (debug) */ #include "bitstream.h" +#include "compiler.h" #define FSE_STATIC_LINKING_ONLY /* FSE_optimalTableLog_internal */ #include "fse.h" /* header compression */ #define HUF_STATIC_LINKING_ONLY @@ -433,117 +434,69 @@ static int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, uns return !bad; } -static void HUF_encodeSymbol(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable) -{ - BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits); -} - size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); } -#define HUF_FLUSHBITS(s) BIT_flushBits(s) -#define HUF_FLUSHBITS_1(stream) \ - if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*2+7) HUF_FLUSHBITS(stream) +#define FUNCTION(fn) fn##_default +#define TARGET +#include "huf_compress_impl.h" +#undef TARGET +#undef FUNCTION -#define HUF_FLUSHBITS_2(stream) \ - if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*4+7) HUF_FLUSHBITS(stream) +#if DYNAMIC_BMI2 -size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) +#define FUNCTION(fn) fn##_bmi2 +#define TARGET TARGET_ATTRIBUTE("bmi2") +#include "huf_compress_impl.h" +#undef TARGET +#undef FUNCTION + +#endif + +static size_t HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable, const int bmi2) { - const BYTE* ip = (const BYTE*) src; - BYTE* const ostart = (BYTE*)dst; - BYTE* const oend = ostart + dstSize; - BYTE* op = ostart; - size_t n; - BIT_CStream_t bitC; - - /* init */ - if (dstSize < 8) return 0; /* not enough space to compress */ - { size_t const initErr = BIT_initCStream(&bitC, op, oend-op); - if (HUF_isError(initErr)) return 0; } - - n = srcSize & ~3; /* join to mod 4 */ - switch (srcSize & 3) - { - case 3 : HUF_encodeSymbol(&bitC, ip[n+ 2], CTable); - HUF_FLUSHBITS_2(&bitC); - /* fall-through */ - case 2 : HUF_encodeSymbol(&bitC, ip[n+ 1], CTable); - HUF_FLUSHBITS_1(&bitC); - /* fall-through */ - case 1 : HUF_encodeSymbol(&bitC, ip[n+ 0], CTable); - HUF_FLUSHBITS(&bitC); - /* fall-through */ - case 0 : /* fall-through */ - default: break; +#if DYNAMIC_BMI2 + if (bmi2) { + return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable); } +#endif + (void)bmi2; + return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable); +} - for (; n>0; n-=4) { /* note : n&3==0 at this stage */ - HUF_encodeSymbol(&bitC, ip[n- 1], CTable); - HUF_FLUSHBITS_1(&bitC); - HUF_encodeSymbol(&bitC, ip[n- 2], CTable); - HUF_FLUSHBITS_2(&bitC); - HUF_encodeSymbol(&bitC, ip[n- 3], CTable); - HUF_FLUSHBITS_1(&bitC); - HUF_encodeSymbol(&bitC, ip[n- 4], CTable); - HUF_FLUSHBITS(&bitC); +static size_t HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable, const int bmi2) +{ +#if DYNAMIC_BMI2 + if (bmi2) { + return HUF_compress4X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable); } - - return BIT_closeCStream(&bitC); +#endif + (void)bmi2; + return HUF_compress4X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable); } +size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) +{ + return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); +} size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) { - size_t const segmentSize = (srcSize+3)/4; /* first 3 segments */ - const BYTE* ip = (const BYTE*) src; - const BYTE* const iend = ip + srcSize; - BYTE* const ostart = (BYTE*) dst; - BYTE* const oend = ostart + dstSize; - BYTE* op = ostart; - - if (dstSize < 6 + 1 + 1 + 1 + 8) return 0; /* minimum space to compress successfully */ - if (srcSize < 12) return 0; /* no saving possible : too small input */ - op += 6; /* jumpTable */ - - { CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend-op, ip, segmentSize, CTable) ); - if (cSize==0) return 0; - MEM_writeLE16(ostart, (U16)cSize); - op += cSize; - } - - ip += segmentSize; - { CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend-op, ip, segmentSize, CTable) ); - if (cSize==0) return 0; - MEM_writeLE16(ostart+2, (U16)cSize); - op += cSize; - } - - ip += segmentSize; - { CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend-op, ip, segmentSize, CTable) ); - if (cSize==0) return 0; - MEM_writeLE16(ostart+4, (U16)cSize); - op += cSize; - } - - ip += segmentSize; - { CHECK_V_F(cSize, HUF_compress1X_usingCTable(op, oend-op, ip, iend-ip, CTable) ); - if (cSize==0) return 0; - op += cSize; - } - - return op-ostart; + return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); } - static size_t HUF_compressCTable_internal( BYTE* const ostart, BYTE* op, BYTE* const oend, const void* src, size_t srcSize, - unsigned singleStream, const HUF_CElt* CTable) + unsigned singleStream, const HUF_CElt* CTable, const int bmi2) { size_t const cSize = singleStream ? - HUF_compress1X_usingCTable(op, oend - op, src, srcSize, CTable) : - HUF_compress4X_usingCTable(op, oend - op, src, srcSize, CTable); + HUF_compress1X_usingCTable_internal(op, oend - op, src, srcSize, CTable, bmi2) : + HUF_compress4X_usingCTable_internal(op, oend - op, src, srcSize, CTable, bmi2); if (HUF_isError(cSize)) { return cSize; } if (cSize==0) { return 0; } /* uncompressible */ op += cSize; @@ -560,7 +513,8 @@ static size_t HUF_compress_internal ( unsigned maxSymbolValue, unsigned huffLog, unsigned singleStream, void* workSpace, size_t wkspSize, - HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat) + HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat, + const int bmi2) { BYTE* const ostart = (BYTE*)dst; BYTE* const oend = ostart + dstSize; @@ -589,7 +543,7 @@ static size_t HUF_compress_internal ( /* Heuristic : If we don't need to check the validity of the old table use the old table for small inputs */ if (preferRepeat && repeat && *repeat == HUF_repeat_valid) { - return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, oldHufTable); + return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, oldHufTable, bmi2); } /* Scan input and build symbol stats */ @@ -604,7 +558,7 @@ static size_t HUF_compress_internal ( } /* Heuristic : use existing table for small inputs */ if (preferRepeat && repeat && *repeat != HUF_repeat_none) { - return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, oldHufTable); + return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, oldHufTable, bmi2); } /* Build Huffman Tree */ @@ -622,7 +576,7 @@ static size_t HUF_compress_internal ( size_t const oldSize = HUF_estimateCompressedSize(oldHufTable, count, maxSymbolValue); size_t const newSize = HUF_estimateCompressedSize(CTable, count, maxSymbolValue); if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) { - return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, oldHufTable); + return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, oldHufTable, bmi2); } } /* Use the new table */ @@ -631,7 +585,7 @@ static size_t HUF_compress_internal ( if (repeat) { *repeat = HUF_repeat_none; } if (oldHufTable) { memcpy(oldHufTable, CTable, CTableSize); } /* Save the new table */ } - return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, CTable); + return HUF_compressCTable_internal(ostart, op, oend, src, srcSize, singleStream, CTable, bmi2); } @@ -640,16 +594,16 @@ size_t HUF_compress1X_wksp (void* dst, size_t dstSize, unsigned maxSymbolValue, unsigned huffLog, void* workSpace, size_t wkspSize) { - return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 1 /* single stream */, workSpace, wkspSize, NULL, NULL, 0); + return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 1 /* single stream */, workSpace, wkspSize, NULL, NULL, 0, /* bmi2 */ 0); } size_t HUF_compress1X_repeat (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, void* workSpace, size_t wkspSize, - HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat) + HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2) { - return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 1 /* single stream */, workSpace, wkspSize, hufTable, repeat, preferRepeat); + return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 1 /* single stream */, workSpace, wkspSize, hufTable, repeat, preferRepeat, bmi2); } size_t HUF_compress1X (void* dst, size_t dstSize, @@ -665,16 +619,16 @@ size_t HUF_compress4X_wksp (void* dst, size_t dstSize, unsigned maxSymbolValue, unsigned huffLog, void* workSpace, size_t wkspSize) { - return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 0 /* 4 streams */, workSpace, wkspSize, NULL, NULL, 0); + return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 0 /* 4 streams */, workSpace, wkspSize, NULL, NULL, 0, /* bmi2 */ 0); } size_t HUF_compress4X_repeat (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, void* workSpace, size_t wkspSize, - HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat) + HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2) { - return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 0 /* 4 streams */, workSpace, wkspSize, hufTable, repeat, preferRepeat); + return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, 0 /* 4 streams */, workSpace, wkspSize, hufTable, repeat, preferRepeat, bmi2); } size_t HUF_compress2 (void* dst, size_t dstSize, diff --git a/lib/compress/huf_compress_impl.h b/lib/compress/huf_compress_impl.h new file mode 100644 index 000000000..fa045399f --- /dev/null +++ b/lib/compress/huf_compress_impl.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2018-present, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef FUNCTION +# error "FUNCTION(name) must be defined" +#endif + +#ifndef TARGET +# error "TARGET must be defined" +#endif + + +static void FUNCTION(HUF_encodeSymbol)(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable) +{ + BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits); +} + +#define HUF_FLUSHBITS(s) BIT_flushBits(s) + +#define HUF_FLUSHBITS_1(stream) \ + if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*2+7) HUF_FLUSHBITS(stream) + +#define HUF_FLUSHBITS_2(stream) \ + if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*4+7) HUF_FLUSHBITS(stream) + +static TARGET +size_t FUNCTION(HUF_compress1X_usingCTable_internal)(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) +{ + const BYTE* ip = (const BYTE*) src; + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstSize; + BYTE* op = ostart; + size_t n; + BIT_CStream_t bitC; + + /* init */ + if (dstSize < 8) return 0; /* not enough space to compress */ + { size_t const initErr = BIT_initCStream(&bitC, op, oend-op); + if (HUF_isError(initErr)) return 0; } + + n = srcSize & ~3; /* join to mod 4 */ + switch (srcSize & 3) + { + case 3 : FUNCTION(HUF_encodeSymbol)(&bitC, ip[n+ 2], CTable); + HUF_FLUSHBITS_2(&bitC); + /* fall-through */ + case 2 : FUNCTION(HUF_encodeSymbol)(&bitC, ip[n+ 1], CTable); + HUF_FLUSHBITS_1(&bitC); + /* fall-through */ + case 1 : FUNCTION(HUF_encodeSymbol)(&bitC, ip[n+ 0], CTable); + HUF_FLUSHBITS(&bitC); + /* fall-through */ + case 0 : /* fall-through */ + default: break; + } + + for (; n>0; n-=4) { /* note : n&3==0 at this stage */ + FUNCTION(HUF_encodeSymbol)(&bitC, ip[n- 1], CTable); + HUF_FLUSHBITS_1(&bitC); + FUNCTION(HUF_encodeSymbol)(&bitC, ip[n- 2], CTable); + HUF_FLUSHBITS_2(&bitC); + FUNCTION(HUF_encodeSymbol)(&bitC, ip[n- 3], CTable); + HUF_FLUSHBITS_1(&bitC); + FUNCTION(HUF_encodeSymbol)(&bitC, ip[n- 4], CTable); + HUF_FLUSHBITS(&bitC); + } + + return BIT_closeCStream(&bitC); +} + + +static TARGET +size_t FUNCTION(HUF_compress4X_usingCTable_internal)(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) +{ + size_t const segmentSize = (srcSize+3)/4; /* first 3 segments */ + const BYTE* ip = (const BYTE*) src; + const BYTE* const iend = ip + srcSize; + BYTE* const ostart = (BYTE*) dst; + BYTE* const oend = ostart + dstSize; + BYTE* op = ostart; + + if (dstSize < 6 + 1 + 1 + 1 + 8) return 0; /* minimum space to compress successfully */ + if (srcSize < 12) return 0; /* no saving possible : too small input */ + op += 6; /* jumpTable */ + + { CHECK_V_F(cSize, FUNCTION(HUF_compress1X_usingCTable_internal)(op, oend-op, ip, segmentSize, CTable) ); + if (cSize==0) return 0; + MEM_writeLE16(ostart, (U16)cSize); + op += cSize; + } + + ip += segmentSize; + { CHECK_V_F(cSize, FUNCTION(HUF_compress1X_usingCTable_internal)(op, oend-op, ip, segmentSize, CTable) ); + if (cSize==0) return 0; + MEM_writeLE16(ostart+2, (U16)cSize); + op += cSize; + } + + ip += segmentSize; + { CHECK_V_F(cSize, FUNCTION(HUF_compress1X_usingCTable_internal)(op, oend-op, ip, segmentSize, CTable) ); + if (cSize==0) return 0; + MEM_writeLE16(ostart+4, (U16)cSize); + op += cSize; + } + + ip += segmentSize; + { CHECK_V_F(cSize, FUNCTION(HUF_compress1X_usingCTable_internal)(op, oend-op, ip, iend-ip, CTable) ); + if (cSize==0) return 0; + op += cSize; + } + + return op-ostart; +} diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 8c6393ed5..80b1d3662 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -21,6 +21,7 @@ * Dependencies ***************************************/ #include /* memset */ +#include "cpu.h" #include "mem.h" #define FSE_STATIC_LINKING_ONLY /* FSE_encodeSymbol */ #include "fse.h" @@ -73,6 +74,7 @@ ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem) cctx->customMem = customMem; cctx->requestedParams.compressionLevel = ZSTD_CLEVEL_DEFAULT; cctx->requestedParams.fParams.contentSizeFlag = 1; + cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); return cctx; } } @@ -96,6 +98,7 @@ ZSTD_CCtx* ZSTD_initStaticCCtx(void *workspace, size_t workspaceSize) void* const ptr = cctx->blockState.nextCBlock + 1; cctx->entropyWorkspace = (U32*)ptr; } + cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); return cctx; } @@ -1401,7 +1404,7 @@ static size_t ZSTD_compressLiterals (ZSTD_entropyCTables_t const* prevEntropy, ZSTD_strategy strategy, void* dst, size_t dstCapacity, const void* src, size_t srcSize, - U32* workspace) + U32* workspace, const int bmi2) { size_t const minGain = ZSTD_minGain(srcSize); size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB); @@ -1426,9 +1429,9 @@ static size_t ZSTD_compressLiterals (ZSTD_entropyCTables_t const* prevEntropy, int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0; if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1; cLitSize = singleStream ? HUF_compress1X_repeat(ostart+lhSize, dstCapacity-lhSize, src, srcSize, 255, 11, - workspace, HUF_WORKSPACE_SIZE, (HUF_CElt*)nextEntropy->hufCTable, &repeat, preferRepeat) + workspace, HUF_WORKSPACE_SIZE, (HUF_CElt*)nextEntropy->hufCTable, &repeat, preferRepeat, bmi2) : HUF_compress4X_repeat(ostart+lhSize, dstCapacity-lhSize, src, srcSize, 255, 11, - workspace, HUF_WORKSPACE_SIZE, (HUF_CElt*)nextEntropy->hufCTable, &repeat, preferRepeat); + workspace, HUF_WORKSPACE_SIZE, (HUF_CElt*)nextEntropy->hufCTable, &repeat, preferRepeat, bmi2); if (repeat != HUF_repeat_none) { /* reused the existing table */ hType = set_repeat; @@ -1583,99 +1586,52 @@ size_t ZSTD_buildCTable(void* dst, size_t dstCapacity, } } -MEM_STATIC +#define FUNCTION(fn) fn##_default +#define TARGET +#include "zstd_compress_impl.h" +#undef TARGET +#undef FUNCTION + +#if DYNAMIC_BMI2 + +#define FUNCTION(fn) fn##_bmi2 +#define TARGET TARGET_ATTRIBUTE("bmi2") +#include "zstd_compress_impl.h" +#undef TARGET +#undef FUNCTION + +#endif + size_t ZSTD_encodeSequences( void* dst, size_t dstCapacity, FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, - seqDef const* sequences, size_t nbSeq, int longOffsets) -{ - BIT_CStream_t blockStream; - FSE_CState_t stateMatchLength; - FSE_CState_t stateOffsetBits; - FSE_CState_t stateLitLength; - - CHECK_E(BIT_initCStream(&blockStream, dst, dstCapacity), dstSize_tooSmall); /* not enough space remaining */ - - /* first symbols */ - FSE_initCState2(&stateMatchLength, CTable_MatchLength, mlCodeTable[nbSeq-1]); - FSE_initCState2(&stateOffsetBits, CTable_OffsetBits, ofCodeTable[nbSeq-1]); - FSE_initCState2(&stateLitLength, CTable_LitLength, llCodeTable[nbSeq-1]); - BIT_addBits(&blockStream, sequences[nbSeq-1].litLength, LL_bits[llCodeTable[nbSeq-1]]); - if (MEM_32bits()) BIT_flushBits(&blockStream); - BIT_addBits(&blockStream, sequences[nbSeq-1].matchLength, ML_bits[mlCodeTable[nbSeq-1]]); - if (MEM_32bits()) BIT_flushBits(&blockStream); - if (longOffsets) { - U32 const ofBits = ofCodeTable[nbSeq-1]; - int const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1); - if (extraBits) { - BIT_addBits(&blockStream, sequences[nbSeq-1].offset, extraBits); - BIT_flushBits(&blockStream); - } - BIT_addBits(&blockStream, sequences[nbSeq-1].offset >> extraBits, - ofBits - extraBits); - } else { - BIT_addBits(&blockStream, sequences[nbSeq-1].offset, ofCodeTable[nbSeq-1]); - } - BIT_flushBits(&blockStream); - - { size_t n; - for (n=nbSeq-2 ; n= 64-7-(LLFSELog+MLFSELog+OffFSELog))) - BIT_flushBits(&blockStream); /* (7)*/ - BIT_addBits(&blockStream, sequences[n].litLength, llBits); - if (MEM_32bits() && ((llBits+mlBits)>24)) BIT_flushBits(&blockStream); - BIT_addBits(&blockStream, sequences[n].matchLength, mlBits); - if (MEM_32bits() || (ofBits+mlBits+llBits > 56)) BIT_flushBits(&blockStream); - if (longOffsets) { - int const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1); - if (extraBits) { - BIT_addBits(&blockStream, sequences[n].offset, extraBits); - BIT_flushBits(&blockStream); /* (7)*/ - } - BIT_addBits(&blockStream, sequences[n].offset >> extraBits, - ofBits - extraBits); /* 31 */ - } else { - BIT_addBits(&blockStream, sequences[n].offset, ofBits); /* 31 */ - } - BIT_flushBits(&blockStream); /* (7)*/ - } } - - DEBUGLOG(6, "ZSTD_encodeSequences: flushing ML state with %u bits", stateMatchLength.stateLog); - FSE_flushCState(&blockStream, &stateMatchLength); - DEBUGLOG(6, "ZSTD_encodeSequences: flushing Off state with %u bits", stateOffsetBits.stateLog); - FSE_flushCState(&blockStream, &stateOffsetBits); - DEBUGLOG(6, "ZSTD_encodeSequences: flushing LL state with %u bits", stateLitLength.stateLog); - FSE_flushCState(&blockStream, &stateLitLength); - - { size_t const streamSize = BIT_closeCStream(&blockStream); - if (streamSize==0) return ERROR(dstSize_tooSmall); /* not enough space */ - return streamSize; + seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2) +{ +#if DYNAMIC_BMI2 + if (bmi2) { + return ZSTD_encodeSequences_bmi2(dst, dstCapacity, + CTable_MatchLength, mlCodeTable, + CTable_OffsetBits, ofCodeTable, + CTable_LitLength, llCodeTable, + sequences, nbSeq, longOffsets); } +#endif + (void)bmi2; + return ZSTD_encodeSequences_default(dst, dstCapacity, + CTable_MatchLength, mlCodeTable, + CTable_OffsetBits, ofCodeTable, + CTable_LitLength, llCodeTable, + sequences, nbSeq, longOffsets); } MEM_STATIC size_t ZSTD_compressSequences_internal(seqStore_t* seqStorePtr, ZSTD_entropyCTables_t const* prevEntropy, ZSTD_entropyCTables_t* nextEntropy, ZSTD_compressionParameters const* cParams, - void* dst, size_t dstCapacity, U32* workspace) + void* dst, size_t dstCapacity, U32* workspace, + const int bmi2) { const int longOffsets = cParams->windowLog > STREAM_ACCUMULATOR_MIN; U32 count[MaxSeq+1]; @@ -1700,7 +1656,7 @@ MEM_STATIC size_t ZSTD_compressSequences_internal(seqStore_t* seqStorePtr, size_t const litSize = seqStorePtr->lit - literals; size_t const cSize = ZSTD_compressLiterals(prevEntropy, nextEntropy, cParams->strategy, op, dstCapacity, literals, litSize, - workspace); + workspace, bmi2); if (ZSTD_isError(cSize)) return cSize; assert(cSize <= dstCapacity); @@ -1780,7 +1736,7 @@ MEM_STATIC size_t ZSTD_compressSequences_internal(seqStore_t* seqStorePtr, CTable_OffsetBits, ofCodeTable, CTable_LitLength, llCodeTable, sequences, nbSeq, - longOffsets); + longOffsets, bmi2); if (ZSTD_isError(bitstreamSize)) return bitstreamSize; op += bitstreamSize; } @@ -1793,11 +1749,11 @@ MEM_STATIC size_t ZSTD_compressSequences(seqStore_t* seqStorePtr, ZSTD_entropyCTables_t* nextEntropy, ZSTD_compressionParameters const* cParams, void* dst, size_t dstCapacity, - size_t srcSize, U32* workspace) + size_t srcSize, U32* workspace, int bmi2) { size_t const cSize = ZSTD_compressSequences_internal( seqStorePtr, prevEntropy, nextEntropy, cParams, dst, dstCapacity, - workspace); + workspace, bmi2); /* If the srcSize <= dstCapacity, then there is enough space to write a * raw uncompressed block. Since we ran out of space, the block must not * be compressible, so fall back to a raw uncompressed block. @@ -1898,7 +1854,7 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc, } } /* encode sequences and literals */ - { size_t const cSize = ZSTD_compressSequences(&zc->seqStore, &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy, &zc->appliedParams.cParams, dst, dstCapacity, srcSize, zc->entropyWorkspace); + { size_t const cSize = ZSTD_compressSequences(&zc->seqStore, &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy, &zc->appliedParams.cParams, dst, dstCapacity, srcSize, zc->entropyWorkspace, zc->bmi2); if (ZSTD_isError(cSize) || cSize == 0) return cSize; /* confirm repcodes and entropy tables */ { ZSTD_compressedBlockState_t* const tmp = zc->blockState.prevCBlock; diff --git a/lib/compress/zstd_compress_impl.h b/lib/compress/zstd_compress_impl.h new file mode 100644 index 000000000..e6a30b1d0 --- /dev/null +++ b/lib/compress/zstd_compress_impl.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2018-present, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef FUNCTION +# error "FUNCTION(name) must be defined" +#endif + +#ifndef TARGET +# error "TARGET must be defined" +#endif + + +MEM_STATIC TARGET +size_t FUNCTION(ZSTD_encodeSequences)( + void* dst, size_t dstCapacity, + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, + seqDef const* sequences, size_t nbSeq, int longOffsets) +{ + BIT_CStream_t blockStream; + FSE_CState_t stateMatchLength; + FSE_CState_t stateOffsetBits; + FSE_CState_t stateLitLength; + + CHECK_E(BIT_initCStream(&blockStream, dst, dstCapacity), dstSize_tooSmall); /* not enough space remaining */ + + /* first symbols */ + FSE_initCState2(&stateMatchLength, CTable_MatchLength, mlCodeTable[nbSeq-1]); + FSE_initCState2(&stateOffsetBits, CTable_OffsetBits, ofCodeTable[nbSeq-1]); + FSE_initCState2(&stateLitLength, CTable_LitLength, llCodeTable[nbSeq-1]); + BIT_addBits(&blockStream, sequences[nbSeq-1].litLength, LL_bits[llCodeTable[nbSeq-1]]); + if (MEM_32bits()) BIT_flushBits(&blockStream); + BIT_addBits(&blockStream, sequences[nbSeq-1].matchLength, ML_bits[mlCodeTable[nbSeq-1]]); + if (MEM_32bits()) BIT_flushBits(&blockStream); + if (longOffsets) { + U32 const ofBits = ofCodeTable[nbSeq-1]; + int const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1); + if (extraBits) { + BIT_addBits(&blockStream, sequences[nbSeq-1].offset, extraBits); + BIT_flushBits(&blockStream); + } + BIT_addBits(&blockStream, sequences[nbSeq-1].offset >> extraBits, + ofBits - extraBits); + } else { + BIT_addBits(&blockStream, sequences[nbSeq-1].offset, ofCodeTable[nbSeq-1]); + } + BIT_flushBits(&blockStream); + + { size_t n; + for (n=nbSeq-2 ; n= 64-7-(LLFSELog+MLFSELog+OffFSELog))) + BIT_flushBits(&blockStream); /* (7)*/ + BIT_addBits(&blockStream, sequences[n].litLength, llBits); + if (MEM_32bits() && ((llBits+mlBits)>24)) BIT_flushBits(&blockStream); + BIT_addBits(&blockStream, sequences[n].matchLength, mlBits); + if (MEM_32bits() || (ofBits+mlBits+llBits > 56)) BIT_flushBits(&blockStream); + if (longOffsets) { + int const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1); + if (extraBits) { + BIT_addBits(&blockStream, sequences[n].offset, extraBits); + BIT_flushBits(&blockStream); /* (7)*/ + } + BIT_addBits(&blockStream, sequences[n].offset >> extraBits, + ofBits - extraBits); /* 31 */ + } else { + BIT_addBits(&blockStream, sequences[n].offset, ofBits); /* 31 */ + } + BIT_flushBits(&blockStream); /* (7)*/ + } } + + DEBUGLOG(6, "ZSTD_encodeSequences: flushing ML state with %u bits", stateMatchLength.stateLog); + FSE_flushCState(&blockStream, &stateMatchLength); + DEBUGLOG(6, "ZSTD_encodeSequences: flushing Off state with %u bits", stateOffsetBits.stateLog); + FSE_flushCState(&blockStream, &stateOffsetBits); + DEBUGLOG(6, "ZSTD_encodeSequences: flushing LL state with %u bits", stateLitLength.stateLog); + FSE_flushCState(&blockStream, &stateLitLength); + + { size_t const streamSize = BIT_closeCStream(&blockStream); + if (streamSize==0) return ERROR(dstSize_tooSmall); /* not enough space */ + return streamSize; + } +} diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h index 9da2cbfeb..ba4d79c66 100644 --- a/lib/compress/zstd_compress_internal.h +++ b/lib/compress/zstd_compress_internal.h @@ -170,6 +170,7 @@ struct ZSTD_CCtx_params_s { struct ZSTD_CCtx_s { ZSTD_compressionStage_e stage; int cParamsChanged; /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */ + int bmi2; ZSTD_CCtx_params requestedParams; ZSTD_CCtx_params appliedParams; U32 dictID;