From: Yann Collet Date: Sun, 29 Dec 2024 10:13:57 +0000 (-0800) Subject: initial implementation (incomplete) X-Git-Tag: v1.5.7^2~36^2~17 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=886720442f712b6e94c13075edaec1f224c1ae1a;p=thirdparty%2Fzstd.git initial implementation (incomplete) needs to take care of long lengths > 65535 --- diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 04b6bb9f1..d91fae619 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -15,7 +15,9 @@ #include "../common/zstd_deps.h" /* INT_MAX, ZSTD_memset, ZSTD_memcpy */ #include "../common/mem.h" #include "../common/error_private.h" +#include "compiler.h" #include "hist.h" /* HIST_countFast_wksp */ +#include "zstd_internal.h" #define FSE_STATIC_LINKING_ONLY /* FSE_encodeSymbol */ #include "../common/fse.h" #include "../common/huf.h" @@ -7103,15 +7105,226 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* cctx, return cSize; } + +#if defined(__AVX2__) + +#include /* AVX2 intrinsics */ + +/* + * Convert 2 sequences per iteration, using AVX2 intrinsics: + * - offset -> offBase = offset + 2 + * - litLength -> (U16) litLength + * - matchLength -> (U16)(matchLength - 3) + * - rep is ignored + * Store only 8 bytes per SeqDef (offBase[4], litLength[2], mlBase[2]). + * + * At the end, instead of extracting two __m128i, + * we use _mm256_permute4x64_epi64(..., 0xE8) to move lane2 into lane1, + * then store the lower 16 bytes in one go. + */ +void convertSequences_noRepcodes( + SeqDef* dstSeqs, + const ZSTD_Sequence* inSeqs, + size_t nbSequences) +{ + /* + * addition: + * For each 128-bit half: (offset+2, litLength+0, matchLength-3, rep+0) + */ + const __m256i addition = _mm256_setr_epi32( + ZSTD_REP_NUM, 0, -MINMATCH, 0, /* for sequence i */ + ZSTD_REP_NUM, 0, -MINMATCH, 0 /* for sequence i+1 */ + ); + + /* + * shuffle mask for byte-level rearrangement in each 128-bit half: + * + * Input layout (after addition) per 128-bit half: + * [ offset+2 (4 bytes) | litLength (4 bytes) | matchLength (4 bytes) | rep (4 bytes) ] + * We only need: + * offBase (4 bytes) = offset+2 + * litLength (2 bytes) = low 2 bytes of litLength + * mlBase (2 bytes) = low 2 bytes of (matchLength) + * => Bytes [0..3, 4..5, 8..9], zero the rest. + */ + const __m256i mask = _mm256_setr_epi8( + /* For the lower 128 bits => sequence i */ + 0, 1, 2, 3, /* offset+2 */ + 4, 5, /* litLength (16 bits) */ + 8, 9, /* matchLength (16 bits) */ + (char)0x80, (char)0x80, (char)0x80, (char)0x80, + (char)0x80, (char)0x80, (char)0x80, (char)0x80, + + /* For the upper 128 bits => sequence i+1 */ + 16,17,18,19, /* offset+2 */ + 20,21, /* litLength */ + 24,25, /* matchLength */ + (char)0x80, (char)0x80, (char)0x80, (char)0x80, + (char)0x80, (char)0x80, (char)0x80, (char)0x80 + ); + + /* + * Next, we'll use _mm256_permute4x64_epi64(vshf, 0xE8). + * Explanation of 0xE8 = 11101000b => [lane0, lane2, lane2, lane3]. + * So the lower 128 bits become [lane0, lane2] => combining seq0 and seq1. + */ +#define PERM_LANE_0X_E8 0xE8 /* [0,2,2,3] in lane indices */ + + size_t i = 0; + /* Process 2 sequences per loop iteration */ + for (; i + 1 < nbSequences; i += 2) { + /* 1) Load 2 ZSTD_Sequence (32 bytes) */ + __m256i vin = _mm256_loadu_si256((__m256i const*)&inSeqs[i]); + + /* 2) Add {2, 0, -3, 0} in each 128-bit half */ + __m256i vadd = _mm256_add_epi32(vin, addition); + + /* 3) Shuffle bytes so each half gives us the 8 bytes we need */ + __m256i vshf = _mm256_shuffle_epi8(vadd, mask); + /* + * Now: + * Lane0 = seq0's 8 bytes + * Lane1 = 0 + * Lane2 = seq1's 8 bytes + * Lane3 = 0 + */ + + /* 4) Permute 64-bit lanes => move Lane2 down into Lane1. */ + __m256i vperm = _mm256_permute4x64_epi64(vshf, PERM_LANE_0X_E8); + /* + * Now the lower 16 bytes (Lane0+Lane1) = [seq0, seq1]. + * The upper 16 bytes are [Lane2, Lane3] = [seq1, 0], but we won't use them. + */ + + /* 5) Store only the lower 16 bytes => 2 SeqDef (8 bytes each) */ + _mm_storeu_si128((__m128i *)&dstSeqs[i], _mm256_castsi256_si128(vperm)); + /* + * This writes out 16 bytes total: + * - offset 0..7 => seq0 (offBase, litLength, mlBase) + * - offset 8..15 => seq1 (offBase, litLength, mlBase) + */ + } + + /* Handle leftover if nbSequences is odd */ + if (i < nbSequences) { + /* Fallback: process last sequence */ + assert(i == nbSequences - 1); + dstSeqs[i].offBase = OFFSET_TO_OFFBASE(inSeqs[i].offset); + /* note: doesn't work if one length is > 65535 */ + dstSeqs[i].litLength = (U16)inSeqs[i].litLength; + dstSeqs[i].mlBase = (U16)(inSeqs[i].matchLength - MINMATCH); + } +} + +#elif defined(__SSSE3__) + +#include /* SSSE3 intrinsics: _mm_shuffle_epi8 */ +#include /* SSE2 intrinsics: _mm_add_epi32, etc. */ + +/* + * Convert sequences with SSE. + * - offset -> offBase = offset + 2 + * - litLength (32-bit) -> (U16) litLength + * - matchLength (32-bit) -> (U16)(matchLength - 3) + * - rep is discarded. + * + * We shuffle so that only the first 8 bytes in the final 128-bit + * register are used. We still store 16 bytes (low 8 are good, high 8 are "don't care"). + */ +static void convertSequences_noRepcodes(SeqDef* dstSeqs, + const ZSTD_Sequence* inSeqs, + size_t nbSequences) +{ + /* + addition = { offset+2, litLength+0, matchLength-3, rep+0 } + setr means the first argument is placed in the lowest 32 bits, + second in next-lower 32 bits, etc. + */ + const __m128i addition = _mm_setr_epi32(2, 0, -3, 0); + + /* + Shuffle mask: we reorder bytes after the addition. + + Input layout in 128-bit register (after addition): + Bytes: [ 0..3 | 4..7 | 8..11 | 12..15 ] + Fields: offset+2 litLength matchLength rep + + We want in output: + Bytes: [ 0..3 | 4..5 | 6..7 | 8..15 ignore ] + Fields: offset+2 (U16)litLength (U16)(matchLength) + + _mm_shuffle_epi8 picks bytes from the source. A byte of 0x80 means “zero out”. + So we want: + out[0] = in[0], out[1] = in[1], out[2] = in[2], out[3] = in[3], // offset+2 (4 bytes) + out[4] = in[4], out[5] = in[5], // (U16) litLength + out[6] = in[8], out[7] = in[9], // (U16) matchLength + out[8..15] = 0x80 => won't matter if we only care about first 8 bytes + */ + const __m128i mask = _mm_setr_epi8( + 0, 1, 2, 3, /* offset (4 bytes) */ + 4, 5, /* litLength (2 bytes) */ + 8, 9, /* matchLength (2 bytes) */ + (char)0x80, (char)0x80, (char)0x80, (char)0x80, + (char)0x80, (char)0x80, (char)0x80, (char)0x80 + ); + size_t i; + + for (i = 0; i + 1 < nbSequences; i += 2) { + /*-------------------------*/ + /* Process inSeqs[i] */ + /*-------------------------*/ + __m128i vin0 = _mm_loadu_si128((const __m128i *)(const void*)&inSeqs[i]); + __m128i vadd0 = _mm_add_epi32(vin0, addition); + __m128i vshf0 = _mm_shuffle_epi8(vadd0, mask); + _mm_storel_epi64((__m128i *)(void*)&dstSeqs[i], vshf0); + + /*-------------------------*/ + /* Process inSeqs[i + 1] */ + /*-------------------------*/ + __m128i vin1 = _mm_loadu_si128((__m128i const *)(const void*)&inSeqs[i + 1]); + __m128i vadd1 = _mm_add_epi32(vin1, addition); + __m128i vshf1 = _mm_shuffle_epi8(vadd1, mask); + _mm_storel_epi64((__m128i *)(void*)&dstSeqs[i + 1], vshf1); + } + + /* Handle leftover if nbSequences is odd */ + if (i < nbSequences) { + /* Fallback: process last sequence */ + assert(i == nbSequences - 1); + dstSeqs[i].offBase = OFFSET_TO_OFFBASE(inSeqs[i].offset); + /* note: doesn't work if one length is > 65535 */ + dstSeqs[i].litLength = (U16)inSeqs[i].litLength; + dstSeqs[i].mlBase = (U16)(inSeqs[i].matchLength - MINMATCH); + } + +} + +#else /* no SSE */ + +FORCE_INLINE_TEMPLATE void convertSequences_noRepcodes(SeqDef* dstSeqs, + const ZSTD_Sequence* const inSeqs, size_t nbSequences) +{ + size_t n; + for (n=0; n 65535 */ + dstSeqs[n].litLength = (U16)inSeqs[n].litLength; + dstSeqs[n].mlBase = (U16)(inSeqs[n].matchLength - MINMATCH); + } +} + +#endif + /* + * Precondition: Sequences must end on an explicit Block Delimiter * @return: 0 on success, or an error code. * Note: Sequence validation functionality has been disabled (removed). * This is helpful to generate a lean main pipeline, improving performance. * It may be re-inserted later. */ -size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx, - const ZSTD_Sequence* const inSeqs, size_t nbSequences, - int repcodeResolution) +static size_t ZSTD_convertBlockSequences_internal(ZSTD_CCtx* cctx, + const ZSTD_Sequence* const inSeqs, size_t nbSequences, + int repcodeResolution) { Repcodes_t updatedRepcodes; size_t seqNb = 0; @@ -7129,21 +7342,26 @@ size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx, assert(inSeqs[nbSequences-1].offset == 0); /* Convert Sequences from public format to internal format */ - for (seqNb = 0; seqNb < nbSequences - 1 ; seqNb++) { - U32 const litLength = inSeqs[seqNb].litLength; - U32 const matchLength = inSeqs[seqNb].matchLength; - U32 offBase; + if (!repcodeResolution) { + convertSequences_noRepcodes(cctx->seqStore.sequencesStart, inSeqs, nbSequences); + cctx->seqStore.sequences += nbSequences; + } else { + for (seqNb = 0; seqNb < nbSequences - 1 ; seqNb++) { + U32 const litLength = inSeqs[seqNb].litLength; + U32 const matchLength = inSeqs[seqNb].matchLength; + U32 offBase; - if (!repcodeResolution) { - offBase = OFFSET_TO_OFFBASE(inSeqs[seqNb].offset); - } else { - U32 const ll0 = (litLength == 0); - offBase = ZSTD_finalizeOffBase(inSeqs[seqNb].offset, updatedRepcodes.rep, ll0); - ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); - } + if (!repcodeResolution) { + offBase = OFFSET_TO_OFFBASE(inSeqs[seqNb].offset); + } else { + U32 const ll0 = (litLength == 0); + offBase = ZSTD_finalizeOffBase(inSeqs[seqNb].offset, updatedRepcodes.rep, ll0); + ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); + } - DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); - ZSTD_storeSeqOnly(&cctx->seqStore, litLength, offBase, matchLength); + DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); + ZSTD_storeSeqOnly(&cctx->seqStore, litLength, offBase, matchLength); + } } /* If we skipped repcode search while parsing, we need to update repcodes now */ @@ -7172,6 +7390,20 @@ size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx, return 0; } +static size_t ZSTD_convertBlockSequences_noRepcode(ZSTD_CCtx* cctx, + const ZSTD_Sequence* const inSeqs, size_t nbSequences) +{ + return ZSTD_convertBlockSequences_internal(cctx, inSeqs, nbSequences, 0); +} + +size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx, + const ZSTD_Sequence* const inSeqs, size_t nbSequences, + int repcodeResolution) +{ + (void)repcodeResolution; + return ZSTD_convertBlockSequences_internal(cctx, inSeqs, nbSequences, 0); +} + typedef struct { size_t nbSequences; size_t blockSize; diff --git a/tests/Makefile b/tests/Makefile index 982181de8..abb0b2b1d 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -148,7 +148,7 @@ fullbench32: CPPFLAGS += -m32 $(FULLBENCHS) : CPPFLAGS += $(MULTITHREAD_CPP) -Wno-deprecated-declarations $(FULLBENCHS) : LDFLAGS += $(MULTITHREAD_LD) $(FULLBENCHS) : DEBUGFLAGS = -DNDEBUG # turn off assert() for speed measurements -$(FULLBENCHS) : DEBUGLEVEL ?= 0 # turn off assert() for speed measurements +$(FULLBENCHS) : DEBUGLEVEL = 0 # turn off assert() for speed measurements $(FULLBENCHS) : $(ZSTD_FILES) $(FULLBENCHS) : $(PRGDIR)/datagen.c $(PRGDIR)/lorem.c $(PRGDIR)/util.c $(PRGDIR)/timefn.c $(PRGDIR)/benchfn.c fullbench.c $(LINK.c) $^ -o $@$(EXT)