From: Yann Collet Date: Thu, 8 Nov 2018 20:47:46 +0000 (-0800) Subject: improve long-range decoder speed X-Git-Tag: v1.3.8~52^2~2 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=9126da5b5c74dac824b9cd9467923c36d8792fb6;p=thirdparty%2Fzstd.git improve long-range decoder speed on enwik9 at level 22 (which is almost a worst case scenario), speed improves by +7% on my laptop (415 -> 445 MB/s) --- diff --git a/lib/common/compiler.h b/lib/common/compiler.h index 07f875e4d..cc830b2b4 100644 --- a/lib/common/compiler.h +++ b/lib/common/compiler.h @@ -89,23 +89,21 @@ #endif /* prefetch - * can be disabled, by declaring NO_PREFETCH macro - * All prefetch invocations use a single default locality 2, - * generating instruction prefetcht1, - * which, according to Intel, means "load data into L2 cache". - * This is a good enough "middle ground" for the time being, - * though in theory, it would be better to specialize locality depending on data being prefetched. - * Tests could not determine any sensible difference based on locality value. */ + * can be disabled, by declaring NO_PREFETCH build macro */ #if defined(NO_PREFETCH) +# define PREFETCH_L1(ptr) (void)(ptr) /* disabled */ # define PREFETCH(ptr) (void)(ptr) /* disabled */ #else # if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */ # include /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ -# define PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T1) +# define PREFETCH_L1(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) +# define PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T1) # elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) -# define PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */) +# define PREFETCH_L1(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) +# define PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */) # else -# define PREFETCH(ptr) (void)(ptr) /* disabled */ +# define PREFETCH_L1(ptr) (void)(ptr) /* disabled */ +# define PREFETCH(ptr) (void)(ptr) /* disabled */ # endif #endif /* NO_PREFETCH */ diff --git a/lib/decompress/zstd_decompress_block.c b/lib/decompress/zstd_decompress_block.c index 68298772e..ef424877b 100644 --- a/lib/decompress/zstd_decompress_block.c +++ b/lib/decompress/zstd_decompress_block.c @@ -1054,7 +1054,7 @@ ZSTD_decompressSequencesLong_body( seq_t const sequence = ZSTD_decodeSequenceLong(&seqState, isLongOffset); size_t const oneSeqSize = ZSTD_execSequenceLong(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd); if (ZSTD_isError(oneSeqSize)) return oneSeqSize; - PREFETCH(sequence.match); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */ + PREFETCH_L1(sequence.match); PREFETCH_L1(sequence.match + sequence.matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */ sequences[seqNb & STORED_SEQS_MASK] = sequence; op += oneSeqSize; } diff --git a/programs/bench.c b/programs/bench.c index caa803572..2bbaa9d0a 100644 --- a/programs/bench.c +++ b/programs/bench.c @@ -601,7 +601,11 @@ BMK_benchMemAdvancedNoAlloc( cPtr += cCapacities[nbBlocks]; resPtr += thisBlockSize; remaining -= thisBlockSize; - if (BMK_decodeOnly) { assert(nbBlocks==0); cSizes[nbBlocks] = thisBlockSize; } + if (BMK_decodeOnly) { + assert(nbBlocks==0); + cSizes[nbBlocks] = thisBlockSize; + benchResult.cSize = thisBlockSize; + } } } }