From: Nick Terrell Date: Fri, 20 Sep 2019 07:52:15 +0000 (-0700) Subject: Widen ZSTD_wildcopy to 32 bytes X-Git-Tag: v1.4.4~1^2~42^2~7 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=cdad7fa512e03557669533ca5143ec138885f1bf;p=thirdparty%2Fzstd.git Widen ZSTD_wildcopy to 32 bytes --- diff --git a/lib/common/zstd_internal.h b/lib/common/zstd_internal.h index 007b03df7..522c1fda2 100644 --- a/lib/common/zstd_internal.h +++ b/lib/common/zstd_internal.h @@ -197,7 +197,7 @@ static void ZSTD_copy8(void* dst, const void* src) { memcpy(dst, src, 8); } static void ZSTD_copy16(void* dst, const void* src) { memcpy(dst, src, 16); } #define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; } -#define WILDCOPY_OVERLENGTH 16 +#define WILDCOPY_OVERLENGTH 32 #define WILDCOPY_VECLEN 16 typedef enum { @@ -237,11 +237,11 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e * On clang-8 unrolling once is +1.4%, twice is +3.3%, thrice is +3%. */ COPY16(op, ip); - if (op >= oend) return; COPY16(op, ip); if (op >= oend) return; do { COPY16(op, ip); + COPY16(op, ip); } while (op < oend); } @@ -257,7 +257,7 @@ MEM_STATIC void ZSTD_wildcopy8(void* dst, const void* src, ptrdiff_t length) BYTE* op = (BYTE*)dst; BYTE* const oend = (BYTE*)op + length; do { - COPY8(op, ip) + COPY8(op, ip); } while (op < oend); } diff --git a/lib/decompress/zstd_decompress_block.c b/lib/decompress/zstd_decompress_block.c index e799a5c74..27ce137f3 100644 --- a/lib/decompress/zstd_decompress_block.c +++ b/lib/decompress/zstd_decompress_block.c @@ -724,12 +724,14 @@ size_t ZSTD_execSequence(BYTE* op, assert(oMatchEnd <= oend_w /* Can wildcopy matches */); /* Copy Literals: - * Split out litLength <= 16 since it is nearly always true. +1% on gcc-9. + * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9. + * We likely don't need the full 32-byte wildcopy. */ - if (sequence.litLength <= 16) - ZSTD_copy16(op, *litPtr); - else - ZSTD_wildcopy(op, (*litPtr), sequence.litLength, ZSTD_no_overlap); + assert(WILDCOPY_OVERLENGTH >= 16); + ZSTD_copy16(op, (*litPtr)); + if (sequence.litLength > 16) { + ZSTD_wildcopy(op+16, (*litPtr)+16, sequence.litLength-16, ZSTD_no_overlap); + } op = oLitEnd; *litPtr = iLitEnd; /* update for next sequence */ @@ -755,18 +757,18 @@ size_t ZSTD_execSequence(BYTE* op, assert(match >= prefixStart); assert(sequence.matchLength >= 1); - /* Nearly all offsets are >= 16 bytes, which means we can use wildcopy + /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy * without overlap checking. */ - if (sequence.offset >= 16) { - /* Split out matchLength <= 16 since it is nearly always true. +1% on gcc-9. */ - if (sequence.matchLength <= 16) - ZSTD_copy16(op, match); - else - ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap); + if (sequence.offset >= WILDCOPY_VECLEN) { + /* Split out matchLength <= 32 since it is nearly always true. +1% on gcc-9. + * We copy 32 bytes here since matches are generally longer than literals. + * In silesia, for example ~10% of matches are longer than 16 bytes. + */ + ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap); return sequenceLength; } - assert(sequence.offset < 16); + assert(sequence.offset < WILDCOPY_VECLEN); /* Copy 8 bytes and spread the offset to be >= 8. */ ZSTD_overlapCopy8(&op, &match, sequence.offset);