From: Ma Lin Date: Thu, 28 Oct 2021 10:53:12 +0000 (+0800) Subject: ZSTD_copy16() uses SSE2 instructions X-Git-Tag: v1.5.1~1^2~61^2 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b10357ce65fa7fc908a06713c4f23f69dfc7ba8a;p=thirdparty%2Fzstd.git ZSTD_copy16() uses SSE2 instructions This accelerates the decompression speed of MSVC build. --- diff --git a/lib/common/zstd_internal.h b/lib/common/zstd_internal.h index aeafac2fa..1dfa120e5 100644 --- a/lib/common/zstd_internal.h +++ b/lib/common/zstd_internal.h @@ -176,11 +176,17 @@ static void ZSTD_copy8(void* dst, const void* src) { ZSTD_memcpy(dst, src, 8); #endif } - #define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; } + +/* Need to use memmove here since the literal buffer can now be located within + the dst buffer. In circumstances where the op "catches up" to where the + literal buffer is, there can be partial overlaps in this call on the final + copy if the literal is being shifted by less than 16 bytes. */ static void ZSTD_copy16(void* dst, const void* src) { #if defined(ZSTD_ARCH_ARM_NEON) vst1q_u8((uint8_t*)dst, vld1q_u8((const uint8_t*)src)); +#elif defined(ZSTD_ARCH_X86_SSE2) + _mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((const __m128i*)src)); #else ZSTD_memmove(dst, src, 16); #endif