From 1e9d2006ae2601df6c3ec36464e1622921424a91 Mon Sep 17 00:00:00 2001 From: Arpad Panyik Date: Fri, 20 Jun 2025 14:48:33 +0000 Subject: [PATCH] AArch64: Use better block copy8 The vector copy is only necessary for 16-byte blocks on AArch64. Decompression uplifts on a Neoverse V2 system, using Zstd-1.5.8 compiled with "-O3 -march=armv8.2-a+sve2": Clang-19 Clang-20 GCC-14 GCC-15 1#silesia.tar: +0.316% +0.865% +0.025% +0.096% 2#silesia.tar: +0.689% +1.374% +0.027% +0.065% 3#silesia.tar: +0.811% +1.654% +0.034% +0.033% 4#silesia.tar: +0.912% +1.755% +0.027% +0.042% 5#silesia.tar: +0.995% +1.826% +0.062% +0.094% 6#silesia.tar: +0.976% +1.777% +0.065% +0.104% 7#silesia.tar: +0.910% +1.738% +0.077% +0.110% --- lib/common/zstd_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/common/zstd_internal.h b/lib/common/zstd_internal.h index c1647689b..791b6485d 100644 --- a/lib/common/zstd_internal.h +++ b/lib/common/zstd_internal.h @@ -168,7 +168,7 @@ static UNUSED_ATTR const U32 OF_defaultNormLog = OF_DEFAULTNORMLOG; * Shared functions to include for inlining *********************************************/ static void ZSTD_copy8(void* dst, const void* src) { -#if defined(ZSTD_ARCH_ARM_NEON) +#if defined(ZSTD_ARCH_ARM_NEON) && !defined(__aarch64__) vst1_u8((uint8_t*)dst, vld1_u8((const uint8_t*)src)); #else ZSTD_memcpy(dst, src, 8); -- 2.47.2