From: Mika Lindqvist Date: Fri, 18 Jun 2021 21:10:44 +0000 (+0300) Subject: [chunkset_neon] Use vdupq_n_u64. X-Git-Tag: 2.1.0-beta1~559 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=31aed006c39118721929112f62805712183fc730;p=thirdparty%2Fzlib-ng.git [chunkset_neon] Use vdupq_n_u64. * Using vdupq_n_u64 duplicates the unsigned 64-bit integer to two consecutive aligned memory locations in stack so compiler can use wider load instructions. All different-sized general-purpose registers overlay on ARM/AArch64, so any vector cast is no-op in assembly. --- diff --git a/arch/arm/chunkset_neon.c b/arch/arm/chunkset_neon.c index b1fcb241d..e0ad3e04e 100644 --- a/arch/arm/chunkset_neon.c +++ b/arch/arm/chunkset_neon.c @@ -37,7 +37,9 @@ static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { } static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { - *chunk = vcombine_u8(vld1_u8(from), vld1_u8(from)); + uint64_t tmp; + memcpy(&tmp, from, 8); + *chunk = vreinterpretq_u8_u64(vdupq_n_u64(tmp)); } #define CHUNKSIZE chunksize_neon