* Using vdupq_n_u64 duplicates the unsigned 64-bit integer to two consecutive aligned memory locations in stack so compiler can use wider load instructions.
All different-sized general-purpose registers overlay on ARM/AArch64, so any vector cast is no-op in assembly.
}
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
- *chunk = vcombine_u8(vld1_u8(from), vld1_u8(from));
+ uint64_t tmp;
+ memcpy(&tmp, from, 8);
+ *chunk = vreinterpretq_u8_u64(vdupq_n_u64(tmp));
}
#define CHUNKSIZE chunksize_neon