Technically, we weren't actually doing this the way C wants us to,
legally. The zmemcpy's turn into NOPs for pretty much all > 0
optimization levels and this gets us defined behavior with the
sanitizer, putting the optimized load by arbitrary alignment into the
compiler's hands instead of ours.
}
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
- *chunk = _mm256_set1_epi16(*(int16_t *)from);
+ int16_t tmp;
+ zmemcpy_2(&tmp, from);
+ *chunk = _mm256_set1_epi16(tmp);
}
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
- *chunk = _mm256_set1_epi32(*(int32_t *)from);
+ int32_t tmp;
+ zmemcpy_4(&tmp, from);
+ *chunk = _mm256_set1_epi32(tmp);
}
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
- *chunk = _mm256_set1_epi64x(*(int64_t *)from);
+ int64_t tmp;
+ zmemcpy_8(&tmp, from);
+ *chunk = _mm256_set1_epi64x(tmp);
}
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
}
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
- *chunk = _mm_set1_epi16(*(int16_t *)from);
+ int16_t tmp;
+ zmemcpy_2(&tmp, from);
+ *chunk = _mm_set1_epi16(tmp);
}
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
- *chunk = _mm_set1_epi32(*(int32_t *)from);
+ int32_t tmp;
+ zmemcpy_4(&tmp, from);
+ *chunk = _mm_set1_epi32(tmp);
}
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
- *chunk = _mm_set1_epi64x(*(int64_t *)from);
+ int64_t tmp;
+ zmemcpy_8(&tmp, from);
+ *chunk = _mm_set1_epi64x(tmp);
}
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {