From: Nathan Moinvaziri Date: Wed, 29 Jun 2022 15:57:11 +0000 (-0700) Subject: Don't use unaligned access for memcpy instructions due to GCC 11 assuming it is align... X-Git-Tag: 2.1.0-beta1~186 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e22195e5bcb10851f96e0b56e396696b152e81af;p=thirdparty%2Fzlib-ng.git Don't use unaligned access for memcpy instructions due to GCC 11 assuming it is aligned in certain instances. --- diff --git a/arch/arm/chunkset_neon.c b/arch/arm/chunkset_neon.c index 29065f77c..3b8d2c001 100644 --- a/arch/arm/chunkset_neon.c +++ b/arch/arm/chunkset_neon.c @@ -38,19 +38,19 @@ static const lut_rem_pair perm_idx_lut[13] = { static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { uint16_t tmp; - zmemcpy_2(&tmp, from); + memcpy(&tmp, from, sizeof(tmp)); *chunk = vreinterpretq_u8_u16(vdupq_n_u16(tmp)); } static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { uint32_t tmp; - zmemcpy_4(&tmp, from); + memcpy(&tmp, from, sizeof(tmp)); *chunk = vreinterpretq_u8_u32(vdupq_n_u32(tmp)); } static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { uint64_t tmp; - zmemcpy_8(&tmp, from); + memcpy(&tmp, from, sizeof(tmp)); *chunk = vreinterpretq_u8_u64(vdupq_n_u64(tmp)); } @@ -76,7 +76,7 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t /* See note in chunkset_sse41.c for why this is ok */ __msan_unpoison(buf + dist, 16 - dist); #endif - + /* This version of table is only available on aarch64 */ #if defined(_M_ARM64) || defined(__aarch64__) uint8x16_t ret_vec = vld1q_u8(buf); diff --git a/arch/power/chunkset_power8.c b/arch/power/chunkset_power8.c index 47e548526..83928308d 100644 --- a/arch/power/chunkset_power8.c +++ b/arch/power/chunkset_power8.c @@ -16,19 +16,19 @@ typedef vector unsigned char chunk_t; static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { uint16_t tmp; - zmemcpy_2(&tmp, from); + memcpy(&tmp, from, sizeof(tmp)); *chunk = (vector unsigned char)vec_splats(tmp); } static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { uint32_t tmp; - zmemcpy_4(&tmp, from); + memcpy(&tmp, from, sizeof(tmp)); *chunk = (vector unsigned char)vec_splats(tmp); } static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { uint64_t tmp; - zmemcpy_8(&tmp, from); + memcpy(&tmp, from, sizeof(tmp)); *chunk = (vector unsigned char)vec_splats(tmp); } diff --git a/arch/x86/chunkset_avx.c b/arch/x86/chunkset_avx.c index 024b37c30..c4a4d9b05 100644 --- a/arch/x86/chunkset_avx.c +++ b/arch/x86/chunkset_avx.c @@ -52,19 +52,19 @@ static const lut_rem_pair perm_idx_lut[29] = { static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { int16_t tmp; - zmemcpy_2(&tmp, from); + memcpy(&tmp, from, sizeof(tmp)); *chunk = _mm256_set1_epi16(tmp); } static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { int32_t tmp; - zmemcpy_4(&tmp, from); + memcpy(&tmp, from, sizeof(tmp)); *chunk = _mm256_set1_epi32(tmp); } static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { int64_t tmp; - zmemcpy_8(&tmp, from); + memcpy(&tmp, from, sizeof(tmp)); *chunk = _mm256_set1_epi64x(tmp); } diff --git a/arch/x86/chunkset_sse2.c b/arch/x86/chunkset_sse2.c index 8e3166f70..eddf5d989 100644 --- a/arch/x86/chunkset_sse2.c +++ b/arch/x86/chunkset_sse2.c @@ -17,19 +17,19 @@ typedef __m128i chunk_t; static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { int16_t tmp; - zmemcpy_2(&tmp, from); + memcpy(&tmp, from, sizeof(tmp)); *chunk = _mm_set1_epi16(tmp); } static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { int32_t tmp; - zmemcpy_4(&tmp, from); + memcpy(&tmp, from, sizeof(tmp)); *chunk = _mm_set1_epi32(tmp); } static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { int64_t tmp; - zmemcpy_8(&tmp, from); + memcpy(&tmp, from, sizeof(tmp)); *chunk = _mm_set1_epi64x(tmp); } diff --git a/arch/x86/chunkset_sse41.c b/arch/x86/chunkset_sse41.c index 42b44d051..c148db092 100644 --- a/arch/x86/chunkset_sse41.c +++ b/arch/x86/chunkset_sse41.c @@ -41,19 +41,19 @@ static const lut_rem_pair perm_idx_lut[13] = { static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { int16_t tmp; - zmemcpy_2(&tmp, from); + memcpy(&tmp, from, sizeof(tmp)); *chunk = _mm_set1_epi16(tmp); } static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { int32_t tmp; - zmemcpy_4(&tmp, from); + memcpy(&tmp, from, sizeof(tmp)); *chunk = _mm_set1_epi32(tmp); } static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { int64_t tmp; - zmemcpy_8(&tmp, from); + memcpy(&tmp, from, sizeof(tmp)); *chunk = _mm_set1_epi64x(tmp); } @@ -69,7 +69,7 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t lut_rem_pair lut_rem = perm_idx_lut[dist - 3]; __m128i perm_vec, ret_vec; #ifdef Z_MEMORY_SANITIZER - /* Important to note: + /* Important to note: * This is _not_ to subvert the memory sanitizer but to instead unpoison some * bytes we willingly and purposefully load uninitialized that we swizzle over * in a vector register, anyway. If what we assume is wrong about what is used, diff --git a/chunkset.c b/chunkset.c index ca35929f3..169e41123 100644 --- a/chunkset.c +++ b/chunkset.c @@ -13,20 +13,20 @@ typedef uint64_t chunk_t; static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { uint8_t *dest = (uint8_t *)chunk; - zmemcpy_4(dest, from); - zmemcpy_4(dest+4, from); + memcpy(dest, from, sizeof(uint32_t)); + memcpy(dest+4, from, sizeof(uint32_t)); } static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { - zmemcpy_8(chunk, from); + memcpy(chunk, from, sizeof(uint64_t)); } static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { - zmemcpy_8(chunk, (uint8_t *)s); + memcpy(chunk, (uint8_t *)s, sizeof(uint64_t)); } static inline void storechunk(uint8_t *out, chunk_t *chunk) { - zmemcpy_8(out, chunk); + memcpy(out, chunk, sizeof(uint64_t)); } #define CHUNKSIZE chunksize_c diff --git a/compare256.c b/compare256.c index 3c05969f9..b11ac24ee 100644 --- a/compare256.c +++ b/compare256.c @@ -101,8 +101,8 @@ static inline uint32_t compare256_unaligned_32_static(const uint8_t *src0, const do { uint32_t sv, mv, diff; - zmemcpy_4(&sv, src0); - zmemcpy_4(&mv, src1); + memcpy(&sv, src0, sizeof(sv)); + memcpy(&mv, src1, sizeof(mv)); diff = sv ^ mv; if (diff) { @@ -141,8 +141,8 @@ static inline uint32_t compare256_unaligned_64_static(const uint8_t *src0, const do { uint64_t sv, mv, diff; - zmemcpy_8(&sv, src0); - zmemcpy_8(&mv, src1); + memcpy(&sv, src0, sizeof(sv)); + memcpy(&mv, src1, sizeof(mv)); diff = sv ^ mv; if (diff) { diff --git a/deflate.h b/deflate.h index f8920df59..ccb246a81 100644 --- a/deflate.h +++ b/deflate.h @@ -306,7 +306,7 @@ static inline void put_short(deflate_state *s, uint16_t w) { #if BYTE_ORDER == BIG_ENDIAN w = ZSWAP16(w); #endif - zmemcpy_2(&s->pending_buf[s->pending], &w); + memcpy(&s->pending_buf[s->pending], &w, sizeof(w)); s->pending += 2; } @@ -318,7 +318,7 @@ static inline void put_short_msb(deflate_state *s, uint16_t w) { #if BYTE_ORDER == LITTLE_ENDIAN w = ZSWAP16(w); #endif - zmemcpy_2(&s->pending_buf[s->pending], &w); + memcpy(&s->pending_buf[s->pending], &w, sizeof(w)); s->pending += 2; } @@ -330,7 +330,7 @@ static inline void put_uint32(deflate_state *s, uint32_t dw) { #if BYTE_ORDER == BIG_ENDIAN dw = ZSWAP32(dw); #endif - zmemcpy_4(&s->pending_buf[s->pending], &dw); + memcpy(&s->pending_buf[s->pending], &dw, sizeof(dw)); s->pending += 4; } @@ -342,7 +342,7 @@ static inline void put_uint32_msb(deflate_state *s, uint32_t dw) { #if BYTE_ORDER == LITTLE_ENDIAN dw = ZSWAP32(dw); #endif - zmemcpy_4(&s->pending_buf[s->pending], &dw); + memcpy(&s->pending_buf[s->pending], &dw, sizeof(dw)); s->pending += 4; } @@ -354,7 +354,7 @@ static inline void put_uint64(deflate_state *s, uint64_t lld) { #if BYTE_ORDER == BIG_ENDIAN lld = ZSWAP64(lld); #endif - zmemcpy_8(&s->pending_buf[s->pending], &lld); + memcpy(&s->pending_buf[s->pending], &lld, sizeof(lld)); s->pending += 8; } diff --git a/inffast.c b/inffast.c index 36923317d..bfb1c8313 100644 --- a/inffast.c +++ b/inffast.c @@ -14,7 +14,7 @@ /* Load 64 bits from IN and place the bytes at offset BITS in the result. */ static inline uint64_t load_64_bits(const unsigned char *in, unsigned bits) { uint64_t chunk; - zmemcpy_8(&chunk, in); + memcpy(&chunk, in, sizeof(chunk)); #if BYTE_ORDER == LITTLE_ENDIAN return chunk << bits; diff --git a/inflate_p.h b/inflate_p.h index 20f6b1a8c..7122d7ce6 100644 --- a/inflate_p.h +++ b/inflate_p.h @@ -176,21 +176,21 @@ static inline uint8_t* chunkcopy_safe(uint8_t *out, uint8_t *from, uint64_t len, } if (tocopy >= 8) { - zmemcpy_8(out, from); + memcpy(out, from, 8); out += 8; from += 8; tocopy -= 8; } if (tocopy >= 4) { - zmemcpy_4(out, from); + memcpy(out, from, 4); out += 4; from += 4; tocopy -= 4; } if (tocopy >= 2) { - zmemcpy_2(out, from); + memcpy(out, from, 2); out += 2; from += 2; tocopy -= 2; diff --git a/insert_string_tpl.h b/insert_string_tpl.h index 643a5e0e3..4acd67fd6 100644 --- a/insert_string_tpl.h +++ b/insert_string_tpl.h @@ -31,7 +31,7 @@ #ifndef HASH_CALC_READ # if BYTE_ORDER == LITTLE_ENDIAN # define HASH_CALC_READ \ - zmemcpy_4(&val, strstart); + memcpy(&val, strstart, sizeof(val)); # else # define HASH_CALC_READ \ val = ((uint32_t)(strstart[0])); \ diff --git a/match_tpl.h b/match_tpl.h index 3fc71c15a..fbd34e58a 100644 --- a/match_tpl.h +++ b/match_tpl.h @@ -74,11 +74,11 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) { #endif #ifdef UNALIGNED64_OK - zmemcpy_8(scan_start, scan); - zmemcpy_8(scan_end, scan+offset); + memcpy(scan_start, scan, sizeof(uint64_t)); + memcpy(scan_end, scan+offset, sizeof(uint64_t)); #elif defined(UNALIGNED_OK) - zmemcpy_4(scan_start, scan); - zmemcpy_4(scan_end, scan+offset); + memcpy(scan_start, scan, sizeof(uint32_t)); + memcpy(scan_end, scan+offset, sizeof(uint32_t)); #else scan_end[0] = *(scan+offset); scan_end[1] = *(scan+offset+1); @@ -201,9 +201,9 @@ Z_INTERNAL uint32_t LONGEST_MATCH(deflate_state *const s, Pos cur_match) { #endif #ifdef UNALIGNED64_OK - zmemcpy_8(scan_end, scan+offset); + memcpy(scan_end, scan+offset, sizeof(uint64_t)); #elif defined(UNALIGNED_OK) - zmemcpy_4(scan_end, scan+offset); + memcpy(scan_end, scan+offset, sizeof(uint32_t)); #else scan_end[0] = *(scan+offset); scan_end[1] = *(scan+offset+1); diff --git a/zbuild.h b/zbuild.h index d82c1c388..10a7fd6b3 100644 --- a/zbuild.h +++ b/zbuild.h @@ -218,31 +218,22 @@ # endif #endif -/* Force compiler to emit unaligned memory accesses if unaligned access is supported +/* Force compiler to emit unaligned memory comparisons if unaligned access is supported on the architecture, otherwise don't assume unaligned access is supported. Older - compilers don't optimize memcpy and memcmp calls to unaligned access instructions - when it is supported on the architecture resulting in significant performance impact. - Newer compilers might optimize memcpy but not all optimize memcmp for all integer types. */ + compilers don't optimize memcmp calls for all integer types to unaligned access instructions + when it is supported on the architecture resulting in significant performance impact. */ #ifdef UNALIGNED_OK -# define zmemcpy_2(dest, src) (*((uint16_t *)(dest)) = *((uint16_t *)(src))) # define zmemcmp_2(str1, str2) (*((uint16_t *)(str1)) != *((uint16_t *)(str2))) -# define zmemcpy_4(dest, src) (*((uint32_t *)(dest)) = *((uint32_t *)(src))) # define zmemcmp_4(str1, str2) (*((uint32_t *)(str1)) != *((uint32_t *)(str2))) # if defined(UNALIGNED64_OK) && (UINTPTR_MAX == UINT64_MAX) -# define zmemcpy_8(dest, src) (*((uint64_t *)(dest)) = *((uint64_t *)(src))) # define zmemcmp_8(str1, str2) (*((uint64_t *)(str1)) != *((uint64_t *)(str2))) # else -# define zmemcpy_8(dest, src) (((uint32_t *)(dest))[0] = ((uint32_t *)(src))[0], \ - ((uint32_t *)(dest))[1] = ((uint32_t *)(src))[1]) # define zmemcmp_8(str1, str2) (((uint32_t *)(str1))[0] != ((uint32_t *)(str2))[0] || \ ((uint32_t *)(str1))[1] != ((uint32_t *)(str2))[1]) # endif #else -# define zmemcpy_2(dest, src) memcpy(dest, src, 2) # define zmemcmp_2(str1, str2) memcmp(str1, str2, 2) -# define zmemcpy_4(dest, src) memcpy(dest, src, 4) # define zmemcmp_4(str1, str2) memcmp(str1, str2, 4) -# define zmemcpy_8(dest, src) memcpy(dest, src, 8) # define zmemcmp_8(str1, str2) memcmp(str1, str2, 8) #endif