From: Nathan Moinvaziri Date: Sun, 27 Mar 2022 20:18:03 +0000 (-0700) Subject: Allow SSE2 and AVX2 functions with -DWITH_UNALIGNED=OFF. Even though they use unalign... X-Git-Tag: 2.1.0-beta1~294 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=81633227813d4b275a2f5a6178b3af3e683db180;p=thirdparty%2Fzlib-ng.git Allow SSE2 and AVX2 functions with -DWITH_UNALIGNED=OFF. Even though they use unaligned loads, they don't result in undefined behavior. --- diff --git a/arch/x86/compare256_avx2.c b/arch/x86/compare256_avx2.c index e25fa93eb..1318a0e33 100644 --- a/arch/x86/compare256_avx2.c +++ b/arch/x86/compare256_avx2.c @@ -14,8 +14,7 @@ # include #endif -/* AVX2 unaligned intrinsic comparison */ -static inline uint32_t compare256_unaligned_avx2_static(const uint8_t *src0, const uint8_t *src1) { +static inline uint32_t compare256_avx2_static(const uint8_t *src0, const uint8_t *src1) { uint32_t len = 0; do { @@ -46,18 +45,18 @@ static inline uint32_t compare256_unaligned_avx2_static(const uint8_t *src0, con return 256; } -Z_INTERNAL uint32_t compare256_unaligned_avx2(const uint8_t *src0, const uint8_t *src1) { - return compare256_unaligned_avx2_static(src0, src1); +Z_INTERNAL uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1) { + return compare256_avx2_static(src0, src1); } -#define LONGEST_MATCH longest_match_unaligned_avx2 -#define COMPARE256 compare256_unaligned_avx2_static +#define LONGEST_MATCH longest_match_avx2 +#define COMPARE256 compare256_avx2_static #include "match_tpl.h" #define LONGEST_MATCH_SLOW -#define LONGEST_MATCH longest_match_slow_unaligned_avx2 -#define COMPARE256 compare256_unaligned_avx2_static +#define LONGEST_MATCH longest_match_slow_avx2 +#define COMPARE256 compare256_avx2_static #include "match_tpl.h" diff --git a/arch/x86/compare256_sse2.c b/arch/x86/compare256_sse2.c index bd5d62cf7..aad4bd240 100644 --- a/arch/x86/compare256_sse2.c +++ b/arch/x86/compare256_sse2.c @@ -11,7 +11,7 @@ #include -static inline uint32_t compare256_unaligned_sse2_static(const uint8_t *src0, const uint8_t *src1) { +static inline uint32_t compare256_sse2_static(const uint8_t *src0, const uint8_t *src1) { uint32_t len = 0; int align_offset = ((uintptr_t)src0) & 15; const uint8_t *end0 = src0 + 256; @@ -78,18 +78,18 @@ static inline uint32_t compare256_unaligned_sse2_static(const uint8_t *src0, con return 256; } -Z_INTERNAL uint32_t compare256_unaligned_sse2(const uint8_t *src0, const uint8_t *src1) { - return compare256_unaligned_sse2_static(src0, src1); +Z_INTERNAL uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1) { + return compare256_sse2_static(src0, src1); } -#define LONGEST_MATCH longest_match_unaligned_sse2 -#define COMPARE256 compare256_unaligned_sse2_static +#define LONGEST_MATCH longest_match_sse2 +#define COMPARE256 compare256_sse2_static #include "match_tpl.h" #define LONGEST_MATCH_SLOW -#define LONGEST_MATCH longest_match_slow_unaligned_sse2 -#define COMPARE256 compare256_unaligned_sse2_static +#define LONGEST_MATCH longest_match_slow_sse2 +#define COMPARE256 compare256_sse2_static #include "match_tpl.h" diff --git a/cpu_features.h b/cpu_features.h index 7cc74a97a..560767217 100644 --- a/cpu_features.h +++ b/cpu_features.h @@ -120,12 +120,12 @@ extern uint32_t compare256_unaligned_32(const uint8_t *src0, const uint8_t *src1 #if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL) extern uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1); #endif +#endif #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) -extern uint32_t compare256_unaligned_sse2(const uint8_t *src0, const uint8_t *src1); +extern uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1); #endif #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) -extern uint32_t compare256_unaligned_avx2(const uint8_t *src0, const uint8_t *src1); -#endif +extern uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1); #endif #ifdef DEFLATE_H_ @@ -147,12 +147,12 @@ extern uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match #if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL) extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match); #endif +#endif #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) -extern uint32_t longest_match_unaligned_sse2(deflate_state *const s, Pos cur_match); +extern uint32_t longest_match_sse2(deflate_state *const s, Pos cur_match); #endif #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) -extern uint32_t longest_match_unaligned_avx2(deflate_state *const s, Pos cur_match); -#endif +extern uint32_t longest_match_avx2(deflate_state *const s, Pos cur_match); #endif /* longest_match_slow */ @@ -163,12 +163,12 @@ extern uint32_t longest_match_slow_unaligned_32(deflate_state *const s, Pos cur_ #ifdef UNALIGNED64_OK extern uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_match); #endif +#endif #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) -extern uint32_t longest_match_slow_unaligned_sse2(deflate_state *const s, Pos cur_match); +extern uint32_t longest_match_slow_sse2(deflate_state *const s, Pos cur_match); #endif #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) -extern uint32_t longest_match_slow_unaligned_avx2(deflate_state *const s, Pos cur_match); -#endif +extern uint32_t longest_match_slow_avx2(deflate_state *const s, Pos cur_match); #endif /* quick_insert_string */ diff --git a/functable.c b/functable.c index 3e23c54e5..ffb0d3f47 100644 --- a/functable.c +++ b/functable.c @@ -106,17 +106,17 @@ Z_INTERNAL uint32_t longest_match_stub(deflate_state *const s, Pos cur_match) { # else functable.longest_match = &longest_match_unaligned_16; # endif -# if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) - if (x86_cpu_has_sse2) - functable.longest_match = &longest_match_unaligned_sse2; -# endif -# if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) - if (x86_cpu_has_avx2) - functable.longest_match = &longest_match_unaligned_avx2; -# endif #else functable.longest_match = &longest_match_c; #endif +#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) + if (x86_cpu_has_sse2) + functable.longest_match = &longest_match_sse2; +#endif +#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) + if (x86_cpu_has_avx2) + functable.longest_match = &longest_match_avx2; +#endif return functable.longest_match(s, cur_match); } @@ -131,17 +131,17 @@ Z_INTERNAL uint32_t longest_match_slow_stub(deflate_state *const s, Pos cur_matc # else functable.longest_match_slow = &longest_match_slow_unaligned_16; # endif -# if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) - if (x86_cpu_has_sse2) - functable.longest_match_slow = &longest_match_slow_unaligned_sse2; -# endif -# if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) - if (x86_cpu_has_avx2) - functable.longest_match_slow = &longest_match_slow_unaligned_avx2; -# endif #else functable.longest_match_slow = &longest_match_slow_c; #endif +#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) + if (x86_cpu_has_sse2) + functable.longest_match_slow = &longest_match_slow_sse2; +#endif +#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) + if (x86_cpu_has_avx2) + functable.longest_match_slow = &longest_match_slow_avx2; +#endif return functable.longest_match_slow(s, cur_match); } @@ -391,17 +391,17 @@ Z_INTERNAL uint32_t compare256_stub(const uint8_t *src0, const uint8_t *src1) { # else functable.compare256 = &compare256_unaligned_16; # endif -# if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) - if (x86_cpu_has_sse2) - functable.compare256 = &compare256_unaligned_sse2; -# endif -# if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) - if (x86_cpu_has_avx2) - functable.compare256 = &compare256_unaligned_avx2; -# endif #else functable.compare256 = &compare256_c; #endif +#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) + if (x86_cpu_has_sse2) + functable.compare256 = &compare256_sse2; +#endif +#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) + if (x86_cpu_has_avx2) + functable.compare256 = &compare256_avx2; +#endif return functable.compare256(src0, src1); } diff --git a/test/benchmarks/benchmark_compare256.cc b/test/benchmarks/benchmark_compare256.cc index bdfdd6bf9..e690e8eff 100644 --- a/test/benchmarks/benchmark_compare256.cc +++ b/test/benchmarks/benchmark_compare256.cc @@ -69,10 +69,10 @@ BENCHMARK_COMPARE256(unaligned_32, compare256_unaligned_32, 1); #if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL) BENCHMARK_COMPARE256(unaligned_64, compare256_unaligned_64, 1); #endif +#endif #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) -BENCHMARK_COMPARE256(unaligned_sse2, compare256_unaligned_sse2, x86_cpu_has_sse2); +BENCHMARK_COMPARE256(sse2, compare256_sse2, x86_cpu_has_sse2); #endif #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) -BENCHMARK_COMPARE256(unaligned_avx2, compare256_unaligned_avx2, x86_cpu_has_avx2); -#endif +BENCHMARK_COMPARE256(avx2, compare256_avx2, x86_cpu_has_avx2); #endif diff --git a/test/test_compare256.cc b/test/test_compare256.cc index 2c2f3aaa8..c252cfada 100644 --- a/test/test_compare256.cc +++ b/test/test_compare256.cc @@ -65,10 +65,10 @@ TEST_COMPARE256(unaligned_32, compare256_unaligned_32, 1) #if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL) TEST_COMPARE256(unaligned_64, compare256_unaligned_64, 1) #endif +#endif #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) -TEST_COMPARE256(unaligned_sse2, compare256_unaligned_sse2, x86_cpu_has_sse2) +TEST_COMPARE256(sse2, compare256_sse2, x86_cpu_has_sse2) #endif #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) -TEST_COMPARE256(unaligned_avx2, compare256_unaligned_avx2, x86_cpu_has_avx2) -#endif +TEST_COMPARE256(avx2, compare256_avx2, x86_cpu_has_avx2) #endif