]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Allow SSE2 and AVX2 functions with -DWITH_UNALIGNED=OFF. Even though they use unalign...
authorNathan Moinvaziri <nathan@nathanm.com>
Sun, 27 Mar 2022 20:18:03 +0000 (13:18 -0700)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Thu, 31 Mar 2022 14:11:25 +0000 (16:11 +0200)
arch/x86/compare256_avx2.c
arch/x86/compare256_sse2.c
cpu_features.h
functable.c
test/benchmarks/benchmark_compare256.cc
test/test_compare256.cc

index e25fa93eb9b49d1058c21ea457a0741814b80a50..1318a0e333a491e611693ea4dfa9b0819956c927 100644 (file)
@@ -14,8 +14,7 @@
 #  include <nmmintrin.h>
 #endif
 
-/* AVX2 unaligned intrinsic comparison */
-static inline uint32_t compare256_unaligned_avx2_static(const uint8_t *src0, const uint8_t *src1) {
+static inline uint32_t compare256_avx2_static(const uint8_t *src0, const uint8_t *src1) {
     uint32_t len = 0;
 
     do {
@@ -46,18 +45,18 @@ static inline uint32_t compare256_unaligned_avx2_static(const uint8_t *src0, con
     return 256;
 }
 
-Z_INTERNAL uint32_t compare256_unaligned_avx2(const uint8_t *src0, const uint8_t *src1) {
-    return compare256_unaligned_avx2_static(src0, src1);
+Z_INTERNAL uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_avx2_static(src0, src1);
 }
 
-#define LONGEST_MATCH       longest_match_unaligned_avx2
-#define COMPARE256          compare256_unaligned_avx2_static
+#define LONGEST_MATCH       longest_match_avx2
+#define COMPARE256          compare256_avx2_static
 
 #include "match_tpl.h"
 
 #define LONGEST_MATCH_SLOW
-#define LONGEST_MATCH       longest_match_slow_unaligned_avx2
-#define COMPARE256          compare256_unaligned_avx2_static
+#define LONGEST_MATCH       longest_match_slow_avx2
+#define COMPARE256          compare256_avx2_static
 
 #include "match_tpl.h"
 
index bd5d62cf7b391e1452cf30aaa3148e227368e2dd..aad4bd240d2021e5c6d7d375ec19bc2ad6177ddb 100644 (file)
@@ -11,7 +11,7 @@
 
 #include <emmintrin.h>
 
-static inline uint32_t compare256_unaligned_sse2_static(const uint8_t *src0, const uint8_t *src1) {
+static inline uint32_t compare256_sse2_static(const uint8_t *src0, const uint8_t *src1) {
     uint32_t len = 0;
     int align_offset = ((uintptr_t)src0) & 15;
     const uint8_t *end0 = src0 + 256;
@@ -78,18 +78,18 @@ static inline uint32_t compare256_unaligned_sse2_static(const uint8_t *src0, con
     return 256;
 }
 
-Z_INTERNAL uint32_t compare256_unaligned_sse2(const uint8_t *src0, const uint8_t *src1) {
-    return compare256_unaligned_sse2_static(src0, src1);
+Z_INTERNAL uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_sse2_static(src0, src1);
 }
 
-#define LONGEST_MATCH       longest_match_unaligned_sse2
-#define COMPARE256          compare256_unaligned_sse2_static
+#define LONGEST_MATCH       longest_match_sse2
+#define COMPARE256          compare256_sse2_static
 
 #include "match_tpl.h"
 
 #define LONGEST_MATCH_SLOW
-#define LONGEST_MATCH       longest_match_slow_unaligned_sse2
-#define COMPARE256          compare256_unaligned_sse2_static
+#define LONGEST_MATCH       longest_match_slow_sse2
+#define COMPARE256          compare256_sse2_static
 
 #include "match_tpl.h"
 
index 7cc74a97adb631119ba630dc96851cae2d409272..5607672178c9ff3fa50d3a484bd932bfc259a3e5 100644 (file)
@@ -120,12 +120,12 @@ extern uint32_t compare256_unaligned_32(const uint8_t *src0, const uint8_t *src1
 #if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
 extern uint32_t compare256_unaligned_64(const uint8_t *src0, const uint8_t *src1);
 #endif
+#endif
 #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
-extern uint32_t compare256_unaligned_sse2(const uint8_t *src0, const uint8_t *src1);
+extern uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
 #endif
 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
-extern uint32_t compare256_unaligned_avx2(const uint8_t *src0, const uint8_t *src1);
-#endif
+extern uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
 #endif
 
 #ifdef DEFLATE_H_
@@ -147,12 +147,12 @@ extern uint32_t longest_match_unaligned_32(deflate_state *const s, Pos cur_match
 #if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
 extern uint32_t longest_match_unaligned_64(deflate_state *const s, Pos cur_match);
 #endif
+#endif
 #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
-extern uint32_t longest_match_unaligned_sse2(deflate_state *const s, Pos cur_match);
+extern uint32_t longest_match_sse2(deflate_state *const s, Pos cur_match);
 #endif
 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
-extern uint32_t longest_match_unaligned_avx2(deflate_state *const s, Pos cur_match);
-#endif
+extern uint32_t longest_match_avx2(deflate_state *const s, Pos cur_match);
 #endif
 
 /* longest_match_slow */
@@ -163,12 +163,12 @@ extern uint32_t longest_match_slow_unaligned_32(deflate_state *const s, Pos cur_
 #ifdef UNALIGNED64_OK
 extern uint32_t longest_match_slow_unaligned_64(deflate_state *const s, Pos cur_match);
 #endif
+#endif
 #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
-extern uint32_t longest_match_slow_unaligned_sse2(deflate_state *const s, Pos cur_match);
+extern uint32_t longest_match_slow_sse2(deflate_state *const s, Pos cur_match);
 #endif
 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
-extern uint32_t longest_match_slow_unaligned_avx2(deflate_state *const s, Pos cur_match);
-#endif
+extern uint32_t longest_match_slow_avx2(deflate_state *const s, Pos cur_match);
 #endif
 
 /* quick_insert_string */
index 3e23c54e59d6cc89a09d81e1ad28a816fef71b4a..ffb0d3f4795a57cb1ce9ecfaebc0b695d0686869 100644 (file)
@@ -106,17 +106,17 @@ Z_INTERNAL uint32_t longest_match_stub(deflate_state *const s, Pos cur_match) {
 #  else
     functable.longest_match = &longest_match_unaligned_16;
 #  endif
-#  if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
-    if (x86_cpu_has_sse2)
-        functable.longest_match = &longest_match_unaligned_sse2;
-#  endif
-#  if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
-    if (x86_cpu_has_avx2)
-        functable.longest_match = &longest_match_unaligned_avx2;
-#  endif
 #else
     functable.longest_match = &longest_match_c;
 #endif
+#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+    if (x86_cpu_has_sse2)
+        functable.longest_match = &longest_match_sse2;
+#endif
+#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+    if (x86_cpu_has_avx2)
+        functable.longest_match = &longest_match_avx2;
+#endif
 
     return functable.longest_match(s, cur_match);
 }
@@ -131,17 +131,17 @@ Z_INTERNAL uint32_t longest_match_slow_stub(deflate_state *const s, Pos cur_matc
 #  else
     functable.longest_match_slow = &longest_match_slow_unaligned_16;
 #  endif
-#  if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
-    if (x86_cpu_has_sse2)
-        functable.longest_match_slow = &longest_match_slow_unaligned_sse2;
-#  endif
-#  if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
-    if (x86_cpu_has_avx2)
-        functable.longest_match_slow = &longest_match_slow_unaligned_avx2;
-#  endif
 #else
     functable.longest_match_slow = &longest_match_slow_c;
 #endif
+#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+    if (x86_cpu_has_sse2)
+        functable.longest_match_slow = &longest_match_slow_sse2;
+#endif
+#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+    if (x86_cpu_has_avx2)
+        functable.longest_match_slow = &longest_match_slow_avx2;
+#endif
 
     return functable.longest_match_slow(s, cur_match);
 }
@@ -391,17 +391,17 @@ Z_INTERNAL uint32_t compare256_stub(const uint8_t *src0, const uint8_t *src1) {
 #  else
     functable.compare256 = &compare256_unaligned_16;
 #  endif
-#  if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
-    if (x86_cpu_has_sse2)
-        functable.compare256 = &compare256_unaligned_sse2;
-#  endif
-#  if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
-    if (x86_cpu_has_avx2)
-        functable.compare256 = &compare256_unaligned_avx2;
-#  endif
 #else
     functable.compare256 = &compare256_c;
 #endif
+#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+    if (x86_cpu_has_sse2)
+        functable.compare256 = &compare256_sse2;
+#endif
+#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+    if (x86_cpu_has_avx2)
+        functable.compare256 = &compare256_avx2;
+#endif
 
     return functable.compare256(src0, src1);
 }
index bdfdd6bf975fc1a806d78a4ca324113c98fbb858..e690e8effe573f93457be5827cd99f2e5eba186e 100644 (file)
@@ -69,10 +69,10 @@ BENCHMARK_COMPARE256(unaligned_32, compare256_unaligned_32, 1);
 #if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
 BENCHMARK_COMPARE256(unaligned_64, compare256_unaligned_64, 1);
 #endif
+#endif
 #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
-BENCHMARK_COMPARE256(unaligned_sse2, compare256_unaligned_sse2, x86_cpu_has_sse2);
+BENCHMARK_COMPARE256(sse2, compare256_sse2, x86_cpu_has_sse2);
 #endif
 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
-BENCHMARK_COMPARE256(unaligned_avx2, compare256_unaligned_avx2, x86_cpu_has_avx2);
-#endif
+BENCHMARK_COMPARE256(avx2, compare256_avx2, x86_cpu_has_avx2);
 #endif
index 2c2f3aaa87aff55f2fe8672830b066425deebbd1..c252cfada77de11e066588edce3f836d9947038f 100644 (file)
@@ -65,10 +65,10 @@ TEST_COMPARE256(unaligned_32, compare256_unaligned_32, 1)
 #if defined(UNALIGNED64_OK) && defined(HAVE_BUILTIN_CTZLL)
 TEST_COMPARE256(unaligned_64, compare256_unaligned_64, 1)
 #endif
+#endif
 #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
-TEST_COMPARE256(unaligned_sse2, compare256_unaligned_sse2, x86_cpu_has_sse2)
+TEST_COMPARE256(sse2, compare256_sse2, x86_cpu_has_sse2)
 #endif
 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
-TEST_COMPARE256(unaligned_avx2, compare256_unaligned_avx2, x86_cpu_has_avx2)
-#endif
+TEST_COMPARE256(avx2, compare256_avx2, x86_cpu_has_avx2)
 #endif