]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Implement neon version of compare256.
authorNathan Moinvaziri <nathan@nathanm.com>
Mon, 18 Apr 2022 01:47:07 +0000 (18:47 -0700)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Fri, 6 May 2022 10:19:35 +0000 (12:19 +0200)
Co-authored-by: Adam Stylinski <kungfujesus06@gmail.com>
CMakeLists.txt
README.md
arch/arm/Makefile.in
arch/arm/compare256_neon.c [new file with mode: 0644]
configure
cpu_features.h
functable.c
test/benchmarks/benchmark_compare256.cc
test/test_compare256.cc
win32/Makefile.a64
win32/Makefile.arm

index f0b08808898ca8bf7a5b7e6095eb1fd0dcffb180..ed438f79a2d7b1f0e65b92fef46081a3fe6c0d16 100644 (file)
@@ -601,8 +601,9 @@ if(WITH_OPTIM)
         if(WITH_NEON)
             check_neon_compiler_flag()
             if(MFPU_NEON_AVAILABLE)
-                add_definitions(-DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH)
-                set(NEON_SRCS ${ARCHDIR}/adler32_neon.c ${ARCHDIR}/chunkset_neon.c ${ARCHDIR}/slide_hash_neon.c)
+                add_definitions(-DARM_NEON -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH)
+                set(NEON_SRCS ${ARCHDIR}/adler32_neon.c ${ARCHDIR}/chunkset_neon.c
+                    ${ARCHDIR}/compare256_neon.c ${ARCHDIR}/slide_hash_neon.c)
                 list(APPEND ZLIB_ARCH_SRCS ${NEON_SRCS})
                 set_property(SOURCE ${NEON_SRCS} PROPERTY COMPILE_FLAGS "${NEONFLAG} ${NOLTOFLAG}")
                 if(MSVC)
index e02e04b3161fbb58b92cae8116a5b5f70f94e865..40ad1f858df45f26812fae5982cae64934d1379a 100644 (file)
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ Features
   * CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z
   * Hash table implementation using CRC32-C intrinsics on x86 and ARM
   * Slide hash implementations using SSE2, AVX2, Neon, VMX & VSX
-  * Compare256 implementations using SSE2 & AVX2
+  * Compare256 implementations using SSE2, AVX2, & Neon
   * Inflate chunk copying using SSE2, AVX, Neon & VSX
   * Support for hardware-accelerated deflate using IBM Z DFLTCC
 * Unaligned memory read/writes and large bit buffer improvements
index f47325c2319718ab075c029b13633524f23eedf2..abf6193fc0d01f58625f801435e7f4e4f37650e3 100644 (file)
@@ -20,6 +20,7 @@ all: \
        adler32_neon.o adler32_neon.lo \
        arm_features.o arm_features.lo \
        chunkset_neon.o chunkset_neon.lo \
+       compare256_neon.o compare256_neon.lo \
        crc32_acle.o crc32_acle.lo \
        slide_hash_neon.o slide_hash_neon.lo \
        insert_string_acle.o insert_string_acle.lo
@@ -42,6 +43,12 @@ chunkset_neon.o:
 chunkset_neon.lo:
        $(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
 
+compare256_neon.o:
+       $(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c
+
+compare256_neon.lo:
+       $(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c
+
 crc32_acle.o:
        $(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
 
diff --git a/arch/arm/compare256_neon.c b/arch/arm/compare256_neon.c
new file mode 100644 (file)
index 0000000..53a088c
--- /dev/null
@@ -0,0 +1,60 @@
+/* compare256_neon.c - NEON version of compare256
+ * Copyright (C) 2022 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
+#ifdef _M_ARM64
+#  include <arm64_neon.h>
+#else
+#  include <arm_neon.h>
+#endif
+#include "../../zbuild.h"
+
+static inline uint32_t compare256_neon_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+
+    do {
+        uint8x16_t a, b, cmp;
+        uint64_t lane;
+
+        a = vld1q_u8(src0);
+        b = vld1q_u8(src1);
+
+        cmp = veorq_u8(a, b);
+
+        lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 0);
+        if (lane) {
+            uint32_t match_byte = (uint32_t)__builtin_ctzll(lane) / 8;
+            return len + match_byte;
+        }
+        len += 8;
+        lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 1);
+        if (lane) {
+            uint32_t match_byte = (uint32_t)__builtin_ctzll(lane) / 8;
+            return len + match_byte;
+        }
+        len += 8;
+
+        src0 += 16, src1 += 16;
+    } while (len < 256);
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_neon_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_neon
+#define COMPARE256          compare256_neon_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_neon
+#define COMPARE256          compare256_neon_static
+
+#include "match_tpl.h"
+
+#endif
index a357e1c8aa64aa7c2fc52113dcdeef9953da8162..836e03072b378c8b6b4c07569b4a5f0985a788f9 100755 (executable)
--- a/configure
+++ b/configure
@@ -1659,7 +1659,10 @@ EOF
                     fi
 
                     if test $buildneon -eq 1; then
-                        if test $MFPU_NEON_AVAILABLE -eq 1;then
+                        CFLAGS="${CFLAGS} -DARM_NEON"
+                        SFLAGS="${SFLAGS} -DARM_NEON"
+
+                        if test $MFPU_NEON_AVAILABLE -eq 1; then
                             neonflag="-mfpu=neon"
                         fi
 
@@ -1671,8 +1674,8 @@ EOF
                         CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
                         SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
 
-                        ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o chunkset_neon.o slide_hash_neon.o"
-                        ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo chunkset_neon.lo slide_hash_neon.lo"
+                        ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o chunkset_neon.o compare256_neon.o slide_hash_neon.o"
+                        ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo chunkset_neon.lo compare256_neon.lo slide_hash_neon.lo"
                     fi
                 fi
             ;;
@@ -1683,6 +1686,9 @@ EOF
                     fi
 
                     if test $buildneon -eq 1; then
+                        CFLAGS="${CFLAGS} -DARM_NEON"
+                        SFLAGS="${SFLAGS} -DARM_NEON"
+
                         if test $MFPU_NEON_AVAILABLE -eq 1;then
                             neonflag="-mfpu=neon"
                         fi
@@ -1695,8 +1701,8 @@ EOF
                         CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
                         SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
 
-                        ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o chunkset_neon.o slide_hash_neon.o"
-                        ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo chunkset_neon.lo slide_hash_neon.lo"
+                        ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o chunkset_neon.o compare256_neon.o slide_hash_neon.o"
+                        ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo chunkset_neon.lo compare256_neon.lo slide_hash_neon.lo"
                     fi
                 fi
             ;;
@@ -1713,6 +1719,9 @@ EOF
                     fi
 
                     if test $buildneon -eq 1; then
+                        CFLAGS="${CFLAGS} -DARM_NEON"
+                        SFLAGS="${SFLAGS} -DARM_NEON"
+
                         if test $MFPU_NEON_AVAILABLE -eq 1;then
                             neonflag="-mfpu=neon"
                         fi
@@ -1725,8 +1734,8 @@ EOF
                         CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
                         SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
 
-                        ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o chunkset_neon.o slide_hash_neon.o"
-                        ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo chunkset_neon.lo slide_hash_neon.lo"
+                        ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o chunkset_neon.o compare256_neon.o slide_hash_neon.o"
+                        ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo chunkset_neon.lo compare256_neon.lo slide_hash_neon.lo"
                     fi
                 fi
             ;;
@@ -1786,10 +1795,10 @@ EOF
                 if test $native -eq 0; then
                     ARCH="${ARCH}+simd"
                 fi
-                CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
-                SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
-                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o chunkset_neon.o slide_hash_neon.o"
-                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo chunkset_neon.lo slide_hash_neon.lo"
+                CFLAGS="${CFLAGS} -DARM_NEON -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
+                SFLAGS="${SFLAGS} -DARM_NEON -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o chunkset_neon.o compare256_neon.o slide_hash_neon.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo chunkset_neon.lo compare256_neon.lo slide_hash_neon.lo"
             fi
         fi
 
index 4dcf8e59088c5ed29f5450fbd7db68018643d04d..504c6a93e7ede0c9b7ae7472566f3a3cfb61bbe1 100644 (file)
@@ -127,6 +127,9 @@ extern uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1);
 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
 extern uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
 #endif
+#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
+extern uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1);
+#endif
 
 #ifdef DEFLATE_H_
 /* insert_string */
@@ -154,6 +157,9 @@ extern uint32_t longest_match_sse2(deflate_state *const s, Pos cur_match);
 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
 extern uint32_t longest_match_avx2(deflate_state *const s, Pos cur_match);
 #endif
+#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
+extern uint32_t longest_match_neon(deflate_state *const s, Pos cur_match);
+#endif
 
 /* longest_match_slow */
 extern uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match);
@@ -170,6 +176,9 @@ extern uint32_t longest_match_slow_sse2(deflate_state *const s, Pos cur_match);
 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
 extern uint32_t longest_match_slow_avx2(deflate_state *const s, Pos cur_match);
 #endif
+#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
+extern uint32_t longest_match_slow_neon(deflate_state *const s, Pos cur_match);
+#endif
 
 /* quick_insert_string */
 extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str);
index 68aef1d338e2059fa33fff8db6cbadd04c449f14..74381e1589015eda11bf44424d6f3cd39d57e42f 100644 (file)
@@ -117,6 +117,10 @@ Z_INTERNAL uint32_t longest_match_stub(deflate_state *const s, Pos cur_match) {
     if (x86_cpu_has_avx2)
         functable.longest_match = &longest_match_avx2;
 #endif
+#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
+    if (arm_cpu_has_neon)
+        functable.longest_match = &longest_match_neon;
+#endif
 
     return functable.longest_match(s, cur_match);
 }
@@ -142,6 +146,10 @@ Z_INTERNAL uint32_t longest_match_slow_stub(deflate_state *const s, Pos cur_matc
     if (x86_cpu_has_avx2)
         functable.longest_match_slow = &longest_match_slow_avx2;
 #endif
+#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
+    if (arm_cpu_has_neon)
+        functable.longest_match_slow = &longest_match_slow_neon;
+#endif
 
     return functable.longest_match_slow(s, cur_match);
 }
index cc1ee5c19aecf4f77a32d275e96c4e3f0e792c25..c579d9ac81a3814f4c3ebb5b6b151381b0f35cd8 100644 (file)
@@ -76,3 +76,6 @@ BENCHMARK_COMPARE256(sse2, compare256_sse2, x86_cpu_has_sse2);
 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
 BENCHMARK_COMPARE256(avx2, compare256_avx2, x86_cpu_has_avx2);
 #endif
+#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
+BENCHMARK_COMPARE256(neon, compare256_neon, arm_cpu_has_neon);
+#endif
index c252cfada77de11e066588edce3f836d9947038f..61c6e19bcc41ed6601d4fc4b96103e490e2d2fd8 100644 (file)
@@ -72,3 +72,6 @@ TEST_COMPARE256(sse2, compare256_sse2, x86_cpu_has_sse2)
 #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
 TEST_COMPARE256(avx2, compare256_avx2, x86_cpu_has_avx2)
 #endif
+#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
+TEST_COMPARE256(neon, compare256_neon, arm_cpu_has_neon)
+#endif
index 29e66ceed1583066a1f12da9830295d06d40b0cf..b0d7993d54fbb0566a88e740a500897d22e0374d 100644 (file)
@@ -93,12 +93,13 @@ OBJS = $(OBJS) gzlib.obj gzread.obj gzwrite.obj
 WFLAGS = $(WFLAGS) \
        -DARM_ACLE_CRC_HASH \
        -D__ARM_NEON__=1 \
+       -DARM_NEON \
        -DARM_NEON_ADLER32 \
        -DARM_NEON_CHUNKSET \
        -DARM_NEON_SLIDEHASH \
        -DARM_NOCHECK_NEON \
        #
-OBJS = $(OBJS) crc32_acle.obj insert_string_acle.obj adler32_neon.obj chunkset_neon.obj slide_hash_neon.obj
+OBJS = $(OBJS) crc32_acle.obj insert_string_acle.obj adler32_neon.obj chunkset_neon.obj compare256_neon.obj slide_hash_neon.obj
 
 # targets
 all: $(STATICLIB) $(SHAREDLIB) $(IMPLIB) \
index a43dc59b1ca8b49ea48ced3942290ffd7253a8c3..14df718d844973da4639576859ac4e1bdd3117c3 100644 (file)
@@ -105,12 +105,13 @@ NEON_ARCH = /arch:VFPv3
 CFLAGS = $(CFLAGS) $(NEON_ARCH)
 WFLAGS = $(WFLAGS) \
        -D__ARM_NEON__=1 \
+       -DARM_NEON \
        -DARM_NEON_ADLER32 \
        -DARM_NEON_CHUNKSET \
        -DARM_NEON_SLIDEHASH \
        -DARM_NOCHECK_NEON \
        #
-OBJS = $(OBJS) adler32_neon.obj chunkset_neon.obj slide_hash_neon.obj
+OBJS = $(OBJS) adler32_neon.obj chunkset_neon.obj compare256_neon.obj slide_hash_neon.obj
 !endif
 
 # targets