]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Marginal improvement by pipelining loads on NEON
authorAdam Stylinski <kungfujesus06@gmail.com>
Sun, 23 Jan 2022 16:59:57 +0000 (11:59 -0500)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Tue, 1 Feb 2022 12:31:00 +0000 (13:31 +0100)
The ld1{4 reg} variant saves us instructions
and only adds 3 cycles of latency to load 3
more neon/asimd registers worth of data.

CMakeLists.txt
arch/arm/slide_hash_neon.c
cmake/detect-intrinsics.cmake
configure
fallback_builtins.h
win32/Makefile.a64
win32/Makefile.arm

index f15d1f9cb5db1a0b5de756a44272f83e9fdcb864..ccac51cbf107160b29956d77b057e2093299bb8c 100644 (file)
@@ -628,6 +628,10 @@ if(WITH_OPTIM)
                 endif()
                 add_feature_info(NEON_ADLER32 1 "Support NEON instructions in adler32, using \"${NEONFLAG}\"")
                 add_feature_info(NEON_SLIDEHASH 1 "Support NEON instructions in slide_hash, using \"${NEONFLAG}\"")
+                check_neon_ld4_intrinsics()
+                if(NEON_HAS_LD4)
+                    add_definitions(-DARM_NEON_HASLD4)
+                endif()
             else()
                 set(WITH_NEON OFF)
             endif()
index b90e47021980981efab0fa6c8013cd59cf2cbade..6ff7a0bb60789beff82177d88f3c78802cc87f40 100644 (file)
 #endif
 #include "../../zbuild.h"
 #include "../../deflate.h"
+#include "../../fallback_builtins.h"
 
 /* SIMD version of hash_chain rebase */
 static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
-    Z_REGISTER uint16x8_t v, *p;
+    Z_REGISTER uint16x8_t v;
+    uint16x8x4_t p0, p1;
     Z_REGISTER size_t n;
 
     size_t size = entries*sizeof(table[0]);
@@ -28,18 +30,15 @@ static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize
     Assert(sizeof(Pos) == 2, "Wrong Pos size");
     v = vdupq_n_u16(wsize);
 
-    p = (uint16x8_t *)table;
     n = size / (sizeof(uint16x8_t) * 8);
     do {
-        p[0] = vqsubq_u16(p[0], v);
-        p[1] = vqsubq_u16(p[1], v);
-        p[2] = vqsubq_u16(p[2], v);
-        p[3] = vqsubq_u16(p[3], v);
-        p[4] = vqsubq_u16(p[4], v);
-        p[5] = vqsubq_u16(p[5], v);
-        p[6] = vqsubq_u16(p[6], v);
-        p[7] = vqsubq_u16(p[7], v);
-        p += 8;
+        p0 = vld1q_u16_x4(table); 
+        p1 = vld1q_u16_x4(table+32); 
+        vqsubq_u16_x4_x1(p0, p0, v);
+        vqsubq_u16_x4_x1(p1, p1, v);
+        vst1q_u16_x4(table, p0);
+        vst1q_u16_x4(table+32, p1);
+        table += 64;
     } while (--n);
 }
 
index f7d27dc2b1a479147610bf8cd93bd9559b4da529..c4231c26acd7d1cea97ba40636784db906a3cd03 100644 (file)
@@ -151,6 +151,34 @@ macro(check_neon_compiler_flag)
     set(CMAKE_REQUIRED_FLAGS)
 endmacro()
 
+macro(check_neon_ld4_intrinsics)
+    if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+        if(NOT NATIVEFLAG)
+            if("${ARCH}" MATCHES "aarch64")
+                set(NEONFLAG "-march=armv8-a+simd")
+            else()
+                set(NEONFLAG "-mfpu=neon")
+            endif()
+        endif()
+    endif()
+    # Check whether compiler supports loading 4 neon vecs into a register range 
+    set(CMAKE_REQUIRED_FLAGS "${NEONFLAG}")
+    check_c_source_compiles(
+        "#ifdef _M_ARM64
+        #  include <arm64_neon.h>
+        #else
+        #  include <arm_neon.h>
+        #endif
+        int main(void) {
+            int stack_var[16];
+            int32x4x4_t v = vld1q_s32_x4(stack_var);
+            (void)v;
+            return 0;
+        }"
+        NEON_HAS_LD4)
+    set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
 macro(check_pclmulqdq_intrinsics)
     if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
         if(NOT NATIVEFLAG)
index 1f8c64db01c1c07fc1371b6e3d4174b63f39fa74..93b109b7ca45eb72ab13293d7dbe7f5c1612f934 100755 (executable)
--- a/configure
+++ b/configure
@@ -1169,6 +1169,29 @@ EOF
     fi
 }
 
+check_neon_ld4_intrinsics() {
+    cat > $test.c << EOF
+#ifdef _M_ARM64
+#  include <arm64_neon.h>
+#else
+#  include <arm_neon.h>
+#endif
+int main(void) {
+    int stack_var[16];
+    int32x4x4_t v = vld1q_s32_x4(stack_var);
+    (void)v;
+    return 0;
+}
+EOF
+    if try $CC -c $CFLAGS -march=native $test.c; then
+        NEON_HAS_LD4=1
+        echo "check whether compiler supports 4 wide register loads ... Yes." | tee -a configure.log
+    else
+        NEON_HAS_LD4=0
+        echo "check whether compiler supports 4 wide register loads ... No." | tee -a configure.log
+    fi
+}
+
 check_pclmulqdq_intrinsics() {
     # Check whether compiler supports PCLMULQDQ intrinsics
     cat > $test.c << EOF
@@ -1658,6 +1681,7 @@ EOF
         if test $without_optimizations -eq 0; then
             check_acle_compiler_flag
             check_neon_compiler_flag
+            check_neon_ld4_intrinsics
         fi
 
         case "${ARCH}" in
@@ -1700,6 +1724,11 @@ EOF
                             neonflag="-mfpu=neon"
                         fi
 
+                        if test $NEON_HAS_LD4 -eq 1; then
+                            CFLAGS="${CFLAGS} -DARM_NEON_HASLD4"
+                            SFLAGS="${SFLAGS} -DARM_NEON_HASLD4"
+                        fi
+
                         CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
                         SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
 
@@ -1722,6 +1751,11 @@ EOF
                             neonflag="-mfpu=neon"
                         fi
 
+                        if test $NEON_HAS_LD4 -eq 1; then
+                            CFLAGS="${CFLAGS} -DARM_NEON_HASLD4"
+                            SFLAGS="${SFLAGS} -DARM_NEON_HASLD4"
+                        fi
+
                         CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
                         SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
 
@@ -1750,6 +1784,11 @@ EOF
                             neonflag="-mfpu=neon"
                         fi
 
+                        if test $NEON_HAS_LD4 -eq 1; then
+                            CFLAGS="${CFLAGS} -DARM_NEON_HASLD4"
+                            SFLAGS="${SFLAGS} -DARM_NEON_HASLD4"
+                        fi
+
                         CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
                         SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
 
@@ -1793,6 +1832,11 @@ EOF
                 fi
             fi
 
+            if test $NEON_HAS_LD4 -eq 1; then
+                CFLAGS="${CFLAGS} -DARM_NEON_HASLD4"
+                SFLAGS="${SFLAGS} -DARM_NEON_HASLD4"
+            fi
+
             if test $buildacle -eq 1; then
                 if test $native -eq 0; then
                     ARCH="${ARCH}+crc"
index 5aaa2c681eb08c9772b73745f600b7b562960926..ee623a66f278ad40d30cc112f88a78919ec68ded 100644 (file)
@@ -37,10 +37,10 @@ static __forceinline unsigned long long __builtin_ctzll(uint64_t value) {
     return trailing_zero;
 }
 #define HAVE_BUILTIN_CTZLL
-#endif
+#endif // Microsoft AMD64 
 
-#endif
-#endif
+#endif // Microsoft AMD64/IA64/x86/ARM/ARM64 test
+#endif // _MSC_VER & !clang
 
 /* Unfortunately GCC didn't support these things until version 10 */
 #ifdef __AVX2__
@@ -63,4 +63,43 @@ static inline __m512i _mm512_zextsi128_si512(__m128i a) {
 #endif // gcc version 10 test
 
 #endif // __AVX2__
+
+#ifdef ARM_NEON_SLIDEHASH
+
+#define vqsubq_u16_x4_x1(out, a, b) do { \
+    out.val[0] = vqsubq_u16(a.val[0], b); \
+    out.val[1] = vqsubq_u16(a.val[1], b); \
+    out.val[2] = vqsubq_u16(a.val[2], b); \
+    out.val[3] = vqsubq_u16(a.val[3], b); \
+} while (0)
+
+/* Have to check for hard float ABI on GCC/clang, but not 
+ * on MSVC (we don't compile for the soft float ABI on windows)
+ */
+#if !defined(ARM_NEON_HASLD4) && (defined(__ARM_FP) || defined(_MSC_VER))
+
+#ifdef _M_ARM64
+#  include <arm64_neon.h>
+#else
+#  include <arm_neon.h>
+#endif
+
+static inline uint16x8x4_t vld1q_u16_x4(uint16_t *a) {
+    uint16x8x4_t ret = (uint16x8x4_t) {{
+                          vld1q_u16(a),
+                          vld1q_u16(a+8),
+                          vld1q_u16(a+16),
+                          vld1q_u16(a+24)}};
+    return ret;
+}
+
+static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) {
+    vst1q_u16(p, a.val[0]);
+    vst1q_u16(p + 8, a.val[1]);
+    vst1q_u16(p + 16, a.val[2]);
+    vst1q_u16(p + 24, a.val[3]);
+}
+#endif // HASLD4 check and hard float
+#endif // ARM_NEON_SLIDEHASH
+
 #endif // include guard FALLBACK_BUILTINS_H 
index e65123360992338561bc5779cdf8dad58e2b4158..1dc0815a3b291e6324dc110d312db04becf40c83 100644 (file)
@@ -25,12 +25,13 @@ RC = rc
 CP = copy /y
 CFLAGS  = -nologo -MD -W3 -O2 -Oy- -Zi -Fd"zlib" $(LOC)
 WFLAGS  = \
+       -D_ARM64_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1 \
        -D_CRT_SECURE_NO_DEPRECATE \
        -D_CRT_NONSTDC_NO_DEPRECATE \
+       -DARM_NEON_HASLD4 \
+       -DARM_FEATURES \
        -DUNALIGNED_OK \
        -DUNALIGNED64_OK \
-       -D_ARM64_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1 \
-       -DARM_FEATURES \
        #
 LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest
 ARFLAGS = -nologo
index 29ce955811b68fd7cb07d2d56456468e1675634f..7628b617fe57c2eb475c492bff1552d878b6251e 100644 (file)
@@ -25,11 +25,12 @@ RC = rc
 CP = copy /y
 CFLAGS  = -nologo -MD -W3 -O2 -Oy- -Zi -Fd"zlib" $(LOC)
 WFLAGS  = \
+       -D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1 \
        -D_CRT_SECURE_NO_DEPRECATE \
        -D_CRT_NONSTDC_NO_DEPRECATE \
-       -DUNALIGNED_OK \
-       -D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1 \
        -DARM_FEATURES \
+       -DARM_NEON_HASLD4 \
+       -DUNALIGNED_OK \
        #
 LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest
 ARFLAGS = -nologo