endif()
add_feature_info(NEON_ADLER32 1 "Support NEON instructions in adler32, using \"${NEONFLAG}\"")
add_feature_info(NEON_SLIDEHASH 1 "Support NEON instructions in slide_hash, using \"${NEONFLAG}\"")
+ check_neon_ld4_intrinsics()
+ if(NEON_HAS_LD4)
+ add_definitions(-DARM_NEON_HASLD4)
+ endif()
else()
set(WITH_NEON OFF)
endif()
#endif
#include "../../zbuild.h"
#include "../../deflate.h"
+#include "../../fallback_builtins.h"
/* SIMD version of hash_chain rebase */
static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
- Z_REGISTER uint16x8_t v, *p;
+ Z_REGISTER uint16x8_t v;
+ uint16x8x4_t p0, p1;
Z_REGISTER size_t n;
size_t size = entries*sizeof(table[0]);
Assert(sizeof(Pos) == 2, "Wrong Pos size");
v = vdupq_n_u16(wsize);
- p = (uint16x8_t *)table;
n = size / (sizeof(uint16x8_t) * 8);
do {
- p[0] = vqsubq_u16(p[0], v);
- p[1] = vqsubq_u16(p[1], v);
- p[2] = vqsubq_u16(p[2], v);
- p[3] = vqsubq_u16(p[3], v);
- p[4] = vqsubq_u16(p[4], v);
- p[5] = vqsubq_u16(p[5], v);
- p[6] = vqsubq_u16(p[6], v);
- p[7] = vqsubq_u16(p[7], v);
- p += 8;
+ p0 = vld1q_u16_x4(table);
+ p1 = vld1q_u16_x4(table+32);
+ vqsubq_u16_x4_x1(p0, p0, v);
+ vqsubq_u16_x4_x1(p1, p1, v);
+ vst1q_u16_x4(table, p0);
+ vst1q_u16_x4(table+32, p1);
+ table += 64;
} while (--n);
}
set(CMAKE_REQUIRED_FLAGS)
endmacro()
+macro(check_neon_ld4_intrinsics)
+ if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+ if(NOT NATIVEFLAG)
+ if("${ARCH}" MATCHES "aarch64")
+ set(NEONFLAG "-march=armv8-a+simd")
+ else()
+ set(NEONFLAG "-mfpu=neon")
+ endif()
+ endif()
+ endif()
+ # Check whether compiler supports loading 4 neon vecs into a register range
+ set(CMAKE_REQUIRED_FLAGS "${NEONFLAG}")
+ check_c_source_compiles(
+ "#ifdef _M_ARM64
+ # include <arm64_neon.h>
+ #else
+ # include <arm_neon.h>
+ #endif
+ int main(void) {
+ int stack_var[16];
+ int32x4x4_t v = vld1q_s32_x4(stack_var);
+ (void)v;
+ return 0;
+ }"
+ NEON_HAS_LD4)
+ set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
macro(check_pclmulqdq_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
if(NOT NATIVEFLAG)
fi
}
+check_neon_ld4_intrinsics() {
+ cat > $test.c << EOF
+#ifdef _M_ARM64
+# include <arm64_neon.h>
+#else
+# include <arm_neon.h>
+#endif
+int main(void) {
+ int stack_var[16];
+ int32x4x4_t v = vld1q_s32_x4(stack_var);
+ (void)v;
+ return 0;
+}
+EOF
+ if try $CC -c $CFLAGS -march=native $test.c; then
+ NEON_HAS_LD4=1
+ echo "check whether compiler supports 4 wide register loads ... Yes." | tee -a configure.log
+ else
+ NEON_HAS_LD4=0
+ echo "check whether compiler supports 4 wide register loads ... No." | tee -a configure.log
+ fi
+}
+
check_pclmulqdq_intrinsics() {
# Check whether compiler supports PCLMULQDQ intrinsics
cat > $test.c << EOF
if test $without_optimizations -eq 0; then
check_acle_compiler_flag
check_neon_compiler_flag
+ check_neon_ld4_intrinsics
fi
case "${ARCH}" in
neonflag="-mfpu=neon"
fi
+ if test $NEON_HAS_LD4 -eq 1; then
+ CFLAGS="${CFLAGS} -DARM_NEON_HASLD4"
+ SFLAGS="${SFLAGS} -DARM_NEON_HASLD4"
+ fi
+
CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
neonflag="-mfpu=neon"
fi
+ if test $NEON_HAS_LD4 -eq 1; then
+ CFLAGS="${CFLAGS} -DARM_NEON_HASLD4"
+ SFLAGS="${SFLAGS} -DARM_NEON_HASLD4"
+ fi
+
CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
neonflag="-mfpu=neon"
fi
+ if test $NEON_HAS_LD4 -eq 1; then
+ CFLAGS="${CFLAGS} -DARM_NEON_HASLD4"
+ SFLAGS="${SFLAGS} -DARM_NEON_HASLD4"
+ fi
+
CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
fi
fi
+ if test $NEON_HAS_LD4 -eq 1; then
+ CFLAGS="${CFLAGS} -DARM_NEON_HASLD4"
+ SFLAGS="${SFLAGS} -DARM_NEON_HASLD4"
+ fi
+
if test $buildacle -eq 1; then
if test $native -eq 0; then
ARCH="${ARCH}+crc"
return trailing_zero;
}
#define HAVE_BUILTIN_CTZLL
-#endif
+#endif // Microsoft AMD64
-#endif
-#endif
+#endif // Microsoft AMD64/IA64/x86/ARM/ARM64 test
+#endif // _MSC_VER & !clang
/* Unfortunately GCC didn't support these things until version 10 */
#ifdef __AVX2__
#endif // gcc version 10 test
#endif // __AVX2__
+
+#ifdef ARM_NEON_SLIDEHASH
+
+#define vqsubq_u16_x4_x1(out, a, b) do { \
+ out.val[0] = vqsubq_u16(a.val[0], b); \
+ out.val[1] = vqsubq_u16(a.val[1], b); \
+ out.val[2] = vqsubq_u16(a.val[2], b); \
+ out.val[3] = vqsubq_u16(a.val[3], b); \
+} while (0)
+
+/* Have to check for hard float ABI on GCC/clang, but not
+ * on MSVC (we don't compile for the soft float ABI on windows)
+ */
+#if !defined(ARM_NEON_HASLD4) && (defined(__ARM_FP) || defined(_MSC_VER))
+
+#ifdef _M_ARM64
+# include <arm64_neon.h>
+#else
+# include <arm_neon.h>
+#endif
+
+static inline uint16x8x4_t vld1q_u16_x4(uint16_t *a) {
+ uint16x8x4_t ret = (uint16x8x4_t) {{
+ vld1q_u16(a),
+ vld1q_u16(a+8),
+ vld1q_u16(a+16),
+ vld1q_u16(a+24)}};
+ return ret;
+}
+
+static inline void vst1q_u16_x4(uint16_t *p, uint16x8x4_t a) {
+ vst1q_u16(p, a.val[0]);
+ vst1q_u16(p + 8, a.val[1]);
+ vst1q_u16(p + 16, a.val[2]);
+ vst1q_u16(p + 24, a.val[3]);
+}
+#endif // HASLD4 check and hard float
+#endif // ARM_NEON_SLIDEHASH
+
#endif // include guard FALLBACK_BUILTINS_H
CP = copy /y
CFLAGS = -nologo -MD -W3 -O2 -Oy- -Zi -Fd"zlib" $(LOC)
WFLAGS = \
+ -D_ARM64_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1 \
-D_CRT_SECURE_NO_DEPRECATE \
-D_CRT_NONSTDC_NO_DEPRECATE \
+ -DARM_NEON_HASLD4 \
+ -DARM_FEATURES \
-DUNALIGNED_OK \
-DUNALIGNED64_OK \
- -D_ARM64_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1 \
- -DARM_FEATURES \
#
LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest
ARFLAGS = -nologo
CP = copy /y
CFLAGS = -nologo -MD -W3 -O2 -Oy- -Zi -Fd"zlib" $(LOC)
WFLAGS = \
+ -D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1 \
-D_CRT_SECURE_NO_DEPRECATE \
-D_CRT_NONSTDC_NO_DEPRECATE \
- -DUNALIGNED_OK \
- -D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1 \
-DARM_FEATURES \
+ -DARM_NEON_HASLD4 \
+ -DUNALIGNED_OK \
#
LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest
ARFLAGS = -nologo