if(BASEARCH_ARM_FOUND)
option(WITH_ACLE "Build with ACLE" ON)
option(WITH_NEON "Build with NEON intrinsics" ON)
+ option(WITH_ARMV6 "Build with ARMv6 SIMD" ON)
elseif(BASEARCH_PPC_FOUND)
option(WITH_ALTIVEC "Build with AltiVec (VMX) optimisations for PowerPC" ON)
option(WITH_POWER8 "Build with optimisations for POWER8" ON)
ZLIB_SYMBOL_PREFIX
WITH_REDUCED_MEM
WITH_ACLE WITH_NEON
+ WITH_ARMV6
WITH_DFLTCC_DEFLATE
WITH_DFLTCC_INFLATE
WITH_CRC32_VX
#
# Check for standard/system includes
#
+check_include_file(arm_acle.h HAVE_ARM_ACLE_H)
+if(HAVE_ARM_ACLE_H)
+ add_definitions(-DHAVE_ARM_ACLE_H)
+endif()
check_include_file(sys/auxv.h HAVE_SYS_AUXV_H)
if(HAVE_SYS_AUXV_H)
add_definitions(-DHAVE_SYS_AUXV_H)
set(WITH_NEON OFF)
endif()
endif()
+ if(WITH_ARMV6)
+ check_armv6_compiler_flag()
+ if(HAVE_ARMV6_INLINE_ASM OR HAVE_ARMV6_INTRIN)
+ add_definitions(-DARM_SIMD)
+ set(ARMV6_SRCS ${ARCHDIR}/slide_hash_armv6.c)
+ set_property(SOURCE ${ARMV6_SRCS} PROPERTY COMPILE_FLAGS "${ARMV6FLAG} ${NOLTOFLAG}")
+ list(APPEND ZLIB_ARCH_SRCS ${ARMV6_SRCS})
+ add_feature_info(ARMV6 1 "Support ARMv6 SIMD instructions in slide_hash, using \"${ARMV6FLAG}\"")
+ if(HAVE_ARMV6_INTRIN)
+ add_definitions(-DARM_SIMD_INTRIN)
+ endif()
+ else()
+ set(WITH_ARMV6 OFF)
+ endif()
+ else()
+ set(WITH_ARMV6 OFF)
+ endif()
elseif(BASEARCH_PPC_FOUND)
# Common arch detection code
if(WITH_ALTIVEC)
if(BASEARCH_ARM_FOUND)
add_feature_info(WITH_ACLE WITH_ACLE "Build with ACLE")
add_feature_info(WITH_NEON WITH_NEON "Build with NEON intrinsics")
+ add_feature_info(WITH_ARMV6 WITH_ARMV6 "Build with ARMv6 SIMD")
elseif(BASEARCH_PPC_FOUND)
add_feature_info(WITH_ALTIVEC WITH_ALTIVEC "Build with AltiVec optimisations")
add_feature_info(WITH_POWER8 WITH_POWER8 "Build with optimisations for POWER8")
* Adler32 implementation using SSSE3, AVX2, AVX512, AVX512-VNNI, Neon, VMX & VSX
* CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z
* Hash table implementation using CRC32-C intrinsics on x86 and ARM
- * Slide hash implementations using SSE2, AVX2, Neon, VMX & VSX
+ * Slide hash implementations using SSE2, AVX2, ARMv6, Neon, VMX & VSX
* Compare256 implementations using SSE2, AVX2, Neon, POWER9 & RVV
* Inflate chunk copying using SSE2, SSSE3, AVX, Neon & VSX
* Support for hardware-accelerated deflate using IBM Z DFLTCC
| WITH_VPCLMULQDQ | --without-vpclmulqdq | Build with VPCLMULQDQ intrinsics | ON |
| WITH_ACLE | --without-acle | Build with ACLE intrinsics | ON |
| WITH_NEON | --without-neon | Build with NEON intrinsics | ON |
+| WITH_ARMV6 | --without-armv6 | Build with ARMv6 intrinsics | ON |
| WITH_ALTIVEC | --without-altivec | Build with AltiVec (VMX) intrinsics | ON |
| WITH_POWER8 | --without-power8 | Build with POWER8 optimisations | ON |
| WITH_RVV | | Build with RVV intrinsics | ON |
ACLEFLAG=
NEONFLAG=
+ARMV6FLAG=
NOLTOFLAG=
SRCDIR=.
compare256_neon.o compare256_neon.lo \
crc32_acle.o crc32_acle.lo \
slide_hash_neon.o slide_hash_neon.lo \
+ slide_hash_armv6.o slide_hash_armv6.lo \
insert_string_acle.o insert_string_acle.lo
adler32_neon.o:
slide_hash_neon.lo:
$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_neon.c
+slide_hash_armv6.o:
+ $(CC) $(CFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c
+
+slide_hash_armv6.lo:
+ $(CC) $(SFLAGS) $(ARMV6FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_armv6.c
+
insert_string_acle.o:
$(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_acle.c
--- /dev/null
+#ifndef ARM_ACLE_INTRINS_H
+#define ARM_ACLE_INTRINS_H
+
+#include <stdint.h>
+#ifdef _MSC_VER
+# include <intrin.h>
+#elif defined(HAVE_ARM_ACLE_H)
+# include <arm_acle.h>
+#endif
+
+#ifdef ARM_SIMD
+#ifdef _MSC_VER
+typedef uint32_t uint16x2_t;
+
+#define __uqsub16 _arm_uqsub16
+#elif !defined(ARM_SIMD_INTRIN)
+typedef uint32_t uint16x2_t;
+
+static inline uint16x2_t __uqsub16(uint16x2_t __a, uint16x2_t __b) {
+ uint16x2_t __c;
+ __asm__ __volatile__("uqsub16\t%0, %1, %2" : "=r" (__c) : "r"(__a), "r"(__b));
+ return __c;
+}
+#endif
+#endif
+
+#endif // include guard ARM_ACLE_INTRINS_H
}
#endif
+/* AArch64 does not have ARMv6 SIMD. */
+#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
+static inline int arm_has_simd() {
+#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
+ const char *platform = (const char *)getauxval(AT_PLATFORM);
+ return strncmp(platform, "v6l", 3) == 0
+ || strncmp(platform, "v7l", 3) == 0
+ || strncmp(platform, "v8l", 3) == 0;
+#elif defined(ARM_NOCHECK_SIMD)
+ return 1;
+#else
+ return 0;
+#endif
+}
+#endif
+
void Z_INTERNAL arm_check_features(struct arm_cpu_features *features) {
#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+ features->has_simd = 0; /* never available */
features->has_neon = 1; /* always available */
#else
+ features->has_simd = arm_has_simd();
features->has_neon = arm_has_neon();
#endif
features->has_crc32 = arm_has_crc32();
#define ARM_H_
struct arm_cpu_features {
+ int has_simd;
int has_neon;
int has_crc32;
};
--- /dev/null
+/* slide_hash_armv6.c -- Optimized hash table shifting for ARMv6 with support for SIMD instructions
+ * Copyright (C) 2023 Cameron Cawley
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#if defined(ARM_SIMD)
+#include "acle_intrins.h"
+#include "../../zbuild.h"
+#include "../../deflate.h"
+
+/* SIMD version of hash_chain rebase */
+static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize) {
+ Z_REGISTER uint16x2_t v;
+ uint16x2_t p0, p1, p2, p3;
+ Z_REGISTER size_t n;
+
+ size_t size = entries*sizeof(table[0]);
+ Assert((size % (sizeof(uint16x2_t) * 4) == 0), "hash table size err");
+
+ Assert(sizeof(Pos) == 2, "Wrong Pos size");
+ v = wsize | (wsize << 16);
+
+ n = size / (sizeof(uint16x2_t) * 4);
+ do {
+ p0 = *((const uint16x2_t *)(table));
+ p1 = *((const uint16x2_t *)(table+2));
+ p2 = *((const uint16x2_t *)(table+4));
+ p3 = *((const uint16x2_t *)(table+6));
+ p0 = __uqsub16(p0, v);
+ p1 = __uqsub16(p1, v);
+ p2 = __uqsub16(p2, v);
+ p3 = __uqsub16(p3, v);
+ *((uint16x2_t *)(table)) = p0;
+ *((uint16x2_t *)(table+2)) = p1;
+ *((uint16x2_t *)(table+4)) = p2;
+ *((uint16x2_t *)(table+6)) = p3;
+ table += 8;
+ } while (--n);
+}
+
+Z_INTERNAL void slide_hash_armv6(deflate_state *s) {
+ unsigned int wsize = s->w_size;
+
+ slide_hash_chain(s->head, HASH_SIZE, wsize);
+ slide_hash_chain(s->prev, wsize, wsize);
+}
+#endif
endif()
endmacro()
+macro(check_armv6_compiler_flag)
+ if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+ if(NOT NATIVEFLAG)
+ check_c_compiler_flag("-march=armv6" HAVE_MARCH_ARMV6)
+ if(HAVE_MARCH_ARMV6)
+ set(ARMV6FLAG "-march=armv6" CACHE INTERNAL "Compiler option to enable ARMv6 support")
+ endif()
+ endif()
+ endif()
+ # Check whether compiler supports ARMv6 inline asm
+ set(CMAKE_REQUIRED_FLAGS "${ARMV6FLAG} ${NATIVEFLAG}")
+ check_c_source_compile_or_run(
+ "unsigned int f(unsigned int a, unsigned int b) {
+ unsigned int c;
+ __asm__ __volatile__ ( \"uqsub16 %0, %1, %2\" : \"=r\" (c) : \"r\" (a), \"r\" (b) );
+ return (int)c;
+ }
+ int main(void) { return 0; }"
+ HAVE_ARMV6_INLINE_ASM
+ )
+ # Check whether compiler supports ARMv6 intrinsics
+ check_c_source_compile_or_run(
+ "#if defined(_MSC_VER)
+ #include <intrin.h>
+ #else
+ #include <arm_acle.h>
+ #endif
+ unsigned int f(unsigned int a, unsigned int b) {
+ #if defined(_MSC_VER)
+ return _arm_uqsub16(a, b);
+ #else
+ return __uqsub16(a, b);
+ #endif
+ }
+ int main(void) { return 0; }"
+ HAVE_ARMV6_INTRIN
+ )
+ set(CMAKE_REQUIRED_FLAGS)
+endmacro()
+
macro(check_avx512_intrinsics)
if(CMAKE_C_COMPILER_ID MATCHES "Intel")
if(CMAKE_HOST_UNIX OR APPLE)
build64=0
buildvpclmulqdq=1
buildacle=1
+buildarmv6=1
buildaltivec=1
buildpower8=1
buildpower9=1
xsaveflag="-mxsave"
acleflag=
neonflag=
+armv6flag=
noltoflag="-fno-lto"
vgfmaflag="-march=z13"
vmxflag="-maltivec"
echo ' [--without-new-strategies] Compiles without using new additional deflate strategies' | tee -a configure.log
echo ' [--without-acle] Compiles without ARM C Language Extensions' | tee -a configure.log
echo ' [--without-neon] Compiles without ARM Neon SIMD instruction set' | tee -a configure.log
+ echo ' [--without-armv6] Compiles without ARMv6 SIMD instruction set' | tee -a configure.log
echo ' [--without-altivec] Compiles without PPC AltiVec support' | tee -a configure.log
echo ' [--without-power8] Compiles without Power8 instruction set' | tee -a configure.log
echo ' [--with-dfltcc-deflate] Use DEFLATE CONVERSION CALL instruction for compression on IBM Z' | tee -a configure.log
--without-vpclmulqdq) buildvpclmulqdq=0; shift ;;
--without-acle) buildacle=0; shift ;;
--without-neon) buildneon=0; shift ;;
+ --without-armv6) buildarmv6=0; shift ;;
--without-altivec) buildaltivec=0 ; shift ;;
--without-power8) buildpower8=0 ; shift ;;
--without-power9) buildpower9=0 ; shift ;;
fi
}
+check_armv6_intrinsics() {
+ # Check whether -march=armv6 works correctly
+ cat > $test.c << EOF
+int main() { return 0; }
+EOF
+ if try $CC -c $CFLAGS -march=armv6 $test.c; then
+ armv6flag=-march=armv6
+ echo "Check whether -march=armv6 works ... Yes." | tee -a configure.log
+ else
+ echo "Check whether -march=armv6 works ... No." | tee -a configure.log
+ fi
+
+ # Check whether compiler supports ARMv6 inline asm
+ cat > $test.c << EOF
+unsigned int f(unsigned int a, unsigned int b) {
+ unsigned int c;
+ __asm__ __volatile__ ( "uqsub16 %0, %1, %2" : "=r" (c) : "r" (a), "r" (b) );
+ return c;
+}
+int main(void) { return 0; }
+EOF
+ if try ${CC} ${CFLAGS} ${armv6flag} $test.c; then
+ echo "Checking for ARMv6 inline assembly ... Yes." | tee -a configure.log
+ HAVE_ARMV6_INLINE_ASM=1
+ else
+ echo "Checking for ARMv6 inline assembly ... No." | tee -a configure.log
+ HAVE_ARMV6_INLINE_ASM=0
+ fi
+
+ # Check whether compiler supports ARMv6 intrinsics
+ cat > $test.c << EOF
+#include <arm_acle.h>
+unsigned int f(unsigned int a, unsigned int b) {
+ return __uqsub16(a, b);
+}
+int main(void) { return 0; }
+EOF
+ if try ${CC} ${CFLAGS} ${armv6flag} $test.c; then
+ echo "Checking for ARMv6 intrinsics ... Yes." | tee -a configure.log
+ HAVE_ARMV6_INTRIN=1
+ else
+ echo "Checking for ARMv6 intrinsics ... No." | tee -a configure.log
+ HAVE_ARMV6_INTRIN=0
+ fi
+}
+
check_pclmulqdq_intrinsics() {
# Check whether compiler supports PCLMULQDQ intrinsics
cat > $test.c << EOF
ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} arm_features.o"
ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} arm_features.lo"
+ cat > $test.c <<EOF
+#include <arm_acle.h>
+EOF
+ if try $CC -c $CFLAGS $test.c; then
+ echo "Checking for arm_acle.h... Yes." | tee -a configure.log
+ CFLAGS="${CFLAGS} -DHAVE_ARM_ACLE_H"
+ SFLAGS="${SFLAGS} -DHAVE_ARM_ACLE_H"
+ else
+ echo "Checking for arm_acle.h... No." | tee -a configure.log
+ fi
+
+
if test $LINUX -eq 1; then
if test "$ARCH" = "aarch64"; then
cat > $test.c <<EOF
ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo chunkset_neon.lo compare256_neon.lo slide_hash_neon.lo"
fi
fi
+
+ if test $buildarmv6 -eq 1; then
+ check_armv6_intrinsics
+
+ if test $HAVE_ARMV6_INTRIN -eq 1 || test $HAVE_ARMV6_INLINE_ASM -eq 1; then
+ CFLAGS="${CFLAGS} -DARM_SIMD"
+ SFLAGS="${SFLAGS} -DARM_SIMD"
+
+ if test $HAVE_ARMV6_INTRIN -eq 1; then
+ CFLAGS="${CFLAGS} -DARM_SIMD_INTRIN"
+ SFLAGS="${SFLAGS} -DARM_SIMD_INTRIN"
+ fi
+
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_hash_armv6.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_hash_armv6.lo"
+ fi
+ fi
+
fi
;;
powerpc*)
echo xsaveflag = $xsaveflag >> configure.log
echo acleflag = $acleflag >> configure.log
echo neonflag = $neonflag >> configure.log
+echo armv6flag = $armv6flag >> configure.log
echo ARCHDIR = ${ARCHDIR} >> configure.log
echo ARCH_STATIC_OBJS = ${ARCH_STATIC_OBJS} >> configure.log
echo ARCH_SHARED_OBJS = ${ARCH_SHARED_OBJS} >> configure.log
/^XSAVEFLAG *=/s#=.*#=$xsaveflag#
/^ACLEFLAG *=/s#=.*#=$acleflag#
/^NEONFLAG *=/s#=.*#=$neonflag#
+/^ARMV6FLAG *=/s#=.*#=$armv6flag#
/^NOLTOFLAG *=/s#=.*#=$noltoflag#
/^VGFMAFLAG *=/s#=.*#=$vgfmaflag#
/^PPCFLAGS *=/s#=.*#=$vmxflag#
#ifdef X86_SSE2
extern void slide_hash_sse2(deflate_state *s);
-#elif defined(ARM_NEON)
+#endif
+#if defined(ARM_SIMD)
+extern void slide_hash_armv6(deflate_state *s);
+#endif
+#if defined(ARM_NEON)
extern void slide_hash_neon(deflate_state *s);
#endif
#if defined(PPC_VMX)
#endif
+ // ARM - SIMD
+#ifdef ARM_SIMD
+# ifndef ARM_NOCHECK_SIMD
+ if (cf.arm.has_simd)
+# endif
+ {
+ ft.slide_hash = &slide_hash_armv6;
+ }
+#endif
// ARM - NEON
#ifdef ARM_NEON
# ifndef ARM_NOCHECK_NEON
BENCHMARK_SLIDEHASH(c, slide_hash_c, 1);
+#ifdef ARM_SIMD
+BENCHMARK_SLIDEHASH(armv6, slide_hash_armv6, test_cpu_features.arm.has_simd);
+#endif
#ifdef ARM_NEON
BENCHMARK_SLIDEHASH(neon, slide_hash_neon, test_cpu_features.arm.has_neon);
#endif
ZLIB_COMPAT =
WITH_ACLE =
WITH_NEON =
+WITH_ARMV6 =
WITH_VFPV3 =
NEON_ARCH = /arch:VFPv4
SUFFIX =
#
OBJS = $(OBJS) adler32_neon.obj chunkset_neon.obj compare256_neon.obj slide_hash_neon.obj
!endif
+!if "$(WITH_ARMV6)" != ""
+WFLAGS = $(WFLAGS) \
+ -DARM_SIMD \
+ -DARM_NOCHECK_SIMD \
+ #
+OBJS = $(OBJS) slide_hash_armv6.obj
+!endif
# targets
all: $(STATICLIB) $(SHAREDLIB) $(IMPLIB) \