if(WITH_NEON)
check_neon_compiler_flag()
if(MFPU_NEON_AVAILABLE)
- add_definitions(-DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH)
- set(NEON_SRCS ${ARCHDIR}/adler32_neon.c ${ARCHDIR}/chunkset_neon.c ${ARCHDIR}/slide_hash_neon.c)
+ add_definitions(-DARM_NEON -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH)
+ set(NEON_SRCS ${ARCHDIR}/adler32_neon.c ${ARCHDIR}/chunkset_neon.c
+ ${ARCHDIR}/compare256_neon.c ${ARCHDIR}/slide_hash_neon.c)
list(APPEND ZLIB_ARCH_SRCS ${NEON_SRCS})
set_property(SOURCE ${NEON_SRCS} PROPERTY COMPILE_FLAGS "${NEONFLAG} ${NOLTOFLAG}")
if(MSVC)
* CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z
* Hash table implementation using CRC32-C intrinsics on x86 and ARM
* Slide hash implementations using SSE2, AVX2, Neon, VMX & VSX
- * Compare256 implementations using SSE2 & AVX2
+ * Compare256 implementations using SSE2, AVX2, & Neon
* Inflate chunk copying using SSE2, AVX, Neon & VSX
* Support for hardware-accelerated deflate using IBM Z DFLTCC
* Unaligned memory read/writes and large bit buffer improvements
adler32_neon.o adler32_neon.lo \
arm_features.o arm_features.lo \
chunkset_neon.o chunkset_neon.lo \
+ compare256_neon.o compare256_neon.lo \
crc32_acle.o crc32_acle.lo \
slide_hash_neon.o slide_hash_neon.lo \
insert_string_acle.o insert_string_acle.lo
chunkset_neon.lo:
$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
+compare256_neon.o:
+ $(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c
+
+compare256_neon.lo:
+ $(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_neon.c
+
crc32_acle.o:
$(CC) $(CFLAGS) $(ACLEFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_acle.c
--- /dev/null
+/* compare256_neon.c - NEON version of compare256
+ * Copyright (C) 2022 Nathan Moinvaziri
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
+#ifdef _M_ARM64
+# include <arm64_neon.h>
+#else
+# include <arm_neon.h>
+#endif
+#include "../../zbuild.h"
+
+static inline uint32_t compare256_neon_static(const uint8_t *src0, const uint8_t *src1) {
+ uint32_t len = 0;
+
+ do {
+ uint8x16_t a, b, cmp;
+ uint64_t lane;
+
+ a = vld1q_u8(src0);
+ b = vld1q_u8(src1);
+
+ cmp = veorq_u8(a, b);
+
+ lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 0);
+ if (lane) {
+ uint32_t match_byte = (uint32_t)__builtin_ctzll(lane) / 8;
+ return len + match_byte;
+ }
+ len += 8;
+ lane = vgetq_lane_u64(vreinterpretq_u64_u8(cmp), 1);
+ if (lane) {
+ uint32_t match_byte = (uint32_t)__builtin_ctzll(lane) / 8;
+ return len + match_byte;
+ }
+ len += 8;
+
+ src0 += 16, src1 += 16;
+ } while (len < 256);
+
+ return 256;
+}
+
+Z_INTERNAL uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1) {
+ return compare256_neon_static(src0, src1);
+}
+
+#define LONGEST_MATCH longest_match_neon
+#define COMPARE256 compare256_neon_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH longest_match_slow_neon
+#define COMPARE256 compare256_neon_static
+
+#include "match_tpl.h"
+
+#endif
fi
if test $buildneon -eq 1; then
- if test $MFPU_NEON_AVAILABLE -eq 1;then
+ CFLAGS="${CFLAGS} -DARM_NEON"
+ SFLAGS="${SFLAGS} -DARM_NEON"
+
+ if test $MFPU_NEON_AVAILABLE -eq 1; then
neonflag="-mfpu=neon"
fi
CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o chunkset_neon.o slide_hash_neon.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo chunkset_neon.lo slide_hash_neon.lo"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o chunkset_neon.o compare256_neon.o slide_hash_neon.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo chunkset_neon.lo compare256_neon.lo slide_hash_neon.lo"
fi
fi
;;
fi
if test $buildneon -eq 1; then
+ CFLAGS="${CFLAGS} -DARM_NEON"
+ SFLAGS="${SFLAGS} -DARM_NEON"
+
if test $MFPU_NEON_AVAILABLE -eq 1;then
neonflag="-mfpu=neon"
fi
CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o chunkset_neon.o slide_hash_neon.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo chunkset_neon.lo slide_hash_neon.lo"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o chunkset_neon.o compare256_neon.o slide_hash_neon.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo chunkset_neon.lo compare256_neon.lo slide_hash_neon.lo"
fi
fi
;;
fi
if test $buildneon -eq 1; then
+ CFLAGS="${CFLAGS} -DARM_NEON"
+ SFLAGS="${SFLAGS} -DARM_NEON"
+
if test $MFPU_NEON_AVAILABLE -eq 1;then
neonflag="-mfpu=neon"
fi
CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o chunkset_neon.o slide_hash_neon.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo chunkset_neon.lo slide_hash_neon.lo"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o chunkset_neon.o compare256_neon.o slide_hash_neon.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo chunkset_neon.lo compare256_neon.lo slide_hash_neon.lo"
fi
fi
;;
if test $native -eq 0; then
ARCH="${ARCH}+simd"
fi
- CFLAGS="${CFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
- SFLAGS="${SFLAGS} -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o chunkset_neon.o slide_hash_neon.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo chunkset_neon.lo slide_hash_neon.lo"
+ CFLAGS="${CFLAGS} -DARM_NEON -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
+ SFLAGS="${SFLAGS} -DARM_NEON -DARM_NEON_ADLER32 -DARM_NEON_CHUNKSET -DARM_NEON_SLIDEHASH"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_neon.o chunkset_neon.o compare256_neon.o slide_hash_neon.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_neon.lo chunkset_neon.lo compare256_neon.lo slide_hash_neon.lo"
fi
fi
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
extern uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1);
#endif
+#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
+extern uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1);
+#endif
#ifdef DEFLATE_H_
/* insert_string */
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
extern uint32_t longest_match_avx2(deflate_state *const s, Pos cur_match);
#endif
+#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
+extern uint32_t longest_match_neon(deflate_state *const s, Pos cur_match);
+#endif
/* longest_match_slow */
extern uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match);
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
extern uint32_t longest_match_slow_avx2(deflate_state *const s, Pos cur_match);
#endif
+#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
+extern uint32_t longest_match_slow_neon(deflate_state *const s, Pos cur_match);
+#endif
/* quick_insert_string */
extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str);
if (x86_cpu_has_avx2)
functable.longest_match = &longest_match_avx2;
#endif
+#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
+ if (arm_cpu_has_neon)
+ functable.longest_match = &longest_match_neon;
+#endif
return functable.longest_match(s, cur_match);
}
if (x86_cpu_has_avx2)
functable.longest_match_slow = &longest_match_slow_avx2;
#endif
+#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
+ if (arm_cpu_has_neon)
+ functable.longest_match_slow = &longest_match_slow_neon;
+#endif
return functable.longest_match_slow(s, cur_match);
}
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
BENCHMARK_COMPARE256(avx2, compare256_avx2, x86_cpu_has_avx2);
#endif
+#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
+BENCHMARK_COMPARE256(neon, compare256_neon, arm_cpu_has_neon);
+#endif
#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
TEST_COMPARE256(avx2, compare256_avx2, x86_cpu_has_avx2)
#endif
+#if defined(ARM_NEON) && defined(HAVE_BUILTIN_CTZLL)
+TEST_COMPARE256(neon, compare256_neon, arm_cpu_has_neon)
+#endif
WFLAGS = $(WFLAGS) \
-DARM_ACLE_CRC_HASH \
-D__ARM_NEON__=1 \
+ -DARM_NEON \
-DARM_NEON_ADLER32 \
-DARM_NEON_CHUNKSET \
-DARM_NEON_SLIDEHASH \
-DARM_NOCHECK_NEON \
#
-OBJS = $(OBJS) crc32_acle.obj insert_string_acle.obj adler32_neon.obj chunkset_neon.obj slide_hash_neon.obj
+OBJS = $(OBJS) crc32_acle.obj insert_string_acle.obj adler32_neon.obj chunkset_neon.obj compare256_neon.obj slide_hash_neon.obj
# targets
all: $(STATICLIB) $(SHAREDLIB) $(IMPLIB) \
CFLAGS = $(CFLAGS) $(NEON_ARCH)
WFLAGS = $(WFLAGS) \
-D__ARM_NEON__=1 \
+ -DARM_NEON \
-DARM_NEON_ADLER32 \
-DARM_NEON_CHUNKSET \
-DARM_NEON_SLIDEHASH \
-DARM_NOCHECK_NEON \
#
-OBJS = $(OBJS) adler32_neon.obj chunkset_neon.obj slide_hash_neon.obj
+OBJS = $(OBJS) adler32_neon.obj chunkset_neon.obj compare256_neon.obj slide_hash_neon.obj
!endif
# targets