"Instruct the compiler to use the full instruction set on this host (gcc/clang -march=native)" OFF)
if("${ARCH}" MATCHES "arm" OR "${ARCH}" MATCHES "aarch64")
option(WITH_ACLE "Build with ACLE CRC" OFF)
+ option(WITH_NEON "Build with NEON intrinsics" OFF)
endif()
if(${CMAKE_C_COMPILER} MATCHES "icc" OR ${CMAKE_C_COMPILER} MATCHES "icpc" OR ${CMAKE_C_COMPILER} MATCHES "icl")
endif()
if("${ARCH}" MATCHES "arm")
add_definitions("-D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1")
+ set(NEONFLAG "/arch:VFPv4")
endif()
if(WITH_NATIVE_INSTRUCTIONS)
message(STATUS "Ignoring WITH_NATIVE_INSTRUCTIONS; not supported on this configuration")
endif()
if("${ARCH}" MATCHES "arm")
set(ACLEFLAG "-march=armv8-a+crc")
+ set(NEONFLAG "-mfloat-abi=softfp -mfpu=neon")
elseif("${ARCH}" MATCHES "aarch64")
set(ACLEFLAG "-march=armv8-a+crc")
+ set(NEONFLAG "-march=armv8-a+crc+simd")
endif()
else(NOT NATIVEFLAG)
set(SSE2FLAG ${NATIVEFLAG})
set(PCLMULFLAG ${NATIVEFLAG})
if("${ARCH}" MATCHES "arm")
set(ACLEFLAG "${NATIVEFLAG}")
+ set(NEONFLAG "-mfloat-abi=softfp -mfpu=neon")
elseif("${ARCH}" MATCHES "aarch64")
set(ACLEFLAG "${NATIVEFLAG}")
+ set(NEONFLAG "${NATIVEFLAG}")
endif()
endif(NOT NATIVEFLAG)
endif()
add_feature_info(WITH_NEW_STRATEGIES WITH_NEW_STRATEGIES "Use new strategies")
if("${ARCH}" MATCHES "arm" OR "${ARCH}" MATCHES "aarch64")
add_feature_info(WITH_ACLE WITH_ACLE "Build with ACLE CRC")
+ add_feature_info(WITH_NEON WITH_NEON "Build with NEON intrinsics")
endif()
#
add_definitions("-DARM_ACLE_CRC_HASH")
add_feature_info(ACLE_CRC 1 "Support CRC hash generation using the ACLE instruction set, using \"${ACLEFLAG}\"")
endif()
+ if(WITH_NEON)
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${NEONFLAG}")
+ if(MSVC)
+ add_definitions("-D__ARM_NEON__=1")
+ endif(MSVC)
+ add_feature_info(NEON_FILLWINDOW 1 "Support NEON instructions in fill_window_arm, using \"${NEONFLAG}\"")
+ endif()
elseif("${ARCH}" MATCHES "aarch64")
if(WITH_ACLE)
set(ZLIB_ARCH_SRCS ${ZLIB_ARCH_SRCS} ${ARCHDIR}/crc32_acle.c ${ARCHDIR}/insert_string_acle.c)
add_definitions("-DARM_ACLE_CRC_HASH")
add_feature_info(ACLE_CRC 1 "Support CRC hash generation using the ACLE instruction set, using \"${ACLEFLAG}\"")
+ endif()
+ # We need to check WITH_NEON first
+ if(WITH_NEON)
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${NEONFLAG}")
+ add_feature_info(NEON_FILLWINDOW 1 "Support NEON instructions in fill_window_arm, using \"${NEONFLAG}\"")
+ elseif(WITH_ACLE)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ACLEFLAG}")
endif()
elseif("${ARCHDIR}" MATCHES "arch/x86")
-/* fill_window_arm.c -- Optimized hash table shifting for ARM
+/* fill_window_arm.c -- Optimized hash table shifting for ARM with support for NEON instructions
* Copyright (C) 2017 Mika T. Lindqvist
*
* Authors:
extern ZLIB_INTERNAL int read_buf (z_stream *strm, unsigned char *buf, unsigned size);
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif
+
void fill_window_arm(deflate_state *s) {
register unsigned n;
unsigned long more; /* Amount of free space at the end of the window. */
later. (Using level 0 permanently is not an optimal usage of
zlib, so we don't care about this pathological case.)
*/
- {
- n = s->hash_size;
- for (i = 0; i < n; i++) {
- if (s->head[i] >= wsize)
- s->head[i] -= wsize;
- else
- s->head[i] = NIL;
- }
+
+ n = s->hash_size;
+#if __ARM_NEON
+ uint16x8_t neon_wsize = vdupq_n_u16(wsize);
+ uint16_t * p = s->head;
+ for (i = 0; i < n; i+=8) {
+ uint16x8_t h = vld1q_u16(p);
+ vst1q_u16(p, vqsubq_u16(h, neon_wsize));
+ p += 8;
}
- {
- for (i = 0; i < wsize; i++) {
- if (s->prev[i] >= wsize)
- s->prev[i] -= wsize;
- else
- s->prev[i] = NIL;
- }
+ p = s->prev;
+ for (i = 0; i < wsize; i+=8) {
+ uint16x8_t h = vld1q_u16(p);
+ vst1q_u16(p, vqsubq_u16(h, neon_wsize));
+ p+= 8;
+ }
+#else
+ for (i = 0; i < n; i++) {
+ if (s->head[i] >= wsize)
+ s->head[i] -= wsize;
+ else
+ s->head[i] = NIL;
+ }
+
+ for (i = 0; i < wsize; i++) {
+ if (s->prev[i] >= wsize)
+ s->prev[i] -= wsize;
+ else
+ s->prev[i] = NIL;
}
+#endif
more += wsize;
}
if (s->strm->avail_in == 0)
-/* fill_window_arm.c -- Optimized hash table shifting for ARM
+/* fill_window_arm.c -- Optimized hash table shifting for ARM with support for NEON instructions
* Copyright (C) 2017 Mika T. Lindqvist
*
* Authors:
extern ZLIB_INTERNAL int read_buf (z_stream *strm, unsigned char *buf, unsigned size);
+#if __ARM_NEON__
+#include <arm_neon.h>
+#endif
+
void fill_window_arm(deflate_state *s) {
register unsigned n;
unsigned long more; /* Amount of free space at the end of the window. */
later. (Using level 0 permanently is not an optimal usage of
zlib, so we don't care about this pathological case.)
*/
- {
- n = s->hash_size;
- for (i = 0; i < n; i++) {
- if (s->head[i] >= wsize)
- s->head[i] -= wsize;
- else
- s->head[i] = NIL;
- }
+
+ n = s->hash_size;
+#if __ARM_NEON__
+ uint16x8_t neon_wsize = vdupq_n_u16(wsize);
+ uint16_t * p = s->head;
+ for (i = 0; i < n; i+=8) {
+ uint16x8_t h = vld1q_u16(p);
+ vst1q_u16(p, vqsubq_u16(h, neon_wsize));
+ p += 8;
}
- {
- for (i = 0; i < wsize; i++) {
- if (s->prev[i] >= wsize)
- s->prev[i] -= wsize;
- else
- s->prev[i] = NIL;
- }
+ p = s->prev;
+ for (i = 0; i < wsize; i+=8) {
+ uint16x8_t h = vld1q_u16(p);
+ vst1q_u16(p, vqsubq_u16(h, neon_wsize));
+ p+= 8;
+ }
+#else
+ for (i = 0; i < n; i++) {
+ if (s->head[i] >= wsize)
+ s->head[i] -= wsize;
+ else
+ s->head[i] = NIL;
+ }
+
+ for (i = 0; i < wsize; i++) {
+ if (s->prev[i] >= wsize)
+ s->prev[i] -= wsize;
+ else
+ s->prev[i] = NIL;
}
+#endif
more += wsize;
}
if (s->strm->avail_in == 0)
build32=0
build64=0
buildacle=0
+buildneon=0
native=0
sse2flag="-msse2"
sse4flag="-msse4"
echo ' configure [--zlib-compat] [--prefix=PREFIX] [--eprefix=EXPREFIX]' | tee -a configure.log
echo ' [--static] [--32] [--64] [--libdir=LIBDIR] [--sharedlibdir=LIBDIR]' | tee -a configure.log
echo ' [--includedir=INCLUDEDIR] [--archs="-arch i386 -arch x86_64"]' | tee -a configure.log
- echo ' [--acle]' | tee -a configure.log
+ echo ' [--acle] [--neon]' | tee -a configure.log
exit 0 ;;
-p*=* | --prefix=*) prefix=`echo $1 | sed 's/.*=//'`; shift ;;
-e*=* | --eprefix=*) exec_prefix=`echo $1 | sed 's/.*=//'`; shift ;;
-3* | --32) build32=1; shift ;;
-6* | --64) build64=1; shift ;;
--acle) buildacle=1; shift ;;
+ --neon) buildneon=1; shift ;;
-n | --native) native=1; shift ;;
-a*=* | --archs=*) ARCHS=`echo $1 | sed 's/.*=//'`; shift ;;
--sysconfdir=*) echo "ignored option: --sysconfdir" | tee -a configure.log; shift ;;
;;
# ARM specific optimizations
- arm | armv3l | armv4b | armv4l | armv4tl | armv5tel | armv5tejl | armv6l | armv6hl | armv7l | armv7hl | armv7hnl | armv8-a | armv8-a+crc | armv8.1-a)
+ arm | armv[3467]l | armv4b | armv4tl | armv5tel | armv5tejl | armv[67]hl | armv7hnl | armv[78]-a | armv8-a+* | armv8.[12]-a)
ARCHDIR=arch/arm
ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} fill_window_arm.o"
ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} fill_window_arm.lo"
CFLAGS="${CFLAGS} -DUNALIGNED_OK -DUNROLL_LESS"
SFLAGS="${SFLAGS} -DUNALIGNED_OK -DUNROLL_LESS"
;;
- armv8-a+crc | armv8.1-a)
+ arm | armv7* | armv8-a | armv8-a+simd)
+ CFLAGS="${CFLAGS} -DUNALIGNED_OK -DUNROLL_LESS"
+ SFLAGS="${SFLAGS} -DUNALIGNED_OK -DUNROLL_LESS"
+
+ if test $buildneon -eq 1; then
+ CFLAGS="${CFLAGS} -mfloat-abi=softfp -mfpu=neon"
+ SFLAGS="${SFLAGS} -mfloat-abi=softfp -mfpu=neon"
+ fi
+ ;;
+ armv8-a+crc | armv8-a+crc+simd | armv8.[12]-a)
CFLAGS="-march=${ARCH} ${CFLAGS} -DARM_ACLE_CRC_HASH -DUNALIGNED_OK -DUNROLL_LESS"
SFLAGS="-march=${ARCH} ${SFLAGS} -DARM_ACLE_CRC_HASH -DUNALIGNED_OK -DUNROLL_LESS"
ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} fill_window_arm.o"
ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} fill_window_arm.lo"
+ if test $native -eq 0; then
+ ARCH="armv8-a"
+ else
+ ARCH="native"
+ fi
if test $buildacle -eq 1; then
- CFLAGS="-march=armv8-a+crc ${CFLAGS} -DARM_ACLE_CRC_HASH"
- SFLAGS="-march=armv8-a+crc ${SFLAGS} -DARM_ACLE_CRC_HASH"
+ if test $native -eq 0; then
+ ARCH="${ARCH}+crc"
+ fi
+ CFLAGS="${CFLAGS} -DARM_ACLE_CRC_HASH"
+ SFLAGS="${SFLAGS} -DARM_ACLE_CRC_HASH"
ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_acle.o insert_string_acle.o"
ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_acle.lo insert_string_acle.lo"
fi
- CFLAGS="${CFLAGS} -DUNALIGNED_OK -DUNROLL_LESS"
- SFLAGS="${SFLAGS} -DUNALIGNED_OK -DUNROLL_LESS"
+ if test $buildneon -eq 1 && test $native -eq 0; then
+ ARCH="${ARCH}+simd"
+ fi
+ CFLAGS="-march=${ARCH} ${CFLAGS} -DUNALIGNED_OK -DUNROLL_LESS"
+ SFLAGS="-march=${ARCH} ${SFLAGS} -DUNALIGNED_OK -DUNROLL_LESS"
;;
esac
DEFFILE = zlib.def
WITH_GZFILEOP =
WITH_ACLE =
+WITH_NEON =
+WITH_VFPV3 =
+NEON_ARCH = /arch:VFPv4
OBJS = adler32.obj compress.obj crc32.obj deflate.obj deflate_fast.obj deflate_slow.obj \
infback.obj inflate.obj inftrees.obj inffast.obj match.obj trees.obj uncompr.obj zutil.obj fill_window_arm.obj
WFLAGS = $(WFLAGS) -DARM_ACLE_CRC_HASH
OBJS = $(OBJS) crc32_acle.obj insert_string_acle.obj
!endif
+!if "$(WITH_VFPV3)" != ""
+NEON_ARCH = /arch:VFPv3
+!endif
+!if "$(WITH_NEON)" != ""
+CFLAGS = $(CFLAGS) $(NEON_ARCH)
+WFLAGS = $(WFLAGS) -D__ARM_NEON__=1
+!endif
# targets
all: $(STATICLIB) $(SHAREDLIB) $(IMPLIB) \