From: Mika Lindqvist Date: Sun, 10 Apr 2016 16:59:03 +0000 (+0300) Subject: Add initial support for ARM NEON vector instructions. X-Git-Tag: 1.9.9-b1~660^2~22^2~1 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=22cc8799de817b36c3bca69755a3749408354011;p=thirdparty%2Fzlib-ng.git Add initial support for ARM NEON vector instructions. --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 98732ed2c..fb70ba2ea 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,6 +60,7 @@ option(WITH_NATIVE_INSTRUCTIONS "Instruct the compiler to use the full instruction set on this host (gcc/clang -march=native)" OFF) if("${ARCH}" MATCHES "arm" OR "${ARCH}" MATCHES "aarch64") option(WITH_ACLE "Build with ACLE CRC" OFF) + option(WITH_NEON "Build with NEON intrinsics" OFF) endif() if(${CMAKE_C_COMPILER} MATCHES "icc" OR ${CMAKE_C_COMPILER} MATCHES "icpc" OR ${CMAKE_C_COMPILER} MATCHES "icl") @@ -91,6 +92,7 @@ elseif(MSVC) endif() if("${ARCH}" MATCHES "arm") add_definitions("-D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1") + set(NEONFLAG "/arch:VFPv4") endif() if(WITH_NATIVE_INSTRUCTIONS) message(STATUS "Ignoring WITH_NATIVE_INSTRUCTIONS; not supported on this configuration") @@ -125,8 +127,10 @@ else() endif() if("${ARCH}" MATCHES "arm") set(ACLEFLAG "-march=armv8-a+crc") + set(NEONFLAG "-mfloat-abi=softfp -mfpu=neon") elseif("${ARCH}" MATCHES "aarch64") set(ACLEFLAG "-march=armv8-a+crc") + set(NEONFLAG "-march=armv8-a+crc+simd") endif() else(NOT NATIVEFLAG) set(SSE2FLAG ${NATIVEFLAG}) @@ -134,8 +138,10 @@ else() set(PCLMULFLAG ${NATIVEFLAG}) if("${ARCH}" MATCHES "arm") set(ACLEFLAG "${NATIVEFLAG}") + set(NEONFLAG "-mfloat-abi=softfp -mfpu=neon") elseif("${ARCH}" MATCHES "aarch64") set(ACLEFLAG "${NATIVEFLAG}") + set(NEONFLAG "${NATIVEFLAG}") endif() endif(NOT NATIVEFLAG) endif() @@ -146,6 +152,7 @@ add_feature_info(WITH_OPTIM WITH_OPTIM "Build with optimisation") add_feature_info(WITH_NEW_STRATEGIES WITH_NEW_STRATEGIES "Use new strategies") if("${ARCH}" MATCHES "arm" OR "${ARCH}" MATCHES "aarch64") add_feature_info(WITH_ACLE WITH_ACLE "Build with ACLE CRC") + add_feature_info(WITH_NEON WITH_NEON "Build with NEON intrinsics") endif() # @@ -356,11 +363,24 @@ if(WITH_OPTIM) add_definitions("-DARM_ACLE_CRC_HASH") add_feature_info(ACLE_CRC 1 "Support CRC hash generation using the ACLE instruction set, using \"${ACLEFLAG}\"") endif() + if(WITH_NEON) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${NEONFLAG}") + if(MSVC) + add_definitions("-D__ARM_NEON__=1") + endif(MSVC) + add_feature_info(NEON_FILLWINDOW 1 "Support NEON instructions in fill_window_arm, using \"${NEONFLAG}\"") + endif() elseif("${ARCH}" MATCHES "aarch64") if(WITH_ACLE) set(ZLIB_ARCH_SRCS ${ZLIB_ARCH_SRCS} ${ARCHDIR}/crc32_acle.c ${ARCHDIR}/insert_string_acle.c) add_definitions("-DARM_ACLE_CRC_HASH") add_feature_info(ACLE_CRC 1 "Support CRC hash generation using the ACLE instruction set, using \"${ACLEFLAG}\"") + endif() + # We need to check WITH_NEON first + if(WITH_NEON) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${NEONFLAG}") + add_feature_info(NEON_FILLWINDOW 1 "Support NEON instructions in fill_window_arm, using \"${NEONFLAG}\"") + elseif(WITH_ACLE) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ACLEFLAG}") endif() elseif("${ARCHDIR}" MATCHES "arch/x86") diff --git a/arch/aarch64/fill_window_arm.c b/arch/aarch64/fill_window_arm.c index 9608a25f9..5ddbfed52 100644 --- a/arch/aarch64/fill_window_arm.c +++ b/arch/aarch64/fill_window_arm.c @@ -1,4 +1,4 @@ -/* fill_window_arm.c -- Optimized hash table shifting for ARM +/* fill_window_arm.c -- Optimized hash table shifting for ARM with support for NEON instructions * Copyright (C) 2017 Mika T. Lindqvist * * Authors: @@ -14,6 +14,10 @@ extern ZLIB_INTERNAL int read_buf (z_stream *strm, unsigned char *buf, unsigned size); +#if __ARM_NEON +#include +#endif + void fill_window_arm(deflate_state *s) { register unsigned n; unsigned long more; /* Amount of free space at the end of the window. */ @@ -41,24 +45,38 @@ void fill_window_arm(deflate_state *s) { later. (Using level 0 permanently is not an optimal usage of zlib, so we don't care about this pathological case.) */ - { - n = s->hash_size; - for (i = 0; i < n; i++) { - if (s->head[i] >= wsize) - s->head[i] -= wsize; - else - s->head[i] = NIL; - } + + n = s->hash_size; +#if __ARM_NEON + uint16x8_t neon_wsize = vdupq_n_u16(wsize); + uint16_t * p = s->head; + for (i = 0; i < n; i+=8) { + uint16x8_t h = vld1q_u16(p); + vst1q_u16(p, vqsubq_u16(h, neon_wsize)); + p += 8; } - { - for (i = 0; i < wsize; i++) { - if (s->prev[i] >= wsize) - s->prev[i] -= wsize; - else - s->prev[i] = NIL; - } + p = s->prev; + for (i = 0; i < wsize; i+=8) { + uint16x8_t h = vld1q_u16(p); + vst1q_u16(p, vqsubq_u16(h, neon_wsize)); + p+= 8; + } +#else + for (i = 0; i < n; i++) { + if (s->head[i] >= wsize) + s->head[i] -= wsize; + else + s->head[i] = NIL; + } + + for (i = 0; i < wsize; i++) { + if (s->prev[i] >= wsize) + s->prev[i] -= wsize; + else + s->prev[i] = NIL; } +#endif more += wsize; } if (s->strm->avail_in == 0) diff --git a/arch/arm/fill_window_arm.c b/arch/arm/fill_window_arm.c index 9608a25f9..700993a65 100644 --- a/arch/arm/fill_window_arm.c +++ b/arch/arm/fill_window_arm.c @@ -1,4 +1,4 @@ -/* fill_window_arm.c -- Optimized hash table shifting for ARM +/* fill_window_arm.c -- Optimized hash table shifting for ARM with support for NEON instructions * Copyright (C) 2017 Mika T. Lindqvist * * Authors: @@ -14,6 +14,10 @@ extern ZLIB_INTERNAL int read_buf (z_stream *strm, unsigned char *buf, unsigned size); +#if __ARM_NEON__ +#include +#endif + void fill_window_arm(deflate_state *s) { register unsigned n; unsigned long more; /* Amount of free space at the end of the window. */ @@ -41,24 +45,38 @@ void fill_window_arm(deflate_state *s) { later. (Using level 0 permanently is not an optimal usage of zlib, so we don't care about this pathological case.) */ - { - n = s->hash_size; - for (i = 0; i < n; i++) { - if (s->head[i] >= wsize) - s->head[i] -= wsize; - else - s->head[i] = NIL; - } + + n = s->hash_size; +#if __ARM_NEON__ + uint16x8_t neon_wsize = vdupq_n_u16(wsize); + uint16_t * p = s->head; + for (i = 0; i < n; i+=8) { + uint16x8_t h = vld1q_u16(p); + vst1q_u16(p, vqsubq_u16(h, neon_wsize)); + p += 8; } - { - for (i = 0; i < wsize; i++) { - if (s->prev[i] >= wsize) - s->prev[i] -= wsize; - else - s->prev[i] = NIL; - } + p = s->prev; + for (i = 0; i < wsize; i+=8) { + uint16x8_t h = vld1q_u16(p); + vst1q_u16(p, vqsubq_u16(h, neon_wsize)); + p+= 8; + } +#else + for (i = 0; i < n; i++) { + if (s->head[i] >= wsize) + s->head[i] -= wsize; + else + s->head[i] = NIL; + } + + for (i = 0; i < wsize; i++) { + if (s->prev[i] >= wsize) + s->prev[i] -= wsize; + else + s->prev[i] = NIL; } +#endif more += wsize; } if (s->strm->avail_in == 0) diff --git a/configure b/configure index 810721bea..a9e7fde76 100755 --- a/configure +++ b/configure @@ -106,6 +106,7 @@ cover=0 build32=0 build64=0 buildacle=0 +buildneon=0 native=0 sse2flag="-msse2" sse4flag="-msse4" @@ -146,7 +147,7 @@ case "$1" in echo ' configure [--zlib-compat] [--prefix=PREFIX] [--eprefix=EXPREFIX]' | tee -a configure.log echo ' [--static] [--32] [--64] [--libdir=LIBDIR] [--sharedlibdir=LIBDIR]' | tee -a configure.log echo ' [--includedir=INCLUDEDIR] [--archs="-arch i386 -arch x86_64"]' | tee -a configure.log - echo ' [--acle]' | tee -a configure.log + echo ' [--acle] [--neon]' | tee -a configure.log exit 0 ;; -p*=* | --prefix=*) prefix=`echo $1 | sed 's/.*=//'`; shift ;; -e*=* | --eprefix=*) exec_prefix=`echo $1 | sed 's/.*=//'`; shift ;; @@ -165,6 +166,7 @@ case "$1" in -3* | --32) build32=1; shift ;; -6* | --64) build64=1; shift ;; --acle) buildacle=1; shift ;; + --neon) buildneon=1; shift ;; -n | --native) native=1; shift ;; -a*=* | --archs=*) ARCHS=`echo $1 | sed 's/.*=//'`; shift ;; --sysconfdir=*) echo "ignored option: --sysconfdir" | tee -a configure.log; shift ;; @@ -829,7 +831,7 @@ case "${ARCH}" in ;; # ARM specific optimizations - arm | armv3l | armv4b | armv4l | armv4tl | armv5tel | armv5tejl | armv6l | armv6hl | armv7l | armv7hl | armv7hnl | armv8-a | armv8-a+crc | armv8.1-a) + arm | armv[3467]l | armv4b | armv4tl | armv5tel | armv5tejl | armv[67]hl | armv7hnl | armv[78]-a | armv8-a+* | armv8.[12]-a) ARCHDIR=arch/arm ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} fill_window_arm.o" ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} fill_window_arm.lo" @@ -841,7 +843,16 @@ case "${ARCH}" in CFLAGS="${CFLAGS} -DUNALIGNED_OK -DUNROLL_LESS" SFLAGS="${SFLAGS} -DUNALIGNED_OK -DUNROLL_LESS" ;; - armv8-a+crc | armv8.1-a) + arm | armv7* | armv8-a | armv8-a+simd) + CFLAGS="${CFLAGS} -DUNALIGNED_OK -DUNROLL_LESS" + SFLAGS="${SFLAGS} -DUNALIGNED_OK -DUNROLL_LESS" + + if test $buildneon -eq 1; then + CFLAGS="${CFLAGS} -mfloat-abi=softfp -mfpu=neon" + SFLAGS="${SFLAGS} -mfloat-abi=softfp -mfpu=neon" + fi + ;; + armv8-a+crc | armv8-a+crc+simd | armv8.[12]-a) CFLAGS="-march=${ARCH} ${CFLAGS} -DARM_ACLE_CRC_HASH -DUNALIGNED_OK -DUNROLL_LESS" SFLAGS="-march=${ARCH} ${SFLAGS} -DARM_ACLE_CRC_HASH -DUNALIGNED_OK -DUNROLL_LESS" @@ -858,14 +869,25 @@ case "${ARCH}" in ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} fill_window_arm.o" ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} fill_window_arm.lo" + if test $native -eq 0; then + ARCH="armv8-a" + else + ARCH="native" + fi if test $buildacle -eq 1; then - CFLAGS="-march=armv8-a+crc ${CFLAGS} -DARM_ACLE_CRC_HASH" - SFLAGS="-march=armv8-a+crc ${SFLAGS} -DARM_ACLE_CRC_HASH" + if test $native -eq 0; then + ARCH="${ARCH}+crc" + fi + CFLAGS="${CFLAGS} -DARM_ACLE_CRC_HASH" + SFLAGS="${SFLAGS} -DARM_ACLE_CRC_HASH" ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_acle.o insert_string_acle.o" ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_acle.lo insert_string_acle.lo" fi - CFLAGS="${CFLAGS} -DUNALIGNED_OK -DUNROLL_LESS" - SFLAGS="${SFLAGS} -DUNALIGNED_OK -DUNROLL_LESS" + if test $buildneon -eq 1 && test $native -eq 0; then + ARCH="${ARCH}+simd" + fi + CFLAGS="-march=${ARCH} ${CFLAGS} -DUNALIGNED_OK -DUNROLL_LESS" + SFLAGS="-march=${ARCH} ${SFLAGS} -DUNALIGNED_OK -DUNROLL_LESS" ;; esac diff --git a/win32/Makefile.arm b/win32/Makefile.arm index fb330ce46..4997052a1 100644 --- a/win32/Makefile.arm +++ b/win32/Makefile.arm @@ -30,6 +30,9 @@ RCFLAGS = /dARM /r DEFFILE = zlib.def WITH_GZFILEOP = WITH_ACLE = +WITH_NEON = +WITH_VFPV3 = +NEON_ARCH = /arch:VFPv4 OBJS = adler32.obj compress.obj crc32.obj deflate.obj deflate_fast.obj deflate_slow.obj \ infback.obj inflate.obj inftrees.obj inffast.obj match.obj trees.obj uncompr.obj zutil.obj fill_window_arm.obj @@ -42,6 +45,13 @@ DEFFILE = zlibcompat.def WFLAGS = $(WFLAGS) -DARM_ACLE_CRC_HASH OBJS = $(OBJS) crc32_acle.obj insert_string_acle.obj !endif +!if "$(WITH_VFPV3)" != "" +NEON_ARCH = /arch:VFPv3 +!endif +!if "$(WITH_NEON)" != "" +CFLAGS = $(CFLAGS) $(NEON_ARCH) +WFLAGS = $(WFLAGS) -D__ARM_NEON__=1 +!endif # targets all: $(STATICLIB) $(SHAREDLIB) $(IMPLIB) \