]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Add initial support for ARM NEON vector instructions.
authorMika Lindqvist <postmaster@raasu.org>
Sun, 10 Apr 2016 16:59:03 +0000 (19:59 +0300)
committerMika Lindqvist <postmaster@raasu.org>
Thu, 30 Mar 2017 12:07:34 +0000 (15:07 +0300)
CMakeLists.txt
arch/aarch64/fill_window_arm.c
arch/arm/fill_window_arm.c
configure
win32/Makefile.arm

index 98732ed2c71e8182123fabb97fe35fcf59c010aa..fb70ba2eaad77f91be4390799e822566950e1df0 100644 (file)
@@ -60,6 +60,7 @@ option(WITH_NATIVE_INSTRUCTIONS
     "Instruct the compiler to use the full instruction set on this host (gcc/clang -march=native)" OFF)
 if("${ARCH}" MATCHES "arm" OR "${ARCH}" MATCHES "aarch64")
     option(WITH_ACLE "Build with ACLE CRC" OFF)
+    option(WITH_NEON "Build with NEON intrinsics" OFF)
 endif()
 
 if(${CMAKE_C_COMPILER} MATCHES "icc" OR ${CMAKE_C_COMPILER} MATCHES "icpc" OR ${CMAKE_C_COMPILER} MATCHES "icl")
@@ -91,6 +92,7 @@ elseif(MSVC)
     endif()
     if("${ARCH}" MATCHES "arm")
         add_definitions("-D_ARM_WINAPI_PARTITION_DESKTOP_SDK_AVAILABLE=1")
+        set(NEONFLAG "/arch:VFPv4")
     endif()
     if(WITH_NATIVE_INSTRUCTIONS)
         message(STATUS "Ignoring WITH_NATIVE_INSTRUCTIONS; not supported on this configuration")
@@ -125,8 +127,10 @@ else()
         endif()
         if("${ARCH}" MATCHES "arm")
             set(ACLEFLAG "-march=armv8-a+crc")
+            set(NEONFLAG "-mfloat-abi=softfp -mfpu=neon")
         elseif("${ARCH}" MATCHES "aarch64")
             set(ACLEFLAG "-march=armv8-a+crc")
+            set(NEONFLAG "-march=armv8-a+crc+simd")
         endif()
     else(NOT NATIVEFLAG)
         set(SSE2FLAG ${NATIVEFLAG})
@@ -134,8 +138,10 @@ else()
         set(PCLMULFLAG ${NATIVEFLAG})
         if("${ARCH}" MATCHES "arm")
             set(ACLEFLAG "${NATIVEFLAG}")
+            set(NEONFLAG "-mfloat-abi=softfp -mfpu=neon")
         elseif("${ARCH}" MATCHES "aarch64")
             set(ACLEFLAG "${NATIVEFLAG}")
+            set(NEONFLAG "${NATIVEFLAG}")
         endif()
     endif(NOT NATIVEFLAG)
 endif()
@@ -146,6 +152,7 @@ add_feature_info(WITH_OPTIM WITH_OPTIM "Build with optimisation")
 add_feature_info(WITH_NEW_STRATEGIES WITH_NEW_STRATEGIES "Use new strategies")
 if("${ARCH}" MATCHES "arm" OR "${ARCH}" MATCHES "aarch64")
     add_feature_info(WITH_ACLE WITH_ACLE "Build with ACLE CRC")
+    add_feature_info(WITH_NEON WITH_NEON "Build with NEON intrinsics")
 endif()
 
 #
@@ -356,11 +363,24 @@ if(WITH_OPTIM)
             add_definitions("-DARM_ACLE_CRC_HASH")
             add_feature_info(ACLE_CRC 1 "Support CRC hash generation using the ACLE instruction set, using \"${ACLEFLAG}\"")
         endif()
+        if(WITH_NEON)
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${NEONFLAG}")
+            if(MSVC)
+                add_definitions("-D__ARM_NEON__=1")
+            endif(MSVC)
+            add_feature_info(NEON_FILLWINDOW 1 "Support NEON instructions in fill_window_arm, using \"${NEONFLAG}\"")
+        endif()
     elseif("${ARCH}" MATCHES "aarch64")
         if(WITH_ACLE)
             set(ZLIB_ARCH_SRCS ${ZLIB_ARCH_SRCS} ${ARCHDIR}/crc32_acle.c ${ARCHDIR}/insert_string_acle.c)
             add_definitions("-DARM_ACLE_CRC_HASH")
             add_feature_info(ACLE_CRC 1 "Support CRC hash generation using the ACLE instruction set, using \"${ACLEFLAG}\"")
+        endif()
+        # We need to check WITH_NEON first
+        if(WITH_NEON)
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${NEONFLAG}")
+            add_feature_info(NEON_FILLWINDOW 1 "Support NEON instructions in fill_window_arm, using \"${NEONFLAG}\"")
+        elseif(WITH_ACLE)
             set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ACLEFLAG}")
         endif()
     elseif("${ARCHDIR}" MATCHES "arch/x86")
index 9608a25f9cee8a10497cfdabfa83cd92ea338d85..5ddbfed5275557d3b0ab66e02140bbf065d23258 100644 (file)
@@ -1,4 +1,4 @@
-/* fill_window_arm.c -- Optimized hash table shifting for ARM
+/* fill_window_arm.c -- Optimized hash table shifting for ARM with support for NEON instructions
  * Copyright (C) 2017 Mika T. Lindqvist
  *
  * Authors:
 
 extern ZLIB_INTERNAL int read_buf        (z_stream *strm, unsigned char *buf, unsigned size);
 
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif
+
 void fill_window_arm(deflate_state *s) {
     register unsigned n;
     unsigned long more;  /* Amount of free space at the end of the window. */
@@ -41,24 +45,38 @@ void fill_window_arm(deflate_state *s) {
                later. (Using level 0 permanently is not an optimal usage of
                zlib, so we don't care about this pathological case.)
              */
-            {
-                n = s->hash_size;
-                for (i = 0; i < n; i++) {
-                    if (s->head[i] >= wsize)
-                        s->head[i] -= wsize;
-                    else
-                        s->head[i] = NIL;
-                }
+
+            n = s->hash_size;
+#if __ARM_NEON
+            uint16x8_t neon_wsize = vdupq_n_u16(wsize);
+            uint16_t * p = s->head;
+            for (i = 0; i < n; i+=8) {
+                uint16x8_t h = vld1q_u16(p);
+                vst1q_u16(p, vqsubq_u16(h, neon_wsize));
+                p += 8;
             }
 
-            {
-                for (i = 0; i < wsize; i++) {
-                    if (s->prev[i] >= wsize)
-                        s->prev[i] -= wsize;
-                    else
-                        s->prev[i] = NIL;
-                }
+            p = s->prev;
+            for (i = 0; i < wsize; i+=8) {
+                uint16x8_t h = vld1q_u16(p);
+                vst1q_u16(p, vqsubq_u16(h, neon_wsize));
+                p+= 8;
+            }
+#else
+            for (i = 0; i < n; i++) {
+                if (s->head[i] >= wsize)
+                    s->head[i] -= wsize;
+                else
+                    s->head[i] = NIL;
+            }
+
+            for (i = 0; i < wsize; i++) {
+                if (s->prev[i] >= wsize)
+                    s->prev[i] -= wsize;
+                else
+                    s->prev[i] = NIL;
             }
+#endif
             more += wsize;
         }
         if (s->strm->avail_in == 0)
index 9608a25f9cee8a10497cfdabfa83cd92ea338d85..700993a6563a66af25bc8c66a1788d165c20cbc6 100644 (file)
@@ -1,4 +1,4 @@
-/* fill_window_arm.c -- Optimized hash table shifting for ARM
+/* fill_window_arm.c -- Optimized hash table shifting for ARM with support for NEON instructions
  * Copyright (C) 2017 Mika T. Lindqvist
  *
  * Authors:
 
 extern ZLIB_INTERNAL int read_buf        (z_stream *strm, unsigned char *buf, unsigned size);
 
+#if __ARM_NEON__
+#include <arm_neon.h>
+#endif
+
 void fill_window_arm(deflate_state *s) {
     register unsigned n;
     unsigned long more;  /* Amount of free space at the end of the window. */
@@ -41,24 +45,38 @@ void fill_window_arm(deflate_state *s) {
                later. (Using level 0 permanently is not an optimal usage of
                zlib, so we don't care about this pathological case.)
              */
-            {
-                n = s->hash_size;
-                for (i = 0; i < n; i++) {
-                    if (s->head[i] >= wsize)
-                        s->head[i] -= wsize;
-                    else
-                        s->head[i] = NIL;
-                }
+
+            n = s->hash_size;
+#if __ARM_NEON__
+            uint16x8_t neon_wsize = vdupq_n_u16(wsize);
+            uint16_t * p = s->head;
+            for (i = 0; i < n; i+=8) {
+                uint16x8_t h = vld1q_u16(p);
+                vst1q_u16(p, vqsubq_u16(h, neon_wsize));
+                p += 8;
             }
 
-            {
-                for (i = 0; i < wsize; i++) {
-                    if (s->prev[i] >= wsize)
-                        s->prev[i] -= wsize;
-                    else
-                        s->prev[i] = NIL;
-                }
+            p = s->prev;
+            for (i = 0; i < wsize; i+=8) {
+                uint16x8_t h = vld1q_u16(p);
+                vst1q_u16(p, vqsubq_u16(h, neon_wsize));
+                p+= 8;
+            }
+#else
+            for (i = 0; i < n; i++) {
+                if (s->head[i] >= wsize)
+                    s->head[i] -= wsize;
+                else
+                    s->head[i] = NIL;
+            }
+
+            for (i = 0; i < wsize; i++) {
+                if (s->prev[i] >= wsize)
+                    s->prev[i] -= wsize;
+                else
+                    s->prev[i] = NIL;
             }
+#endif
             more += wsize;
         }
         if (s->strm->avail_in == 0)
index 810721bea81f643463080c8da321878b690c7a29..a9e7fde76a452377e4cb31ee9d904dee72e0cc27 100755 (executable)
--- a/configure
+++ b/configure
@@ -106,6 +106,7 @@ cover=0
 build32=0
 build64=0
 buildacle=0
+buildneon=0
 native=0
 sse2flag="-msse2"
 sse4flag="-msse4"
@@ -146,7 +147,7 @@ case "$1" in
       echo '  configure [--zlib-compat] [--prefix=PREFIX]  [--eprefix=EXPREFIX]' | tee -a configure.log
       echo '    [--static] [--32] [--64] [--libdir=LIBDIR] [--sharedlibdir=LIBDIR]' | tee -a configure.log
       echo '    [--includedir=INCLUDEDIR] [--archs="-arch i386 -arch x86_64"]' | tee -a configure.log
-      echo '    [--acle]' | tee -a configure.log
+      echo '    [--acle] [--neon]' | tee -a configure.log
         exit 0 ;;
     -p*=* | --prefix=*) prefix=`echo $1 | sed 's/.*=//'`; shift ;;
     -e*=* | --eprefix=*) exec_prefix=`echo $1 | sed 's/.*=//'`; shift ;;
@@ -165,6 +166,7 @@ case "$1" in
     -3* | --32) build32=1; shift ;;
     -6* | --64) build64=1; shift ;;
     --acle) buildacle=1; shift ;;
+    --neon) buildneon=1; shift ;;
     -n | --native) native=1; shift ;;
     -a*=* | --archs=*) ARCHS=`echo $1 | sed 's/.*=//'`; shift ;;
     --sysconfdir=*) echo "ignored option: --sysconfdir" | tee -a configure.log; shift ;;
@@ -829,7 +831,7 @@ case "${ARCH}" in
     ;;
 
     # ARM specific optimizations
-    arm | armv3l | armv4b | armv4l | armv4tl | armv5tel | armv5tejl | armv6l | armv6hl | armv7l | armv7hl | armv7hnl | armv8-a | armv8-a+crc | armv8.1-a)
+    arm | armv[3467]l | armv4b | armv4tl | armv5tel | armv5tejl | armv[67]hl | armv7hnl | armv[78]-a | armv8-a+* | armv8.[12]-a)
        ARCHDIR=arch/arm
        ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} fill_window_arm.o"
        ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} fill_window_arm.lo"
@@ -841,7 +843,16 @@ case "${ARCH}" in
                 CFLAGS="${CFLAGS} -DUNALIGNED_OK -DUNROLL_LESS"
                 SFLAGS="${SFLAGS} -DUNALIGNED_OK -DUNROLL_LESS"
             ;;
-            armv8-a+crc | armv8.1-a)
+            arm | armv7* | armv8-a | armv8-a+simd)
+                CFLAGS="${CFLAGS} -DUNALIGNED_OK -DUNROLL_LESS"
+                SFLAGS="${SFLAGS} -DUNALIGNED_OK -DUNROLL_LESS"
+
+                if test $buildneon -eq 1; then
+                    CFLAGS="${CFLAGS} -mfloat-abi=softfp -mfpu=neon"
+                    SFLAGS="${SFLAGS} -mfloat-abi=softfp -mfpu=neon"
+                fi
+            ;;
+            armv8-a+crc | armv8-a+crc+simd | armv8.[12]-a)
                 CFLAGS="-march=${ARCH} ${CFLAGS} -DARM_ACLE_CRC_HASH -DUNALIGNED_OK -DUNROLL_LESS"
                 SFLAGS="-march=${ARCH} ${SFLAGS} -DARM_ACLE_CRC_HASH -DUNALIGNED_OK -DUNROLL_LESS"
 
@@ -858,14 +869,25 @@ case "${ARCH}" in
     ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} fill_window_arm.o"
     ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} fill_window_arm.lo"
 
+    if test $native -eq 0; then
+      ARCH="armv8-a"
+    else
+      ARCH="native"
+    fi
     if test $buildacle -eq 1; then
-       CFLAGS="-march=armv8-a+crc ${CFLAGS} -DARM_ACLE_CRC_HASH"
-       SFLAGS="-march=armv8-a+crc ${SFLAGS} -DARM_ACLE_CRC_HASH"
+       if test $native -eq 0; then
+         ARCH="${ARCH}+crc"
+       fi
+       CFLAGS="${CFLAGS} -DARM_ACLE_CRC_HASH"
+       SFLAGS="${SFLAGS} -DARM_ACLE_CRC_HASH"
        ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} crc32_acle.o insert_string_acle.o"
        ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc32_acle.lo insert_string_acle.lo"
     fi
-    CFLAGS="${CFLAGS} -DUNALIGNED_OK -DUNROLL_LESS"
-    SFLAGS="${SFLAGS} -DUNALIGNED_OK -DUNROLL_LESS"
+    if test $buildneon -eq 1 && test $native -eq 0; then
+        ARCH="${ARCH}+simd"
+    fi
+    CFLAGS="-march=${ARCH} ${CFLAGS} -DUNALIGNED_OK -DUNROLL_LESS"
+    SFLAGS="-march=${ARCH} ${SFLAGS} -DUNALIGNED_OK -DUNROLL_LESS"
     ;;
 esac
 
index fb330ce4678c1e4db6ab5cb6e5293978d186d36c..4997052a1b0b94aaa0fc74f8f4e1b5499d73c374 100644 (file)
@@ -30,6 +30,9 @@ RCFLAGS = /dARM /r
 DEFFILE = zlib.def
 WITH_GZFILEOP =
 WITH_ACLE =
+WITH_NEON =
+WITH_VFPV3 =
+NEON_ARCH = /arch:VFPv4
 
 OBJS = adler32.obj compress.obj crc32.obj deflate.obj deflate_fast.obj deflate_slow.obj \
        infback.obj inflate.obj inftrees.obj inffast.obj match.obj trees.obj uncompr.obj zutil.obj fill_window_arm.obj
@@ -42,6 +45,13 @@ DEFFILE = zlibcompat.def
 WFLAGS = $(WFLAGS) -DARM_ACLE_CRC_HASH
 OBJS = $(OBJS) crc32_acle.obj insert_string_acle.obj
 !endif
+!if "$(WITH_VFPV3)" != ""
+NEON_ARCH = /arch:VFPv3
+!endif
+!if "$(WITH_NEON)" != ""
+CFLAGS = $(CFLAGS) $(NEON_ARCH)
+WFLAGS = $(WFLAGS) -D__ARM_NEON__=1
+!endif
 
 # targets
 all: $(STATICLIB) $(SHAREDLIB) $(IMPLIB) \