]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Add initial AVX2 support.
authorMika Lindqvist <postmaster@raasu.org>
Wed, 22 Jan 2020 20:58:35 +0000 (22:58 +0200)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Fri, 7 Feb 2020 19:49:52 +0000 (20:49 +0100)
CMakeLists.txt
arch/x86/Makefile.in
arch/x86/fill_window_sse.c
arch/x86/slide_avx.c [new file with mode: 0644]
arch/x86/x86.c
arch/x86/x86.h
configure
win32/Makefile.msc

index a02d5546ff6ea2bf16a7401a15095fd61c3f6494..352a2d1feabe380ac6cf764f99087d3790e1d683 100644 (file)
@@ -137,6 +137,7 @@ if(${CMAKE_C_COMPILER} MATCHES "icc" OR ${CMAKE_C_COMPILER} MATCHES "icpc" OR ${
         set(WARNFLAGS_MAINTAINER "-W4 -Wcheck")
         set(WARNFLAGS_DISABLE "")
         if(BASEARCH_X86_FOUND)
+            set(AVX2FLAG "-mavx2")
             set(SSE2FLAG "-msse2")
             set(SSE4FLAG "-msse4.2")
         endif()
@@ -145,6 +146,7 @@ if(${CMAKE_C_COMPILER} MATCHES "icc" OR ${CMAKE_C_COMPILER} MATCHES "icpc" OR ${
         set(WARNFLAGS_MAINTAINER "/W4 /Wcheck")
         set(WARNFLAGS_DISABLE "")
         if(BASEARCH_X86_FOUND)
+            set(AVX2FLAG "/arch:AVX2")
             set(SSE2FLAG "/arch:SSE2")
             set(SSE4FLAG "/arch:SSE4.2")
         endif()
@@ -189,6 +191,7 @@ else()
     if(NOT NATIVEFLAG)
         if (__GNUC__)
             if(BASEARCH_X86_FOUND)
+                set(AVX2FLAG "-mavx2")
                 set(SSE2FLAG "-msse2")
                 set(SSE4FLAG "-msse4")
                 set(PCLMULFLAG "-mpclmul")
@@ -223,6 +226,7 @@ else()
         endif()
     else()
         if(BASEARCH_X86_FOUND)
+            set(AVX2FLAG ${NATIVEFLAG})
             set(SSE2FLAG ${NATIVEFLAG})
             set(SSE4FLAG ${NATIVEFLAG})
             set(PCLMULFLAG ${NATIVEFLAG})
@@ -537,6 +541,25 @@ if(BASEARCH_X86_FOUND)
     endif()
     set(CMAKE_REQUIRED_FLAGS)
 
+    # Check whether compiler supports AVX2 intrinics
+    if(WITH_NATIVE_INSTRUCTIONS)
+        set(CMAKE_REQUIRED_FLAGS "${NATIVEFLAG}")
+    else()
+        set(CMAKE_REQUIRED_FLAGS "${AVX2FLAG}")
+    endif()
+    check_c_source_compile_or_run(
+        "#include <immintrin.h>
+        int main(void) {
+            __m256i x = _mm256_set1_epi16(2);
+            const __m256i y = _mm256_set1_epi16(1);
+            x = _mm256_subs_epu16(x, y);
+            (void)x;
+            return 0;
+        }"
+        HAVE_AVX2_INTRIN
+    )
+    set(CMAKE_REQUIRED_FLAGS)
+
     # FORCE_SSE2 option will only be shown if HAVE_SSE2_INTRIN is true
     if("${ARCH}" MATCHES "i[3-6]86")
         cmake_dependent_option(FORCE_SSE2 "Always assume CPU is SSE2 capable" OFF "HAVE_SSE2_INTRIN" OFF)
@@ -613,6 +636,12 @@ if(WITH_OPTIM)
         if(MSVC)
             list(APPEND ZLIB_ARCH_HDRS fallback_builtins.h)
         endif()
+        if(HAVE_AVX2_INTRIN)
+            add_definitions(-DX86_AVX2)
+            list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/slide_avx.c)
+            add_feature_info(AVX2_SLIDEHASH 1 "Support AVX2-optimized slide_hash, using \"${AVX2FLAG}\"")
+            add_intrinsics_option("${AVX2FLAG}")
+        endif()
         if(HAVE_SSE42CRC_INLINE_ASM OR HAVE_SSE42CRC_INTRIN)
             add_definitions(-DX86_SSE42_CRC_HASH)
             list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/insert_string_sse.c)
index 95ad3682f9bfc4a6616ca2f8f0df5927025dfb48..187d06fdaef67e42cc3809c5839582093bd1b073 100644 (file)
@@ -8,6 +8,7 @@ SFLAGS=
 INCLUDES=
 SUFFIX=
 
+AVX2FLAG=-mavx2
 SSE2FLAG=-msse2
 SSE4FLAG=-msse4
 PCLMULFLAG=-mpclmul
@@ -16,7 +17,7 @@ SRCDIR=.
 SRCTOP=../..
 TOPDIR=$(SRCTOP)
 
-all: x86.o x86.lo fill_window_sse.o fill_window_sse.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo slide_sse.o
+all: x86.o x86.lo fill_window_sse.o fill_window_sse.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo slide_avx.o slide_avx.lo slide_sse.o slide_sse.lo
 
 x86.o:
        $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c
@@ -48,6 +49,12 @@ crc_folding.o:
 crc_folding.lo:
        $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc_folding.c
 
+slide_avx.o:
+       $(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_avx.c
+
+slide_avx.lo:
+       $(CC) $(SFLAGS) $(AVX2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_avx.c
+
 slide_sse.o:
        $(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_sse.c
 
index 275a2d11c0706dfe6bf5a26fa4b328bb11d34b9a..3cac1cb9359d1df0c3989e17352f71044424cb3f 100644 (file)
@@ -18,6 +18,9 @@
 
 extern int read_buf(PREFIX3(stream) *strm, unsigned char *buf, unsigned size);
 void slide_hash_sse2(deflate_state *s);
+#ifdef X86_AVX2
+void slide_hash_avx2(deflate_state *s);
+#endif
 
 ZLIB_INTERNAL void fill_window_sse(deflate_state *s) {
     register unsigned n;
@@ -57,6 +60,11 @@ ZLIB_INTERNAL void fill_window_sse(deflate_state *s) {
                later. (Using level 0 permanently is not an optimal usage of
                zlib, so we don't care about this pathological case.)
              */
+#ifdef X86_AVX2
+            if (x86_cpu_has_avx2) {
+                slide_hash_avx2(s);
+            } else
+#endif
             slide_hash_sse2(s);
             more += wsize;
         }
diff --git a/arch/x86/slide_avx.c b/arch/x86/slide_avx.c
new file mode 100644 (file)
index 0000000..77221d6
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+ * AVX2 optimized hash slide, based on Intel's slide_sse implementation
+ *
+ * Copyright (C) 2017 Intel Corporation
+ * Authors:
+ *   Arjan van de Ven   <arjan@linux.intel.com>
+ *   Jim Kukunas        <james.t.kukunas@linux.intel.com>
+ *   Mika T. Lindqvist  <postmaster@raasu.org>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "../../zbuild.h"
+#include "../../deflate.h"
+
+#include <immintrin.h>
+
+ZLIB_INTERNAL void slide_hash_avx2(deflate_state *s) {
+    Pos *p;
+    unsigned n;
+    unsigned wsize = s->w_size;
+    const __m256i zmm_wsize = _mm256_set1_epi16(s->w_size);
+
+    n = s->hash_size;
+    p = &s->head[n] - 16;
+    do {
+        __m256i value, result;
+
+        value = _mm256_loadu_si256((__m256i *)p);
+        result= _mm256_subs_epu16(value, zmm_wsize);
+        _mm256_storeu_si256((__m256i *)p, result);
+        p -= 16;
+        n -= 16;
+    } while (n > 0);
+
+    n = wsize;
+    p = &s->prev[n] - 16;
+    do {
+        __m256i value, result;
+
+        value = _mm256_loadu_si256((__m256i *)p);
+        result= _mm256_subs_epu16(value, zmm_wsize);
+        _mm256_storeu_si256((__m256i *)p, result);
+
+        p -= 16;
+        n -= 16;
+    } while (n > 0);
+}
index a3aee7b08b623dc3ec7b14b8df48645216a2a4c1..c50fd3472416473fed924f9040fd99b4ec2b90eb 100644 (file)
@@ -17,6 +17,7 @@
 #  include <cpuid.h>
 #endif
 
+ZLIB_INTERNAL int x86_cpu_has_avx2;
 ZLIB_INTERNAL int x86_cpu_has_sse2;
 ZLIB_INTERNAL int x86_cpu_has_sse42;
 ZLIB_INTERNAL int x86_cpu_has_pclmulqdq;
@@ -62,7 +63,10 @@ void ZLIB_INTERNAL x86_check_features(void) {
         // check BMI1 bit
         // Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
         x86_cpu_has_tzcnt = ebx & 0x8;
+        // check AVX2 bit
+        x86_cpu_has_avx2 = ebx & 0x20;
     } else {
         x86_cpu_has_tzcnt = 0;
+        x86_cpu_has_avx2 = 0;
     }
 }
index 9d6f37569e4b2300d4740736a91e9d745e299288..3e212a48a6330d4f5f47995579546c2f80db7c36 100644 (file)
@@ -6,6 +6,7 @@
 #ifndef CPU_H_
 #define CPU_H_
 
+extern int x86_cpu_has_avx2;
 extern int x86_cpu_has_sse2;
 extern int x86_cpu_has_sse42;
 extern int x86_cpu_has_pclmulqdq;
index 9fde2ec2501007604115a0040783dfbf5fb67136..767fb7bde46e9c005afa52bf85eb820b7dd5f75f 100755 (executable)
--- a/configure
+++ b/configure
@@ -100,6 +100,7 @@ with_fuzzers=0
 floatabi=
 native=0
 forcesse2=0
+avx2flag="-mavx2"
 sse2flag="-msse2"
 sse4flag="-msse4"
 sse42flag="-msse4.2"
@@ -941,6 +942,30 @@ EOF
         ;;
 esac
 
+# Check for AVX2 intrinsics
+case "${ARCH}" in
+    i386 | i486 | i586 | i686 | x86_64)
+        cat > $test.c << EOF
+#include <immintrin.h>
+int main(void) {
+    __m256i x = _mm256_set1_epi16(2);
+    const __m256i y = _mm256_set1_epi16(1);
+    x = _mm256_subs_epu16(x, y);
+    (void)x;
+    return 0;
+}
+EOF
+        if try ${CC} ${CFLAGS} ${avx2flag} $test.c; then
+            echo "Checking for AVX2 intrinsics ... Yes." | tee -a configure.log
+            HAVE_AVX2_INTRIN=1
+        else
+            echo "Checking for AVX2 intrinsics ... No." | tee -a configure.log
+            HAVE_AVX2_INTRIN=0
+        fi
+        ;;
+esac
+
+
 # Check whether -mfpu=neon is available on ARM processors.
 case "${ARCH}" in
     arm*)
@@ -1018,6 +1043,13 @@ case "${ARCH}" in
                 SFLAGS="${SFLAGS} -DX86_SSE42_CRC_INTRIN"
             fi
 
+            if test ${HAVE_AVX2_INTRIN} -eq 1; then
+                CFLAGS="${CFLAGS} -DX86_AVX2"
+                SFLAGS="${SFLAGS} -DX86_AVX2"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_avx.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_avx.lo"
+            fi
+
             CFLAGS="${CFLAGS} -DX86_SSE42_CRC_HASH"
             SFLAGS="${SFLAGS} -DX86_SSE42_CRC_HASH"
             ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} insert_string_sse.o"
@@ -1060,6 +1092,13 @@ case "${ARCH}" in
                 ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc_folding.lo"
             fi
 
+            if test ${HAVE_AVX2_INTRIN} -eq 1; then
+                CFLAGS="${CFLAGS} -DX86_AVX2"
+                SFLAGS="${SFLAGS} -DX86_AVX2"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_avx.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_avx.lo"
+            fi
+
             # Enable deflate_quick at level 1?
             if test $without_new_strategies -eq 0; then
                 CFLAGS="${CFLAGS} -DX86_QUICK_STRATEGY"
@@ -1450,6 +1489,7 @@ sed < $SRCDIR/$ARCHDIR/Makefile.in "
 /^SRCDIR *=/s#=.*#=$SRCDIR/$ARCHDIR#
 /^SRCTOP *=/s#=.*#=$SRCDIR#
 /^TOPDIR *=/s#=.*#=$BUILDDIR#
+/^AVX2FLAG *=/s#=.*#=$avx2flag#
 /^SSE2FLAG *=/s#=.*#=$sse2flag#
 /^SSE4FLAG *=/s#=.*#=$sse4flag#
 /^PCLMULFLAG *=/s#=.*#=$pclmulflag#
index a1c73e33b6e6c6ae3835130e8b5fecfa272d8085..bd79e6f682d7bc0599e45795babf93483e313165 100644 (file)
@@ -23,7 +23,7 @@ AR = lib
 RC = rc
 CP = copy /y
 CFLAGS  = -nologo -MD -W3 -O2 -Oy- -Zi -Fd"zlib" $(LOC)
-WFLAGS  = -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE -DX86_PCLMULQDQ_CRC -DX86_SSE2 -DX86_CPUID -DX86_SSE42_CRC_HASH -DUNALIGNED_OK -DX86_QUICK_STRATEGY
+WFLAGS  = -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE -DX86_PCLMULQDQ_CRC -DX86_SSE2 -DX86_CPUID -DX86_SSE42_CRC_INTRIN -DX86_SSE42_CRC_HASH -DX86_AVX2 -DUNALIGNED_OK -DX86_QUICK_STRATEGY
 LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest
 ARFLAGS = -nologo
 RCFLAGS = /dWIN32 /r
@@ -36,7 +36,7 @@ SUFFIX =
 
 OBJS = adler32.obj compress.obj crc32.obj deflate.obj deflate_fast.obj deflate_quick.obj deflate_slow.obj \
        deflate_medium.obj \
-       functable.obj infback.obj inflate.obj inftrees.obj inffast.obj slide_sse.obj trees.obj uncompr.obj zutil.obj \
+       functable.obj infback.obj inflate.obj inftrees.obj inffast.obj slide_avx.obj slide_sse.obj trees.obj uncompr.obj zutil.obj \
        x86.obj fill_window_sse.obj insert_string_sse.obj crc_folding.obj
 !if "$(ZLIB_COMPAT)" != ""
 WITH_GZFILEOP = yes