From 53f9257a24362386e75504836eaeb2788360b1e8 Mon Sep 17 00:00:00 2001 From: Mika Lindqvist Date: Wed, 22 Jan 2020 22:58:35 +0200 Subject: [PATCH] Add initial AVX2 support. --- CMakeLists.txt | 29 +++++++++++++++++++++++ arch/x86/Makefile.in | 9 +++++++- arch/x86/fill_window_sse.c | 8 +++++++ arch/x86/slide_avx.c | 47 ++++++++++++++++++++++++++++++++++++++ arch/x86/x86.c | 4 ++++ arch/x86/x86.h | 1 + configure | 40 ++++++++++++++++++++++++++++++++ win32/Makefile.msc | 4 ++-- 8 files changed, 139 insertions(+), 3 deletions(-) create mode 100644 arch/x86/slide_avx.c diff --git a/CMakeLists.txt b/CMakeLists.txt index a02d5546..352a2d1f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -137,6 +137,7 @@ if(${CMAKE_C_COMPILER} MATCHES "icc" OR ${CMAKE_C_COMPILER} MATCHES "icpc" OR ${ set(WARNFLAGS_MAINTAINER "-W4 -Wcheck") set(WARNFLAGS_DISABLE "") if(BASEARCH_X86_FOUND) + set(AVX2FLAG "-mavx2") set(SSE2FLAG "-msse2") set(SSE4FLAG "-msse4.2") endif() @@ -145,6 +146,7 @@ if(${CMAKE_C_COMPILER} MATCHES "icc" OR ${CMAKE_C_COMPILER} MATCHES "icpc" OR ${ set(WARNFLAGS_MAINTAINER "/W4 /Wcheck") set(WARNFLAGS_DISABLE "") if(BASEARCH_X86_FOUND) + set(AVX2FLAG "/arch:AVX2") set(SSE2FLAG "/arch:SSE2") set(SSE4FLAG "/arch:SSE4.2") endif() @@ -189,6 +191,7 @@ else() if(NOT NATIVEFLAG) if (__GNUC__) if(BASEARCH_X86_FOUND) + set(AVX2FLAG "-mavx2") set(SSE2FLAG "-msse2") set(SSE4FLAG "-msse4") set(PCLMULFLAG "-mpclmul") @@ -223,6 +226,7 @@ else() endif() else() if(BASEARCH_X86_FOUND) + set(AVX2FLAG ${NATIVEFLAG}) set(SSE2FLAG ${NATIVEFLAG}) set(SSE4FLAG ${NATIVEFLAG}) set(PCLMULFLAG ${NATIVEFLAG}) @@ -537,6 +541,25 @@ if(BASEARCH_X86_FOUND) endif() set(CMAKE_REQUIRED_FLAGS) + # Check whether compiler supports AVX2 intrinics + if(WITH_NATIVE_INSTRUCTIONS) + set(CMAKE_REQUIRED_FLAGS "${NATIVEFLAG}") + else() + set(CMAKE_REQUIRED_FLAGS "${AVX2FLAG}") + endif() + check_c_source_compile_or_run( + "#include + int main(void) { + __m256i x = _mm256_set1_epi16(2); + const __m256i y = _mm256_set1_epi16(1); + x = _mm256_subs_epu16(x, y); + (void)x; + return 0; + }" + HAVE_AVX2_INTRIN + ) + set(CMAKE_REQUIRED_FLAGS) + # FORCE_SSE2 option will only be shown if HAVE_SSE2_INTRIN is true if("${ARCH}" MATCHES "i[3-6]86") cmake_dependent_option(FORCE_SSE2 "Always assume CPU is SSE2 capable" OFF "HAVE_SSE2_INTRIN" OFF) @@ -613,6 +636,12 @@ if(WITH_OPTIM) if(MSVC) list(APPEND ZLIB_ARCH_HDRS fallback_builtins.h) endif() + if(HAVE_AVX2_INTRIN) + add_definitions(-DX86_AVX2) + list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/slide_avx.c) + add_feature_info(AVX2_SLIDEHASH 1 "Support AVX2-optimized slide_hash, using \"${AVX2FLAG}\"") + add_intrinsics_option("${AVX2FLAG}") + endif() if(HAVE_SSE42CRC_INLINE_ASM OR HAVE_SSE42CRC_INTRIN) add_definitions(-DX86_SSE42_CRC_HASH) list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/insert_string_sse.c) diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in index 95ad3682..187d06fd 100644 --- a/arch/x86/Makefile.in +++ b/arch/x86/Makefile.in @@ -8,6 +8,7 @@ SFLAGS= INCLUDES= SUFFIX= +AVX2FLAG=-mavx2 SSE2FLAG=-msse2 SSE4FLAG=-msse4 PCLMULFLAG=-mpclmul @@ -16,7 +17,7 @@ SRCDIR=. SRCTOP=../.. TOPDIR=$(SRCTOP) -all: x86.o x86.lo fill_window_sse.o fill_window_sse.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo slide_sse.o +all: x86.o x86.lo fill_window_sse.o fill_window_sse.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo slide_avx.o slide_avx.lo slide_sse.o slide_sse.lo x86.o: $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c @@ -48,6 +49,12 @@ crc_folding.o: crc_folding.lo: $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc_folding.c +slide_avx.o: + $(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_avx.c + +slide_avx.lo: + $(CC) $(SFLAGS) $(AVX2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_avx.c + slide_sse.o: $(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_sse.c diff --git a/arch/x86/fill_window_sse.c b/arch/x86/fill_window_sse.c index 275a2d11..3cac1cb9 100644 --- a/arch/x86/fill_window_sse.c +++ b/arch/x86/fill_window_sse.c @@ -18,6 +18,9 @@ extern int read_buf(PREFIX3(stream) *strm, unsigned char *buf, unsigned size); void slide_hash_sse2(deflate_state *s); +#ifdef X86_AVX2 +void slide_hash_avx2(deflate_state *s); +#endif ZLIB_INTERNAL void fill_window_sse(deflate_state *s) { register unsigned n; @@ -57,6 +60,11 @@ ZLIB_INTERNAL void fill_window_sse(deflate_state *s) { later. (Using level 0 permanently is not an optimal usage of zlib, so we don't care about this pathological case.) */ +#ifdef X86_AVX2 + if (x86_cpu_has_avx2) { + slide_hash_avx2(s); + } else +#endif slide_hash_sse2(s); more += wsize; } diff --git a/arch/x86/slide_avx.c b/arch/x86/slide_avx.c new file mode 100644 index 00000000..77221d64 --- /dev/null +++ b/arch/x86/slide_avx.c @@ -0,0 +1,47 @@ +/* + * AVX2 optimized hash slide, based on Intel's slide_sse implementation + * + * Copyright (C) 2017 Intel Corporation + * Authors: + * Arjan van de Ven + * Jim Kukunas + * Mika T. Lindqvist + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ +#include "../../zbuild.h" +#include "../../deflate.h" + +#include + +ZLIB_INTERNAL void slide_hash_avx2(deflate_state *s) { + Pos *p; + unsigned n; + unsigned wsize = s->w_size; + const __m256i zmm_wsize = _mm256_set1_epi16(s->w_size); + + n = s->hash_size; + p = &s->head[n] - 16; + do { + __m256i value, result; + + value = _mm256_loadu_si256((__m256i *)p); + result= _mm256_subs_epu16(value, zmm_wsize); + _mm256_storeu_si256((__m256i *)p, result); + p -= 16; + n -= 16; + } while (n > 0); + + n = wsize; + p = &s->prev[n] - 16; + do { + __m256i value, result; + + value = _mm256_loadu_si256((__m256i *)p); + result= _mm256_subs_epu16(value, zmm_wsize); + _mm256_storeu_si256((__m256i *)p, result); + + p -= 16; + n -= 16; + } while (n > 0); +} diff --git a/arch/x86/x86.c b/arch/x86/x86.c index a3aee7b0..c50fd347 100644 --- a/arch/x86/x86.c +++ b/arch/x86/x86.c @@ -17,6 +17,7 @@ # include #endif +ZLIB_INTERNAL int x86_cpu_has_avx2; ZLIB_INTERNAL int x86_cpu_has_sse2; ZLIB_INTERNAL int x86_cpu_has_sse42; ZLIB_INTERNAL int x86_cpu_has_pclmulqdq; @@ -62,7 +63,10 @@ void ZLIB_INTERNAL x86_check_features(void) { // check BMI1 bit // Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf x86_cpu_has_tzcnt = ebx & 0x8; + // check AVX2 bit + x86_cpu_has_avx2 = ebx & 0x20; } else { x86_cpu_has_tzcnt = 0; + x86_cpu_has_avx2 = 0; } } diff --git a/arch/x86/x86.h b/arch/x86/x86.h index 9d6f3756..3e212a48 100644 --- a/arch/x86/x86.h +++ b/arch/x86/x86.h @@ -6,6 +6,7 @@ #ifndef CPU_H_ #define CPU_H_ +extern int x86_cpu_has_avx2; extern int x86_cpu_has_sse2; extern int x86_cpu_has_sse42; extern int x86_cpu_has_pclmulqdq; diff --git a/configure b/configure index 9fde2ec2..767fb7bd 100755 --- a/configure +++ b/configure @@ -100,6 +100,7 @@ with_fuzzers=0 floatabi= native=0 forcesse2=0 +avx2flag="-mavx2" sse2flag="-msse2" sse4flag="-msse4" sse42flag="-msse4.2" @@ -941,6 +942,30 @@ EOF ;; esac +# Check for AVX2 intrinsics +case "${ARCH}" in + i386 | i486 | i586 | i686 | x86_64) + cat > $test.c << EOF +#include +int main(void) { + __m256i x = _mm256_set1_epi16(2); + const __m256i y = _mm256_set1_epi16(1); + x = _mm256_subs_epu16(x, y); + (void)x; + return 0; +} +EOF + if try ${CC} ${CFLAGS} ${avx2flag} $test.c; then + echo "Checking for AVX2 intrinsics ... Yes." | tee -a configure.log + HAVE_AVX2_INTRIN=1 + else + echo "Checking for AVX2 intrinsics ... No." | tee -a configure.log + HAVE_AVX2_INTRIN=0 + fi + ;; +esac + + # Check whether -mfpu=neon is available on ARM processors. case "${ARCH}" in arm*) @@ -1018,6 +1043,13 @@ case "${ARCH}" in SFLAGS="${SFLAGS} -DX86_SSE42_CRC_INTRIN" fi + if test ${HAVE_AVX2_INTRIN} -eq 1; then + CFLAGS="${CFLAGS} -DX86_AVX2" + SFLAGS="${SFLAGS} -DX86_AVX2" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_avx.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_avx.lo" + fi + CFLAGS="${CFLAGS} -DX86_SSE42_CRC_HASH" SFLAGS="${SFLAGS} -DX86_SSE42_CRC_HASH" ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} insert_string_sse.o" @@ -1060,6 +1092,13 @@ case "${ARCH}" in ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc_folding.lo" fi + if test ${HAVE_AVX2_INTRIN} -eq 1; then + CFLAGS="${CFLAGS} -DX86_AVX2" + SFLAGS="${SFLAGS} -DX86_AVX2" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_avx.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_avx.lo" + fi + # Enable deflate_quick at level 1? if test $without_new_strategies -eq 0; then CFLAGS="${CFLAGS} -DX86_QUICK_STRATEGY" @@ -1450,6 +1489,7 @@ sed < $SRCDIR/$ARCHDIR/Makefile.in " /^SRCDIR *=/s#=.*#=$SRCDIR/$ARCHDIR# /^SRCTOP *=/s#=.*#=$SRCDIR# /^TOPDIR *=/s#=.*#=$BUILDDIR# +/^AVX2FLAG *=/s#=.*#=$avx2flag# /^SSE2FLAG *=/s#=.*#=$sse2flag# /^SSE4FLAG *=/s#=.*#=$sse4flag# /^PCLMULFLAG *=/s#=.*#=$pclmulflag# diff --git a/win32/Makefile.msc b/win32/Makefile.msc index a1c73e33..bd79e6f6 100644 --- a/win32/Makefile.msc +++ b/win32/Makefile.msc @@ -23,7 +23,7 @@ AR = lib RC = rc CP = copy /y CFLAGS = -nologo -MD -W3 -O2 -Oy- -Zi -Fd"zlib" $(LOC) -WFLAGS = -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE -DX86_PCLMULQDQ_CRC -DX86_SSE2 -DX86_CPUID -DX86_SSE42_CRC_HASH -DUNALIGNED_OK -DX86_QUICK_STRATEGY +WFLAGS = -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE -DX86_PCLMULQDQ_CRC -DX86_SSE2 -DX86_CPUID -DX86_SSE42_CRC_INTRIN -DX86_SSE42_CRC_HASH -DX86_AVX2 -DUNALIGNED_OK -DX86_QUICK_STRATEGY LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest ARFLAGS = -nologo RCFLAGS = /dWIN32 /r @@ -36,7 +36,7 @@ SUFFIX = OBJS = adler32.obj compress.obj crc32.obj deflate.obj deflate_fast.obj deflate_quick.obj deflate_slow.obj \ deflate_medium.obj \ - functable.obj infback.obj inflate.obj inftrees.obj inffast.obj slide_sse.obj trees.obj uncompr.obj zutil.obj \ + functable.obj infback.obj inflate.obj inftrees.obj inffast.obj slide_avx.obj slide_sse.obj trees.obj uncompr.obj zutil.obj \ x86.obj fill_window_sse.obj insert_string_sse.obj crc_folding.obj !if "$(ZLIB_COMPAT)" != "" WITH_GZFILEOP = yes -- 2.47.2