set(WARNFLAGS_MAINTAINER "-W4 -Wcheck")
set(WARNFLAGS_DISABLE "")
if(BASEARCH_X86_FOUND)
+ set(AVX2FLAG "-mavx2")
set(SSE2FLAG "-msse2")
set(SSE4FLAG "-msse4.2")
endif()
set(WARNFLAGS_MAINTAINER "/W4 /Wcheck")
set(WARNFLAGS_DISABLE "")
if(BASEARCH_X86_FOUND)
+ set(AVX2FLAG "/arch:AVX2")
set(SSE2FLAG "/arch:SSE2")
set(SSE4FLAG "/arch:SSE4.2")
endif()
if(NOT NATIVEFLAG)
if (__GNUC__)
if(BASEARCH_X86_FOUND)
+ set(AVX2FLAG "-mavx2")
set(SSE2FLAG "-msse2")
set(SSE4FLAG "-msse4")
set(PCLMULFLAG "-mpclmul")
endif()
else()
if(BASEARCH_X86_FOUND)
+ set(AVX2FLAG ${NATIVEFLAG})
set(SSE2FLAG ${NATIVEFLAG})
set(SSE4FLAG ${NATIVEFLAG})
set(PCLMULFLAG ${NATIVEFLAG})
endif()
set(CMAKE_REQUIRED_FLAGS)
+ # Check whether compiler supports AVX2 intrinics
+ if(WITH_NATIVE_INSTRUCTIONS)
+ set(CMAKE_REQUIRED_FLAGS "${NATIVEFLAG}")
+ else()
+ set(CMAKE_REQUIRED_FLAGS "${AVX2FLAG}")
+ endif()
+ check_c_source_compile_or_run(
+ "#include <immintrin.h>
+ int main(void) {
+ __m256i x = _mm256_set1_epi16(2);
+ const __m256i y = _mm256_set1_epi16(1);
+ x = _mm256_subs_epu16(x, y);
+ (void)x;
+ return 0;
+ }"
+ HAVE_AVX2_INTRIN
+ )
+ set(CMAKE_REQUIRED_FLAGS)
+
# FORCE_SSE2 option will only be shown if HAVE_SSE2_INTRIN is true
if("${ARCH}" MATCHES "i[3-6]86")
cmake_dependent_option(FORCE_SSE2 "Always assume CPU is SSE2 capable" OFF "HAVE_SSE2_INTRIN" OFF)
if(MSVC)
list(APPEND ZLIB_ARCH_HDRS fallback_builtins.h)
endif()
+ if(HAVE_AVX2_INTRIN)
+ add_definitions(-DX86_AVX2)
+ list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/slide_avx.c)
+ add_feature_info(AVX2_SLIDEHASH 1 "Support AVX2-optimized slide_hash, using \"${AVX2FLAG}\"")
+ add_intrinsics_option("${AVX2FLAG}")
+ endif()
if(HAVE_SSE42CRC_INLINE_ASM OR HAVE_SSE42CRC_INTRIN)
add_definitions(-DX86_SSE42_CRC_HASH)
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/insert_string_sse.c)
INCLUDES=
SUFFIX=
+AVX2FLAG=-mavx2
SSE2FLAG=-msse2
SSE4FLAG=-msse4
PCLMULFLAG=-mpclmul
SRCTOP=../..
TOPDIR=$(SRCTOP)
-all: x86.o x86.lo fill_window_sse.o fill_window_sse.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo slide_sse.o
+all: x86.o x86.lo fill_window_sse.o fill_window_sse.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo slide_avx.o slide_avx.lo slide_sse.o slide_sse.lo
x86.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c
crc_folding.lo:
$(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE4FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc_folding.c
+slide_avx.o:
+ $(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_avx.c
+
+slide_avx.lo:
+ $(CC) $(SFLAGS) $(AVX2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_avx.c
+
slide_sse.o:
$(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_sse.c
extern int read_buf(PREFIX3(stream) *strm, unsigned char *buf, unsigned size);
void slide_hash_sse2(deflate_state *s);
+#ifdef X86_AVX2
+void slide_hash_avx2(deflate_state *s);
+#endif
ZLIB_INTERNAL void fill_window_sse(deflate_state *s) {
register unsigned n;
later. (Using level 0 permanently is not an optimal usage of
zlib, so we don't care about this pathological case.)
*/
+#ifdef X86_AVX2
+ if (x86_cpu_has_avx2) {
+ slide_hash_avx2(s);
+ } else
+#endif
slide_hash_sse2(s);
more += wsize;
}
--- /dev/null
+/*
+ * AVX2 optimized hash slide, based on Intel's slide_sse implementation
+ *
+ * Copyright (C) 2017 Intel Corporation
+ * Authors:
+ * Arjan van de Ven <arjan@linux.intel.com>
+ * Jim Kukunas <james.t.kukunas@linux.intel.com>
+ * Mika T. Lindqvist <postmaster@raasu.org>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "../../zbuild.h"
+#include "../../deflate.h"
+
+#include <immintrin.h>
+
+ZLIB_INTERNAL void slide_hash_avx2(deflate_state *s) {
+ Pos *p;
+ unsigned n;
+ unsigned wsize = s->w_size;
+ const __m256i zmm_wsize = _mm256_set1_epi16(s->w_size);
+
+ n = s->hash_size;
+ p = &s->head[n] - 16;
+ do {
+ __m256i value, result;
+
+ value = _mm256_loadu_si256((__m256i *)p);
+ result= _mm256_subs_epu16(value, zmm_wsize);
+ _mm256_storeu_si256((__m256i *)p, result);
+ p -= 16;
+ n -= 16;
+ } while (n > 0);
+
+ n = wsize;
+ p = &s->prev[n] - 16;
+ do {
+ __m256i value, result;
+
+ value = _mm256_loadu_si256((__m256i *)p);
+ result= _mm256_subs_epu16(value, zmm_wsize);
+ _mm256_storeu_si256((__m256i *)p, result);
+
+ p -= 16;
+ n -= 16;
+ } while (n > 0);
+}
# include <cpuid.h>
#endif
+ZLIB_INTERNAL int x86_cpu_has_avx2;
ZLIB_INTERNAL int x86_cpu_has_sse2;
ZLIB_INTERNAL int x86_cpu_has_sse42;
ZLIB_INTERNAL int x86_cpu_has_pclmulqdq;
// check BMI1 bit
// Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
x86_cpu_has_tzcnt = ebx & 0x8;
+ // check AVX2 bit
+ x86_cpu_has_avx2 = ebx & 0x20;
} else {
x86_cpu_has_tzcnt = 0;
+ x86_cpu_has_avx2 = 0;
}
}
#ifndef CPU_H_
#define CPU_H_
+extern int x86_cpu_has_avx2;
extern int x86_cpu_has_sse2;
extern int x86_cpu_has_sse42;
extern int x86_cpu_has_pclmulqdq;
floatabi=
native=0
forcesse2=0
+avx2flag="-mavx2"
sse2flag="-msse2"
sse4flag="-msse4"
sse42flag="-msse4.2"
;;
esac
+# Check for AVX2 intrinsics
+case "${ARCH}" in
+ i386 | i486 | i586 | i686 | x86_64)
+ cat > $test.c << EOF
+#include <immintrin.h>
+int main(void) {
+ __m256i x = _mm256_set1_epi16(2);
+ const __m256i y = _mm256_set1_epi16(1);
+ x = _mm256_subs_epu16(x, y);
+ (void)x;
+ return 0;
+}
+EOF
+ if try ${CC} ${CFLAGS} ${avx2flag} $test.c; then
+ echo "Checking for AVX2 intrinsics ... Yes." | tee -a configure.log
+ HAVE_AVX2_INTRIN=1
+ else
+ echo "Checking for AVX2 intrinsics ... No." | tee -a configure.log
+ HAVE_AVX2_INTRIN=0
+ fi
+ ;;
+esac
+
+
# Check whether -mfpu=neon is available on ARM processors.
case "${ARCH}" in
arm*)
SFLAGS="${SFLAGS} -DX86_SSE42_CRC_INTRIN"
fi
+ if test ${HAVE_AVX2_INTRIN} -eq 1; then
+ CFLAGS="${CFLAGS} -DX86_AVX2"
+ SFLAGS="${SFLAGS} -DX86_AVX2"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_avx.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_avx.lo"
+ fi
+
CFLAGS="${CFLAGS} -DX86_SSE42_CRC_HASH"
SFLAGS="${SFLAGS} -DX86_SSE42_CRC_HASH"
ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} insert_string_sse.o"
ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} crc_folding.lo"
fi
+ if test ${HAVE_AVX2_INTRIN} -eq 1; then
+ CFLAGS="${CFLAGS} -DX86_AVX2"
+ SFLAGS="${SFLAGS} -DX86_AVX2"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_avx.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_avx.lo"
+ fi
+
# Enable deflate_quick at level 1?
if test $without_new_strategies -eq 0; then
CFLAGS="${CFLAGS} -DX86_QUICK_STRATEGY"
/^SRCDIR *=/s#=.*#=$SRCDIR/$ARCHDIR#
/^SRCTOP *=/s#=.*#=$SRCDIR#
/^TOPDIR *=/s#=.*#=$BUILDDIR#
+/^AVX2FLAG *=/s#=.*#=$avx2flag#
/^SSE2FLAG *=/s#=.*#=$sse2flag#
/^SSE4FLAG *=/s#=.*#=$sse4flag#
/^PCLMULFLAG *=/s#=.*#=$pclmulflag#
RC = rc
CP = copy /y
CFLAGS = -nologo -MD -W3 -O2 -Oy- -Zi -Fd"zlib" $(LOC)
-WFLAGS = -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE -DX86_PCLMULQDQ_CRC -DX86_SSE2 -DX86_CPUID -DX86_SSE42_CRC_HASH -DUNALIGNED_OK -DX86_QUICK_STRATEGY
+WFLAGS = -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE -DX86_PCLMULQDQ_CRC -DX86_SSE2 -DX86_CPUID -DX86_SSE42_CRC_INTRIN -DX86_SSE42_CRC_HASH -DX86_AVX2 -DUNALIGNED_OK -DX86_QUICK_STRATEGY
LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest
ARFLAGS = -nologo
RCFLAGS = /dWIN32 /r
OBJS = adler32.obj compress.obj crc32.obj deflate.obj deflate_fast.obj deflate_quick.obj deflate_slow.obj \
deflate_medium.obj \
- functable.obj infback.obj inflate.obj inftrees.obj inffast.obj slide_sse.obj trees.obj uncompr.obj zutil.obj \
+ functable.obj infback.obj inflate.obj inftrees.obj inffast.obj slide_avx.obj slide_sse.obj trees.obj uncompr.obj zutil.obj \
x86.obj fill_window_sse.obj insert_string_sse.obj crc_folding.obj
!if "$(ZLIB_COMPAT)" != ""
WITH_GZFILEOP = yes