From: Nathan Moinvaziri Date: Mon, 29 Jun 2020 03:00:01 +0000 (-0700) Subject: Added AVX support to chunkset functions. X-Git-Tag: 1.9.9-b1~47 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=4bc5bd65e52ecbc5f751fba11461bc803bd92428;p=thirdparty%2Fzlib-ng.git Added AVX support to chunkset functions. --- diff --git a/CMakeLists.txt b/CMakeLists.txt index cd3764b8..67c547c5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -699,9 +699,11 @@ if(WITH_OPTIM) list(APPEND ZLIB_ARCH_HDRS fallback_builtins.h) endif() if(WITH_AVX2 AND HAVE_AVX2_INTRIN) - add_definitions(-DX86_AVX2 -DX86_AVX2_ADLER32) + add_definitions(-DX86_AVX2 -DX86_AVX2_ADLER32 -DX86_AVX_CHUNKSET) set(AVX2_SRCS ${ARCHDIR}/slide_avx.c) add_feature_info(AVX2_SLIDEHASH 1 "Support AVX2 optimized slide_hash, using \"${AVX2FLAG}\"") + list(APPEND AVX2_SRCS ${ARCHDIR}/chunkset_avx.c) + add_feature_info(AVX_CHUNKSET 1 "Support AVX optimized chunkset, using \"${AVX2FLAG}\"") list(APPEND AVX2_SRCS ${ARCHDIR}/compare258_avx.c) add_feature_info(AVX2_COMPARE258 1 "Support AVX2 optimized compare258, using \"${AVX2FLAG}\"") list(APPEND AVX2_SRCS ${ARCHDIR}/adler32_avx.c) diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in index 1e163778..4f6594e6 100644 --- a/arch/x86/Makefile.in +++ b/arch/x86/Makefile.in @@ -22,6 +22,7 @@ all: \ x86.o x86.lo \ adler32_avx.o adler32.lo \ adler32_ssse3.o adler32_ssse3.lo \ + chunkset_avx.o chunkset_avx.lo \ chunkset_sse.o chunkset_sse.lo \ compare258_avx.o compare258_avx.lo \ compare258_sse.o compare258_sse.lo \ @@ -36,6 +37,12 @@ x86.o: x86.lo: $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c +chunkset_avx.o: + $(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx.c + +chunkset_avx.lo: + $(CC) $(SFLAGS) $(AVX2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx.c + chunkset_sse.o: $(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse.c diff --git a/arch/x86/chunkset_avx.c b/arch/x86/chunkset_avx.c new file mode 100644 index 00000000..eb76c0db --- /dev/null +++ b/arch/x86/chunkset_avx.c @@ -0,0 +1,50 @@ +/* chunkset_avx.c -- AVX inline functions to copy small data chunks. + * For conditions of distribution and use, see copyright notice in zlib.h + */ +#include "zbuild.h" +#include "zutil.h" + +#ifdef X86_AVX_CHUNKSET +#include + +typedef __m256i chunk_t; + +#define HAVE_CHUNKMEMSET_1 +#define HAVE_CHUNKMEMSET_2 +#define HAVE_CHUNKMEMSET_4 +#define HAVE_CHUNKMEMSET_8 + +static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) { + *chunk = _mm256_set1_epi8(*(int8_t *)from); +} + +static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { + *chunk = _mm256_set1_epi16(*(int16_t *)from); +} + +static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { + *chunk = _mm256_set1_epi32(*(int32_t *)from); +} + +static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { + *chunk = _mm256_set1_epi64x(*(int64_t *)from); +} + +static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { + *chunk = _mm256_loadu_si256((__m256i *)s); +} + +static inline void storechunk(uint8_t *out, chunk_t *chunk) { + _mm256_storeu_si256((__m256i *)out, *chunk); +} + +#define CHUNKSIZE chunksize_avx +#define CHUNKCOPY chunkcopy_avx +#define CHUNKCOPY_SAFE chunkcopy_safe_avx +#define CHUNKUNROLL chunkunroll_avx +#define CHUNKMEMSET chunkmemset_avx +#define CHUNKMEMSET_SAFE chunkmemset_safe_avx + +#include "chunkset_tpl.h" + +#endif diff --git a/chunkset_tpl.h b/chunkset_tpl.h index 1cd52f1d..60a8b48f 100644 --- a/chunkset_tpl.h +++ b/chunkset_tpl.h @@ -38,6 +38,11 @@ Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) { /* Behave like chunkcopy, but avoid writing beyond of legal output. */ Z_INTERNAL uint8_t* CHUNKCOPY_SAFE(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe) { if ((safe - out) < (ptrdiff_t)sizeof(chunk_t)) { + if (sizeof(chunk_t) > 16 && (len & 16)) { + memcpy(out, from, 16); + out += 16; + from += 16; + } if (len & 8) { memcpy(out, from, 8); out += 8; diff --git a/configure b/configure index 18891ce5..5e3d5bb1 100755 --- a/configure +++ b/configure @@ -1136,10 +1136,10 @@ case "${ARCH}" in ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} x86.lo" if test ${HAVE_AVX2_INTRIN} -eq 1; then - CFLAGS="${CFLAGS} -DX86_AVX2 -DX86_AVX2_ADLER32" - SFLAGS="${SFLAGS} -DX86_AVX2 -DX86_AVX2_ADLER32" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_avx.o compare258_avx.o adler32_avx.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_avx.lo compare258_avx.lo adler32_avx.lo" + CFLAGS="${CFLAGS} -DX86_AVX2 -DX86_AVX2_ADLER32 -DX86_AVX_CHUNKSET" + SFLAGS="${SFLAGS} -DX86_AVX2 -DX86_AVX2_ADLER32 -DX86_AVX_CHUNKSET" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_avx.o chunkset_avx.o compare258_avx.o adler32_avx.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_avx.lo chunkset_avx.lo compare258_avx.lo adler32_avx.lo" fi if test ${HAVE_SSE42CRC_INTRIN} -eq 1 || test ${HAVE_SSE42CRC_INLINE_ASM} -eq 1; then diff --git a/functable.c b/functable.c index 6f4f8156..782e7fd0 100644 --- a/functable.c +++ b/functable.c @@ -72,6 +72,14 @@ extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len); extern uint8_t* chunkmemset_sse2(uint8_t *out, unsigned dist, unsigned len); extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left); #endif +#ifdef X86_AVX_CHUNKSET +extern uint32_t chunksize_avx(void); +extern uint8_t* chunkcopy_avx(uint8_t *out, uint8_t const *from, unsigned len); +extern uint8_t* chunkcopy_safe_avx(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe); +extern uint8_t* chunkunroll_avx(uint8_t *out, unsigned *dist, unsigned *len); +extern uint8_t* chunkmemset_avx(uint8_t *out, unsigned dist, unsigned len); +extern uint8_t* chunkmemset_safe_avx(uint8_t *out, unsigned dist, unsigned len, unsigned left); +#endif #ifdef ARM_NEON_CHUNKSET extern uint32_t chunksize_neon(void); extern uint8_t* chunkcopy_neon(uint8_t *out, uint8_t const *from, unsigned len); @@ -240,6 +248,10 @@ Z_INTERNAL uint32_t chunksize_stub(void) { # endif functable.chunksize = &chunksize_sse2; #endif +#ifdef X86_AVX_CHUNKSET + if (x86_cpu_has_avx2) + functable.chunksize = &chunksize_avx; +#endif #ifdef ARM_NEON_CHUNKSET if (arm_cpu_has_neon) functable.chunksize = &chunksize_neon; @@ -258,6 +270,10 @@ Z_INTERNAL uint8_t* chunkcopy_stub(uint8_t *out, uint8_t const *from, unsigned l # endif functable.chunkcopy = &chunkcopy_sse2; #endif +#ifdef X86_AVX_CHUNKSET + if (x86_cpu_has_avx2) + functable.chunkcopy = &chunkcopy_avx; +#endif #ifdef ARM_NEON_CHUNKSET if (arm_cpu_has_neon) functable.chunkcopy = &chunkcopy_neon; @@ -276,6 +292,10 @@ Z_INTERNAL uint8_t* chunkcopy_safe_stub(uint8_t *out, uint8_t const *from, unsig # endif functable.chunkcopy_safe = &chunkcopy_safe_sse2; #endif +#ifdef X86_AVX_CHUNKSET + if (x86_cpu_has_avx2) + functable.chunkcopy_safe = &chunkcopy_safe_avx; +#endif #ifdef ARM_NEON_CHUNKSET if (arm_cpu_has_neon) functable.chunkcopy_safe = &chunkcopy_safe_neon; @@ -294,6 +314,10 @@ Z_INTERNAL uint8_t* chunkunroll_stub(uint8_t *out, unsigned *dist, unsigned *len # endif functable.chunkunroll = &chunkunroll_sse2; #endif +#ifdef X86_AVX_CHUNKSET + if (x86_cpu_has_avx2) + functable.chunkunroll = &chunkunroll_avx; +#endif #ifdef ARM_NEON_CHUNKSET if (arm_cpu_has_neon) functable.chunkunroll = &chunkunroll_neon; @@ -312,6 +336,10 @@ Z_INTERNAL uint8_t* chunkmemset_stub(uint8_t *out, unsigned dist, unsigned len) # endif functable.chunkmemset = &chunkmemset_sse2; #endif +#ifdef X86_AVX_CHUNKSET + if (x86_cpu_has_avx2) + functable.chunkmemset = &chunkmemset_avx; +#endif #ifdef ARM_NEON_CHUNKSET if (arm_cpu_has_neon) functable.chunkmemset = &chunkmemset_neon; @@ -330,6 +358,10 @@ Z_INTERNAL uint8_t* chunkmemset_safe_stub(uint8_t *out, unsigned dist, unsigned # endif functable.chunkmemset_safe = &chunkmemset_safe_sse2; #endif +#ifdef X86_AVX_CHUNKSET + if (x86_cpu_has_avx2) + functable.chunkmemset_safe = &chunkmemset_safe_avx; +#endif #ifdef ARM_NEON_CHUNKSET if (arm_cpu_has_neon) functable.chunkmemset_safe = &chunkmemset_safe_neon; diff --git a/win32/Makefile.msc b/win32/Makefile.msc index 9bde1aa6..bf1c6881 100644 --- a/win32/Makefile.msc +++ b/win32/Makefile.msc @@ -32,6 +32,7 @@ WFLAGS = \ -DX86_SSE42_CRC_INTRIN \ -DX86_SSE42_CRC_HASH \ -DX86_AVX2 \ + -DX86_AVX_CHUNKSET \ -DX86_SSE2_CHUNKSET \ -DUNALIGNED_OK \ -DUNALIGNED64_OK \ @@ -49,6 +50,7 @@ SUFFIX = OBJS = \ adler32.obj \ chunkset.obj \ + chunkset_avx.obj \ chunkset_sse.obj \ compare258.obj \ compare258_avx.obj \ @@ -166,6 +168,7 @@ gzwrite.obj: $(SRCDIR)/gzwrite.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h compress.obj: $(SRCDIR)/compress.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h uncompr.obj: $(SRCDIR)/uncompr.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h chunkset.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h +chunkset_avx.obj: $(SRCDIR)/arch/x86/chunkset_avx.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h chunkset_sse.obj: $(SRCDIR)/arch/x86/chunkset_sse.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h crc32.obj: $(SRCDIR)/crc32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zendian.h $(SRCDIR)/deflate.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_tbl.h deflate.obj: $(SRCDIR)/deflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h