]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Added AVX support to chunkset functions.
authorNathan Moinvaziri <nathan@solidstatenetworks.com>
Mon, 29 Jun 2020 03:00:01 +0000 (20:00 -0700)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Fri, 11 Sep 2020 11:01:28 +0000 (13:01 +0200)
CMakeLists.txt
arch/x86/Makefile.in
arch/x86/chunkset_avx.c [new file with mode: 0644]
chunkset_tpl.h
configure
functable.c
win32/Makefile.msc

index cd3764b86547bb6f146f0b9e831815a3655a6b62..67c547c53826affef0b808bc87820484b41acf54 100644 (file)
@@ -699,9 +699,11 @@ if(WITH_OPTIM)
             list(APPEND ZLIB_ARCH_HDRS fallback_builtins.h)
         endif()
         if(WITH_AVX2 AND HAVE_AVX2_INTRIN)
-            add_definitions(-DX86_AVX2 -DX86_AVX2_ADLER32)
+            add_definitions(-DX86_AVX2 -DX86_AVX2_ADLER32 -DX86_AVX_CHUNKSET)
             set(AVX2_SRCS ${ARCHDIR}/slide_avx.c)
             add_feature_info(AVX2_SLIDEHASH 1 "Support AVX2 optimized slide_hash, using \"${AVX2FLAG}\"")
+            list(APPEND AVX2_SRCS ${ARCHDIR}/chunkset_avx.c)
+            add_feature_info(AVX_CHUNKSET 1 "Support AVX optimized chunkset, using \"${AVX2FLAG}\"")
             list(APPEND AVX2_SRCS ${ARCHDIR}/compare258_avx.c)
             add_feature_info(AVX2_COMPARE258 1 "Support AVX2 optimized compare258, using \"${AVX2FLAG}\"")
             list(APPEND AVX2_SRCS ${ARCHDIR}/adler32_avx.c)
index 1e163778dbbb27ca55829dcb2a59d2c7ffdbb2e1..4f6594e6df65f30cab9090c40778c573a6274346 100644 (file)
@@ -22,6 +22,7 @@ all: \
        x86.o x86.lo \
        adler32_avx.o adler32.lo \
        adler32_ssse3.o adler32_ssse3.lo \
+       chunkset_avx.o chunkset_avx.lo \
        chunkset_sse.o chunkset_sse.lo \
        compare258_avx.o compare258_avx.lo \
        compare258_sse.o compare258_sse.lo \
@@ -36,6 +37,12 @@ x86.o:
 x86.lo:
        $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c
 
+chunkset_avx.o:
+       $(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx.c
+
+chunkset_avx.lo:
+       $(CC) $(SFLAGS) $(AVX2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx.c
+
 chunkset_sse.o:
        $(CC) $(CFLAGS) $(SSE2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse.c
 
diff --git a/arch/x86/chunkset_avx.c b/arch/x86/chunkset_avx.c
new file mode 100644 (file)
index 0000000..eb76c0d
--- /dev/null
@@ -0,0 +1,50 @@
+/* chunkset_avx.c -- AVX inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "zbuild.h"
+#include "zutil.h"
+
+#ifdef X86_AVX_CHUNKSET
+#include <immintrin.h>
+
+typedef __m256i chunk_t;
+
+#define HAVE_CHUNKMEMSET_1
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm256_set1_epi8(*(int8_t *)from);
+}
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm256_set1_epi16(*(int16_t *)from);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm256_set1_epi32(*(int32_t *)from);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+    *chunk = _mm256_set1_epi64x(*(int64_t *)from);
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+    *chunk = _mm256_loadu_si256((__m256i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+    _mm256_storeu_si256((__m256i *)out, *chunk);
+}
+
+#define CHUNKSIZE        chunksize_avx
+#define CHUNKCOPY        chunkcopy_avx
+#define CHUNKCOPY_SAFE   chunkcopy_safe_avx
+#define CHUNKUNROLL      chunkunroll_avx
+#define CHUNKMEMSET      chunkmemset_avx
+#define CHUNKMEMSET_SAFE chunkmemset_safe_avx
+
+#include "chunkset_tpl.h"
+
+#endif
index 1cd52f1d15c05f83d2c2ce8428daa92360af8146..60a8b48f21c16402dce2d4458b94f322cb5e9b18 100644 (file)
@@ -38,6 +38,11 @@ Z_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
 /* Behave like chunkcopy, but avoid writing beyond of legal output. */
 Z_INTERNAL uint8_t* CHUNKCOPY_SAFE(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe) {
     if ((safe - out) < (ptrdiff_t)sizeof(chunk_t)) {
+        if (sizeof(chunk_t) > 16 && (len & 16)) {
+            memcpy(out, from, 16);
+            out += 16;
+            from += 16;
+        }
         if (len & 8) {
             memcpy(out, from, 8);
             out += 8;
index 18891ce5bf431cb066d97215ae72b92c8c4d53f2..5e3d5bb1c2a5bcaee5a1c347b1e35468b840cfc6 100755 (executable)
--- a/configure
+++ b/configure
@@ -1136,10 +1136,10 @@ case "${ARCH}" in
             ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} x86.lo"
 
             if test ${HAVE_AVX2_INTRIN} -eq 1; then
-                CFLAGS="${CFLAGS} -DX86_AVX2 -DX86_AVX2_ADLER32"
-                SFLAGS="${SFLAGS} -DX86_AVX2 -DX86_AVX2_ADLER32"
-                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_avx.o compare258_avx.o adler32_avx.o"
-                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_avx.lo compare258_avx.lo adler32_avx.lo"
+                CFLAGS="${CFLAGS} -DX86_AVX2 -DX86_AVX2_ADLER32 -DX86_AVX_CHUNKSET"
+                SFLAGS="${SFLAGS} -DX86_AVX2 -DX86_AVX2_ADLER32 -DX86_AVX_CHUNKSET"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_avx.o chunkset_avx.o compare258_avx.o adler32_avx.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_avx.lo chunkset_avx.lo compare258_avx.lo adler32_avx.lo"
             fi
 
             if test ${HAVE_SSE42CRC_INTRIN} -eq 1 || test ${HAVE_SSE42CRC_INLINE_ASM} -eq 1; then
index 6f4f815665b5ac676b567eabf8e555a94f83e1f2..782e7fd0a189b0fa80a10930f19ba5c240fddba5 100644 (file)
@@ -72,6 +72,14 @@ extern uint8_t* chunkunroll_sse2(uint8_t *out, unsigned *dist, unsigned *len);
 extern uint8_t* chunkmemset_sse2(uint8_t *out, unsigned dist, unsigned len);
 extern uint8_t* chunkmemset_safe_sse2(uint8_t *out, unsigned dist, unsigned len, unsigned left);
 #endif
+#ifdef X86_AVX_CHUNKSET
+extern uint32_t chunksize_avx(void);
+extern uint8_t* chunkcopy_avx(uint8_t *out, uint8_t const *from, unsigned len);
+extern uint8_t* chunkcopy_safe_avx(uint8_t *out, uint8_t const *from, unsigned len, uint8_t *safe);
+extern uint8_t* chunkunroll_avx(uint8_t *out, unsigned *dist, unsigned *len);
+extern uint8_t* chunkmemset_avx(uint8_t *out, unsigned dist, unsigned len);
+extern uint8_t* chunkmemset_safe_avx(uint8_t *out, unsigned dist, unsigned len, unsigned left);
+#endif
 #ifdef ARM_NEON_CHUNKSET
 extern uint32_t chunksize_neon(void);
 extern uint8_t* chunkcopy_neon(uint8_t *out, uint8_t const *from, unsigned len);
@@ -240,6 +248,10 @@ Z_INTERNAL uint32_t chunksize_stub(void) {
 # endif
         functable.chunksize = &chunksize_sse2;
 #endif
+#ifdef X86_AVX_CHUNKSET
+    if (x86_cpu_has_avx2)
+        functable.chunksize = &chunksize_avx;
+#endif
 #ifdef ARM_NEON_CHUNKSET
     if (arm_cpu_has_neon)
         functable.chunksize = &chunksize_neon;
@@ -258,6 +270,10 @@ Z_INTERNAL uint8_t* chunkcopy_stub(uint8_t *out, uint8_t const *from, unsigned l
 # endif
         functable.chunkcopy = &chunkcopy_sse2;
 #endif
+#ifdef X86_AVX_CHUNKSET
+    if (x86_cpu_has_avx2)
+        functable.chunkcopy = &chunkcopy_avx;
+#endif
 #ifdef ARM_NEON_CHUNKSET
     if (arm_cpu_has_neon)
         functable.chunkcopy = &chunkcopy_neon;
@@ -276,6 +292,10 @@ Z_INTERNAL uint8_t* chunkcopy_safe_stub(uint8_t *out, uint8_t const *from, unsig
 # endif
         functable.chunkcopy_safe = &chunkcopy_safe_sse2;
 #endif
+#ifdef X86_AVX_CHUNKSET
+    if (x86_cpu_has_avx2)
+        functable.chunkcopy_safe = &chunkcopy_safe_avx;
+#endif
 #ifdef ARM_NEON_CHUNKSET
     if (arm_cpu_has_neon)
         functable.chunkcopy_safe = &chunkcopy_safe_neon;
@@ -294,6 +314,10 @@ Z_INTERNAL uint8_t* chunkunroll_stub(uint8_t *out, unsigned *dist, unsigned *len
 # endif
         functable.chunkunroll = &chunkunroll_sse2;
 #endif
+#ifdef X86_AVX_CHUNKSET
+    if (x86_cpu_has_avx2)
+        functable.chunkunroll = &chunkunroll_avx;
+#endif
 #ifdef ARM_NEON_CHUNKSET
     if (arm_cpu_has_neon)
         functable.chunkunroll = &chunkunroll_neon;
@@ -312,6 +336,10 @@ Z_INTERNAL uint8_t* chunkmemset_stub(uint8_t *out, unsigned dist, unsigned len)
 # endif
         functable.chunkmemset = &chunkmemset_sse2;
 #endif
+#ifdef X86_AVX_CHUNKSET
+    if (x86_cpu_has_avx2)
+        functable.chunkmemset = &chunkmemset_avx;
+#endif
 #ifdef ARM_NEON_CHUNKSET
     if (arm_cpu_has_neon)
         functable.chunkmemset = &chunkmemset_neon;
@@ -330,6 +358,10 @@ Z_INTERNAL uint8_t* chunkmemset_safe_stub(uint8_t *out, unsigned dist, unsigned
 # endif
         functable.chunkmemset_safe = &chunkmemset_safe_sse2;
 #endif
+#ifdef X86_AVX_CHUNKSET
+    if (x86_cpu_has_avx2)
+        functable.chunkmemset_safe = &chunkmemset_safe_avx;
+#endif
 #ifdef ARM_NEON_CHUNKSET
     if (arm_cpu_has_neon)
         functable.chunkmemset_safe = &chunkmemset_safe_neon;
index 9bde1aa6795390b5b213670e7776e4ad048aa061..bf1c688186eea6d3a0b79bbc0494f69e69573c08 100644 (file)
@@ -32,6 +32,7 @@ WFLAGS  = \
        -DX86_SSE42_CRC_INTRIN \
        -DX86_SSE42_CRC_HASH \
        -DX86_AVX2 \
+       -DX86_AVX_CHUNKSET \
        -DX86_SSE2_CHUNKSET \
        -DUNALIGNED_OK \
        -DUNALIGNED64_OK \
@@ -49,6 +50,7 @@ SUFFIX =
 OBJS = \
        adler32.obj \
        chunkset.obj \
+       chunkset_avx.obj \
        chunkset_sse.obj \
        compare258.obj \
        compare258_avx.obj \
@@ -166,6 +168,7 @@ gzwrite.obj: $(SRCDIR)/gzwrite.c $(SRCDIR)/zbuild.h $(SRCDIR)/gzguts.h
 compress.obj: $(SRCDIR)/compress.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h
 uncompr.obj: $(SRCDIR)/uncompr.c $(SRCDIR)/zbuild.h $(SRCDIR)/zlib$(SUFFIX).h
 chunkset.obj: $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
+chunkset_avx.obj: $(SRCDIR)/arch/x86/chunkset_avx.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
 chunkset_sse.obj: $(SRCDIR)/arch/x86/chunkset_sse.c $(SRCDIR)/zbuild.h $(SRCDIR)/zutil.h
 crc32.obj: $(SRCDIR)/crc32.c $(SRCDIR)/zbuild.h $(SRCDIR)/zendian.h $(SRCDIR)/deflate.h $(SRCDIR)/functable.h $(SRCDIR)/crc32_tbl.h
 deflate.obj: $(SRCDIR)/deflate.c $(SRCDIR)/zbuild.h $(SRCDIR)/deflate.h $(SRCDIR)/deflate_p.h $(SRCDIR)/functable.h