]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Added support for AVX2 intrinsics to compare258.
authorNathan Moinvaziri <nathan@nathanm.com>
Thu, 7 May 2020 14:54:37 +0000 (07:54 -0700)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Sun, 24 May 2020 11:53:25 +0000 (13:53 +0200)
CMakeLists.txt
arch/x86/Makefile.in
arch/x86/compare258_avx.c [new file with mode: 0644]
configure
functable.c
win32/Makefile.msc

index c3b3edcb0f7907ca592514aeb0bd8816daa526ab..ee223e421e116ccfc0adf1c178e4226bc0e2d9ee 100644 (file)
@@ -715,6 +715,8 @@ if(WITH_OPTIM)
             add_definitions(-DX86_AVX2)
             list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/slide_avx.c)
             add_feature_info(AVX2_SLIDEHASH 1 "Support AVX2 optimized slide_hash, using \"${AVX2FLAG}\"")
+            list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/compare258_avx.c)
+            add_feature_info(AVX2_COMPARE258 1 "Support AVX2 optimized compare258, using \"${AVX2FLAG}\"")
             add_intrinsics_option("${AVX2FLAG}")
         endif()
         if(WITH_SSE4 AND (HAVE_SSE42CRC_INLINE_ASM OR HAVE_SSE42CRC_INTRIN))
index 365f58a6838b54f442c41d5e2ffb912f99e0fe9a..c119591073dad55df3bb05e1da6df9b53ad108d6 100644 (file)
@@ -17,7 +17,7 @@ SRCDIR=.
 SRCTOP=../..
 TOPDIR=$(SRCTOP)
 
-all: x86.o x86.lo compare258_sse.o compare258_sse.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo slide_avx.o slide_avx.lo slide_sse.o slide_sse.lo
+all: x86.o x86.lo compare258_avx.o compare258_avx.lo compare258_sse.o compare258_sse.lo deflate_quick.o deflate_quick.lo insert_string_sse.o insert_string_sse.lo crc_folding.o crc_folding.lo slide_avx.o slide_avx.lo slide_sse.o slide_sse.lo
 
 x86.o:
        $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c
@@ -25,6 +25,12 @@ x86.o:
 x86.lo:
        $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c
 
+compare258_avx.o:
+       $(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_avx.c
+
+compare258_avx.lo:
+       $(CC) $(SFLAGS) $(AVX2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_avx.c
+
 compare258_sse.o:
        $(CC) $(CFLAGS) $(SSE4FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_sse.c
 
diff --git a/arch/x86/compare258_avx.c b/arch/x86/compare258_avx.c
new file mode 100644 (file)
index 0000000..10096ea
--- /dev/null
@@ -0,0 +1,56 @@
+/* compare258_avx.c -- AVX2 version of compare258
+ * Copyright Mika T. Lindqvist  <postmaster@raasu.org>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "../../zutil.h"
+
+#include "fallback_builtins.h"
+
+#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+
+#include <immintrin.h>
+#ifdef _MSC_VER
+#  include <nmmintrin.h>
+#endif
+
+/* UNALIGNED_OK, AVX2 intrinsic comparison */
+int32_t compare258_unaligned_avx2(const unsigned char *src0, const unsigned char *src1) {
+    const unsigned char *src0start = src0;
+    const unsigned char *src0end = src0 + 256;
+    do {
+        __m256i ymm_src0, ymm_src1, ymm_cmp;
+        ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
+        ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
+        ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */
+        int mask = _mm256_movemask_epi8(ymm_cmp); 
+        if ((unsigned int)mask != 0xFFFFFFFF) {
+            int match_byte = __builtin_ctz(~mask); /* Invert bits so identical = 0 */
+            return (int32_t)(src0 - src0start + match_byte);
+        }
+
+        src0 += 32, src1 += 32;
+
+        ymm_src0 = _mm256_loadu_si256((__m256i*)src0);
+        ymm_src1 = _mm256_loadu_si256((__m256i*)src1);
+        ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1);
+        mask = _mm256_movemask_epi8(ymm_cmp); 
+        if ((unsigned int)mask != 0xFFFFFFFF) {
+            int match_byte = __builtin_ctz(~mask);
+            return (int32_t)(src0 - src0start + match_byte);
+        }
+
+        src0 += 32, src1 += 32;
+    } while (src0 < src0end);
+
+    if (*(uint16_t *)src0 == *(uint16_t *)src1)
+        src0 += 2, src1 += 2;
+    else if (*src0 == *src1)
+        src0 += 1, src1 += 1;
+
+    return (int32_t)(src0 - src0start);
+}
+
+#endif
index 463ae44c14e339d1189c2d030777ca52db059ca6..50172fddc080098f507d6c45049f734cbf8bd9bb 100755 (executable)
--- a/configure
+++ b/configure
@@ -1095,8 +1095,8 @@ case "${ARCH}" in
             if test ${HAVE_AVX2_INTRIN} -eq 1; then
                 CFLAGS="${CFLAGS} -DX86_AVX2"
                 SFLAGS="${SFLAGS} -DX86_AVX2"
-                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_avx.o"
-                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_avx.lo"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare258_avx.o slide_avx.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare258_avx.lo slide_avx.lo"
             fi
 
             CFLAGS="${CFLAGS} -DX86_SSE42_CRC_HASH"
@@ -1144,8 +1144,8 @@ case "${ARCH}" in
             if test ${HAVE_AVX2_INTRIN} -eq 1; then
                 CFLAGS="${CFLAGS} -DX86_AVX2"
                 SFLAGS="${SFLAGS} -DX86_AVX2"
-                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} slide_avx.o"
-                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} slide_avx.lo"
+                ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare258_avx.o slide_avx.o"
+                ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare258_avx.lo slide_avx.lo"
             fi
 
             if test ${HAVE_SSE42CMPSTR_INTRIN} -eq 1; then
index 943c2b41fd691be84cfd21330e298951767a8661..a95cfc5504b58b1a1536c18d2ce520e9700aed6d 100644 (file)
@@ -72,6 +72,9 @@ extern int32_t compare258_unaligned_64(const unsigned char *src0, const unsigned
 #ifdef X86_SSE42_CMP_STR
 extern int32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char *src1);
 #endif
+#if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+extern int32_t compare258_unaligned_avx2(const unsigned char *src0, const unsigned char *src1);
+#endif
 #endif
 
 /* stub definitions */
@@ -223,6 +226,10 @@ ZLIB_INTERNAL int32_t compare258_stub(const unsigned char *src0, const unsigned
     if (x86_cpu_has_sse42)
         functable.compare258 = &compare258_unaligned_sse4;
 #  endif
+#  if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ)
+    if (x86_cpu_has_avx2)
+        functable.compare258 = &compare258_unaligned_avx2;
+#  endif
 #endif
 
     return functable.compare258(src0, src1);
index abb9808b92d2b0d98340ebb9d16cd04056d711c3..ec24b45f016323230518164d3a8dbc3c61646688 100644 (file)
@@ -34,7 +34,7 @@ WITH_GZFILEOP =
 ZLIB_COMPAT =
 SUFFIX =
 
-OBJS = adler32.obj compare258.obj compare258_sse.obj compress.obj crc32.obj \
+OBJS = adler32.obj compare258.obj compare258_avx.obj compare258_sse.obj compress.obj crc32.obj \
        deflate.obj deflate_fast.obj deflate_quick.obj deflate_slow.obj deflate_medium.obj \
        functable.obj infback.obj inflate.obj inftrees.obj inffast.obj insert_string.obj \
        slide_avx.obj slide_sse.obj trees.obj uncompr.obj zutil.obj \