From: Nathan Moinvaziri Date: Sat, 23 May 2020 04:35:26 +0000 (-0700) Subject: Added Adler32 SSSE3 and AVX2 implementations to functable. X-Git-Tag: 1.9.9-b1~226 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f8801a00b02c8796c6dddde8a30adc18a1a80477;p=thirdparty%2Fzlib-ng.git Added Adler32 SSSE3 and AVX2 implementations to functable. Co-authored-by: Brian Bockelman Co-authored-by: Mika T. Lindqvist --- diff --git a/CMakeLists.txt b/CMakeLists.txt index e3a292c0..4d1fac04 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -99,12 +99,13 @@ elseif(BASEARCH_S360_FOUND AND "${ARCH}" MATCHES "s390x") elseif(BASEARCH_X86_FOUND) option(WITH_AVX2 "Build with AVX2" ON) option(WITH_SSE2 "Build with SSE2" ON) + option(WITH_SSSE3 "Build with SSSE3" ON) option(WITH_SSE4 "Build with SSE4" ON) option(WITH_PCLMULQDQ "Build with PCLMULQDQ" ON) endif() mark_as_advanced(FORCE ZLIB_DUAL_LINK WITH_ACLE WITH_NEON WITH_DFLTCC_DEFLATE WITH_DFLTCC_INFLATE - WITH_AVX2 WITH_SSE2 WITH_SSE4 WITH_PCLMULQDQ WITH_POWER8 WITH_INFLATE_STRICT WITH_INFLATE_ALLOW_INVALID_DIST) + WITH_AVX2 WITH_SSE2 WITH_SSSE3 WITH_SSE4 WITH_PCLMULQDQ WITH_POWER8 WITH_INFLATE_STRICT WITH_INFLATE_ALLOW_INVALID_DIST) add_feature_info(ZLIB_COMPAT ZLIB_COMPAT "Provide a zlib-compatible API") add_feature_info(WITH_GZFILEOP WITH_GZFILEOP "Compile with support for gzFile-related functions") @@ -145,6 +146,7 @@ if(${CMAKE_C_COMPILER} MATCHES "icc" OR ${CMAKE_C_COMPILER} MATCHES "icpc" OR ${ if(BASEARCH_X86_FOUND) set(AVX2FLAG "-mavx2") set(SSE2FLAG "-msse2") + set(SSSE3FLAG "-mssse3") set(SSE4FLAG "-msse4.2") endif() else() @@ -154,6 +156,7 @@ if(${CMAKE_C_COMPILER} MATCHES "icc" OR ${CMAKE_C_COMPILER} MATCHES "icpc" OR ${ if(BASEARCH_X86_FOUND) set(AVX2FLAG "/arch:AVX2") set(SSE2FLAG "/arch:SSE2") + set(SSSE3FLAG "/arch:SSSE3") set(SSE4FLAG "/arch:SSE4.2") endif() endif() @@ -232,6 +235,7 @@ else() elseif(BASEARCH_X86_FOUND) set(AVX2FLAG "-mavx2") set(SSE2FLAG "-msse2") + set(SSSE3FLAG "-mssse3") set(SSE4FLAG "-msse4") set(PCLMULFLAG "-mpclmul") endif() @@ -245,6 +249,7 @@ else() elseif(BASEARCH_X86_FOUND) set(AVX2FLAG ${NATIVEFLAG}) set(SSE2FLAG ${NATIVEFLAG}) + set(SSSE3FLAG ${NATIVEFLAG}) set(SSE4FLAG ${NATIVEFLAG}) set(PCLMULFLAG ${NATIVEFLAG}) endif() @@ -456,8 +461,26 @@ elseif(BASEARCH_X86_FOUND) }" HAVE_SSE2_INTRIN ) - set(CMAKE_REQUIRED_FLAGS) + # Check whether compiler supports SSSE3 intrinsics + if(WITH_NATIVE_INSTRUCTIONS) + set(CMAKE_REQUIRED_FLAGS "${NATIVEFLAG}") + else() + set(CMAKE_REQUIRED_FLAGS "${SSSE3FLAG}") + endif() + check_c_source_compile_or_run( + "#include + int main(void) + { + __m128i u, v, w; + u = _mm_set1_epi32(1); + v = _mm_set1_epi32(2); + w = _mm_hadd_epi32(u, v); + (void)w; + return 0; + }" + HAVE_SSSE3_INTRIN + ) # Check whether compiler supports SSE4 CRC inline asm if(WITH_NATIVE_INSTRUCTIONS) set(CMAKE_REQUIRED_FLAGS "${NATIVEFLAG}") @@ -508,8 +531,6 @@ elseif(BASEARCH_X86_FOUND) }" HAVE_SSE42CMPSTR_INTRIN ) - set(CMAKE_REQUIRED_FLAGS) - # Check whether compiler supports PCLMULQDQ intrinsics if(WITH_NATIVE_INSTRUCTIONS) set(CMAKE_REQUIRED_FLAGS "${NATIVEFLAG}") @@ -533,8 +554,6 @@ elseif(BASEARCH_X86_FOUND) else() set(HAVE_PCLMULQDQ_INTRIN NO) endif() - set(CMAKE_REQUIRED_FLAGS) - # Check whether compiler supports AVX2 intrinics if(WITH_NATIVE_INSTRUCTIONS) set(CMAKE_REQUIRED_FLAGS "${NATIVEFLAG}") @@ -679,11 +698,12 @@ if(WITH_OPTIM) list(APPEND ZLIB_ARCH_HDRS fallback_builtins.h) endif() if(WITH_AVX2 AND HAVE_AVX2_INTRIN) - add_definitions(-DX86_AVX2) - list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/slide_avx.c) + add_definitions(-DX86_AVX2 -DX86_AVX2_ADLER32) + list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/slide_avx.c ${ARCHDIR}/adler32_avx.c) add_feature_info(AVX2_SLIDEHASH 1 "Support AVX2 optimized slide_hash, using \"${AVX2FLAG}\"") list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/compare258_avx.c) add_feature_info(AVX2_COMPARE258 1 "Support AVX2 optimized compare258, using \"${AVX2FLAG}\"") + add_feature_info(AVX2_ADLER32 1 "Support AVX2-accelerated adler32, using \"${AVX2FLAG}\"") add_intrinsics_option("${AVX2FLAG}") endif() if(WITH_SSE4 AND (HAVE_SSE42CRC_INLINE_ASM OR HAVE_SSE42CRC_INTRIN)) @@ -711,6 +731,12 @@ if(WITH_OPTIM) endif() endif() endif() + if(WITH_SSSE3 AND HAVE_SSSE3_INTRIN) + add_definitions(-DX86_SSSE3 -DX86_SSSE3_ADLER32) + list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/adler32_ssse3.c) + add_feature_info(SSSE3_ADLER32 1 "Support SSSE3-accelerated adler32, using \"${SSSE3FLAG}\"") + add_intrinsics_option("${SSSE3FLAG}") + endif() if(WITH_PCLMULQDQ AND HAVE_PCLMULQDQ_INTRIN) add_definitions(-DX86_PCLMULQDQ_CRC) list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/crc_folding.c) diff --git a/adler32.cxx b/adler32.cxx deleted file mode 100644 index b30680f2..00000000 --- a/adler32.cxx +++ /dev/null @@ -1,412 +0,0 @@ -/* adler32.c -- compute the Adler-32 checksum of a data stream - * Copyright (C) 1995-2011 Mark Adler - * For conditions of distribution and use, see copyright notice in zlib.h - */ - -/* @(#) $Id$ */ - -#include "zutil.h" -#include -#include - -#include - -static uLong adler32_combine_ OF((uLong adler1, uLong adler2, z_off64_t len2)); - -#define BASE 65521 /* largest prime smaller than 65536 */ -#define NMAX 5552 -/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */ - -/* - * As we are using _signed_ integer arithmetic for the SSE/AVX2 implementations, - * we consider the max as 2^31-1 - */ -#define NMAX_VEC 5552 - -#define NMAX_VEC2 5552 - -#define DO1(buf,i) {adler += (buf)[i]; sum2 += adler;} -#define DO2(buf,i) DO1(buf,i); DO1(buf,i+1); -#define DO4(buf,i) DO2(buf,i); DO2(buf,i+2); -#define DO8(buf,i) DO4(buf,i); DO4(buf,i+4); -#define DO16(buf) DO8(buf,0); DO8(buf,8); - -/* use NO_DIVIDE if your processor does not do division in hardware -- - try it both ways to see which is faster */ -#ifdef NO_DIVIDE -/* note that this assumes BASE is 65521, where 65536 % 65521 == 15 - (thank you to John Reiser for pointing this out) */ -# define CHOP(a) \ - do { \ - unsigned long tmp = a >> 16; \ - a &= 0xffffUL; \ - a += (tmp << 4) - tmp; \ - } while (0) -# define MOD28(a) \ - do { \ - CHOP(a); \ - if (a >= BASE) a -= BASE; \ - } while (0) -# define MOD(a) \ - do { \ - CHOP(a); \ - MOD28(a); \ - } while (0) -# define MOD63(a) \ - do { /* this assumes a is not negative */ \ - z_off64_t tmp = a >> 32; \ - a &= 0xffffffffL; \ - a += (tmp << 8) - (tmp << 5) + tmp; \ - tmp = a >> 16; \ - a &= 0xffffL; \ - a += (tmp << 4) - tmp; \ - tmp = a >> 16; \ - a &= 0xffffL; \ - a += (tmp << 4) - tmp; \ - if (a >= BASE) a -= BASE; \ - } while (0) -#else -# define MOD(a) a %= BASE -# define MOD28(a) a %= BASE -# define MOD63(a) a %= BASE -#endif - -/* ========================================================================= */ -extern "C" { -uLong ZEXPORT adler32_serial(uLong adler, const Bytef *buf, uInt len) -{ - - unsigned long sum2; - unsigned n; - - /* split Adler-32 into component sums */ - sum2 = (adler >> 16) & 0xffff; - adler &= 0xffff; - - /* in case user likes doing a byte at a time, keep it fast */ - if (len == 1) { - adler += buf[0]; - if (adler >= BASE) - adler -= BASE; - sum2 += adler; - if (sum2 >= BASE) - sum2 -= BASE; - return adler | (sum2 << 16); - } - - /* initial Adler-32 value (deferred check for len == 1 speed) */ - if (buf == Z_NULL) - return 1L; - - /* in case short lengths are provided, keep it somewhat fast */ - if (len < 16) { - while (len--) { - adler += *buf++; - sum2 += adler; - } - if (adler >= BASE) - adler -= BASE; - MOD28(sum2); /* only added so many BASE's */ - return adler | (sum2 << 16); - } - - /* do length NMAX blocks -- requires just one modulo operation */ - while (len >= NMAX) { - len -= NMAX; - n = NMAX / 16; /* NMAX is divisible by 16 */ - do { - DO16(buf); /* 16 sums unrolled */ - buf += 16; - } while (--n); - MOD(adler); - MOD(sum2); - } - - /* do remaining bytes (less than NMAX, still just one modulo) */ - if (len) { /* avoid modulos if none remaining */ - while (len >= 16) { - len -= 16; - DO16(buf); - buf += 16; - } - while (len--) { - adler += *buf++; - sum2 += adler; - } - MOD(adler); - MOD(sum2); - } - - /* return recombined sums */ - return adler | (sum2 << 16); -} - -#define likely(x) __builtin_expect(!!(x), 1) -#define unlikely(x) __builtin_expect(!!(x), 0) - -/* ========================================================================= */ -uLong ZEXPORT adler32_vec(uLong adler, const Bytef *buf, uInt len) -{ - - unsigned long sum2; - - /* split Adler-32 into component sums */ - sum2 = (adler >> 16) & 0xffff; - adler &= 0xffff; - - /* in case user likes doing a byte at a time, keep it fast */ - if (unlikely(len == 1)) { - adler += buf[0]; - if (adler >= BASE) - adler -= BASE; - sum2 += adler; - if (sum2 >= BASE) - sum2 -= BASE; - return adler | (sum2 << 16); - } - - /* initial Adler-32 value (deferred check for len == 1 speed) */ - if (unlikely(buf == Z_NULL)) - return 1L; - - /* in case short lengths are provided, keep it somewhat fast */ - if (unlikely(len < 16)) { - while (len--) { - adler += *buf++; - sum2 += adler; - } - if (adler >= BASE) - adler -= BASE; - MOD28(sum2); /* only added so many BASE's */ - return adler | (sum2 << 16); - } - - uint32_t __attribute__ ((aligned(16))) s1[4], s2[4]; - s1[0] = s1[1] = s1[2] = 0; s1[3] = adler; - s2[0] = s2[1] = s2[2] = 0; s2[3] = sum2; - char __attribute__ ((aligned(16))) dot1[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - __m128i dot1v = _mm_load_si128((__m128i*)dot1); - char __attribute__ ((aligned(16))) dot2[16] = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}; - __m128i dot2v = _mm_load_si128((__m128i*)dot2); - short __attribute__ ((aligned(16))) dot3[8] = {1, 1, 1, 1, 1, 1, 1, 1}; - __m128i dot3v = _mm_load_si128((__m128i*)dot3); - // We will need to multiply by - //char __attribute__ ((aligned(16))) shift[4] = {0, 0, 0, 4}; //{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4}; - char __attribute__ ((aligned(16))) shift[16] = {4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - __m128i shiftv = _mm_load_si128((__m128i*)shift); - while (len >= 16) { - __m128i vs1 = _mm_load_si128((__m128i*)s1); - __m128i vs2 = _mm_load_si128((__m128i*)s2); - __m128i vs1_0 = vs1; - int k = (len < NMAX_VEC ? (int)len : NMAX_VEC); - k -= k % 16; - len -= k; - while (k >= 16) { - /* - vs1 = adler + sum(c[i]) - vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) - - NOTE: 256-bit equivalents are: - _mm256_maddubs_epi16 <- operates on 32 bytes to 16 shorts - _mm256_madd_epi16 <- Sums 16 shorts to 8 int32_t. - We could rewrite the below to use 256-bit instructions instead of 128-bit. - */ - __m128i vbuf = _mm_loadu_si128((__m128i*)buf); - buf += 16; - k -= 16; - __m128i v_short_sum1 = _mm_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts. - __m128i vsum1 = _mm_madd_epi16(v_short_sum1, dot3v); // sum 8 shorts to 4 int32_t; - __m128i v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v); - vs1 = _mm_add_epi32(vsum1, vs1); - __m128i vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); - vs1_0 = _mm_sll_epi32(vs1_0, shiftv); - vsum2 = _mm_add_epi32(vsum2, vs2); - vs2 = _mm_add_epi32(vsum2, vs1_0); - vs1_0 = vs1; - } - // At this point, we have partial sums stored in vs1 and vs2. There are AVX512 instructions that - // would allow us to sum these quickly (VP4DPWSSD). For now, just unpack and move on. - uint32_t __attribute__((aligned(16))) s1_unpack[4]; - uint32_t __attribute__((aligned(16))) s2_unpack[4]; - _mm_store_si128((__m128i*)s1_unpack, vs1); - _mm_store_si128((__m128i*)s2_unpack, vs2); - adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE); - MOD(adler); - s1[3] = adler; - sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE); - MOD(sum2); - s2[3] = sum2; - } - - while (len--) { - adler += *buf++; - sum2 += adler; - } - MOD(adler); - MOD(sum2); - - /* return recombined sums */ - return adler | (sum2 << 16); -} - -/* ========================================================================= */ -uLong ZEXPORT adler32_avx(uLong adler, const Bytef *buf, uInt len) -{ - - unsigned long sum2; - - /* split Adler-32 into component sums */ - sum2 = (adler >> 16) & 0xffff; - adler &= 0xffff; - - /* in case user likes doing a byte at a time, keep it fast */ - if (unlikely(len == 1)) { - adler += buf[0]; - if (adler >= BASE) - adler -= BASE; - sum2 += adler; - if (sum2 >= BASE) - sum2 -= BASE; - return adler | (sum2 << 16); - } - - /* initial Adler-32 value (deferred check for len == 1 speed) */ - if (unlikely(buf == Z_NULL)) - return 1L; - - /* in case short lengths are provided, keep it somewhat fast */ - if (unlikely(len < 32)) { - while (len--) { - adler += *buf++; - sum2 += adler; - } - if (adler >= BASE) - adler -= BASE; - MOD28(sum2); /* only added so many BASE's */ - return adler | (sum2 << 16); - } - - uint32_t __attribute__ ((aligned(32))) s1[8], s2[8]; - memset(s1, '\0', sizeof(uint32_t)*7); s1[7] = adler; // TODO: would a masked load be faster? - memset(s2, '\0', sizeof(uint32_t)*7); s2[7] = sum2; - char __attribute__ ((aligned(32))) dot1[32] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - __m256i dot1v = _mm256_load_si256((__m256i*)dot1); - char __attribute__ ((aligned(32))) dot2[32] = {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}; - __m256i dot2v = _mm256_load_si256((__m256i*)dot2); - short __attribute__ ((aligned(32))) dot3[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - __m256i dot3v = _mm256_load_si256((__m256i*)dot3); - // We will need to multiply by - char __attribute__ ((aligned(16))) shift[16] = {5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - __m128i shiftv = _mm_load_si128((__m128i*)shift); - while (len >= 32) { - __m256i vs1 = _mm256_load_si256((__m256i*)s1); - __m256i vs2 = _mm256_load_si256((__m256i*)s2); - __m256i vs1_0 = vs1; - int k = (len < NMAX_VEC ? (int)len : NMAX_VEC); - k -= k % 32; - len -= k; - while (k >= 32) { - /* - vs1 = adler + sum(c[i]) - vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) - */ - __m256i vbuf = _mm256_loadu_si256((__m256i*)buf); - buf += 32; - k -= 32; - __m256i v_short_sum1 = _mm256_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts. - __m256i vsum1 = _mm256_madd_epi16(v_short_sum1, dot3v); // sum 8 shorts to 4 int32_t; - __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); - vs1 = _mm256_add_epi32(vsum1, vs1); - __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); - vs1_0 = _mm256_sll_epi32(vs1_0, shiftv); - vsum2 = _mm256_add_epi32(vsum2, vs2); - vs2 = _mm256_add_epi32(vsum2, vs1_0); - vs1_0 = vs1; - } - // At this point, we have partial sums stored in vs1 and vs2. There are AVX512 instructions that - // would allow us to sum these quickly (VP4DPWSSD). For now, just unpack and move on. - uint32_t __attribute__((aligned(32))) s1_unpack[8]; - uint32_t __attribute__((aligned(32))) s2_unpack[8]; - _mm256_store_si256((__m256i*)s1_unpack, vs1); - _mm256_store_si256((__m256i*)s2_unpack, vs2); - adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) + (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE); - MOD(adler); - s1[7] = adler; - sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE) + (s2_unpack[4] % BASE) + (s2_unpack[5] % BASE) + (s2_unpack[6] % BASE) + (s2_unpack[7] % BASE); - MOD(sum2); - s2[7] = sum2; - } - - while (len--) { - adler += *buf++; - sum2 += adler; - } - MOD(adler); - MOD(sum2); - - /* return recombined sums */ - return adler | (sum2 << 16); -} -} - -__attribute__ ((target ("default"))) -static uLong adler32_impl(uLong adler, const Bytef *buf, uInt len) -{ - return adler32_serial(adler, buf, len); -} - -__attribute__ ((target ("sse4.2"))) -//__attribute__ ((target ("mmx"))) -static uLong adler32_impl(uLong adler, const Bytef *buf, uInt len) -{ - return adler32_vec(adler, buf, len); -} - -__attribute__ ((target ("avx2"))) -static uLong adler32_impl(uLong adler, const Bytef *buf, uInt len) -{ - return adler32_avx(adler, buf, len); -} - -extern "C" { -uLong ZEXPORT adler32(uLong adler, const Bytef *buf, uInt len) {return adler32_impl(adler, buf, len);} -} - -/* ========================================================================= */ -static uLong adler32_combine_(uLong adler1, uLong adler2, z_off64_t len2) -{ - unsigned long sum1; - unsigned long sum2; - unsigned rem; - - /* for negative len, return invalid adler32 as a clue for debugging */ - if (len2 < 0) - return 0xffffffffUL; - - /* the derivation of this formula is left as an exercise for the reader */ - MOD63(len2); /* assumes len2 >= 0 */ - rem = (unsigned)len2; - sum1 = adler1 & 0xffff; - sum2 = rem * sum1; - MOD(sum2); - sum1 += (adler2 & 0xffff) + BASE - 1; - sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem; - if (sum1 >= BASE) sum1 -= BASE; - if (sum1 >= BASE) sum1 -= BASE; - if (sum2 >= (BASE << 1)) sum2 -= (BASE << 1); - if (sum2 >= BASE) sum2 -= BASE; - return sum1 | (sum2 << 16); -} - -extern "C" { -/* ========================================================================= */ -uLong adler32_combine(uLong adler1, uLong adler2, z_off_t len2) -{ - return adler32_combine_(adler1, adler2, len2); -} - -uLong adler32_combine64(uLong adler1, uLong adler2, z_off64_t len2) -{ - return adler32_combine_(adler1, adler2, len2); -} -} diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in index 4f8a753b..f0f44526 100644 --- a/arch/x86/Makefile.in +++ b/arch/x86/Makefile.in @@ -10,6 +10,7 @@ SUFFIX= AVX2FLAG=-mavx2 SSE2FLAG=-msse2 +SSSE3FLAG=-mssse3 SSE4FLAG=-msse4 PCLMULFLAG=-mpclmul @@ -19,6 +20,8 @@ TOPDIR=$(SRCTOP) all: \ x86.o x86.lo \ + adler32_avx.o adler32.lo \ + adler32_ssse3.o adler32_ssse3.lo \ compare258_avx.o compare258_avx.lo \ compare258_sse.o compare258_sse.lo \ insert_string_sse.o insert_string_sse.lo \ @@ -68,6 +71,18 @@ slide_sse.o: slide_sse.lo: $(CC) $(SFLAGS) $(SSE2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_sse.c +adler32_avx.o: $(SRCDIR)/adler32_avx.c + $(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx.c + +adler32_avx.lo: $(SRCDIR)/adler32_avx.c + $(CC) $(SFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx.c + +adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c + $(CC) $(CFLAGS) $(SSSE3FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c + +adler32_ssse3.lo: $(SRCDIR)/adler32_ssse3.c + $(CC) $(SFLAGS) $(SSSE3FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c + mostlyclean: clean clean: rm -f *.o *.lo *~ diff --git a/arch/x86/adler32_avx.c b/arch/x86/adler32_avx.c new file mode 100644 index 00000000..69d8a11c --- /dev/null +++ b/arch/x86/adler32_avx.c @@ -0,0 +1,103 @@ +/* adler32.c -- compute the Adler-32 checksum of a data stream + * Copyright (C) 1995-2011 Mark Adler + * Authors: + * Brian Bockelman + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef ADLER32_AVX2_H +#define ADLER32_AVX2_H + +#include "../../zbuild.h" +#include "../../zutil.h" + +#include "../../adler32_p.h" + +#include + +#ifdef X86_AVX2_ADLER32 + +uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len) { + uint32_t sum2; + + /* split Adler-32 into component sums */ + sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; + + /* in case user likes doing a byte at a time, keep it fast */ + if (len == 1) + return adler32_len_1(adler, buf, sum2); + + /* initial Adler-32 value (deferred check for len == 1 speed) */ + if (buf == NULL) + return 1L; + + /* in case short lengths are provided, keep it somewhat fast */ + if (len < 16) + return adler32_len_16(adler, buf, len, sum2); + + uint32_t ALIGNED_(32) s1[8], s2[8]; + memset(s1, '\0', sizeof(uint32_t)*7); s1[7] = adler; // TODO: would a masked load be faster? + memset(s2, '\0', sizeof(uint32_t)*7); s2[7] = sum2; + char ALIGNED_(32) dot1[32] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + __m256i dot1v = _mm256_load_si256((__m256i*)dot1); + char ALIGNED_(32) dot2[32] = {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}; + __m256i dot2v = _mm256_load_si256((__m256i*)dot2); + short ALIGNED_(32) dot3[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + __m256i dot3v = _mm256_load_si256((__m256i*)dot3); + // We will need to multiply by + char ALIGNED_(32) shift[16] = {5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + __m128i shiftv = _mm_load_si128((__m128i*)shift); + while (len >= 32) { + __m256i vs1 = _mm256_load_si256((__m256i*)s1); + __m256i vs2 = _mm256_load_si256((__m256i*)s2); + __m256i vs1_0 = vs1; + int k = (len < NMAX ? (int)len : NMAX); + k -= k % 32; + len -= k; + while (k >= 32) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) + */ + __m256i vbuf = _mm256_loadu_si256((__m256i*)buf); + buf += 32; + k -= 32; + __m256i v_short_sum1 = _mm256_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts. + __m256i vsum1 = _mm256_madd_epi16(v_short_sum1, dot3v); // sum 8 shorts to 4 int32_t; + __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); + vs1 = _mm256_add_epi32(vsum1, vs1); + __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); + vs1_0 = _mm256_sll_epi32(vs1_0, shiftv); + vsum2 = _mm256_add_epi32(vsum2, vs2); + vs2 = _mm256_add_epi32(vsum2, vs1_0); + vs1_0 = vs1; + } + // At this point, we have partial sums stored in vs1 and vs2. There are AVX512 instructions that + // would allow us to sum these quickly (VP4DPWSSD). For now, just unpack and move on. + uint32_t ALIGNED_(32) s1_unpack[8]; + uint32_t ALIGNED_(32) s2_unpack[8]; + _mm256_store_si256((__m256i*)s1_unpack, vs1); + _mm256_store_si256((__m256i*)s2_unpack, vs2); + adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) + (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE); + MOD(adler); + s1[7] = adler; + sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE) + (s2_unpack[4] % BASE) + (s2_unpack[5] % BASE) + (s2_unpack[6] % BASE) + (s2_unpack[7] % BASE); + MOD(sum2); + s2[7] = sum2; + } + + while (len--) { + adler += *buf++; + sum2 += adler; + } + MOD(adler); + MOD(sum2); + + /* return recombined sums */ + return adler | (sum2 << 16); +} + +#endif + +#endif diff --git a/arch/x86/adler32_ssse3.c b/arch/x86/adler32_ssse3.c new file mode 100644 index 00000000..45638e4f --- /dev/null +++ b/arch/x86/adler32_ssse3.c @@ -0,0 +1,109 @@ +/* adler32.c -- compute the Adler-32 checksum of a data stream + * Copyright (C) 1995-2011 Mark Adler + * Authors: + * Brian Bockelman + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef ADLER32_SSSE3_H +#define ADLER32_SSSE3_H + +#include "../../zbuild.h" +#include "../../zutil.h" + +#include "../../adler32_p.h" + +#ifdef X86_SSSE3_ADLER32 + +#include + +uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len) { + uint32_t sum2; + + /* split Adler-32 into component sums */ + sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; + + /* in case user likes doing a byte at a time, keep it fast */ + if (len == 1) + return adler32_len_1(adler, buf, sum2); + + /* initial Adler-32 value (deferred check for len == 1 speed) */ + if (buf == NULL) + return 1L; + + /* in case short lengths are provided, keep it somewhat fast */ + if (len < 16) + return adler32_len_16(adler, buf, len, sum2); + + uint32_t ALIGNED_(16) s1[4], s2[4]; + s1[0] = s1[1] = s1[2] = 0; s1[3] = adler; + s2[0] = s2[1] = s2[2] = 0; s2[3] = sum2; + char ALIGNED_(16) dot1[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + __m128i dot1v = _mm_load_si128((__m128i*)dot1); + char ALIGNED_(16) dot2[16] = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}; + __m128i dot2v = _mm_load_si128((__m128i*)dot2); + short ALIGNED_(16) dot3[8] = {1, 1, 1, 1, 1, 1, 1, 1}; + __m128i dot3v = _mm_load_si128((__m128i*)dot3); + // We will need to multiply by + //char ALIGNED_(16) shift[4] = {0, 0, 0, 4}; //{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4}; + char ALIGNED_(16) shift[16] = {4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + __m128i shiftv = _mm_load_si128((__m128i*)shift); + while (len >= 16) { + __m128i vs1 = _mm_load_si128((__m128i*)s1); + __m128i vs2 = _mm_load_si128((__m128i*)s2); + __m128i vs1_0 = vs1; + int k = (len < NMAX ? (int)len : NMAX); + k -= k % 16; + len -= k; + while (k >= 16) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) + + NOTE: 256-bit equivalents are: + _mm256_maddubs_epi16 <- operates on 32 bytes to 16 shorts + _mm256_madd_epi16 <- Sums 16 shorts to 8 int32_t. + We could rewrite the below to use 256-bit instructions instead of 128-bit. + */ + __m128i vbuf = _mm_loadu_si128((__m128i*)buf); + buf += 16; + k -= 16; + __m128i v_short_sum1 = _mm_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts. + __m128i vsum1 = _mm_madd_epi16(v_short_sum1, dot3v); // sum 8 shorts to 4 int32_t; + __m128i v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v); + vs1 = _mm_add_epi32(vsum1, vs1); + __m128i vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); + vs1_0 = _mm_sll_epi32(vs1_0, shiftv); + vsum2 = _mm_add_epi32(vsum2, vs2); + vs2 = _mm_add_epi32(vsum2, vs1_0); + vs1_0 = vs1; + } + // At this point, we have partial sums stored in vs1 and vs2. There are AVX512 instructions that + // would allow us to sum these quickly (VP4DPWSSD). For now, just unpack and move on. + uint32_t ALIGNED_(16) s1_unpack[4]; + uint32_t ALIGNED_(16) s2_unpack[4]; + _mm_store_si128((__m128i*)s1_unpack, vs1); + _mm_store_si128((__m128i*)s2_unpack, vs2); + adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE); + MOD(adler); + s1[3] = adler; + sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE); + MOD(sum2); + s2[3] = sum2; + } + + while (len--) { + adler += *buf++; + sum2 += adler; + } + MOD(adler); + MOD(sum2); + + /* return recombined sums */ + return adler | (sum2 << 16); +} + +#endif + +#endif \ No newline at end of file diff --git a/arch/x86/x86.c b/arch/x86/x86.c index 92f94004..79f42035 100644 --- a/arch/x86/x86.c +++ b/arch/x86/x86.c @@ -19,6 +19,7 @@ ZLIB_INTERNAL int x86_cpu_has_avx2; ZLIB_INTERNAL int x86_cpu_has_sse2; +ZLIB_INTERNAL int x86_cpu_has_ssse3; ZLIB_INTERNAL int x86_cpu_has_sse42; ZLIB_INTERNAL int x86_cpu_has_pclmulqdq; ZLIB_INTERNAL int x86_cpu_has_tzcnt; @@ -60,6 +61,7 @@ void ZLIB_INTERNAL x86_check_features(void) { cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx); x86_cpu_has_sse2 = edx & 0x4000000; + x86_cpu_has_ssse3 = ecx & 0x200; x86_cpu_has_sse42 = ecx & 0x100000; x86_cpu_has_pclmulqdq = ecx & 0x2; diff --git a/arch/x86/x86.h b/arch/x86/x86.h index 3e212a48..243a807f 100644 --- a/arch/x86/x86.h +++ b/arch/x86/x86.h @@ -8,6 +8,7 @@ extern int x86_cpu_has_avx2; extern int x86_cpu_has_sse2; +extern int x86_cpu_has_ssse3; extern int x86_cpu_has_sse42; extern int x86_cpu_has_pclmulqdq; extern int x86_cpu_has_tzcnt; diff --git a/configure b/configure index 66380d0f..c2cd6f17 100755 --- a/configure +++ b/configure @@ -102,6 +102,7 @@ native=0 forcesse2=0 avx2flag="-mavx2" sse2flag="-msse2" +ssse3flag="-mssse3" sse4flag="-msse4" sse42flag="-msse4.2" pclmulflag="-mpclmul" @@ -915,6 +916,28 @@ EOF ;; esac +# Check for SSSE3 intrinsics + +cat > $test.c << EOF +#include +int main(void) +{ + __m128i u, v, w; + u = _mm_set1_epi32(1); + v = _mm_set1_epi32(2); + w = _mm_hadd_epi32(u, v); + (void)w; + return 0; +} +EOF +if try ${CC} ${CFLAGS} ${ssse3flag} $test.c; then + echo "Checking for SSSE3 intrinsics ... Yes." | tee -a configure.log + HAVE_SSSE3_INTRIN=1 +else + echo "Checking for SSSE3 intrinsics ... No." | tee -a configure.log + HAVE_SSSE3_INTRIN=0 +fi + # Check for SSE4.2 CRC intrinsics case "${ARCH}" in i386 | i486 | i586 | i686 | x86_64) @@ -1098,6 +1121,13 @@ case "${ARCH}" in SFLAGS="${SFLAGS} -DX86_NOCHECK_SSE2" fi fi + + if test ${HAVE_SSSE3_INTRIN} -eq 1; then + CFLAGS="${CFLAGS} -DX86_SSSE3 -DX86_SSSE3_ADLER32" + SFLAGS="${SFLAGS} -DX86_SSSE3 -DX86_SSSE3_ADLER32" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_ssse3.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_ssse3.lo" + fi if test ${HAVE_SSE42CRC_INTRIN} -eq 1; then CFLAGS="${CFLAGS} -DX86_SSE42_CRC_INTRIN" @@ -1113,10 +1143,10 @@ case "${ARCH}" in fi if test ${HAVE_AVX2_INTRIN} -eq 1; then - CFLAGS="${CFLAGS} -DX86_AVX2" - SFLAGS="${SFLAGS} -DX86_AVX2" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare258_avx.o slide_avx.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare258_avx.lo slide_avx.lo" + CFLAGS="${CFLAGS} -DX86_AVX2 -DX86_AVX2_ADLER32" + SFLAGS="${SFLAGS} -DX86_AVX2 -DX86_AVX2_ADLER32" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare258_avx.o slide_avx.o adler32_avx.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare258_avx.lo slide_avx.lo adler32_avx.o" fi CFLAGS="${CFLAGS} -DX86_SSE42_CRC_HASH" @@ -1149,6 +1179,13 @@ case "${ARCH}" in ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} x86.o insert_string_sse.o slide_sse.o" ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} x86.lo insert_string_sse.lo slide_sse.lo" + if test ${HAVE_SSSE3_INTRIN} -eq 1; then + CFLAGS="${CFLAGS} -DX86_SSSE3 -DX86_SSSE3_ADLER32" + SFLAGS="${SFLAGS} -DX86_SSSE3 -DX86_SSSE3_ADLER32" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_ssse3.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_ssse3.lo" + fi + if test ${HAVE_SSE42CRC_INTRIN} -eq 1; then CFLAGS="${CFLAGS} -DX86_SSE42_CRC_INTRIN" SFLAGS="${SFLAGS} -DX86_SSE42_CRC_INTRIN" @@ -1162,10 +1199,10 @@ case "${ARCH}" in fi if test ${HAVE_AVX2_INTRIN} -eq 1; then - CFLAGS="${CFLAGS} -DX86_AVX2" - SFLAGS="${SFLAGS} -DX86_AVX2" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare258_avx.o slide_avx.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare258_avx.lo slide_avx.lo" + CFLAGS="${CFLAGS} -DX86_AVX2 -DX86_AVX2_ADLER32" + SFLAGS="${SFLAGS} -DX86_AVX2 -DX86_AVX2_ADLER32" + ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare258_avx.o slide_avx.o adler32_avx.o" + ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare258_avx.lo slide_avx.lo adler32_avx.lo" fi if test ${HAVE_SSE42CMPSTR_INTRIN} -eq 1; then @@ -1442,6 +1479,7 @@ echo prefix = $prefix >> configure.log echo sharedlibdir = $sharedlibdir >> configure.log echo uname = $uname >> configure.log echo sse2flag = $sse2flag >> configure.log +echo ssse3flag = $ssse3flag >> configure.log echo sse4flag = $sse4flag >> configure.log echo pclmulflag = $pclmulflag >> configure.log echo ARCHDIR = ${ARCHDIR} >> configure.log @@ -1571,6 +1609,7 @@ sed < $SRCDIR/$ARCHDIR/Makefile.in " /^TOPDIR *=/s#=.*#=$BUILDDIR# /^AVX2FLAG *=/s#=.*#=$avx2flag# /^SSE2FLAG *=/s#=.*#=$sse2flag# +/^SSSE3FLAG *=/s#=.*#=$ssse3flag# /^SSE4FLAG *=/s#=.*#=$sse4flag# /^PCLMULFLAG *=/s#=.*#=$pclmulflag# " > $ARCHDIR/Makefile diff --git a/functable.c b/functable.c index e25da331..4ba575cf 100644 --- a/functable.c +++ b/functable.c @@ -47,6 +47,12 @@ extern uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len); #if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && defined(ARM_NEON_ADLER32) extern uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len); #endif +#ifdef X86_SSSE3_ADLER32 +extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len); +#endif +#ifdef X86_AVX2_ADLER32 +extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len); +#endif /* CRC32 */ ZLIB_INTERNAL uint32_t crc32_generic(uint32_t, const unsigned char *, uint64_t); @@ -195,6 +201,14 @@ ZLIB_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, si # endif functable.adler32 = &adler32_neon; #endif +#ifdef X86_SSSE3_ADLER32 + if (x86_cpu_has_ssse3) + functable.adler32 = &adler32_ssse3; +#endif +#ifdef X86_AVX2_ADLER32 + if (x86_cpu_has_avx2) + functable.adler32 = &adler32_avx2; +#endif return functable.adler32(adler, buf, len); }