elseif(BASEARCH_X86_FOUND)
option(WITH_AVX2 "Build with AVX2" ON)
option(WITH_SSE2 "Build with SSE2" ON)
+ option(WITH_SSSE3 "Build with SSSE3" ON)
option(WITH_SSE4 "Build with SSE4" ON)
option(WITH_PCLMULQDQ "Build with PCLMULQDQ" ON)
endif()
mark_as_advanced(FORCE ZLIB_DUAL_LINK WITH_ACLE WITH_NEON WITH_DFLTCC_DEFLATE WITH_DFLTCC_INFLATE
- WITH_AVX2 WITH_SSE2 WITH_SSE4 WITH_PCLMULQDQ WITH_POWER8 WITH_INFLATE_STRICT WITH_INFLATE_ALLOW_INVALID_DIST)
+ WITH_AVX2 WITH_SSE2 WITH_SSSE3 WITH_SSE4 WITH_PCLMULQDQ WITH_POWER8 WITH_INFLATE_STRICT WITH_INFLATE_ALLOW_INVALID_DIST)
add_feature_info(ZLIB_COMPAT ZLIB_COMPAT "Provide a zlib-compatible API")
add_feature_info(WITH_GZFILEOP WITH_GZFILEOP "Compile with support for gzFile-related functions")
if(BASEARCH_X86_FOUND)
set(AVX2FLAG "-mavx2")
set(SSE2FLAG "-msse2")
+ set(SSSE3FLAG "-mssse3")
set(SSE4FLAG "-msse4.2")
endif()
else()
if(BASEARCH_X86_FOUND)
set(AVX2FLAG "/arch:AVX2")
set(SSE2FLAG "/arch:SSE2")
+ set(SSSE3FLAG "/arch:SSSE3")
set(SSE4FLAG "/arch:SSE4.2")
endif()
endif()
elseif(BASEARCH_X86_FOUND)
set(AVX2FLAG "-mavx2")
set(SSE2FLAG "-msse2")
+ set(SSSE3FLAG "-mssse3")
set(SSE4FLAG "-msse4")
set(PCLMULFLAG "-mpclmul")
endif()
elseif(BASEARCH_X86_FOUND)
set(AVX2FLAG ${NATIVEFLAG})
set(SSE2FLAG ${NATIVEFLAG})
+ set(SSSE3FLAG ${NATIVEFLAG})
set(SSE4FLAG ${NATIVEFLAG})
set(PCLMULFLAG ${NATIVEFLAG})
endif()
}"
HAVE_SSE2_INTRIN
)
- set(CMAKE_REQUIRED_FLAGS)
+ # Check whether compiler supports SSSE3 intrinsics
+ if(WITH_NATIVE_INSTRUCTIONS)
+ set(CMAKE_REQUIRED_FLAGS "${NATIVEFLAG}")
+ else()
+ set(CMAKE_REQUIRED_FLAGS "${SSSE3FLAG}")
+ endif()
+ check_c_source_compile_or_run(
+ "#include <immintrin.h>
+ int main(void)
+ {
+ __m128i u, v, w;
+ u = _mm_set1_epi32(1);
+ v = _mm_set1_epi32(2);
+ w = _mm_hadd_epi32(u, v);
+ (void)w;
+ return 0;
+ }"
+ HAVE_SSSE3_INTRIN
+ )
# Check whether compiler supports SSE4 CRC inline asm
if(WITH_NATIVE_INSTRUCTIONS)
set(CMAKE_REQUIRED_FLAGS "${NATIVEFLAG}")
}"
HAVE_SSE42CMPSTR_INTRIN
)
- set(CMAKE_REQUIRED_FLAGS)
-
# Check whether compiler supports PCLMULQDQ intrinsics
if(WITH_NATIVE_INSTRUCTIONS)
set(CMAKE_REQUIRED_FLAGS "${NATIVEFLAG}")
else()
set(HAVE_PCLMULQDQ_INTRIN NO)
endif()
- set(CMAKE_REQUIRED_FLAGS)
-
# Check whether compiler supports AVX2 intrinics
if(WITH_NATIVE_INSTRUCTIONS)
set(CMAKE_REQUIRED_FLAGS "${NATIVEFLAG}")
list(APPEND ZLIB_ARCH_HDRS fallback_builtins.h)
endif()
if(WITH_AVX2 AND HAVE_AVX2_INTRIN)
- add_definitions(-DX86_AVX2)
- list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/slide_avx.c)
+ add_definitions(-DX86_AVX2 -DX86_AVX2_ADLER32)
+ list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/slide_avx.c ${ARCHDIR}/adler32_avx.c)
add_feature_info(AVX2_SLIDEHASH 1 "Support AVX2 optimized slide_hash, using \"${AVX2FLAG}\"")
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/compare258_avx.c)
add_feature_info(AVX2_COMPARE258 1 "Support AVX2 optimized compare258, using \"${AVX2FLAG}\"")
+ add_feature_info(AVX2_ADLER32 1 "Support AVX2-accelerated adler32, using \"${AVX2FLAG}\"")
add_intrinsics_option("${AVX2FLAG}")
endif()
if(WITH_SSE4 AND (HAVE_SSE42CRC_INLINE_ASM OR HAVE_SSE42CRC_INTRIN))
endif()
endif()
endif()
+ if(WITH_SSSE3 AND HAVE_SSSE3_INTRIN)
+ add_definitions(-DX86_SSSE3 -DX86_SSSE3_ADLER32)
+ list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/adler32_ssse3.c)
+ add_feature_info(SSSE3_ADLER32 1 "Support SSSE3-accelerated adler32, using \"${SSSE3FLAG}\"")
+ add_intrinsics_option("${SSSE3FLAG}")
+ endif()
if(WITH_PCLMULQDQ AND HAVE_PCLMULQDQ_INTRIN)
add_definitions(-DX86_PCLMULQDQ_CRC)
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/crc_folding.c)
+++ /dev/null
-/* adler32.c -- compute the Adler-32 checksum of a data stream
- * Copyright (C) 1995-2011 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-/* @(#) $Id$ */
-
-#include "zutil.h"
-#include <xmmintrin.h>
-#include <tmmintrin.h>
-
-#include <immintrin.h>
-
-static uLong adler32_combine_ OF((uLong adler1, uLong adler2, z_off64_t len2));
-
-#define BASE 65521 /* largest prime smaller than 65536 */
-#define NMAX 5552
-/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
-
-/*
- * As we are using _signed_ integer arithmetic for the SSE/AVX2 implementations,
- * we consider the max as 2^31-1
- */
-#define NMAX_VEC 5552
-
-#define NMAX_VEC2 5552
-
-#define DO1(buf,i) {adler += (buf)[i]; sum2 += adler;}
-#define DO2(buf,i) DO1(buf,i); DO1(buf,i+1);
-#define DO4(buf,i) DO2(buf,i); DO2(buf,i+2);
-#define DO8(buf,i) DO4(buf,i); DO4(buf,i+4);
-#define DO16(buf) DO8(buf,0); DO8(buf,8);
-
-/* use NO_DIVIDE if your processor does not do division in hardware --
- try it both ways to see which is faster */
-#ifdef NO_DIVIDE
-/* note that this assumes BASE is 65521, where 65536 % 65521 == 15
- (thank you to John Reiser for pointing this out) */
-# define CHOP(a) \
- do { \
- unsigned long tmp = a >> 16; \
- a &= 0xffffUL; \
- a += (tmp << 4) - tmp; \
- } while (0)
-# define MOD28(a) \
- do { \
- CHOP(a); \
- if (a >= BASE) a -= BASE; \
- } while (0)
-# define MOD(a) \
- do { \
- CHOP(a); \
- MOD28(a); \
- } while (0)
-# define MOD63(a) \
- do { /* this assumes a is not negative */ \
- z_off64_t tmp = a >> 32; \
- a &= 0xffffffffL; \
- a += (tmp << 8) - (tmp << 5) + tmp; \
- tmp = a >> 16; \
- a &= 0xffffL; \
- a += (tmp << 4) - tmp; \
- tmp = a >> 16; \
- a &= 0xffffL; \
- a += (tmp << 4) - tmp; \
- if (a >= BASE) a -= BASE; \
- } while (0)
-#else
-# define MOD(a) a %= BASE
-# define MOD28(a) a %= BASE
-# define MOD63(a) a %= BASE
-#endif
-
-/* ========================================================================= */
-extern "C" {
-uLong ZEXPORT adler32_serial(uLong adler, const Bytef *buf, uInt len)
-{
-
- unsigned long sum2;
- unsigned n;
-
- /* split Adler-32 into component sums */
- sum2 = (adler >> 16) & 0xffff;
- adler &= 0xffff;
-
- /* in case user likes doing a byte at a time, keep it fast */
- if (len == 1) {
- adler += buf[0];
- if (adler >= BASE)
- adler -= BASE;
- sum2 += adler;
- if (sum2 >= BASE)
- sum2 -= BASE;
- return adler | (sum2 << 16);
- }
-
- /* initial Adler-32 value (deferred check for len == 1 speed) */
- if (buf == Z_NULL)
- return 1L;
-
- /* in case short lengths are provided, keep it somewhat fast */
- if (len < 16) {
- while (len--) {
- adler += *buf++;
- sum2 += adler;
- }
- if (adler >= BASE)
- adler -= BASE;
- MOD28(sum2); /* only added so many BASE's */
- return adler | (sum2 << 16);
- }
-
- /* do length NMAX blocks -- requires just one modulo operation */
- while (len >= NMAX) {
- len -= NMAX;
- n = NMAX / 16; /* NMAX is divisible by 16 */
- do {
- DO16(buf); /* 16 sums unrolled */
- buf += 16;
- } while (--n);
- MOD(adler);
- MOD(sum2);
- }
-
- /* do remaining bytes (less than NMAX, still just one modulo) */
- if (len) { /* avoid modulos if none remaining */
- while (len >= 16) {
- len -= 16;
- DO16(buf);
- buf += 16;
- }
- while (len--) {
- adler += *buf++;
- sum2 += adler;
- }
- MOD(adler);
- MOD(sum2);
- }
-
- /* return recombined sums */
- return adler | (sum2 << 16);
-}
-
-#define likely(x) __builtin_expect(!!(x), 1)
-#define unlikely(x) __builtin_expect(!!(x), 0)
-
-/* ========================================================================= */
-uLong ZEXPORT adler32_vec(uLong adler, const Bytef *buf, uInt len)
-{
-
- unsigned long sum2;
-
- /* split Adler-32 into component sums */
- sum2 = (adler >> 16) & 0xffff;
- adler &= 0xffff;
-
- /* in case user likes doing a byte at a time, keep it fast */
- if (unlikely(len == 1)) {
- adler += buf[0];
- if (adler >= BASE)
- adler -= BASE;
- sum2 += adler;
- if (sum2 >= BASE)
- sum2 -= BASE;
- return adler | (sum2 << 16);
- }
-
- /* initial Adler-32 value (deferred check for len == 1 speed) */
- if (unlikely(buf == Z_NULL))
- return 1L;
-
- /* in case short lengths are provided, keep it somewhat fast */
- if (unlikely(len < 16)) {
- while (len--) {
- adler += *buf++;
- sum2 += adler;
- }
- if (adler >= BASE)
- adler -= BASE;
- MOD28(sum2); /* only added so many BASE's */
- return adler | (sum2 << 16);
- }
-
- uint32_t __attribute__ ((aligned(16))) s1[4], s2[4];
- s1[0] = s1[1] = s1[2] = 0; s1[3] = adler;
- s2[0] = s2[1] = s2[2] = 0; s2[3] = sum2;
- char __attribute__ ((aligned(16))) dot1[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
- __m128i dot1v = _mm_load_si128((__m128i*)dot1);
- char __attribute__ ((aligned(16))) dot2[16] = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
- __m128i dot2v = _mm_load_si128((__m128i*)dot2);
- short __attribute__ ((aligned(16))) dot3[8] = {1, 1, 1, 1, 1, 1, 1, 1};
- __m128i dot3v = _mm_load_si128((__m128i*)dot3);
- // We will need to multiply by
- //char __attribute__ ((aligned(16))) shift[4] = {0, 0, 0, 4}; //{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4};
- char __attribute__ ((aligned(16))) shift[16] = {4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
- __m128i shiftv = _mm_load_si128((__m128i*)shift);
- while (len >= 16) {
- __m128i vs1 = _mm_load_si128((__m128i*)s1);
- __m128i vs2 = _mm_load_si128((__m128i*)s2);
- __m128i vs1_0 = vs1;
- int k = (len < NMAX_VEC ? (int)len : NMAX_VEC);
- k -= k % 16;
- len -= k;
- while (k >= 16) {
- /*
- vs1 = adler + sum(c[i])
- vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
-
- NOTE: 256-bit equivalents are:
- _mm256_maddubs_epi16 <- operates on 32 bytes to 16 shorts
- _mm256_madd_epi16 <- Sums 16 shorts to 8 int32_t.
- We could rewrite the below to use 256-bit instructions instead of 128-bit.
- */
- __m128i vbuf = _mm_loadu_si128((__m128i*)buf);
- buf += 16;
- k -= 16;
- __m128i v_short_sum1 = _mm_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts.
- __m128i vsum1 = _mm_madd_epi16(v_short_sum1, dot3v); // sum 8 shorts to 4 int32_t;
- __m128i v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
- vs1 = _mm_add_epi32(vsum1, vs1);
- __m128i vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
- vs1_0 = _mm_sll_epi32(vs1_0, shiftv);
- vsum2 = _mm_add_epi32(vsum2, vs2);
- vs2 = _mm_add_epi32(vsum2, vs1_0);
- vs1_0 = vs1;
- }
- // At this point, we have partial sums stored in vs1 and vs2. There are AVX512 instructions that
- // would allow us to sum these quickly (VP4DPWSSD). For now, just unpack and move on.
- uint32_t __attribute__((aligned(16))) s1_unpack[4];
- uint32_t __attribute__((aligned(16))) s2_unpack[4];
- _mm_store_si128((__m128i*)s1_unpack, vs1);
- _mm_store_si128((__m128i*)s2_unpack, vs2);
- adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE);
- MOD(adler);
- s1[3] = adler;
- sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE);
- MOD(sum2);
- s2[3] = sum2;
- }
-
- while (len--) {
- adler += *buf++;
- sum2 += adler;
- }
- MOD(adler);
- MOD(sum2);
-
- /* return recombined sums */
- return adler | (sum2 << 16);
-}
-
-/* ========================================================================= */
-uLong ZEXPORT adler32_avx(uLong adler, const Bytef *buf, uInt len)
-{
-
- unsigned long sum2;
-
- /* split Adler-32 into component sums */
- sum2 = (adler >> 16) & 0xffff;
- adler &= 0xffff;
-
- /* in case user likes doing a byte at a time, keep it fast */
- if (unlikely(len == 1)) {
- adler += buf[0];
- if (adler >= BASE)
- adler -= BASE;
- sum2 += adler;
- if (sum2 >= BASE)
- sum2 -= BASE;
- return adler | (sum2 << 16);
- }
-
- /* initial Adler-32 value (deferred check for len == 1 speed) */
- if (unlikely(buf == Z_NULL))
- return 1L;
-
- /* in case short lengths are provided, keep it somewhat fast */
- if (unlikely(len < 32)) {
- while (len--) {
- adler += *buf++;
- sum2 += adler;
- }
- if (adler >= BASE)
- adler -= BASE;
- MOD28(sum2); /* only added so many BASE's */
- return adler | (sum2 << 16);
- }
-
- uint32_t __attribute__ ((aligned(32))) s1[8], s2[8];
- memset(s1, '\0', sizeof(uint32_t)*7); s1[7] = adler; // TODO: would a masked load be faster?
- memset(s2, '\0', sizeof(uint32_t)*7); s2[7] = sum2;
- char __attribute__ ((aligned(32))) dot1[32] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
- __m256i dot1v = _mm256_load_si256((__m256i*)dot1);
- char __attribute__ ((aligned(32))) dot2[32] = {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
- __m256i dot2v = _mm256_load_si256((__m256i*)dot2);
- short __attribute__ ((aligned(32))) dot3[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
- __m256i dot3v = _mm256_load_si256((__m256i*)dot3);
- // We will need to multiply by
- char __attribute__ ((aligned(16))) shift[16] = {5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
- __m128i shiftv = _mm_load_si128((__m128i*)shift);
- while (len >= 32) {
- __m256i vs1 = _mm256_load_si256((__m256i*)s1);
- __m256i vs2 = _mm256_load_si256((__m256i*)s2);
- __m256i vs1_0 = vs1;
- int k = (len < NMAX_VEC ? (int)len : NMAX_VEC);
- k -= k % 32;
- len -= k;
- while (k >= 32) {
- /*
- vs1 = adler + sum(c[i])
- vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
- */
- __m256i vbuf = _mm256_loadu_si256((__m256i*)buf);
- buf += 32;
- k -= 32;
- __m256i v_short_sum1 = _mm256_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts.
- __m256i vsum1 = _mm256_madd_epi16(v_short_sum1, dot3v); // sum 8 shorts to 4 int32_t;
- __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v);
- vs1 = _mm256_add_epi32(vsum1, vs1);
- __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v);
- vs1_0 = _mm256_sll_epi32(vs1_0, shiftv);
- vsum2 = _mm256_add_epi32(vsum2, vs2);
- vs2 = _mm256_add_epi32(vsum2, vs1_0);
- vs1_0 = vs1;
- }
- // At this point, we have partial sums stored in vs1 and vs2. There are AVX512 instructions that
- // would allow us to sum these quickly (VP4DPWSSD). For now, just unpack and move on.
- uint32_t __attribute__((aligned(32))) s1_unpack[8];
- uint32_t __attribute__((aligned(32))) s2_unpack[8];
- _mm256_store_si256((__m256i*)s1_unpack, vs1);
- _mm256_store_si256((__m256i*)s2_unpack, vs2);
- adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) + (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
- MOD(adler);
- s1[7] = adler;
- sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE) + (s2_unpack[4] % BASE) + (s2_unpack[5] % BASE) + (s2_unpack[6] % BASE) + (s2_unpack[7] % BASE);
- MOD(sum2);
- s2[7] = sum2;
- }
-
- while (len--) {
- adler += *buf++;
- sum2 += adler;
- }
- MOD(adler);
- MOD(sum2);
-
- /* return recombined sums */
- return adler | (sum2 << 16);
-}
-}
-
-__attribute__ ((target ("default")))
-static uLong adler32_impl(uLong adler, const Bytef *buf, uInt len)
-{
- return adler32_serial(adler, buf, len);
-}
-
-__attribute__ ((target ("sse4.2")))
-//__attribute__ ((target ("mmx")))
-static uLong adler32_impl(uLong adler, const Bytef *buf, uInt len)
-{
- return adler32_vec(adler, buf, len);
-}
-
-__attribute__ ((target ("avx2")))
-static uLong adler32_impl(uLong adler, const Bytef *buf, uInt len)
-{
- return adler32_avx(adler, buf, len);
-}
-
-extern "C" {
-uLong ZEXPORT adler32(uLong adler, const Bytef *buf, uInt len) {return adler32_impl(adler, buf, len);}
-}
-
-/* ========================================================================= */
-static uLong adler32_combine_(uLong adler1, uLong adler2, z_off64_t len2)
-{
- unsigned long sum1;
- unsigned long sum2;
- unsigned rem;
-
- /* for negative len, return invalid adler32 as a clue for debugging */
- if (len2 < 0)
- return 0xffffffffUL;
-
- /* the derivation of this formula is left as an exercise for the reader */
- MOD63(len2); /* assumes len2 >= 0 */
- rem = (unsigned)len2;
- sum1 = adler1 & 0xffff;
- sum2 = rem * sum1;
- MOD(sum2);
- sum1 += (adler2 & 0xffff) + BASE - 1;
- sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem;
- if (sum1 >= BASE) sum1 -= BASE;
- if (sum1 >= BASE) sum1 -= BASE;
- if (sum2 >= (BASE << 1)) sum2 -= (BASE << 1);
- if (sum2 >= BASE) sum2 -= BASE;
- return sum1 | (sum2 << 16);
-}
-
-extern "C" {
-/* ========================================================================= */
-uLong adler32_combine(uLong adler1, uLong adler2, z_off_t len2)
-{
- return adler32_combine_(adler1, adler2, len2);
-}
-
-uLong adler32_combine64(uLong adler1, uLong adler2, z_off64_t len2)
-{
- return adler32_combine_(adler1, adler2, len2);
-}
-}
AVX2FLAG=-mavx2
SSE2FLAG=-msse2
+SSSE3FLAG=-mssse3
SSE4FLAG=-msse4
PCLMULFLAG=-mpclmul
all: \
x86.o x86.lo \
+ adler32_avx.o adler32.lo \
+ adler32_ssse3.o adler32_ssse3.lo \
compare258_avx.o compare258_avx.lo \
compare258_sse.o compare258_sse.lo \
insert_string_sse.o insert_string_sse.lo \
slide_sse.lo:
$(CC) $(SFLAGS) $(SSE2FLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_sse.c
+adler32_avx.o: $(SRCDIR)/adler32_avx.c
+ $(CC) $(CFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx.c
+
+adler32_avx.lo: $(SRCDIR)/adler32_avx.c
+ $(CC) $(SFLAGS) $(AVX2FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx.c
+
+adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c
+ $(CC) $(CFLAGS) $(SSSE3FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
+
+adler32_ssse3.lo: $(SRCDIR)/adler32_ssse3.c
+ $(CC) $(SFLAGS) $(SSSE3FLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
+
mostlyclean: clean
clean:
rm -f *.o *.lo *~
--- /dev/null
+/* adler32.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ * Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_AVX2_H
+#define ADLER32_AVX2_H
+
+#include "../../zbuild.h"
+#include "../../zutil.h"
+
+#include "../../adler32_p.h"
+
+#include <immintrin.h>
+
+#ifdef X86_AVX2_ADLER32
+
+uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len) {
+ uint32_t sum2;
+
+ /* split Adler-32 into component sums */
+ sum2 = (adler >> 16) & 0xffff;
+ adler &= 0xffff;
+
+ /* in case user likes doing a byte at a time, keep it fast */
+ if (len == 1)
+ return adler32_len_1(adler, buf, sum2);
+
+ /* initial Adler-32 value (deferred check for len == 1 speed) */
+ if (buf == NULL)
+ return 1L;
+
+ /* in case short lengths are provided, keep it somewhat fast */
+ if (len < 16)
+ return adler32_len_16(adler, buf, len, sum2);
+
+ uint32_t ALIGNED_(32) s1[8], s2[8];
+ memset(s1, '\0', sizeof(uint32_t)*7); s1[7] = adler; // TODO: would a masked load be faster?
+ memset(s2, '\0', sizeof(uint32_t)*7); s2[7] = sum2;
+ char ALIGNED_(32) dot1[32] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ __m256i dot1v = _mm256_load_si256((__m256i*)dot1);
+ char ALIGNED_(32) dot2[32] = {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
+ __m256i dot2v = _mm256_load_si256((__m256i*)dot2);
+ short ALIGNED_(32) dot3[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ __m256i dot3v = _mm256_load_si256((__m256i*)dot3);
+ // We will need to multiply by
+ char ALIGNED_(32) shift[16] = {5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ __m128i shiftv = _mm_load_si128((__m128i*)shift);
+ while (len >= 32) {
+ __m256i vs1 = _mm256_load_si256((__m256i*)s1);
+ __m256i vs2 = _mm256_load_si256((__m256i*)s2);
+ __m256i vs1_0 = vs1;
+ int k = (len < NMAX ? (int)len : NMAX);
+ k -= k % 32;
+ len -= k;
+ while (k >= 32) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+ */
+ __m256i vbuf = _mm256_loadu_si256((__m256i*)buf);
+ buf += 32;
+ k -= 32;
+ __m256i v_short_sum1 = _mm256_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts.
+ __m256i vsum1 = _mm256_madd_epi16(v_short_sum1, dot3v); // sum 8 shorts to 4 int32_t;
+ __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v);
+ vs1 = _mm256_add_epi32(vsum1, vs1);
+ __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v);
+ vs1_0 = _mm256_sll_epi32(vs1_0, shiftv);
+ vsum2 = _mm256_add_epi32(vsum2, vs2);
+ vs2 = _mm256_add_epi32(vsum2, vs1_0);
+ vs1_0 = vs1;
+ }
+ // At this point, we have partial sums stored in vs1 and vs2. There are AVX512 instructions that
+ // would allow us to sum these quickly (VP4DPWSSD). For now, just unpack and move on.
+ uint32_t ALIGNED_(32) s1_unpack[8];
+ uint32_t ALIGNED_(32) s2_unpack[8];
+ _mm256_store_si256((__m256i*)s1_unpack, vs1);
+ _mm256_store_si256((__m256i*)s2_unpack, vs2);
+ adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) + (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
+ MOD(adler);
+ s1[7] = adler;
+ sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE) + (s2_unpack[4] % BASE) + (s2_unpack[5] % BASE) + (s2_unpack[6] % BASE) + (s2_unpack[7] % BASE);
+ MOD(sum2);
+ s2[7] = sum2;
+ }
+
+ while (len--) {
+ adler += *buf++;
+ sum2 += adler;
+ }
+ MOD(adler);
+ MOD(sum2);
+
+ /* return recombined sums */
+ return adler | (sum2 << 16);
+}
+
+#endif
+
+#endif
--- /dev/null
+/* adler32.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ * Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_SSSE3_H
+#define ADLER32_SSSE3_H
+
+#include "../../zbuild.h"
+#include "../../zutil.h"
+
+#include "../../adler32_p.h"
+
+#ifdef X86_SSSE3_ADLER32
+
+#include <immintrin.h>
+
+uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len) {
+ uint32_t sum2;
+
+ /* split Adler-32 into component sums */
+ sum2 = (adler >> 16) & 0xffff;
+ adler &= 0xffff;
+
+ /* in case user likes doing a byte at a time, keep it fast */
+ if (len == 1)
+ return adler32_len_1(adler, buf, sum2);
+
+ /* initial Adler-32 value (deferred check for len == 1 speed) */
+ if (buf == NULL)
+ return 1L;
+
+ /* in case short lengths are provided, keep it somewhat fast */
+ if (len < 16)
+ return adler32_len_16(adler, buf, len, sum2);
+
+ uint32_t ALIGNED_(16) s1[4], s2[4];
+ s1[0] = s1[1] = s1[2] = 0; s1[3] = adler;
+ s2[0] = s2[1] = s2[2] = 0; s2[3] = sum2;
+ char ALIGNED_(16) dot1[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+ __m128i dot1v = _mm_load_si128((__m128i*)dot1);
+ char ALIGNED_(16) dot2[16] = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
+ __m128i dot2v = _mm_load_si128((__m128i*)dot2);
+ short ALIGNED_(16) dot3[8] = {1, 1, 1, 1, 1, 1, 1, 1};
+ __m128i dot3v = _mm_load_si128((__m128i*)dot3);
+ // We will need to multiply by
+ //char ALIGNED_(16) shift[4] = {0, 0, 0, 4}; //{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4};
+ char ALIGNED_(16) shift[16] = {4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ __m128i shiftv = _mm_load_si128((__m128i*)shift);
+ while (len >= 16) {
+ __m128i vs1 = _mm_load_si128((__m128i*)s1);
+ __m128i vs2 = _mm_load_si128((__m128i*)s2);
+ __m128i vs1_0 = vs1;
+ int k = (len < NMAX ? (int)len : NMAX);
+ k -= k % 16;
+ len -= k;
+ while (k >= 16) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+
+ NOTE: 256-bit equivalents are:
+ _mm256_maddubs_epi16 <- operates on 32 bytes to 16 shorts
+ _mm256_madd_epi16 <- Sums 16 shorts to 8 int32_t.
+ We could rewrite the below to use 256-bit instructions instead of 128-bit.
+ */
+ __m128i vbuf = _mm_loadu_si128((__m128i*)buf);
+ buf += 16;
+ k -= 16;
+ __m128i v_short_sum1 = _mm_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts.
+ __m128i vsum1 = _mm_madd_epi16(v_short_sum1, dot3v); // sum 8 shorts to 4 int32_t;
+ __m128i v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
+ vs1 = _mm_add_epi32(vsum1, vs1);
+ __m128i vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+ vs1_0 = _mm_sll_epi32(vs1_0, shiftv);
+ vsum2 = _mm_add_epi32(vsum2, vs2);
+ vs2 = _mm_add_epi32(vsum2, vs1_0);
+ vs1_0 = vs1;
+ }
+ // At this point, we have partial sums stored in vs1 and vs2. There are AVX512 instructions that
+ // would allow us to sum these quickly (VP4DPWSSD). For now, just unpack and move on.
+ uint32_t ALIGNED_(16) s1_unpack[4];
+ uint32_t ALIGNED_(16) s2_unpack[4];
+ _mm_store_si128((__m128i*)s1_unpack, vs1);
+ _mm_store_si128((__m128i*)s2_unpack, vs2);
+ adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE);
+ MOD(adler);
+ s1[3] = adler;
+ sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE);
+ MOD(sum2);
+ s2[3] = sum2;
+ }
+
+ while (len--) {
+ adler += *buf++;
+ sum2 += adler;
+ }
+ MOD(adler);
+ MOD(sum2);
+
+ /* return recombined sums */
+ return adler | (sum2 << 16);
+}
+
+#endif
+
+#endif
\ No newline at end of file
ZLIB_INTERNAL int x86_cpu_has_avx2;
ZLIB_INTERNAL int x86_cpu_has_sse2;
+ZLIB_INTERNAL int x86_cpu_has_ssse3;
ZLIB_INTERNAL int x86_cpu_has_sse42;
ZLIB_INTERNAL int x86_cpu_has_pclmulqdq;
ZLIB_INTERNAL int x86_cpu_has_tzcnt;
cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
x86_cpu_has_sse2 = edx & 0x4000000;
+ x86_cpu_has_ssse3 = ecx & 0x200;
x86_cpu_has_sse42 = ecx & 0x100000;
x86_cpu_has_pclmulqdq = ecx & 0x2;
extern int x86_cpu_has_avx2;
extern int x86_cpu_has_sse2;
+extern int x86_cpu_has_ssse3;
extern int x86_cpu_has_sse42;
extern int x86_cpu_has_pclmulqdq;
extern int x86_cpu_has_tzcnt;
forcesse2=0
avx2flag="-mavx2"
sse2flag="-msse2"
+ssse3flag="-mssse3"
sse4flag="-msse4"
sse42flag="-msse4.2"
pclmulflag="-mpclmul"
;;
esac
+# Check for SSSE3 intrinsics
+
+cat > $test.c << EOF
+#include <x86intrin.h>
+int main(void)
+{
+ __m128i u, v, w;
+ u = _mm_set1_epi32(1);
+ v = _mm_set1_epi32(2);
+ w = _mm_hadd_epi32(u, v);
+ (void)w;
+ return 0;
+}
+EOF
+if try ${CC} ${CFLAGS} ${ssse3flag} $test.c; then
+ echo "Checking for SSSE3 intrinsics ... Yes." | tee -a configure.log
+ HAVE_SSSE3_INTRIN=1
+else
+ echo "Checking for SSSE3 intrinsics ... No." | tee -a configure.log
+ HAVE_SSSE3_INTRIN=0
+fi
+
# Check for SSE4.2 CRC intrinsics
case "${ARCH}" in
i386 | i486 | i586 | i686 | x86_64)
SFLAGS="${SFLAGS} -DX86_NOCHECK_SSE2"
fi
fi
+
+ if test ${HAVE_SSSE3_INTRIN} -eq 1; then
+ CFLAGS="${CFLAGS} -DX86_SSSE3 -DX86_SSSE3_ADLER32"
+ SFLAGS="${SFLAGS} -DX86_SSSE3 -DX86_SSSE3_ADLER32"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_ssse3.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_ssse3.lo"
+ fi
if test ${HAVE_SSE42CRC_INTRIN} -eq 1; then
CFLAGS="${CFLAGS} -DX86_SSE42_CRC_INTRIN"
fi
if test ${HAVE_AVX2_INTRIN} -eq 1; then
- CFLAGS="${CFLAGS} -DX86_AVX2"
- SFLAGS="${SFLAGS} -DX86_AVX2"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare258_avx.o slide_avx.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare258_avx.lo slide_avx.lo"
+ CFLAGS="${CFLAGS} -DX86_AVX2 -DX86_AVX2_ADLER32"
+ SFLAGS="${SFLAGS} -DX86_AVX2 -DX86_AVX2_ADLER32"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare258_avx.o slide_avx.o adler32_avx.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare258_avx.lo slide_avx.lo adler32_avx.o"
fi
CFLAGS="${CFLAGS} -DX86_SSE42_CRC_HASH"
ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} x86.o insert_string_sse.o slide_sse.o"
ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} x86.lo insert_string_sse.lo slide_sse.lo"
+ if test ${HAVE_SSSE3_INTRIN} -eq 1; then
+ CFLAGS="${CFLAGS} -DX86_SSSE3 -DX86_SSSE3_ADLER32"
+ SFLAGS="${SFLAGS} -DX86_SSSE3 -DX86_SSSE3_ADLER32"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_ssse3.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_ssse3.lo"
+ fi
+
if test ${HAVE_SSE42CRC_INTRIN} -eq 1; then
CFLAGS="${CFLAGS} -DX86_SSE42_CRC_INTRIN"
SFLAGS="${SFLAGS} -DX86_SSE42_CRC_INTRIN"
fi
if test ${HAVE_AVX2_INTRIN} -eq 1; then
- CFLAGS="${CFLAGS} -DX86_AVX2"
- SFLAGS="${SFLAGS} -DX86_AVX2"
- ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare258_avx.o slide_avx.o"
- ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare258_avx.lo slide_avx.lo"
+ CFLAGS="${CFLAGS} -DX86_AVX2 -DX86_AVX2_ADLER32"
+ SFLAGS="${SFLAGS} -DX86_AVX2 -DX86_AVX2_ADLER32"
+ ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} compare258_avx.o slide_avx.o adler32_avx.o"
+ ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} compare258_avx.lo slide_avx.lo adler32_avx.lo"
fi
if test ${HAVE_SSE42CMPSTR_INTRIN} -eq 1; then
echo sharedlibdir = $sharedlibdir >> configure.log
echo uname = $uname >> configure.log
echo sse2flag = $sse2flag >> configure.log
+echo ssse3flag = $ssse3flag >> configure.log
echo sse4flag = $sse4flag >> configure.log
echo pclmulflag = $pclmulflag >> configure.log
echo ARCHDIR = ${ARCHDIR} >> configure.log
/^TOPDIR *=/s#=.*#=$BUILDDIR#
/^AVX2FLAG *=/s#=.*#=$avx2flag#
/^SSE2FLAG *=/s#=.*#=$sse2flag#
+/^SSSE3FLAG *=/s#=.*#=$ssse3flag#
/^SSE4FLAG *=/s#=.*#=$sse4flag#
/^PCLMULFLAG *=/s#=.*#=$pclmulflag#
" > $ARCHDIR/Makefile
#if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && defined(ARM_NEON_ADLER32)
extern uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len);
#endif
+#ifdef X86_SSSE3_ADLER32
+extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len);
+#endif
+#ifdef X86_AVX2_ADLER32
+extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len);
+#endif
/* CRC32 */
ZLIB_INTERNAL uint32_t crc32_generic(uint32_t, const unsigned char *, uint64_t);
# endif
functable.adler32 = &adler32_neon;
#endif
+#ifdef X86_SSSE3_ADLER32
+ if (x86_cpu_has_ssse3)
+ functable.adler32 = &adler32_ssse3;
+#endif
+#ifdef X86_AVX2_ADLER32
+ if (x86_cpu_has_avx2)
+ functable.adler32 = &adler32_avx2;
+#endif
return functable.adler32(adler, buf, len);
}