From: Adam Stylinski Date: Sun, 20 Mar 2022 15:44:32 +0000 (-0400) Subject: Rename adler32_sse41 to adler32_ssse3 X-Git-Tag: 2.1.0-beta1~312 X-Git-Url: http://git.ipfire.org/gitweb/gitweb.cgi?a=commitdiff_plain;h=7db13e652aad702f68331a929e1990679a8a771a;p=thirdparty%2Fzlib-ng.git Rename adler32_sse41 to adler32_ssse3 As it turns out, the sum of absolute differences instruction _did_ exist in SSSE3 all along. SSE41 introduced a stranger, less commonly used variation of the sum of absolute difference instruction. Knowing this, the old SSSE3 method can be axed entirely and the SSE41 method can now be used on CPUs only having SSSE3. Removing this extra functable entry shrinks the code and allows for a simpler planned refactor later for the adler checksum and copy elision. --- diff --git a/CMakeLists.txt b/CMakeLists.txt index e18a3945..883e7945 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -734,11 +734,7 @@ if(WITH_OPTIM) if(WITH_SSE41) check_sse41_intrinsics() if(HAVE_SSE41_INTRIN) - add_definitions(-DX86_SSE41 -DX86_SSE41_ADLER32) - set(SSE41_SRCS ${ARCHDIR}/adler32_sse41.c) - add_feature_info(SSE4_ADLER32 1 "Support SSE41-accelerated adler32, using \"${SSE41FLAG}\"") - list(APPEND ZLIB_ARCH_SRCS ${SSE41_SRCS}) - set_property(SOURCE ${SSE41_SRCS} PROPERTY COMPILE_FLAGS "${SSE41FLAG} ${NOLTOFLAG}") + add_definitions(-DX86_SSE41) else() set(WITH_SSE41 OFF) endif() diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in index 2b90e2ad..05cf144b 100644 --- a/arch/x86/Makefile.in +++ b/arch/x86/Makefile.in @@ -28,7 +28,6 @@ all: \ adler32_avx2.o adler32_avx2.lo \ adler32_avx512.o adler32_avx512.lo \ adler32_avx512_vnni.o adler32_avx512_vnni.lo \ - adler32_sse41.o adler32_sse41.lo \ adler32_ssse3.o adler32_ssse3.lo \ chunkset_avx.o chunkset_avx.lo \ chunkset_sse2.o chunkset_sse2.lo \ @@ -118,12 +117,6 @@ adler32_avx512_vnni.o: $(SRCDIR)/adler32_avx512_vnni.c adler32_avx512_vnni.lo: $(SRCDIR)/adler32_avx512_vnni.c $(CC) $(SFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c -adler32_sse41.o: $(SRCDIR)/adler32_sse41.c - $(CC) $(CFLAGS) $(SSE41FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse41.c - -adler32_sse41.lo: $(SRCDIR)/adler32_sse41.c - $(CC) $(SFLAGS) $(SSE41FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse41.c - adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c $(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c diff --git a/arch/x86/adler32_sse41.c b/arch/x86/adler32_sse41.c deleted file mode 100644 index 602f8ec1..00000000 --- a/arch/x86/adler32_sse41.c +++ /dev/null @@ -1,169 +0,0 @@ -/* adler32_sse41.c -- compute the Adler-32 checksum of a data stream - * Copyright (C) 1995-2011 Mark Adler - * Authors: - * Adam Stylinski - * Brian Bockelman - * For conditions of distribution and use, see copyright notice in zlib.h - */ - -#include "../../zbuild.h" -#include "../../adler32_p.h" - -#ifdef X86_SSE41_ADLER32 - -#include - -static inline uint32_t partial_hsum(__m128i x) { - __m128i second_int = _mm_bsrli_si128(x, 8); - __m128i sum = _mm_add_epi32(x, second_int); - return _mm_cvtsi128_si32(sum); -} - -static inline uint32_t hsum(__m128i x) { - __m128i sum1 = _mm_unpackhi_epi64(x, x); - __m128i sum2 = _mm_add_epi32(x, sum1); - __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01); - __m128i sum4 = _mm_add_epi32(sum2, sum3); - return _mm_cvtsi128_si32(sum4); -} - -Z_INTERNAL uint32_t adler32_sse41(uint32_t adler, const unsigned char *buf, size_t len) { - uint32_t sum2; - - /* split Adler-32 into component sums */ - sum2 = (adler >> 16) & 0xffff; - adler &= 0xffff; - - /* in case user likes doing a byte at a time, keep it fast */ - if (UNLIKELY(len == 1)) - return adler32_len_1(adler, buf, sum2); - - /* initial Adler-32 value (deferred check for len == 1 speed) */ - if (UNLIKELY(buf == NULL)) - return 1L; - - /* in case short lengths are provided, keep it somewhat fast */ - if (UNLIKELY(len < 16)) - return adler32_len_16(adler, buf, len, sum2); - - const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); - const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); - const __m128i dot3v = _mm_set1_epi16(1); - const __m128i zero = _mm_setzero_si128(); - - __m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0, - vbuf_0, v_sad_sum2, vsum2, vsum2_0; - - /* If our buffer is unaligned (likely), make the determination whether - * or not there's enough of a buffer to consume to make the scalar, aligning - * additions worthwhile or if it's worth it to just eat the cost of an unaligned - * load. This is a pretty simple test, just test if 16 - the remainder + len is - * < 16 */ - int max_iters = NMAX; - int rem = (uintptr_t)buf & 15; - int align_offset = 16 - rem; - int k = 0; - if (rem) { - if (len < 16 + align_offset) { - /* Let's eat the cost of this one unaligned load so that - * we don't completely skip over the vectorization. Doing - * 16 bytes at a time unaligned is is better than 16 + <= 15 - * sums */ - vbuf = _mm_loadu_si128((__m128i*)buf); - len -= 16; - buf += 16; - vs1 = _mm_cvtsi32_si128(adler); - vs2 = _mm_cvtsi32_si128(sum2); - vs3 = _mm_setzero_si128(); - vs1_0 = vs1; - goto unaligned_jmp; - } - - for (int i = 0; i < align_offset; ++i) { - adler += *(buf++); - sum2 += adler; - } - - /* lop off the max number of sums based on the scalar sums done - * above */ - len -= align_offset; - max_iters -= align_offset; - } - - - while (len >= 16) { - vs1 = _mm_cvtsi32_si128(adler); - vs2 = _mm_cvtsi32_si128(sum2); - vs3 = _mm_setzero_si128(); - vs2_0 = _mm_setzero_si128(); - vs1_0 = vs1; - - k = (len < max_iters ? (int)len : max_iters); - k -= k % 16; - len -= k; - - while (k >= 32) { - /* - vs1 = adler + sum(c[i]) - vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) - */ - vbuf = _mm_load_si128((__m128i*)buf); - vbuf_0 = _mm_load_si128((__m128i*)(buf + 16)); - buf += 32; - k -= 32; - - v_sad_sum1 = _mm_sad_epu8(vbuf, zero); - v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero); - vs1 = _mm_add_epi32(v_sad_sum1, vs1); - vs3 = _mm_add_epi32(vs1_0, vs3); - - vs1 = _mm_add_epi32(v_sad_sum2, vs1); - v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v); - vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); - v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0); - vs2 = _mm_add_epi32(vsum2, vs2); - vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v); - vs2_0 = _mm_add_epi32(vsum2_0, vs2_0); - vs1_0 = vs1; - } - - vs2 = _mm_add_epi32(vs2_0, vs2); - vs3 = _mm_slli_epi32(vs3, 5); - vs2 = _mm_add_epi32(vs3, vs2); - vs3 = _mm_setzero_si128(); - - while (k >= 16) { - /* - vs1 = adler + sum(c[i]) - vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) - */ - vbuf = _mm_load_si128((__m128i*)buf); - buf += 16; - k -= 16; - -unaligned_jmp: - v_sad_sum1 = _mm_sad_epu8(vbuf, zero); - vs1 = _mm_add_epi32(v_sad_sum1, vs1); - vs3 = _mm_add_epi32(vs1_0, vs3); - v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0); - vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); - vs2 = _mm_add_epi32(vsum2, vs2); - vs1_0 = vs1; - } - - vs3 = _mm_slli_epi32(vs3, 4); - vs2 = _mm_add_epi32(vs2, vs3); - - /* We don't actually need to do a full horizontal sum, since psadbw is actually doing - * a partial reduction sum implicitly and only summing to integers in vector positions - * 0 and 2. This saves us some contention on the shuffle port(s) */ - adler = partial_hsum(vs1) % BASE; - sum2 = hsum(vs2) % BASE; - max_iters = NMAX; - } - - /* Process tail (len < 16). */ - return adler32_len_16(adler, buf, len, sum2); -} - -#endif diff --git a/arch/x86/adler32_ssse3.c b/arch/x86/adler32_ssse3.c index 57357d59..1767572a 100644 --- a/arch/x86/adler32_ssse3.c +++ b/arch/x86/adler32_ssse3.c @@ -1,6 +1,7 @@ -/* adler32.c -- compute the Adler-32 checksum of a data stream +/* adler32_ssse3.c -- compute the Adler-32 checksum of a data stream * Copyright (C) 1995-2011 Mark Adler * Authors: + * Adam Stylinski * Brian Bockelman * For conditions of distribution and use, see copyright notice in zlib.h */ @@ -12,6 +13,20 @@ #include +static inline uint32_t partial_hsum(__m128i x) { + __m128i second_int = _mm_bsrli_si128(x, 8); + __m128i sum = _mm_add_epi32(x, second_int); + return _mm_cvtsi128_si32(sum); +} + +static inline uint32_t hsum(__m128i x) { + __m128i sum1 = _mm_unpackhi_epi64(x, x); + __m128i sum2 = _mm_add_epi32(x, sum1); + __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01); + __m128i sum4 = _mm_add_epi32(sum2, sum3); + return _mm_cvtsi128_si32(sum4); +} + Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len) { uint32_t sum2; @@ -31,74 +46,120 @@ Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size if (UNLIKELY(len < 16)) return adler32_len_16(adler, buf, len, sum2); - uint32_t ALIGNED_(16) s1[4], s2[4]; - - s1[0] = s1[1] = s1[2] = 0; s1[3] = adler; - s2[0] = s2[1] = s2[2] = 0; s2[3] = sum2; - - char ALIGNED_(16) dot1[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - __m128i dot1v = _mm_load_si128((__m128i*)dot1); - char ALIGNED_(16) dot2[16] = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}; - __m128i dot2v = _mm_load_si128((__m128i*)dot2); - short ALIGNED_(16) dot3[8] = {1, 1, 1, 1, 1, 1, 1, 1}; - __m128i dot3v = _mm_load_si128((__m128i*)dot3); - - // We will need to multiply by - //char ALIGNED_(16) shift[4] = {0, 0, 0, 4}; //{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4}; + const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); + const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); + const __m128i dot3v = _mm_set1_epi16(1); + const __m128i zero = _mm_setzero_si128(); + + __m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0, + vbuf_0, v_sad_sum2, vsum2, vsum2_0; + + /* If our buffer is unaligned (likely), make the determination whether + * or not there's enough of a buffer to consume to make the scalar, aligning + * additions worthwhile or if it's worth it to just eat the cost of an unaligned + * load. This is a pretty simple test, just test if 16 - the remainder + len is + * < 16 */ + int max_iters = NMAX; + int rem = (uintptr_t)buf & 15; + int align_offset = 16 - rem; + int k = 0; + if (rem) { + if (len < 16 + align_offset) { + /* Let's eat the cost of this one unaligned load so that + * we don't completely skip over the vectorization. Doing + * 16 bytes at a time unaligned is is better than 16 + <= 15 + * sums */ + vbuf = _mm_loadu_si128((__m128i*)buf); + len -= 16; + buf += 16; + vs1 = _mm_cvtsi32_si128(adler); + vs2 = _mm_cvtsi32_si128(sum2); + vs3 = _mm_setzero_si128(); + vs1_0 = vs1; + goto unaligned_jmp; + } + + for (int i = 0; i < align_offset; ++i) { + adler += *(buf++); + sum2 += adler; + } + + /* lop off the max number of sums based on the scalar sums done + * above */ + len -= align_offset; + max_iters -= align_offset; + } - char ALIGNED_(16) shift[16] = {4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - __m128i shiftv = _mm_load_si128((__m128i*)shift); while (len >= 16) { - __m128i vs1 = _mm_load_si128((__m128i*)s1); - __m128i vs2 = _mm_load_si128((__m128i*)s2); - __m128i vs1_0 = vs1; - - int k = (len < NMAX ? (int)len : NMAX); - k -= k % 16; - len -= k; - - while (k >= 16) { - /* - vs1 = adler + sum(c[i]) - vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) - - NOTE: 256-bit equivalents are: - _mm256_maddubs_epi16 <- operates on 32 bytes to 16 shorts - _mm256_madd_epi16 <- Sums 16 shorts to 8 int32_t. - We could rewrite the below to use 256-bit instructions instead of 128-bit. - */ - __m128i vbuf = _mm_loadu_si128((__m128i*)buf); - buf += 16; - k -= 16; - - __m128i v_short_sum1 = _mm_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts. - __m128i vsum1 = _mm_madd_epi16(v_short_sum1, dot3v); // sum 8 shorts to 4 int32_t; - __m128i v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v); - vs1 = _mm_add_epi32(vsum1, vs1); - __m128i vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); - vs1_0 = _mm_sll_epi32(vs1_0, shiftv); - vsum2 = _mm_add_epi32(vsum2, vs2); - vs2 = _mm_add_epi32(vsum2, vs1_0); - vs1_0 = vs1; - } - - // At this point, we have partial sums stored in vs1 and vs2. There are AVX512 instructions that - // would allow us to sum these quickly (VP4DPWSSD). For now, just unpack and move on. - - uint32_t ALIGNED_(16) s1_unpack[4]; - uint32_t ALIGNED_(16) s2_unpack[4]; - - _mm_store_si128((__m128i*)s1_unpack, vs1); - _mm_store_si128((__m128i*)s2_unpack, vs2); - - adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE); - adler %= BASE; - s1[3] = adler; - - sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE); - sum2 %= BASE; - s2[3] = sum2; + vs1 = _mm_cvtsi32_si128(adler); + vs2 = _mm_cvtsi32_si128(sum2); + vs3 = _mm_setzero_si128(); + vs2_0 = _mm_setzero_si128(); + vs1_0 = vs1; + + k = (len < max_iters ? (int)len : max_iters); + k -= k % 16; + len -= k; + + while (k >= 32) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) + */ + vbuf = _mm_load_si128((__m128i*)buf); + vbuf_0 = _mm_load_si128((__m128i*)(buf + 16)); + buf += 32; + k -= 32; + + v_sad_sum1 = _mm_sad_epu8(vbuf, zero); + v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero); + vs1 = _mm_add_epi32(v_sad_sum1, vs1); + vs3 = _mm_add_epi32(vs1_0, vs3); + + vs1 = _mm_add_epi32(v_sad_sum2, vs1); + v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v); + vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); + v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0); + vs2 = _mm_add_epi32(vsum2, vs2); + vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v); + vs2_0 = _mm_add_epi32(vsum2_0, vs2_0); + vs1_0 = vs1; + } + + vs2 = _mm_add_epi32(vs2_0, vs2); + vs3 = _mm_slli_epi32(vs3, 5); + vs2 = _mm_add_epi32(vs3, vs2); + vs3 = _mm_setzero_si128(); + + while (k >= 16) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) + */ + vbuf = _mm_load_si128((__m128i*)buf); + buf += 16; + k -= 16; + +unaligned_jmp: + v_sad_sum1 = _mm_sad_epu8(vbuf, zero); + vs1 = _mm_add_epi32(v_sad_sum1, vs1); + vs3 = _mm_add_epi32(vs1_0, vs3); + v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0); + vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); + vs2 = _mm_add_epi32(vsum2, vs2); + vs1_0 = vs1; + } + + vs3 = _mm_slli_epi32(vs3, 4); + vs2 = _mm_add_epi32(vs2, vs3); + + /* We don't actually need to do a full horizontal sum, since psadbw is actually doing + * a partial reduction sum implicitly and only summing to integers in vector positions + * 0 and 2. This saves us some contention on the shuffle port(s) */ + adler = partial_hsum(vs1) % BASE; + sum2 = hsum(vs2) % BASE; + max_iters = NMAX; } /* Process tail (len < 16). */ diff --git a/configure b/configure index 4bc0ee9c..b8216abf 100755 --- a/configure +++ b/configure @@ -1536,10 +1536,8 @@ case "${ARCH}" in check_sse41_intrinsics if test ${HAVE_SSE41_INTRIN} -eq 1; then - CFLAGS="${CFLAGS} -DX86_SSE41_ADLER32" - SFLAGS="${SFLAGS} -DX86_SSE41_ADLER32" - ARCH_STATIC_OBJS="${ARCH_STATIC_OBJS} adler32_sse41.o" - ARCH_SHARED_OBJS="${ARCH_SHARED_OBJS} adler32_sse41.lo" + CFLAGS="${CFLAGS} -DX86_SSE41" + SFLAGS="${SFLAGS} -DX86_SSE41" fi check_sse42_intrinsics diff --git a/cpu_features.h b/cpu_features.h index eb741ab8..7cc74a97 100644 --- a/cpu_features.h +++ b/cpu_features.h @@ -31,9 +31,6 @@ extern uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t le #ifdef PPC_VMX_ADLER32 extern uint32_t adler32_vmx(uint32_t adler, const unsigned char *buf, size_t len); #endif -#ifdef X86_SSE41_ADLER32 -extern uint32_t adler32_sse41(uint32_t adler, const unsigned char *buf, size_t len); -#endif #ifdef X86_SSSE3_ADLER32 extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len); #endif diff --git a/functable.c b/functable.c index f39db295..a4ad2978 100644 --- a/functable.c +++ b/functable.c @@ -161,10 +161,6 @@ Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_ if (x86_cpu_has_ssse3) functable.adler32 = &adler32_ssse3; #endif -#ifdef X86_SSE41_ADLER32 - if (x86_cpu_has_sse41) - functable.adler32 = &adler32_sse41; -#endif #ifdef X86_AVX2_ADLER32 if (x86_cpu_has_avx2) functable.adler32 = &adler32_avx2; diff --git a/test/benchmarks/benchmark_adler32.cc b/test/benchmarks/benchmark_adler32.cc index 7aa46c26..b691c23f 100644 --- a/test/benchmarks/benchmark_adler32.cc +++ b/test/benchmarks/benchmark_adler32.cc @@ -78,9 +78,6 @@ BENCHMARK_ADLER32(power8, adler32_power8, power_cpu_has_arch_2_07); #ifdef X86_SSSE3_ADLER32 BENCHMARK_ADLER32(ssse3, adler32_ssse3, x86_cpu_has_ssse3); #endif -#ifdef X86_SSE41_ADLER32 -BENCHMARK_ADLER32(sse41, adler32_sse41, x86_cpu_has_sse41); -#endif #ifdef X86_AVX2_ADLER32 BENCHMARK_ADLER32(avx2, adler32_avx2, x86_cpu_has_avx2); #endif