From: Adam Stylinski Date: Fri, 8 Apr 2022 02:57:09 +0000 (-0400) Subject: Added inlined AVX512 adler checksum + copy X-Git-Tag: 2.1.0-beta1~241 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=b8269bb7d4702f8e694441112bb4ba7c59ff2362;p=thirdparty%2Fzlib-ng.git Added inlined AVX512 adler checksum + copy While we're here, also simplfy the "fold" signature, as reducing the number of rebases and horizontal sums did not prove to be meaningfully faster (slower in many circumstances). --- diff --git a/adler32_fold.c b/adler32_fold.c index 3f745cd9..688f8485 100644 --- a/adler32_fold.c +++ b/adler32_fold.c @@ -7,24 +7,7 @@ #include "functable.h" #include "adler32_fold.h" -Z_INTERNAL void adler32_fold_reset_c(adler32_fold *adler, uint32_t init_adler) { - /* So, for the "C" version, we'll just stash the value into nsums. - * This is mostly a compatibility shim, these functions in the functable - * will have more optimal versions that make use of adler and sum2. In order - * to make each implementation bisectable, each new implementation will be a - * new commit */ - adler->nsums = init_adler; -} - -Z_INTERNAL void adler32_fold_copy_c(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len) { - adler->nsums = functable.adler32(adler->nsums, src, len); +Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { memcpy(dst, src, len); -} - -Z_INTERNAL void adler32_fold_c(adler32_fold *adler, const uint8_t *src, size_t len) { - adler->nsums = functable.adler32(adler->nsums, src, len); -} - -Z_INTERNAL uint32_t adler32_fold_final_c(adler32_fold *adler) { - return adler->nsums; + return functable.adler32(adler, src, len); } diff --git a/adler32_fold.h b/adler32_fold.h index ec4270a7..ea456adc 100644 --- a/adler32_fold.h +++ b/adler32_fold.h @@ -8,17 +8,6 @@ #include -typedef struct adler32_fold_s { - uint8_t adler[64]; // First half of component sums - uint8_t sum2[64]; // Second half of component sums - uint8_t leftover[16]; // A buffer for sub 16 sized carry over, sized for full loads and alignment - uint32_t nsums; // The number of scalar sums leftover - uint32_t bytes_leftover; // The number of leftover bytes from the previous sum -} adler32_fold; - -Z_INTERNAL void adler32_fold_reset_c(adler32_fold *adler, uint32_t init_adler); -Z_INTERNAL void adler32_fold_copy_c(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len); -Z_INTERNAL void adler32_fold_c(adler32_fold *adler, const uint8_t *src, size_t len); -Z_INTERNAL uint32_t adler32_fold_final_c(adler32_fold *adler); +Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); #endif diff --git a/arch/x86/adler32_avx2.c b/arch/x86/adler32_avx2.c index be7bd6f0..fcca34ec 100644 --- a/arch/x86/adler32_avx2.c +++ b/arch/x86/adler32_avx2.c @@ -10,32 +10,24 @@ #include "../../fallback_builtins.h" #include "adler32_avx2_p.h" #include "../../adler32_fold.h" +#include #include #ifdef X86_AVX2_ADLER32 -Z_INTERNAL void adler32_fold_reset_avx2(adler32_fold *adler, uint32_t init_adler) { - adler->nsums = init_adler; -} - -Z_INTERNAL uint32_t adler32_fold_final_avx2(adler32_fold *adler) { - return adler->nsums; -} - #include "adler32_avx2_tpl.h" #undef ADLER32_AVX2_TPL_H_ #define COPY #include "adler32_avx2_tpl.h" #undef COPY +/* Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len) { if (buf == NULL) return 1L; if (len == 0) return adler; - ALIGNED_(64) adler32_fold fold; - adler32_fold_reset_avx2(&fold, adler); - adler32_fold_avx2(&fold, buf, len); - return adler32_fold_final_avx2(&fold); + return adler32_fold_avx2(adler, buf, len); } +*/ #endif diff --git a/arch/x86/adler32_avx2_tpl.h b/arch/x86/adler32_avx2_tpl.h index ff571672..7df51d57 100644 --- a/arch/x86/adler32_avx2_tpl.h +++ b/arch/x86/adler32_avx2_tpl.h @@ -14,40 +14,40 @@ #include "adler32_avx2_p.h" #ifdef X86_SSE42_ADLER32 -extern void adler32_fold_copy_sse42(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len); -extern void adler32_fold_sse42(adler32_fold *adler, const uint8_t *src, size_t len); +extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len); #define copy_sub32(a, b, c, d) adler32_fold_copy_sse42(a, b, c, d) -#define sub32(a, b, c) adler32_fold_sse42(a, b, c) +#define sub32(a, b, c) adler32_ssse3(a, b, c) #else -#define copy_sub32(a, b, c, d) do { a->nsums = adler32_copy_len_16(adler0, c, b, d, adler1); } while (0) -#define sub32(a, b, c) do { a->nsums = adler32_len_16(adler0, b, c, adler1); } while (0) +#define copy_sub32(a, b, c, d) adler32_copy_len_16(adler0, c, b, d, adler1) +#define sub32(a, b, c) adler32_len_16(adler0, b, c, adler1) #endif #ifdef COPY -Z_INTERNAL void adler32_fold_copy_avx2(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len) { +Z_INTERNAL uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { #else -Z_INTERNAL void adler32_fold_avx2(adler32_fold *adler, const uint8_t *src, size_t len) { +Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) { #endif + if (src == NULL) return 1L; + if (len == 0) return adler; uint32_t adler0, adler1; - adler1 = (adler->nsums >> 16) & 0xffff; - adler0 = adler->nsums & 0xffff; + adler1 = (adler >> 16) & 0xffff; + adler0 = adler & 0xffff; rem_peel: if (len < 16) { #ifdef COPY - adler->nsums = adler32_copy_len_16(adler0, src, dst, len, adler1); + return adler32_copy_len_16(adler0, src, dst, len, adler1); #else - adler->nsums = adler32_len_16(adler0, src, len, adler1); + return adler32_len_16(adler0, src, len, adler1); #endif - return; } else if (len < 32) { #ifdef COPY - copy_sub32(adler, dst, src, len); + return copy_sub32(adler, dst, src, len); #else - sub32(adler, src, len); + return sub32(adler, src, len); #endif - return; } __m256i vs1, vs2; @@ -63,7 +63,7 @@ rem_peel: __m256i vs1_0 = vs1; __m256i vs3 = _mm256_setzero_si256(); - size_t k = (len < NMAX ? len : NMAX); + size_t k = MIN(len, NMAX); k -= k % 32; len -= k; @@ -133,11 +133,13 @@ rem_peel: adler1 = hsum(vs2) % BASE; } - adler->nsums = adler0 | (adler1 << 16); + adler = adler0 | (adler1 << 16); if (len) { goto rem_peel; } + + return adler; } #endif diff --git a/arch/x86/adler32_avx512.c b/arch/x86/adler32_avx512.c index 5571be45..e26b9cc5 100644 --- a/arch/x86/adler32_avx512.c +++ b/arch/x86/adler32_avx512.c @@ -12,90 +12,14 @@ #include "../../fallback_builtins.h" #include #include "adler32_avx512_p.h" +#include "../../adler32_fold.h" #ifdef X86_AVX512_ADLER32 -Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, size_t len) { - uint32_t sum2; - /* For impossibly tiny sizes, use the smaller width versions. We still need - * to check for compile time support for these but they are likely there */ -#ifdef X86_SSE41_ADLER32 - if (len < 32) - return adler32_sse41(adler, buf, len); -#endif - -#ifdef X86_AVX2_ADLER32 - if (len < 64) - return adler32_avx2(adler, buf, len); -#endif - - /* split Adler-32 into component sums */ - sum2 = (adler >> 16) & 0xffff; - adler &= 0xffff; - - /* Only capture these corner cases if we didn't compile with SSE41 and AVX2 support - * This should make for shorter compiled code */ -#if !defined(X86_AVX2_ADLER32) && !defined(X86_SSE41_ADLER32) - /* in case user likes doing a byte at a time, keep it fast */ - if (UNLIKELY(len == 1)) - return adler32_len_1(adler, buf, sum2); - - /* initial Adler-32 value (deferred check for len == 1 speed) */ - if (UNLIKELY(buf == NULL)) - return 1L; - - /* in case short lengths are provided, keep it somewhat fast */ - if (UNLIKELY(len < 16)) - return adler32_len_16(adler, buf, len, sum2); -#endif - - __m512i vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler)); - __m512i vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(sum2)); - - const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, - 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, - 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, - 56, 57, 58, 59, 60, 61, 62, 63, 64); - const __m512i dot3v = _mm512_set1_epi16(1); - const __m512i zero = _mm512_setzero_si512(); - - while (len >= 64) { - __m512i vs1_0 = vs1; - __m512i vs3 = _mm512_setzero_si512(); - - int k = (len < NMAX ? (int)len : NMAX); - k -= k % 64; - len -= k; - - while (k >= 64) { - /* - vs1 = adler + sum(c[i]) - vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] ) - */ - __m512i vbuf = _mm512_loadu_si512(buf); - buf += 64; - k -= 64; - - __m512i vs1_sad = _mm512_sad_epu8(vbuf, zero); - __m512i v_short_sum2 = _mm512_maddubs_epi16(vbuf, dot2v); - vs1 = _mm512_add_epi32(vs1_sad, vs1); - vs3 = _mm512_add_epi32(vs3, vs1_0); - __m512i vsum2 = _mm512_madd_epi16(v_short_sum2, dot3v); - vs2 = _mm512_add_epi32(vsum2, vs2); - vs1_0 = vs1; - } - - vs3 = _mm512_slli_epi32(vs3, 6); - vs2 = _mm512_add_epi32(vs2, vs3); - - adler = partial_hsum(vs1) % BASE; - vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler)); - sum2 = _mm512_reduce_add_epu32(vs2) % BASE; - vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(sum2)); - } - - /* Process tail (len < 64). */ - return adler32_len_16(adler, buf, len, sum2); -} +#include "adler32_avx512_tpl.h" +#undef ADLER32_AVX512_TPL_H_ +#define COPY +#include "adler32_avx512_tpl.h" +#undef COPY #endif diff --git a/arch/x86/adler32_avx512_tpl.h b/arch/x86/adler32_avx512_tpl.h new file mode 100644 index 00000000..df5dd381 --- /dev/null +++ b/arch/x86/adler32_avx512_tpl.h @@ -0,0 +1,110 @@ +/* adler32_avx512_tpl.h -- adler32 avx512 vectorized function templates + * Copyright (C) 2022 Adam Stylinski + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef ADLER32_AVX512_TPL_H_ +#define ADLER32_AVX512_TPL_H_ + +#include "../../zbuild.h" +#include "../../adler32_p.h" +#include "../../cpu_features.h" +#include "../../fallback_builtins.h" +#include +#include "adler32_avx512_p.h" +#include "../../adler32_fold.h" + +#ifdef X86_AVX512_ADLER32 + +#ifdef COPY +Z_INTERNAL uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { +#else +Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, size_t len) { +#endif + + uint32_t adler0, adler1; + adler1 = (adler >> 16) & 0xffff; + adler0 = adler & 0xffff; + + if (src == NULL) return 1L; + if (len == 0) return adler; + +rem_peel: + if (len < 64) { + /* This handles the remaining copies, just call normal adler checksum after this */ +#ifdef COPY + __mmask64 storemask = (0xFFFFFFFFFFFFFFFFUL >> (64 - len)); + __m512i copy_vec = _mm512_maskz_loadu_epi8(storemask, src); + _mm512_mask_storeu_epi8(dst, storemask, copy_vec); +#endif + +#ifdef X86_AVX2_ADLER32 + return adler32_avx2(adler, src, len); +#elif defined(X86_SSSE3_ADLER32) + return adler32_ssse3(adler, src, len); +#else + return adler32_len_16(adler0, src, len, adler1); +#endif + } + + __m512i vbuf, vs1_0, vs3; + + const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, + 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, 64); + const __m512i dot3v = _mm512_set1_epi16(1); + const __m512i zero = _mm512_setzero_si512(); + size_t k; + + while (len >= 64) { + __m512i vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0)); + __m512i vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1)); + vs1_0 = vs1; + vs3 = _mm512_setzero_si512(); + + k = MIN(len, NMAX); + k -= k % 64; + len -= k; + + while (k >= 64) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] ) + */ + vbuf = _mm512_loadu_si512(src); +#ifdef COPY + _mm512_storeu_si512(dst, vbuf); + dst += 64; +#endif + src += 64; + k -= 64; + + __m512i vs1_sad = _mm512_sad_epu8(vbuf, zero); + __m512i v_short_sum2 = _mm512_maddubs_epi16(vbuf, dot2v); + vs1 = _mm512_add_epi32(vs1_sad, vs1); + vs3 = _mm512_add_epi32(vs3, vs1_0); + __m512i vsum2 = _mm512_madd_epi16(v_short_sum2, dot3v); + vs2 = _mm512_add_epi32(vsum2, vs2); + vs1_0 = vs1; + } + + vs3 = _mm512_slli_epi32(vs3, 6); + vs2 = _mm512_add_epi32(vs2, vs3); + + adler0 = partial_hsum(vs1) % BASE; + adler1 = _mm512_reduce_add_epu32(vs2) % BASE; + } + + adler = adler0 | (adler1 << 16); + + /* Process tail (len < 64). */ + if (len) { + goto rem_peel; + } + + return adler; +} + +#endif +#endif diff --git a/arch/x86/adler32_sse42.c b/arch/x86/adler32_sse42.c index dce15502..4f21702a 100644 --- a/arch/x86/adler32_sse42.c +++ b/arch/x86/adler32_sse42.c @@ -14,18 +14,109 @@ #ifdef X86_SSE42_ADLER32 -Z_INTERNAL void adler32_fold_reset_sse42(adler32_fold *adler, uint32_t init_adler) { - adler->nsums = init_adler; -} +Z_INTERNAL uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { -Z_INTERNAL uint32_t adler32_fold_final_sse42(adler32_fold *adler) { - return adler->nsums; -} + uint32_t adler0, adler1; + adler1 = (adler >> 16) & 0xffff; + adler0 = adler & 0xffff; + +rem_peel: + if (len < 16) { + return adler32_copy_len_16(adler0, src, dst, len, adler1); + } + + __m128i vbuf, vbuf_0; + __m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0, + v_sad_sum2, vsum2, vsum2_0; + __m128i zero = _mm_setzero_si128(); + const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); + const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); + const __m128i dot3v = _mm_set1_epi16(1); + size_t k; + + while (len >= 16) { + + k = MIN(len, NMAX); + k -= k % 16; + len -= k; + + vs1 = _mm_cvtsi32_si128(adler0); + vs2 = _mm_cvtsi32_si128(adler1); + + vs3 = _mm_setzero_si128(); + vs2_0 = _mm_setzero_si128(); + vs1_0 = vs1; + + while (k >= 32) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) + */ + vbuf = _mm_loadu_si128((__m128i*)src); + vbuf_0 = _mm_loadu_si128((__m128i*)(src + 16)); + src += 32; + k -= 32; + + v_sad_sum1 = _mm_sad_epu8(vbuf, zero); + v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero); + _mm_storeu_si128((__m128i*)dst, vbuf); + _mm_storeu_si128((__m128i*)(dst + 16), vbuf_0); + dst += 32; + + v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v); + v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0); -#include "adler32_sse42_tpl.h" -#undef ADLER32_SSE42_TPL_H_ -#define COPY -#include "adler32_sse42_tpl.h" -#undef COPY + vs1 = _mm_add_epi32(v_sad_sum1, vs1); + vs3 = _mm_add_epi32(vs1_0, vs3); + + vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); + vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v); + vs1 = _mm_add_epi32(v_sad_sum2, vs1); + vs2 = _mm_add_epi32(vsum2, vs2); + vs2_0 = _mm_add_epi32(vsum2_0, vs2_0); + vs1_0 = vs1; + } + + vs2 = _mm_add_epi32(vs2_0, vs2); + vs3 = _mm_slli_epi32(vs3, 5); + vs2 = _mm_add_epi32(vs3, vs2); + vs3 = _mm_setzero_si128(); + + while (k >= 16) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) + */ + vbuf = _mm_loadu_si128((__m128i*)src); + src += 16; + k -= 16; + + v_sad_sum1 = _mm_sad_epu8(vbuf, zero); + v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0); + + vs1 = _mm_add_epi32(v_sad_sum1, vs1); + vs3 = _mm_add_epi32(vs1_0, vs3); + vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); + vs2 = _mm_add_epi32(vsum2, vs2); + vs1_0 = vs1; + + _mm_storeu_si128((__m128i*)dst, vbuf); + dst += 16; + } + + vs3 = _mm_slli_epi32(vs3, 4); + vs2 = _mm_add_epi32(vs2, vs3); + + adler0 = partial_hsum(vs1) % BASE; + adler1 = hsum(vs2) % BASE; + } + + /* If this is true, there's fewer than 16 elements remaining */ + if (len) { + goto rem_peel; + } + + return adler0 | (adler1 << 16); +} #endif diff --git a/arch/x86/adler32_sse42_tpl.h b/arch/x86/adler32_sse42_tpl.h deleted file mode 100644 index 71d1db81..00000000 --- a/arch/x86/adler32_sse42_tpl.h +++ /dev/null @@ -1,132 +0,0 @@ -/* adler32_ssse3_tpl.h -- adler32 ssse3 vectorized function templates - * Copyright (C) 2022 Adam Stylinski - * For conditions of distribution and use, see copyright notice in zlib.h - */ - -#ifndef ADLER32_SSE42_TPL_H_ -#define ADLER32_SSE42_TPL_H_ - -#include "../../zbuild.h" -#include -#include "../../adler32_fold.h" -#include "../../adler32_p.h" -#include "adler32_ssse3_p.h" - -#ifdef COPY -Z_INTERNAL void adler32_fold_copy_sse42(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len) { -#else -Z_INTERNAL void adler32_fold_sse42(adler32_fold *adler, const uint8_t *src, size_t len) { -#endif - - uint32_t adler0, adler1; - adler1 = (adler->nsums >> 16) & 0xffff; - adler0 = adler->nsums & 0xffff; - - if (len < 16) { -rem_peel: -#ifdef COPY - adler->nsums = adler32_copy_len_16(adler0, src, dst, len, adler1); -#else - adler->nsums = adler32_len_16(adler0, src, len, adler1); -#endif - return; - } - - __m128i vbuf, vbuf_0; - __m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0, - v_sad_sum2, vsum2, vsum2_0; - __m128i zero = _mm_setzero_si128(); - const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); - const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); - const __m128i dot3v = _mm_set1_epi16(1); - size_t k; - - while (len >= 16) { - - k = MIN(len, NMAX); - k -= k % 16; - len -= k; - - vs1 = _mm_cvtsi32_si128(adler0); - vs2 = _mm_cvtsi32_si128(adler1); - - vs3 = _mm_setzero_si128(); - vs2_0 = _mm_setzero_si128(); - vs1_0 = vs1; - - while (k >= 32) { - /* - vs1 = adler + sum(c[i]) - vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) - */ - vbuf = _mm_loadu_si128((__m128i*)src); - vbuf_0 = _mm_loadu_si128((__m128i*)(src + 16)); - src += 32; - k -= 32; - - v_sad_sum1 = _mm_sad_epu8(vbuf, zero); - v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero); -#ifdef COPY - _mm_storeu_si128((__m128i*)dst, vbuf); - _mm_storeu_si128((__m128i*)(dst + 16), vbuf_0); - dst += 32; -#endif - v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v); - v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0); - - vs1 = _mm_add_epi32(v_sad_sum1, vs1); - vs3 = _mm_add_epi32(vs1_0, vs3); - - vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); - vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v); - vs1 = _mm_add_epi32(v_sad_sum2, vs1); - vs2 = _mm_add_epi32(vsum2, vs2); - vs2_0 = _mm_add_epi32(vsum2_0, vs2_0); - vs1_0 = vs1; - } - - vs2 = _mm_add_epi32(vs2_0, vs2); - vs3 = _mm_slli_epi32(vs3, 5); - vs2 = _mm_add_epi32(vs3, vs2); - vs3 = _mm_setzero_si128(); - - while (k >= 16) { - /* - vs1 = adler + sum(c[i]) - vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) - */ - vbuf = _mm_loadu_si128((__m128i*)src); - src += 16; - k -= 16; - - v_sad_sum1 = _mm_sad_epu8(vbuf, zero); - v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0); - - vs1 = _mm_add_epi32(v_sad_sum1, vs1); - vs3 = _mm_add_epi32(vs1_0, vs3); - vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); - vs2 = _mm_add_epi32(vsum2, vs2); - vs1_0 = vs1; - -#ifdef COPY - _mm_storeu_si128((__m128i*)dst, vbuf); - dst += 16; -#endif - } - - vs3 = _mm_slli_epi32(vs3, 4); - vs2 = _mm_add_epi32(vs2, vs3); - - adler0 = partial_hsum(vs1) % BASE; - adler1 = hsum(vs2) % BASE; - } - - /* If this is true, there's fewer than 16 elements remaining */ - if (len) { - goto rem_peel; - } - - adler->nsums = adler0 | (adler1 << 16); -} - -#endif diff --git a/cpu_features.h b/cpu_features.h index dbabb3ac..9e0d5cb9 100644 --- a/cpu_features.h +++ b/cpu_features.h @@ -36,20 +36,15 @@ extern uint32_t adler32_vmx(uint32_t adler, const unsigned char *buf, size_t len extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len); #endif #ifdef X86_SSE42_ADLER32 -extern void adler32_fold_reset_sse42(adler32_fold *adler, uint32_t init_adler); -extern void adler32_fold_copy_sse42(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len); -extern void adler32_fold_sse42(adler32_fold *adler, const uint8_t *src, size_t len); -extern uint32_t adler32_fold_final_sse42(adler32_fold *adler); +extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); #endif #ifdef X86_AVX2_ADLER32 extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len); -extern void adler32_fold_reset_avx2(adler32_fold *adler, uint32_t init_adler); -extern void adler32_fold_copy_avx2(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len); -extern void adler32_fold_avx2(adler32_fold *adler, const uint8_t *src, size_t len); -extern uint32_t adler32_fold_final_avx2(adler32_fold *adler); +extern uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); #endif #ifdef X86_AVX512_ADLER32 extern uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, size_t len); +extern uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); #endif #ifdef X86_AVX512VNNI_ADLER32 extern uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, size_t len); diff --git a/deflate.c b/deflate.c index 6818a860..006803c3 100644 --- a/deflate.c +++ b/deflate.c @@ -52,6 +52,7 @@ #include "deflate.h" #include "deflate_p.h" #include "functable.h" +#include const char PREFIX(deflate_copyright)[] = " deflate 1.2.11.f Copyright 1995-2016 Jean-loup Gailly and Mark Adler "; /* @@ -444,15 +445,12 @@ int32_t Z_EXPORT PREFIX(deflateResetKeep)(PREFIX3(stream) *strm) { INIT_STATE; #ifdef GZIP - if (s->wrap == 2) + if (s->wrap == 2) { + /* Ensure that there's always a reset, regardless of "wrap" */ strm->adler = functable.crc32_fold_reset(&s->crc_fold); - else + } else #endif - //strm->adler = ADLER32_INITIAL_VALUE; - { strm->adler = ADLER32_INITIAL_VALUE; - functable.adler32_fold_reset(&s->adler_fold, ADLER32_INITIAL_VALUE); - } s->last_flush = -2; zng_tr_init(s); @@ -771,7 +769,6 @@ int32_t Z_EXPORT PREFIX(deflate)(PREFIX3(stream) *strm, int32_t flush) { if (s->strstart != 0) put_uint32_msb(s, strm->adler); strm->adler = ADLER32_INITIAL_VALUE; - functable.adler32_fold_reset(&s->adler_fold, ADLER32_INITIAL_VALUE); s->status = BUSY_STATE; /* Compression must start with an empty pending buffer */ @@ -979,7 +976,6 @@ int32_t Z_EXPORT PREFIX(deflate)(PREFIX3(stream) *strm, int32_t flush) { } else #endif { - strm->adler = functable.adler32_fold_final(&s->adler_fold); if (s->wrap == 1) put_uint32_msb(s, strm->adler); } @@ -1092,10 +1088,9 @@ Z_INTERNAL unsigned read_buf(PREFIX3(stream) *strm, unsigned char *buf, unsigned #endif } else { if (strm->state->wrap == 1) - functable.adler32_fold_copy(&strm->state->adler_fold, buf, strm->next_in, len); + strm->adler = functable.adler32_fold_copy(strm->adler, buf, strm->next_in, len); else memcpy(buf, strm->next_in, len); - //strm->adler = functable.adler32(strm->adler, buf, len); } strm->next_in += len; strm->total_in += len; diff --git a/deflate.h b/deflate.h index 2d34c95e..abc87d8b 100644 --- a/deflate.h +++ b/deflate.h @@ -212,7 +212,6 @@ struct internal_state { int nice_match; /* Stop searching when current match exceeds this */ - struct adler32_fold_s ALIGNED_(64) adler_fold; struct crc32_fold_s ALIGNED_(16) crc_fold; /* used by trees.c: */ diff --git a/functable.c b/functable.c index 097c7b24..83283597 100644 --- a/functable.c +++ b/functable.c @@ -202,22 +202,9 @@ Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_ return functable.adler32(adler, buf, len); } -Z_INTERNAL void adler32_fold_reset_stub(adler32_fold *adler, uint32_t init_adler) { - functable.adler32_fold_reset = &adler32_fold_reset_c; -#if (defined X86_SSE42_ADLER32) && !defined(X86_AVX512_ADLER32) && !defined(X86_AVX512VNNI_ADLER32) - if (x86_cpu_has_sse42) - functable.adler32_fold_reset = &adler32_fold_reset_sse42; -#ifdef X86_AVX2_ADLER32 - if (x86_cpu_has_avx2) - functable.adler32_fold_reset = &adler32_fold_reset_avx2; -#endif -#endif - functable.adler32_fold_reset(adler, init_adler); -} - -Z_INTERNAL void adler32_fold_copy_stub(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len) { +Z_INTERNAL uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { functable.adler32_fold_copy = &adler32_fold_copy_c; -#if (defined X86_SSE42_ADLER32) && !defined(X86_AVX512_ADLER32) && !defined(X86_AVX512VNNI_ADLER32) +#if (defined X86_SSE42_ADLER32) && !defined(X86_AVX512VNNI_ADLER32) if (x86_cpu_has_sse42) functable.adler32_fold_copy = &adler32_fold_copy_sse42; #endif @@ -225,33 +212,11 @@ Z_INTERNAL void adler32_fold_copy_stub(adler32_fold *adler, uint8_t *dst, const if (x86_cpu_has_avx2) functable.adler32_fold_copy = &adler32_fold_copy_avx2; #endif - functable.adler32_fold_copy(adler, dst, src, len); -} - -Z_INTERNAL void adler32_fold_stub(adler32_fold *adler, const uint8_t *src, size_t len) { - functable.adler32_fold = &adler32_fold_c; -#if (defined X86_SSE42_ADLER32) && !defined(X86_AVX512_ADLER32) && !defined(X86_AVX512VNNI_ADLER32) - if (x86_cpu_has_sse42) - functable.adler32_fold = &adler32_fold_sse42; -#endif -#ifdef X86_AVX2_ADLER32 - if (x86_cpu_has_avx2) - functable.adler32_fold = &adler32_fold_avx2; -#endif - functable.adler32_fold(adler, src, len); -} - -Z_INTERNAL uint32_t adler32_fold_final_stub(adler32_fold *adler) { - functable.adler32_fold_final = &adler32_fold_final_c; -#if (defined X86_SSE42_ADLER32) && !defined(X86_AVX512_ADLER32) && !defined(X86_AVX512VNNI_ADLER32) - if (x86_cpu_has_sse42) - functable.adler32_fold_final = &adler32_fold_final_sse42; -#endif -#ifdef X86_AVX2_ADLER32 - if (x86_cpu_has_avx2) - functable.adler32_fold_final = &adler32_fold_final_avx2; +#ifdef X86_AVX512_ADLER32 + if (x86_cpu_has_avx512) + functable.adler32_fold_copy = &adler32_fold_copy_avx512; #endif - return functable.adler32_fold_final(adler); + return functable.adler32_fold_copy(adler, dst, src, len); } Z_INTERNAL uint32_t crc32_fold_reset_stub(crc32_fold *crc) { @@ -489,10 +454,7 @@ Z_INTERNAL uint32_t compare256_stub(const uint8_t *src0, const uint8_t *src1) { /* functable init */ Z_INTERNAL Z_TLS struct functable_s functable = { adler32_stub, - adler32_fold_reset_stub, adler32_fold_copy_stub, - adler32_fold_stub, - adler32_fold_final_stub, crc32_stub, crc32_fold_reset_stub, crc32_fold_copy_stub, diff --git a/functable.h b/functable.h index 8889e74a..da7726e4 100644 --- a/functable.h +++ b/functable.h @@ -12,10 +12,7 @@ struct functable_s { uint32_t (* adler32) (uint32_t adler, const unsigned char *buf, size_t len); - void (* adler32_fold_reset) (adler32_fold *adler, uint32_t init_adler); - void (* adler32_fold_copy) (adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len); - void (* adler32_fold) (adler32_fold *adler, const uint8_t *src, size_t len); - uint32_t (* adler32_fold_final) (adler32_fold *adler); + uint32_t (* adler32_fold_copy) (uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); uint32_t (* crc32) (uint32_t crc, const unsigned char *buf, uint64_t len); uint32_t (* crc32_fold_reset) (crc32_fold *crc); void (* crc32_fold_copy) (crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len); diff --git a/inflate.c b/inflate.c index 864ca888..8611e4bd 100644 --- a/inflate.c +++ b/inflate.c @@ -28,9 +28,7 @@ static inline void inf_chksum_cpy(PREFIX3(stream) *strm, uint8_t *dst, } else #endif { - /*strm->adler = state->check = functable.adler32(state->check, src, copy); - memcpy(dst, src, copy);*/ - functable.adler32_fold_copy(&state->adler_fold, dst, src, copy); + strm->adler = state->check = functable.adler32_fold_copy(state->check, dst, src, copy); } } @@ -42,8 +40,7 @@ static inline void inf_chksum(PREFIX3(stream) *strm, const uint8_t *src, uint32_ } else #endif { - //strm->adler = state->check = functable.adler32(state->check, src, len); - functable.adler32_fold(&state->adler_fold, src, len); + strm->adler = state->check = functable.adler32(state->check, src, len); } } @@ -466,7 +463,6 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) { state->dmax = 1U << len; state->flags = 0; /* indicate zlib header */ Tracev((stderr, "inflate: zlib header ok\n")); - functable.adler32_fold_reset(&state->adler_fold, ADLER32_INITIAL_VALUE); strm->adler = state->check = ADLER32_INITIAL_VALUE; state->mode = hold & 0x200 ? DICTID : TYPE; INITBITS(); @@ -615,7 +611,6 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) { NEEDBITS(32); //strm->adler = state->check = ZSWAP32(hold); strm->adler = state->check = ZSWAP32(hold); - functable.adler32_fold_reset(&state->adler_fold, strm->adler); INITBITS(); state->mode = DICT; @@ -625,7 +620,6 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) { return Z_NEED_DICT; } strm->adler = state->check = ADLER32_INITIAL_VALUE; - functable.adler32_fold_reset(&state->adler_fold, ADLER32_INITIAL_VALUE); state->mode = TYPE; case TYPE: @@ -1018,8 +1012,6 @@ int32_t Z_EXPORT PREFIX(inflate)(PREFIX3(stream) *strm, int32_t flush) { #ifdef GUNZIP if (state->flags) strm->adler = state->check = functable.crc32_fold_final(&state->crc_fold); - else - strm->adler = state->check = functable.adler32_fold_final(&state->adler_fold); #endif } out = left; diff --git a/inflate.h b/inflate.h index 8c65f1db..5761077a 100644 --- a/inflate.h +++ b/inflate.h @@ -104,7 +104,6 @@ struct inflate_state { uint32_t wnext; /* window write index */ unsigned char *window; /* allocated sliding window, if needed */ - struct adler32_fold_s ALIGNED_(64) adler_fold; struct crc32_fold_s ALIGNED_(16) crc_fold; /* bit accumulator */ diff --git a/win32/Makefile.msc b/win32/Makefile.msc index bce5c004..8db2633b 100644 --- a/win32/Makefile.msc +++ b/win32/Makefile.msc @@ -194,7 +194,7 @@ adler32_avx2.obj: $(SRCDIR)/arch/x86/adler32_avx2.c $(SRCDIR)/zbuild.h $(SRCDIR) adler32_avx512.obj: $(SRCDIR)/arch/x86/adler32_avx512.c $(SRCDIR)/zbuild.h $(SRCDIR)/cpu_features.h $(SRCDIR)/adler32_p.h $(SRCDIR)/arch/x86/adler32_avx512_p.h adler32_avx512_vnni.obj: $(SRCDIR)/arch/x86/adler32_avx512_vnni.c $(SRCDIR)/zbuild.h $(SRCDIR)/cpu_features.h $(SRCDIR)/adler32_p.h $(SRCDIR)/arch/x86/adler32_avx512_p.h adler32_sse42.obj: $(SRCDIR)/arch/x86/adler32_sse42.c $(SRCDIR)/zbuild.h $(SRCDIR)/cpu_features.h $(SRCDIR)/adler32_p.h $(SRCDIR)/adler32_fold.h \ - $(SRCDIR)/arch/x86/adler32_ssse3_p.h $(SRCDIR)/arch/x86/adler32_sse42_tpl.h + $(SRCDIR)/arch/x86/adler32_ssse3_p.h adler32_ssse3.obj: $(SRCDIR)/arch/x86/adler32_ssse3.c $(SRCDIR)/zbuild.h $(SRCDIR)/cpu_features.h $(SRCDIR)/adler32_p.h $(SRCDIR)/adler32_fold.h \ $(SRCDIR)/arch/x86/adler32_ssse3_p.h adler32_fold.obj: $(SRCDIR)/adler32_fold.c $(SRCDIR)/zbuild.h $(SRCDIR)/adler32_fold.h $(SRCDIR)/functable.h