#include "functable.h"
#include "adler32_fold.h"
-Z_INTERNAL void adler32_fold_reset_c(adler32_fold *adler, uint32_t init_adler) {
- /* So, for the "C" version, we'll just stash the value into nsums.
- * This is mostly a compatibility shim, these functions in the functable
- * will have more optimal versions that make use of adler and sum2. In order
- * to make each implementation bisectable, each new implementation will be a
- * new commit */
- adler->nsums = init_adler;
-}
-
-Z_INTERNAL void adler32_fold_copy_c(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len) {
- adler->nsums = functable.adler32(adler->nsums, src, len);
+Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
memcpy(dst, src, len);
-}
-
-Z_INTERNAL void adler32_fold_c(adler32_fold *adler, const uint8_t *src, size_t len) {
- adler->nsums = functable.adler32(adler->nsums, src, len);
-}
-
-Z_INTERNAL uint32_t adler32_fold_final_c(adler32_fold *adler) {
- return adler->nsums;
+ return functable.adler32(adler, src, len);
}
#include <stdint.h>
-typedef struct adler32_fold_s {
- uint8_t adler[64]; // First half of component sums
- uint8_t sum2[64]; // Second half of component sums
- uint8_t leftover[16]; // A buffer for sub 16 sized carry over, sized for full loads and alignment
- uint32_t nsums; // The number of scalar sums leftover
- uint32_t bytes_leftover; // The number of leftover bytes from the previous sum
-} adler32_fold;
-
-Z_INTERNAL void adler32_fold_reset_c(adler32_fold *adler, uint32_t init_adler);
-Z_INTERNAL void adler32_fold_copy_c(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len);
-Z_INTERNAL void adler32_fold_c(adler32_fold *adler, const uint8_t *src, size_t len);
-Z_INTERNAL uint32_t adler32_fold_final_c(adler32_fold *adler);
+Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
#include "../../fallback_builtins.h"
#include "adler32_avx2_p.h"
#include "../../adler32_fold.h"
+#include <stdio.h>
#include <immintrin.h>
#ifdef X86_AVX2_ADLER32
-Z_INTERNAL void adler32_fold_reset_avx2(adler32_fold *adler, uint32_t init_adler) {
- adler->nsums = init_adler;
-}
-
-Z_INTERNAL uint32_t adler32_fold_final_avx2(adler32_fold *adler) {
- return adler->nsums;
-}
-
#include "adler32_avx2_tpl.h"
#undef ADLER32_AVX2_TPL_H_
#define COPY
#include "adler32_avx2_tpl.h"
#undef COPY
+/*
Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len) {
if (buf == NULL) return 1L;
if (len == 0) return adler;
- ALIGNED_(64) adler32_fold fold;
- adler32_fold_reset_avx2(&fold, adler);
- adler32_fold_avx2(&fold, buf, len);
- return adler32_fold_final_avx2(&fold);
+ return adler32_fold_avx2(adler, buf, len);
}
+*/
#endif
#include "adler32_avx2_p.h"
#ifdef X86_SSE42_ADLER32
-extern void adler32_fold_copy_sse42(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len);
-extern void adler32_fold_sse42(adler32_fold *adler, const uint8_t *src, size_t len);
+extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);
#define copy_sub32(a, b, c, d) adler32_fold_copy_sse42(a, b, c, d)
-#define sub32(a, b, c) adler32_fold_sse42(a, b, c)
+#define sub32(a, b, c) adler32_ssse3(a, b, c)
#else
-#define copy_sub32(a, b, c, d) do { a->nsums = adler32_copy_len_16(adler0, c, b, d, adler1); } while (0)
-#define sub32(a, b, c) do { a->nsums = adler32_len_16(adler0, b, c, adler1); } while (0)
+#define copy_sub32(a, b, c, d) adler32_copy_len_16(adler0, c, b, d, adler1)
+#define sub32(a, b, c) adler32_len_16(adler0, b, c, adler1)
#endif
#ifdef COPY
-Z_INTERNAL void adler32_fold_copy_avx2(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len) {
+Z_INTERNAL uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
#else
-Z_INTERNAL void adler32_fold_avx2(adler32_fold *adler, const uint8_t *src, size_t len) {
+Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) {
#endif
+ if (src == NULL) return 1L;
+ if (len == 0) return adler;
uint32_t adler0, adler1;
- adler1 = (adler->nsums >> 16) & 0xffff;
- adler0 = adler->nsums & 0xffff;
+ adler1 = (adler >> 16) & 0xffff;
+ adler0 = adler & 0xffff;
rem_peel:
if (len < 16) {
#ifdef COPY
- adler->nsums = adler32_copy_len_16(adler0, src, dst, len, adler1);
+ return adler32_copy_len_16(adler0, src, dst, len, adler1);
#else
- adler->nsums = adler32_len_16(adler0, src, len, adler1);
+ return adler32_len_16(adler0, src, len, adler1);
#endif
- return;
} else if (len < 32) {
#ifdef COPY
- copy_sub32(adler, dst, src, len);
+ return copy_sub32(adler, dst, src, len);
#else
- sub32(adler, src, len);
+ return sub32(adler, src, len);
#endif
- return;
}
__m256i vs1, vs2;
__m256i vs1_0 = vs1;
__m256i vs3 = _mm256_setzero_si256();
- size_t k = (len < NMAX ? len : NMAX);
+ size_t k = MIN(len, NMAX);
k -= k % 32;
len -= k;
adler1 = hsum(vs2) % BASE;
}
- adler->nsums = adler0 | (adler1 << 16);
+ adler = adler0 | (adler1 << 16);
if (len) {
goto rem_peel;
}
+
+ return adler;
}
#endif
#include "../../fallback_builtins.h"
#include <immintrin.h>
#include "adler32_avx512_p.h"
+#include "../../adler32_fold.h"
#ifdef X86_AVX512_ADLER32
-Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, size_t len) {
- uint32_t sum2;
- /* For impossibly tiny sizes, use the smaller width versions. We still need
- * to check for compile time support for these but they are likely there */
-#ifdef X86_SSE41_ADLER32
- if (len < 32)
- return adler32_sse41(adler, buf, len);
-#endif
-
-#ifdef X86_AVX2_ADLER32
- if (len < 64)
- return adler32_avx2(adler, buf, len);
-#endif
-
- /* split Adler-32 into component sums */
- sum2 = (adler >> 16) & 0xffff;
- adler &= 0xffff;
-
- /* Only capture these corner cases if we didn't compile with SSE41 and AVX2 support
- * This should make for shorter compiled code */
-#if !defined(X86_AVX2_ADLER32) && !defined(X86_SSE41_ADLER32)
- /* in case user likes doing a byte at a time, keep it fast */
- if (UNLIKELY(len == 1))
- return adler32_len_1(adler, buf, sum2);
-
- /* initial Adler-32 value (deferred check for len == 1 speed) */
- if (UNLIKELY(buf == NULL))
- return 1L;
-
- /* in case short lengths are provided, keep it somewhat fast */
- if (UNLIKELY(len < 16))
- return adler32_len_16(adler, buf, len, sum2);
-#endif
-
- __m512i vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler));
- __m512i vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(sum2));
-
- const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
- 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
- 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
- 56, 57, 58, 59, 60, 61, 62, 63, 64);
- const __m512i dot3v = _mm512_set1_epi16(1);
- const __m512i zero = _mm512_setzero_si512();
-
- while (len >= 64) {
- __m512i vs1_0 = vs1;
- __m512i vs3 = _mm512_setzero_si512();
-
- int k = (len < NMAX ? (int)len : NMAX);
- k -= k % 64;
- len -= k;
-
- while (k >= 64) {
- /*
- vs1 = adler + sum(c[i])
- vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
- */
- __m512i vbuf = _mm512_loadu_si512(buf);
- buf += 64;
- k -= 64;
-
- __m512i vs1_sad = _mm512_sad_epu8(vbuf, zero);
- __m512i v_short_sum2 = _mm512_maddubs_epi16(vbuf, dot2v);
- vs1 = _mm512_add_epi32(vs1_sad, vs1);
- vs3 = _mm512_add_epi32(vs3, vs1_0);
- __m512i vsum2 = _mm512_madd_epi16(v_short_sum2, dot3v);
- vs2 = _mm512_add_epi32(vsum2, vs2);
- vs1_0 = vs1;
- }
-
- vs3 = _mm512_slli_epi32(vs3, 6);
- vs2 = _mm512_add_epi32(vs2, vs3);
-
- adler = partial_hsum(vs1) % BASE;
- vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler));
- sum2 = _mm512_reduce_add_epu32(vs2) % BASE;
- vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(sum2));
- }
-
- /* Process tail (len < 64). */
- return adler32_len_16(adler, buf, len, sum2);
-}
+#include "adler32_avx512_tpl.h"
+#undef ADLER32_AVX512_TPL_H_
+#define COPY
+#include "adler32_avx512_tpl.h"
+#undef COPY
#endif
--- /dev/null
+/* adler32_avx512_tpl.h -- adler32 avx512 vectorized function templates
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_AVX512_TPL_H_
+#define ADLER32_AVX512_TPL_H_
+
+#include "../../zbuild.h"
+#include "../../adler32_p.h"
+#include "../../cpu_features.h"
+#include "../../fallback_builtins.h"
+#include <immintrin.h>
+#include "adler32_avx512_p.h"
+#include "../../adler32_fold.h"
+
+#ifdef X86_AVX512_ADLER32
+
+#ifdef COPY
+Z_INTERNAL uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+#else
+Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, size_t len) {
+#endif
+
+ uint32_t adler0, adler1;
+ adler1 = (adler >> 16) & 0xffff;
+ adler0 = adler & 0xffff;
+
+ if (src == NULL) return 1L;
+ if (len == 0) return adler;
+
+rem_peel:
+ if (len < 64) {
+ /* This handles the remaining copies, just call normal adler checksum after this */
+#ifdef COPY
+ __mmask64 storemask = (0xFFFFFFFFFFFFFFFFUL >> (64 - len));
+ __m512i copy_vec = _mm512_maskz_loadu_epi8(storemask, src);
+ _mm512_mask_storeu_epi8(dst, storemask, copy_vec);
+#endif
+
+#ifdef X86_AVX2_ADLER32
+ return adler32_avx2(adler, src, len);
+#elif defined(X86_SSSE3_ADLER32)
+ return adler32_ssse3(adler, src, len);
+#else
+ return adler32_len_16(adler0, src, len, adler1);
+#endif
+ }
+
+ __m512i vbuf, vs1_0, vs3;
+
+ const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+ 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63, 64);
+ const __m512i dot3v = _mm512_set1_epi16(1);
+ const __m512i zero = _mm512_setzero_si512();
+ size_t k;
+
+ while (len >= 64) {
+ __m512i vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
+ __m512i vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
+ vs1_0 = vs1;
+ vs3 = _mm512_setzero_si512();
+
+ k = MIN(len, NMAX);
+ k -= k % 64;
+ len -= k;
+
+ while (k >= 64) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
+ */
+ vbuf = _mm512_loadu_si512(src);
+#ifdef COPY
+ _mm512_storeu_si512(dst, vbuf);
+ dst += 64;
+#endif
+ src += 64;
+ k -= 64;
+
+ __m512i vs1_sad = _mm512_sad_epu8(vbuf, zero);
+ __m512i v_short_sum2 = _mm512_maddubs_epi16(vbuf, dot2v);
+ vs1 = _mm512_add_epi32(vs1_sad, vs1);
+ vs3 = _mm512_add_epi32(vs3, vs1_0);
+ __m512i vsum2 = _mm512_madd_epi16(v_short_sum2, dot3v);
+ vs2 = _mm512_add_epi32(vsum2, vs2);
+ vs1_0 = vs1;
+ }
+
+ vs3 = _mm512_slli_epi32(vs3, 6);
+ vs2 = _mm512_add_epi32(vs2, vs3);
+
+ adler0 = partial_hsum(vs1) % BASE;
+ adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
+ }
+
+ adler = adler0 | (adler1 << 16);
+
+ /* Process tail (len < 64). */
+ if (len) {
+ goto rem_peel;
+ }
+
+ return adler;
+}
+
+#endif
+#endif
#ifdef X86_SSE42_ADLER32
-Z_INTERNAL void adler32_fold_reset_sse42(adler32_fold *adler, uint32_t init_adler) {
- adler->nsums = init_adler;
-}
+Z_INTERNAL uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
-Z_INTERNAL uint32_t adler32_fold_final_sse42(adler32_fold *adler) {
- return adler->nsums;
-}
+ uint32_t adler0, adler1;
+ adler1 = (adler >> 16) & 0xffff;
+ adler0 = adler & 0xffff;
+
+rem_peel:
+ if (len < 16) {
+ return adler32_copy_len_16(adler0, src, dst, len, adler1);
+ }
+
+ __m128i vbuf, vbuf_0;
+ __m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
+ v_sad_sum2, vsum2, vsum2_0;
+ __m128i zero = _mm_setzero_si128();
+ const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
+ const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+ const __m128i dot3v = _mm_set1_epi16(1);
+ size_t k;
+
+ while (len >= 16) {
+
+ k = MIN(len, NMAX);
+ k -= k % 16;
+ len -= k;
+
+ vs1 = _mm_cvtsi32_si128(adler0);
+ vs2 = _mm_cvtsi32_si128(adler1);
+
+ vs3 = _mm_setzero_si128();
+ vs2_0 = _mm_setzero_si128();
+ vs1_0 = vs1;
+
+ while (k >= 32) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+ */
+ vbuf = _mm_loadu_si128((__m128i*)src);
+ vbuf_0 = _mm_loadu_si128((__m128i*)(src + 16));
+ src += 32;
+ k -= 32;
+
+ v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+ v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
+ _mm_storeu_si128((__m128i*)dst, vbuf);
+ _mm_storeu_si128((__m128i*)(dst + 16), vbuf_0);
+ dst += 32;
+
+ v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
+ v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
-#include "adler32_sse42_tpl.h"
-#undef ADLER32_SSE42_TPL_H_
-#define COPY
-#include "adler32_sse42_tpl.h"
-#undef COPY
+ vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+ vs3 = _mm_add_epi32(vs1_0, vs3);
+
+ vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+ vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
+ vs1 = _mm_add_epi32(v_sad_sum2, vs1);
+ vs2 = _mm_add_epi32(vsum2, vs2);
+ vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
+ vs1_0 = vs1;
+ }
+
+ vs2 = _mm_add_epi32(vs2_0, vs2);
+ vs3 = _mm_slli_epi32(vs3, 5);
+ vs2 = _mm_add_epi32(vs3, vs2);
+ vs3 = _mm_setzero_si128();
+
+ while (k >= 16) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+ */
+ vbuf = _mm_loadu_si128((__m128i*)src);
+ src += 16;
+ k -= 16;
+
+ v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+ v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
+
+ vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+ vs3 = _mm_add_epi32(vs1_0, vs3);
+ vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+ vs2 = _mm_add_epi32(vsum2, vs2);
+ vs1_0 = vs1;
+
+ _mm_storeu_si128((__m128i*)dst, vbuf);
+ dst += 16;
+ }
+
+ vs3 = _mm_slli_epi32(vs3, 4);
+ vs2 = _mm_add_epi32(vs2, vs3);
+
+ adler0 = partial_hsum(vs1) % BASE;
+ adler1 = hsum(vs2) % BASE;
+ }
+
+ /* If this is true, there's fewer than 16 elements remaining */
+ if (len) {
+ goto rem_peel;
+ }
+
+ return adler0 | (adler1 << 16);
+}
#endif
+++ /dev/null
-/* adler32_ssse3_tpl.h -- adler32 ssse3 vectorized function templates
- * Copyright (C) 2022 Adam Stylinski
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#ifndef ADLER32_SSE42_TPL_H_
-#define ADLER32_SSE42_TPL_H_
-
-#include "../../zbuild.h"
-#include <immintrin.h>
-#include "../../adler32_fold.h"
-#include "../../adler32_p.h"
-#include "adler32_ssse3_p.h"
-
-#ifdef COPY
-Z_INTERNAL void adler32_fold_copy_sse42(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len) {
-#else
-Z_INTERNAL void adler32_fold_sse42(adler32_fold *adler, const uint8_t *src, size_t len) {
-#endif
-
- uint32_t adler0, adler1;
- adler1 = (adler->nsums >> 16) & 0xffff;
- adler0 = adler->nsums & 0xffff;
-
- if (len < 16) {
-rem_peel:
-#ifdef COPY
- adler->nsums = adler32_copy_len_16(adler0, src, dst, len, adler1);
-#else
- adler->nsums = adler32_len_16(adler0, src, len, adler1);
-#endif
- return;
- }
-
- __m128i vbuf, vbuf_0;
- __m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
- v_sad_sum2, vsum2, vsum2_0;
- __m128i zero = _mm_setzero_si128();
- const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
- const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
- const __m128i dot3v = _mm_set1_epi16(1);
- size_t k;
-
- while (len >= 16) {
-
- k = MIN(len, NMAX);
- k -= k % 16;
- len -= k;
-
- vs1 = _mm_cvtsi32_si128(adler0);
- vs2 = _mm_cvtsi32_si128(adler1);
-
- vs3 = _mm_setzero_si128();
- vs2_0 = _mm_setzero_si128();
- vs1_0 = vs1;
-
- while (k >= 32) {
- /*
- vs1 = adler + sum(c[i])
- vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
- */
- vbuf = _mm_loadu_si128((__m128i*)src);
- vbuf_0 = _mm_loadu_si128((__m128i*)(src + 16));
- src += 32;
- k -= 32;
-
- v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
- v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
-#ifdef COPY
- _mm_storeu_si128((__m128i*)dst, vbuf);
- _mm_storeu_si128((__m128i*)(dst + 16), vbuf_0);
- dst += 32;
-#endif
- v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
- v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
-
- vs1 = _mm_add_epi32(v_sad_sum1, vs1);
- vs3 = _mm_add_epi32(vs1_0, vs3);
-
- vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
- vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
- vs1 = _mm_add_epi32(v_sad_sum2, vs1);
- vs2 = _mm_add_epi32(vsum2, vs2);
- vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
- vs1_0 = vs1;
- }
-
- vs2 = _mm_add_epi32(vs2_0, vs2);
- vs3 = _mm_slli_epi32(vs3, 5);
- vs2 = _mm_add_epi32(vs3, vs2);
- vs3 = _mm_setzero_si128();
-
- while (k >= 16) {
- /*
- vs1 = adler + sum(c[i])
- vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
- */
- vbuf = _mm_loadu_si128((__m128i*)src);
- src += 16;
- k -= 16;
-
- v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
- v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
-
- vs1 = _mm_add_epi32(v_sad_sum1, vs1);
- vs3 = _mm_add_epi32(vs1_0, vs3);
- vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
- vs2 = _mm_add_epi32(vsum2, vs2);
- vs1_0 = vs1;
-
-#ifdef COPY
- _mm_storeu_si128((__m128i*)dst, vbuf);
- dst += 16;
-#endif
- }
-
- vs3 = _mm_slli_epi32(vs3, 4);
- vs2 = _mm_add_epi32(vs2, vs3);
-
- adler0 = partial_hsum(vs1) % BASE;
- adler1 = hsum(vs2) % BASE;
- }
-
- /* If this is true, there's fewer than 16 elements remaining */
- if (len) {
- goto rem_peel;
- }
-
- adler->nsums = adler0 | (adler1 << 16);
-}
-
-#endif
extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len);
#endif
#ifdef X86_SSE42_ADLER32
-extern void adler32_fold_reset_sse42(adler32_fold *adler, uint32_t init_adler);
-extern void adler32_fold_copy_sse42(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len);
-extern void adler32_fold_sse42(adler32_fold *adler, const uint8_t *src, size_t len);
-extern uint32_t adler32_fold_final_sse42(adler32_fold *adler);
+extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
#ifdef X86_AVX2_ADLER32
extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len);
-extern void adler32_fold_reset_avx2(adler32_fold *adler, uint32_t init_adler);
-extern void adler32_fold_copy_avx2(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len);
-extern void adler32_fold_avx2(adler32_fold *adler, const uint8_t *src, size_t len);
-extern uint32_t adler32_fold_final_avx2(adler32_fold *adler);
+extern uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
#ifdef X86_AVX512_ADLER32
extern uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, size_t len);
+extern uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
#ifdef X86_AVX512VNNI_ADLER32
extern uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, size_t len);
#include "deflate.h"
#include "deflate_p.h"
#include "functable.h"
+#include <stdio.h>
const char PREFIX(deflate_copyright)[] = " deflate 1.2.11.f Copyright 1995-2016 Jean-loup Gailly and Mark Adler ";
/*
INIT_STATE;
#ifdef GZIP
- if (s->wrap == 2)
+ if (s->wrap == 2) {
+ /* Ensure that there's always a reset, regardless of "wrap" */
strm->adler = functable.crc32_fold_reset(&s->crc_fold);
- else
+ } else
#endif
- //strm->adler = ADLER32_INITIAL_VALUE;
- {
strm->adler = ADLER32_INITIAL_VALUE;
- functable.adler32_fold_reset(&s->adler_fold, ADLER32_INITIAL_VALUE);
- }
s->last_flush = -2;
zng_tr_init(s);
if (s->strstart != 0)
put_uint32_msb(s, strm->adler);
strm->adler = ADLER32_INITIAL_VALUE;
- functable.adler32_fold_reset(&s->adler_fold, ADLER32_INITIAL_VALUE);
s->status = BUSY_STATE;
/* Compression must start with an empty pending buffer */
} else
#endif
{
- strm->adler = functable.adler32_fold_final(&s->adler_fold);
if (s->wrap == 1)
put_uint32_msb(s, strm->adler);
}
#endif
} else {
if (strm->state->wrap == 1)
- functable.adler32_fold_copy(&strm->state->adler_fold, buf, strm->next_in, len);
+ strm->adler = functable.adler32_fold_copy(strm->adler, buf, strm->next_in, len);
else
memcpy(buf, strm->next_in, len);
- //strm->adler = functable.adler32(strm->adler, buf, len);
}
strm->next_in += len;
strm->total_in += len;
int nice_match; /* Stop searching when current match exceeds this */
- struct adler32_fold_s ALIGNED_(64) adler_fold;
struct crc32_fold_s ALIGNED_(16) crc_fold;
/* used by trees.c: */
return functable.adler32(adler, buf, len);
}
-Z_INTERNAL void adler32_fold_reset_stub(adler32_fold *adler, uint32_t init_adler) {
- functable.adler32_fold_reset = &adler32_fold_reset_c;
-#if (defined X86_SSE42_ADLER32) && !defined(X86_AVX512_ADLER32) && !defined(X86_AVX512VNNI_ADLER32)
- if (x86_cpu_has_sse42)
- functable.adler32_fold_reset = &adler32_fold_reset_sse42;
-#ifdef X86_AVX2_ADLER32
- if (x86_cpu_has_avx2)
- functable.adler32_fold_reset = &adler32_fold_reset_avx2;
-#endif
-#endif
- functable.adler32_fold_reset(adler, init_adler);
-}
-
-Z_INTERNAL void adler32_fold_copy_stub(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len) {
+Z_INTERNAL uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
functable.adler32_fold_copy = &adler32_fold_copy_c;
-#if (defined X86_SSE42_ADLER32) && !defined(X86_AVX512_ADLER32) && !defined(X86_AVX512VNNI_ADLER32)
+#if (defined X86_SSE42_ADLER32) && !defined(X86_AVX512VNNI_ADLER32)
if (x86_cpu_has_sse42)
functable.adler32_fold_copy = &adler32_fold_copy_sse42;
#endif
if (x86_cpu_has_avx2)
functable.adler32_fold_copy = &adler32_fold_copy_avx2;
#endif
- functable.adler32_fold_copy(adler, dst, src, len);
-}
-
-Z_INTERNAL void adler32_fold_stub(adler32_fold *adler, const uint8_t *src, size_t len) {
- functable.adler32_fold = &adler32_fold_c;
-#if (defined X86_SSE42_ADLER32) && !defined(X86_AVX512_ADLER32) && !defined(X86_AVX512VNNI_ADLER32)
- if (x86_cpu_has_sse42)
- functable.adler32_fold = &adler32_fold_sse42;
-#endif
-#ifdef X86_AVX2_ADLER32
- if (x86_cpu_has_avx2)
- functable.adler32_fold = &adler32_fold_avx2;
-#endif
- functable.adler32_fold(adler, src, len);
-}
-
-Z_INTERNAL uint32_t adler32_fold_final_stub(adler32_fold *adler) {
- functable.adler32_fold_final = &adler32_fold_final_c;
-#if (defined X86_SSE42_ADLER32) && !defined(X86_AVX512_ADLER32) && !defined(X86_AVX512VNNI_ADLER32)
- if (x86_cpu_has_sse42)
- functable.adler32_fold_final = &adler32_fold_final_sse42;
-#endif
-#ifdef X86_AVX2_ADLER32
- if (x86_cpu_has_avx2)
- functable.adler32_fold_final = &adler32_fold_final_avx2;
+#ifdef X86_AVX512_ADLER32
+ if (x86_cpu_has_avx512)
+ functable.adler32_fold_copy = &adler32_fold_copy_avx512;
#endif
- return functable.adler32_fold_final(adler);
+ return functable.adler32_fold_copy(adler, dst, src, len);
}
Z_INTERNAL uint32_t crc32_fold_reset_stub(crc32_fold *crc) {
/* functable init */
Z_INTERNAL Z_TLS struct functable_s functable = {
adler32_stub,
- adler32_fold_reset_stub,
adler32_fold_copy_stub,
- adler32_fold_stub,
- adler32_fold_final_stub,
crc32_stub,
crc32_fold_reset_stub,
crc32_fold_copy_stub,
struct functable_s {
uint32_t (* adler32) (uint32_t adler, const unsigned char *buf, size_t len);
- void (* adler32_fold_reset) (adler32_fold *adler, uint32_t init_adler);
- void (* adler32_fold_copy) (adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len);
- void (* adler32_fold) (adler32_fold *adler, const uint8_t *src, size_t len);
- uint32_t (* adler32_fold_final) (adler32_fold *adler);
+ uint32_t (* adler32_fold_copy) (uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
uint32_t (* crc32) (uint32_t crc, const unsigned char *buf, uint64_t len);
uint32_t (* crc32_fold_reset) (crc32_fold *crc);
void (* crc32_fold_copy) (crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len);
} else
#endif
{
- /*strm->adler = state->check = functable.adler32(state->check, src, copy);
- memcpy(dst, src, copy);*/
- functable.adler32_fold_copy(&state->adler_fold, dst, src, copy);
+ strm->adler = state->check = functable.adler32_fold_copy(state->check, dst, src, copy);
}
}
} else
#endif
{
- //strm->adler = state->check = functable.adler32(state->check, src, len);
- functable.adler32_fold(&state->adler_fold, src, len);
+ strm->adler = state->check = functable.adler32(state->check, src, len);
}
}
state->dmax = 1U << len;
state->flags = 0; /* indicate zlib header */
Tracev((stderr, "inflate: zlib header ok\n"));
- functable.adler32_fold_reset(&state->adler_fold, ADLER32_INITIAL_VALUE);
strm->adler = state->check = ADLER32_INITIAL_VALUE;
state->mode = hold & 0x200 ? DICTID : TYPE;
INITBITS();
NEEDBITS(32);
//strm->adler = state->check = ZSWAP32(hold);
strm->adler = state->check = ZSWAP32(hold);
- functable.adler32_fold_reset(&state->adler_fold, strm->adler);
INITBITS();
state->mode = DICT;
return Z_NEED_DICT;
}
strm->adler = state->check = ADLER32_INITIAL_VALUE;
- functable.adler32_fold_reset(&state->adler_fold, ADLER32_INITIAL_VALUE);
state->mode = TYPE;
case TYPE:
#ifdef GUNZIP
if (state->flags)
strm->adler = state->check = functable.crc32_fold_final(&state->crc_fold);
- else
- strm->adler = state->check = functable.adler32_fold_final(&state->adler_fold);
#endif
}
out = left;
uint32_t wnext; /* window write index */
unsigned char *window; /* allocated sliding window, if needed */
- struct adler32_fold_s ALIGNED_(64) adler_fold;
struct crc32_fold_s ALIGNED_(16) crc_fold;
/* bit accumulator */
adler32_avx512.obj: $(SRCDIR)/arch/x86/adler32_avx512.c $(SRCDIR)/zbuild.h $(SRCDIR)/cpu_features.h $(SRCDIR)/adler32_p.h $(SRCDIR)/arch/x86/adler32_avx512_p.h
adler32_avx512_vnni.obj: $(SRCDIR)/arch/x86/adler32_avx512_vnni.c $(SRCDIR)/zbuild.h $(SRCDIR)/cpu_features.h $(SRCDIR)/adler32_p.h $(SRCDIR)/arch/x86/adler32_avx512_p.h
adler32_sse42.obj: $(SRCDIR)/arch/x86/adler32_sse42.c $(SRCDIR)/zbuild.h $(SRCDIR)/cpu_features.h $(SRCDIR)/adler32_p.h $(SRCDIR)/adler32_fold.h \
- $(SRCDIR)/arch/x86/adler32_ssse3_p.h $(SRCDIR)/arch/x86/adler32_sse42_tpl.h
+ $(SRCDIR)/arch/x86/adler32_ssse3_p.h
adler32_ssse3.obj: $(SRCDIR)/arch/x86/adler32_ssse3.c $(SRCDIR)/zbuild.h $(SRCDIR)/cpu_features.h $(SRCDIR)/adler32_p.h $(SRCDIR)/adler32_fold.h \
$(SRCDIR)/arch/x86/adler32_ssse3_p.h
adler32_fold.obj: $(SRCDIR)/adler32_fold.c $(SRCDIR)/zbuild.h $(SRCDIR)/adler32_fold.h $(SRCDIR)/functable.h