-/* crc32_fold.c -- adler32 folding interface
+/* adler32_fold.c -- adler32 folding interface
* Copyright (C) 2022 Adam Stylinski
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifndef ADLER32_FOLD_H_
#define ADLER32_FOLD_H_
-#include <stdint.h>
-
Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
return adler | (sum2 << 16);
}
-static inline uint32_t adler32_copy_len_16(uint32_t adler, const unsigned char *buf, uint8_t *dst, size_t len, uint32_t sum2) {
- while (len--) {
- *dst = *buf++;
- adler += *dst++;
+static inline uint32_t adler32_len_16(uint32_t adler, const unsigned char *buf, size_t len, uint32_t sum2) {
+ while (len) {
+ --len;
+ adler += *buf++;
sum2 += adler;
}
adler %= BASE;
return adler | (sum2 << 16);
}
-static inline uint32_t adler32_len_16(uint32_t adler, const unsigned char *buf, size_t len, uint32_t sum2) {
- while (len) {
- --len;
- adler += *buf++;
+static inline uint32_t adler32_copy_len_16(uint32_t adler, const unsigned char *buf, uint8_t *dst, size_t len, uint32_t sum2) {
+ while (len--) {
+ *dst = *buf++;
+ adler += *dst++;
sum2 += adler;
}
adler %= BASE;
* For conditions of distribution and use, see copyright notice in zlib.h
*/
-#include "../../zbuild.h"
-#include "../../adler32_p.h"
-#include "../../fallback_builtins.h"
-#include "adler32_avx2_p.h"
-#include "../../adler32_fold.h"
-#include <stdio.h>
-
#include <immintrin.h>
#ifdef X86_AVX2_ADLER32
#include "adler32_avx2_tpl.h"
-#undef ADLER32_AVX2_TPL_H_
+
#define COPY
#include "adler32_avx2_tpl.h"
-#undef COPY
-
-/*
-Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len) {
- if (buf == NULL) return 1L;
- if (len == 0) return adler;
- return adler32_fold_avx2(adler, buf, len);
-}
-*/
#endif
#ifndef ADLER32_AVX2_P_H_
#define ADLER32_AVX2_P_H_
-#ifdef X86_AVX2_ADLER32
+#if defined(X86_AVX2_ADLER32) || defined(X86_AVX512VNNI_ADLER32)
/* 32 bit horizontal sum, adapted from Agner Fog's vector library. */
-static inline uint32_t hsum(__m256i x) {
+static inline uint32_t hsum256(__m256i x) {
__m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(x, 1),
_mm256_castsi256_si128(x));
__m128i sum2 = _mm_add_epi32(sum1, _mm_unpackhi_epi64(sum1, sum1));
return (uint32_t)_mm_cvtsi128_si32(sum3);
}
-static inline uint32_t partial_hsum(__m256i x) {
+static inline uint32_t partial_hsum256(__m256i x) {
/* We need a permutation vector to extract every other integer. The
* rest are going to be zeros */
const __m256i perm_vec = _mm256_setr_epi32(0, 2, 4, 6, 1, 1, 1, 1);
* For conditions of distribution and use, see copyright notice in zlib.h
*/
-#ifndef ADLER32_AVX2_TPL_H_
-#define ADLER32_AVX2_TPL_H_
-
#include "../../zbuild.h"
#include <immintrin.h>
#include "../../adler32_fold.h"
rem_peel:
if (len < 16) {
#ifdef COPY
- return adler32_copy_len_16(adler0, src, dst, len, adler1);
+ return adler32_copy_len_16(adler0, src, dst, len, adler1);
#else
- return adler32_len_16(adler0, src, len, adler1);
+ return adler32_len_16(adler0, src, len, adler1);
#endif
} else if (len < 32) {
#ifdef COPY
* conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant).
* This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly
* what the compiler is doing to avoid integer divisions. */
- adler0 = partial_hsum(vs1) % BASE;
- adler1 = hsum(vs2) % BASE;
+ adler0 = partial_hsum256(vs1) % BASE;
+ adler1 = hsum256(vs2) % BASE;
}
adler = adler0 | (adler1 << 16);
return adler;
}
-
-#endif
* For conditions of distribution and use, see copyright notice in zlib.h
*/
-#include "../../zbuild.h"
-#include "../../adler32_p.h"
-#include "../../cpu_features.h"
-#include "../../fallback_builtins.h"
-#include <immintrin.h>
-#include "adler32_avx512_p.h"
-#include "../../adler32_fold.h"
-
#ifdef X86_AVX512_ADLER32
#include "adler32_avx512_tpl.h"
-#undef ADLER32_AVX512_TPL_H_
+
#define COPY
#include "adler32_avx512_tpl.h"
-#undef COPY
#endif
* For conditions of distribution and use, see copyright notice in zlib.h
*/
-#ifndef ADLER32_AVX512_TPL_H_
-#define ADLER32_AVX512_TPL_H_
-
#include "../../zbuild.h"
#include "../../adler32_p.h"
+#include "../../adler32_fold.h"
#include "../../cpu_features.h"
#include "../../fallback_builtins.h"
#include <immintrin.h>
#include "adler32_avx512_p.h"
-#include "../../adler32_fold.h"
#ifdef X86_AVX512_ADLER32
Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, size_t len) {
#endif
+ if (src == NULL) return 1L;
+ if (len == 0) return adler;
+
uint32_t adler0, adler1;
adler1 = (adler >> 16) & 0xffff;
adler0 = adler & 0xffff;
- if (src == NULL) return 1L;
- if (len == 0) return adler;
-
rem_peel:
if (len < 64) {
/* This handles the remaining copies, just call normal adler checksum after this */
}
#endif
-#endif
* For conditions of distribution and use, see copyright notice in zlib.h
*/
+#ifdef X86_AVX512VNNI_ADLER32
+
#include "../../zbuild.h"
#include "../../adler32_p.h"
#include "../../cpu_features.h"
#include "../../fallback_builtins.h"
#include <immintrin.h>
+#include "../../adler32_fold.h"
#include "adler32_avx512_p.h"
+#include "adler32_avx2_p.h"
-#ifdef X86_AVX512VNNI_ADLER32
-Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, size_t len) {
- uint32_t sum2;
+Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size_t len) {
+ if (src == NULL) return 1L;
+ if (len == 0) return adler;
+
+ uint32_t adler0, adler1;
+ adler1 = (adler >> 16) & 0xffff;
+ adler0 = adler & 0xffff;
- /* For impossibly tiny sizes, use the smaller width versions. We still need
- * to check for compile time support for these but they are likely there */
-#ifdef X86_SSE41_ADLER32
+rem_peel:
if (len < 32)
- return adler32_sse41(adler, buf, len);
+#if defined(X86_SSSE3_ADLER32)
+ return adler32_ssse3(adler, src, len);
+#else
+ return adler32_len_16(adler0, src, len, adler1);
#endif
-#ifdef X86_AVX2_ADLER32
if (len < 64)
- return adler32_avx2(adler, buf, len);
-#endif
-
- /* split Adler-32 into component sums */
- sum2 = (adler >> 16) & 0xffff;
- adler &= 0xffff;
-
- /* Only capture these corner cases if we didn't compile with SSE41 and AVX2 support
- * This should make for shorter compiled code */
-#if !defined(X86_AVX2_ADLER32) && !defined(X86_SSE41_ADLER32)
- /* in case user likes doing a byte at a time, keep it fast */
- if (UNLIKELY(len == 1))
- return adler32_len_1(adler, buf, sum2);
-
- /* initial Adler-32 value (deferred check for len == 1 speed) */
- if (UNLIKELY(buf == NULL))
- return 1L;
-
- /* in case short lengths are provided, keep it somewhat fast */
- if (UNLIKELY(len < 16))
- return adler32_len_16(adler, buf, len, sum2);
+#ifdef X86_AVX2_ADLER32
+ return adler32_avx2(adler, src, len);
+#elif defined(X86_SSE3_ADLER32)
+ return adler32_ssse3(adler, src, len);
+#else
+ return adler32_len_16(adler0, src, len, adler1);
#endif
- /* We want to place initial adler sum at vector position 0, as it is one of the lanes that line up
- * with the sum of absolute differences' reduction sum. If we do this, we can get away with a partial,
- * less expensive horizontal sum for the vs1 component at the end. It also happens to be marginally better
- * (by a single cycle) to do this with the ancient vmovd insruction, and simply allow the register to be
- * aliased up to a 512 bit wide zmm */
- __m512i vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler));
- __m512i vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(sum2));
-
const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
56, 57, 58, 59, 60, 61, 62, 63, 64);
const __m512i zero = _mm512_setzero_si512();
+ __m512i vs1, vs2;
while (len >= 64) {
- int k = (len < NMAX ? (int)len : NMAX);
+ vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
+ vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
+ size_t k = MIN(len, NMAX);
k -= k % 64;
len -= k;
__m512i vs1_0 = vs1;
/* Remainder peeling */
if (k % 128) {
- vbuf1 = _mm512_loadu_si512(buf);
- buf += 64;
+ vbuf1 = _mm512_loadu_si512((__m512i*)src);
+
+ src += 64;
k -= 64;
__m512i vs1_sad = _mm512_sad_epu8(vbuf1, zero);
vs1 = adler + sum(c[i])
vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
*/
- vbuf0 = _mm512_loadu_si512(buf);
- vbuf1 = _mm512_loadu_si512(buf + 64);
- buf += 128;
+ vbuf0 = _mm512_loadu_si512((__m512i*)src);
+ vbuf1 = _mm512_loadu_si512((__m512i*)(src + 64));
+ src += 128;
k -= 128;
__m512i vs1_sad = _mm512_sad_epu8(vbuf0, zero);
vs2 = _mm512_add_epi32(vs2, vs3);
vs2 = _mm512_add_epi32(vs2, vs2_1);
- adler = partial_hsum(vs1) % BASE;
- vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler));
- sum2 = _mm512_reduce_add_epu32(vs2) % BASE;
- vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(sum2));
+ adler0 = partial_hsum(vs1) % BASE;
+ adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
+ }
+
+ adler = adler0 | (adler1 << 16);
+
+ /* Process tail (len < 64). */
+ if (len) {
+ goto rem_peel;
+ }
+
+ return adler;
+}
+
+Z_INTERNAL uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+ if (src == NULL) return 1L;
+ if (len == 0) return adler;
+
+ uint32_t adler0, adler1;
+ adler1 = (adler >> 16) & 0xffff;
+ adler0 = adler & 0xffff;
+
+rem_peel_copy:
+ if (len < 32) {
+ /* This handles the remaining copies, just call normal adler checksum after this */
+ __mmask32 storemask = (0xFFFFFFFFUL >> (32 - len));
+ __m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src);
+ _mm256_mask_storeu_epi8(dst, storemask, copy_vec);
+
+#if defined(X86_SSSE3_ADLER32)
+ return adler32_ssse3(adler, src, len);
+#else
+ return adler32_len_16(adler0, src, len, adler1);
+#endif
+ }
+
+ const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i vs1, vs2;
+
+ while (len >= 32) {
+ vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
+ vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
+ size_t k = MIN(len, NMAX);
+ k -= k % 32;
+ len -= k;
+ __m256i vs1_0 = vs1;
+ __m256i vs3 = _mm256_setzero_si256();
+ /* We might get a tad bit more ILP here if we sum to a second register in the loop */
+ __m256i vs2_1 = _mm256_setzero_si256();
+ __m256i vbuf0, vbuf1;
+
+ /* Remainder peeling */
+ if (k % 64) {
+ vbuf1 = _mm256_loadu_si256((__m256i*)src);
+ _mm256_storeu_si256((__m256i*)dst, vbuf1);
+ dst += 32;
+
+ src += 32;
+ k -= 32;
+
+ __m256i vs1_sad = _mm256_sad_epu8(vbuf1, zero);
+ vs1 = _mm256_add_epi32(vs1, vs1_sad);
+ vs3 = _mm256_add_epi32(vs3, vs1_0);
+ vs2 = _mm256_dpbusd_epi32(vs2, vbuf1, dot2v);
+ vs1_0 = vs1;
+ }
+
+ /* Manually unrolled this loop by 2 for an decent amount of ILP */
+ while (k >= 64) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
+ */
+ vbuf0 = _mm256_loadu_si256((__m256i*)src);
+ vbuf1 = _mm256_loadu_si256((__m256i*)(src + 32));
+ _mm256_storeu_si256((__m256i*)dst, vbuf0);
+ _mm256_storeu_si256((__m256i*)(dst + 32), vbuf1);
+ dst += 64;
+ src += 64;
+ k -= 64;
+
+ __m256i vs1_sad = _mm256_sad_epu8(vbuf0, zero);
+ vs1 = _mm256_add_epi32(vs1, vs1_sad);
+ vs3 = _mm256_add_epi32(vs3, vs1_0);
+ /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
+ * instructions to eliminate them */
+ vs2 = _mm256_dpbusd_epi32(vs2, vbuf0, dot2v);
+
+ vs3 = _mm256_add_epi32(vs3, vs1);
+ vs1_sad = _mm256_sad_epu8(vbuf1, zero);
+ vs1 = _mm256_add_epi32(vs1, vs1_sad);
+ vs2_1 = _mm256_dpbusd_epi32(vs2_1, vbuf1, dot2v);
+ vs1_0 = vs1;
+ }
+
+ vs3 = _mm256_slli_epi32(vs3, 5);
+ vs2 = _mm256_add_epi32(vs2, vs3);
+ vs2 = _mm256_add_epi32(vs2, vs2_1);
+
+ adler0 = partial_hsum256(vs1) % BASE;
+ adler1 = hsum256(vs2) % BASE;
}
+ adler = adler0 | (adler1 << 16);
+
/* Process tail (len < 64). */
- return adler32_len_16(adler, buf, len, sum2);
+ if (len) {
+ goto rem_peel_copy;
+ }
+
+ return adler;
}
#endif
-/* adler32_sse4.c -- compute the Adler-32 checksum of a data stream
+/* adler32_sse42.c -- compute the Adler-32 checksum of a data stream
* Copyright (C) 1995-2011 Mark Adler
* Authors:
* Adam Stylinski <kungfujesus06@gmail.com>
#ifdef X86_SSE42_ADLER32
Z_INTERNAL uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
-
uint32_t adler0, adler1;
adler1 = (adler >> 16) & 0xffff;
adler0 = adler & 0xffff;
+++ /dev/null
-/* adler32_ssse3_tpl.h -- adler32 ssse3 vectorized function templates
- * Copyright (C) 2022 Adam Stylinski
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#ifndef ADLER32_SSSE3_TPL_H_
-#define ADLER32_SSSE3_TPL_H_
-
-#include "../../zbuild.h"
-#include <immintrin.h>
-#include "../../adler32_fold.h"
-#include "../../adler32_p.h"
-#include "adler32_ssse3_p.h"
-
-#ifdef COPY
-Z_INTERNAL void adler32_fold_copy_ssse3(adler32_fold *adler, uint8_t *dst, const uint8_t *src, size_t len) {
-#else
-Z_INTERNAL void adler32_fold_ssse3(adler32_fold *adler, const uint8_t *src, size_t len) {
-#endif
- uint32_t adler0, adler1;
-
- /* split Adler-32 into component sums */
- adler1 = (adler->nsums >> 16) & 0xffff;
- adler0 = adler->nsums & 0xffff;
-
- /* in case user likes doing a byte at a time, keep it fast */
- if (UNLIKELY(len == 1)) {
-#ifdef COPY
- *(dst++) = *src;
-#endif
- adler->nsums = adler32_len_1(adler0, src, adler1);
- return;
- }
-
- /* initial Adler-32 value (deferred check for len == 1 speed) */
- if (UNLIKELY(src == NULL)) {
- adler->nsums = 1L;
- return;
- }
-
- /* in case short lengths are provided, keep it somewhat fast */
- if (UNLIKELY(len < 16)) {
- goto sub16;
- }
-
- const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
- const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
- const __m128i dot3v = _mm_set1_epi16(1);
- const __m128i zero = _mm_setzero_si128();
-
- __m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
- vbuf_0, v_sad_sum2, vsum2, vsum2_0;
-
- /* If our buffer is unaligned (likely), make the determination whether
- * or not there's enough of a buffer to consume to make the scalar, aligning
- * additions worthwhile or if it's worth it to just eat the cost of an unaligned
- * load. This is a pretty simple test, just test if 16 - the remainder + len is
- * < 16 */
- size_t max_iters = NMAX;
- size_t rem = (uintptr_t)src & 15;
- size_t align_offset = 16 - rem;
- size_t k = 0;
- if (rem) {
- if (len < 16 + align_offset) {
- /* Let's eat the cost of this one unaligned load so that
- * we don't completely skip over the vectorization. Doing
- * 16 bytes at a time unaligned is is better than 16 + <= 15
- * sums */
- vbuf = _mm_loadu_si128((__m128i*)src);
- len -= 16;
- src += 16;
-#ifdef COPY
- _mm_storeu_si128((__m128i*)dst, vbuf);
- dst += 16;
-#endif
- vs1 = _mm_cvtsi32_si128(adler0);
- vs2 = _mm_cvtsi32_si128(adler1);
- vs3 = _mm_setzero_si128();
- vs1_0 = vs1;
- goto unaligned_jmp;
- }
-
-#ifdef COPY
- memcpy(dst, src, align_offset);
- dst += align_offset;
-#endif
- for (size_t i = 0; i < align_offset; ++i) {
- adler0 += *(src++);
- adler1 += adler0;
- }
-
- /* lop off the max number of sums based on the scalar sums done
- * above */
- len -= align_offset;
- max_iters -= align_offset;
- }
-
-
- while (len >= 16) {
- vs1 = _mm_cvtsi32_si128(adler0);
- vs2 = _mm_cvtsi32_si128(adler1);
- vs3 = _mm_setzero_si128();
- vs2_0 = _mm_setzero_si128();
- vs1_0 = vs1;
-
- k = (len < max_iters ? len : max_iters);
- k -= k % 16;
- len -= k;
-
- while (k >= 32) {
- /*
- vs1 = adler + sum(c[i])
- vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
- */
- vbuf = _mm_load_si128((__m128i*)src);
- vbuf_0 = _mm_load_si128((__m128i*)(src + 16));
- src += 32;
- k -= 32;
-
- v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
- v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
-#ifdef COPY
- _mm_storeu_si128((__m128i*)dst, vbuf);
- _mm_storeu_si128((__m128i*)(dst + 16), vbuf_0);
- dst += 32;
-#endif
- vs1 = _mm_add_epi32(v_sad_sum1, vs1);
- vs3 = _mm_add_epi32(vs1_0, vs3);
-
- vs1 = _mm_add_epi32(v_sad_sum2, vs1);
- v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
- vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
- v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
- vs2 = _mm_add_epi32(vsum2, vs2);
- vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
- vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
- vs1_0 = vs1;
- }
-
- vs2 = _mm_add_epi32(vs2_0, vs2);
- vs3 = _mm_slli_epi32(vs3, 5);
- vs2 = _mm_add_epi32(vs3, vs2);
- vs3 = _mm_setzero_si128();
-
- while (k >= 16) {
- /*
- vs1 = adler + sum(c[i])
- vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
- */
- vbuf = _mm_load_si128((__m128i*)src);
- src += 16;
- k -= 16;
-
-unaligned_jmp:
- v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
-#ifdef COPY
- _mm_storeu_si128((__m128i*)dst, vbuf);
- dst += 16;
-#endif
- vs1 = _mm_add_epi32(v_sad_sum1, vs1);
- vs3 = _mm_add_epi32(vs1_0, vs3);
- v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
- vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
- vs2 = _mm_add_epi32(vsum2, vs2);
- vs1_0 = vs1;
- }
-
- vs3 = _mm_slli_epi32(vs3, 4);
- vs2 = _mm_add_epi32(vs2, vs3);
-
- /* We don't actually need to do a full horizontal sum, since psadbw is actually doing
- * a partial reduction sum implicitly and only summing to integers in vector positions
- * 0 and 2. This saves us some contention on the shuffle port(s) */
- adler0 = partial_hsum(vs1) % BASE;
- adler1 = hsum(vs2) % BASE;
- max_iters = NMAX;
- }
-
-sub16:
-#ifdef COPY
- adler->nsums = adler32_copy_len_16(adler0, src, dst, len, adler1);
-#else
- /* Process tail (len < 16). */
- adler->nsums = adler32_len_16(adler0, src, len, adler1);
-#endif
-}
-
-#endif
#ifndef CPU_FEATURES_H_
#define CPU_FEATURES_H_
-#include "crc32_fold.h"
#include "adler32_fold.h"
+#include "crc32_fold.h"
#if defined(X86_FEATURES)
# include "arch/x86/x86_features.h"
#ifdef X86_SSSE3_ADLER32
extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len);
#endif
-#ifdef X86_SSE42_ADLER32
-extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-#endif
#ifdef X86_AVX2_ADLER32
extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len);
-extern uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
#ifdef X86_AVX512_ADLER32
extern uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, size_t len);
-extern uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
#ifdef X86_AVX512VNNI_ADLER32
extern uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, size_t len);
extern uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len);
#endif
+/* adler32 folding */
+#ifdef X86_SSE42_ADLER32
+extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef X86_AVX2_ADLER32
+extern uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef X86_AVX512_ADLER32
+extern uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+#ifdef X86_AVX512VNNI_ADLER32
+extern uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+#endif
+
/* CRC32 folding */
#ifdef X86_PCLMULQDQ_CRC
extern uint32_t crc32_fold_reset_pclmulqdq(crc32_fold *crc);
#include "deflate.h"
#include "deflate_p.h"
#include "functable.h"
-#include <stdio.h>
const char PREFIX(deflate_copyright)[] = " deflate 1.2.11.f Copyright 1995-2016 Jean-loup Gailly and Mark Adler ";
/*
#ifdef GZIP
if (s->wrap == 2) {
- /* Ensure that there's always a reset, regardless of "wrap" */
strm->adler = functable.crc32_fold_reset(&s->crc_fold);
} else
#endif
subject to change. Applications should only use zlib.h.
*/
-#include "adler32_fold.h"
#include "zutil.h"
#include "zendian.h"
+#include "adler32_fold.h"
#include "crc32_fold.h"
/* define NO_GZIP when compiling if you want to disable gzip header and
Z_INTERNAL uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
functable.adler32_fold_copy = &adler32_fold_copy_c;
-#if (defined X86_SSE42_ADLER32) && !defined(X86_AVX512VNNI_ADLER32)
+#if (defined X86_SSE42_ADLER32)
if (x86_cpu_has_sse42)
functable.adler32_fold_copy = &adler32_fold_copy_sse42;
#endif
#ifdef X86_AVX512_ADLER32
if (x86_cpu_has_avx512)
functable.adler32_fold_copy = &adler32_fold_copy_avx512;
+#endif
+#ifdef X86_AVX512VNNI_ADLER32
+ if (x86_cpu_has_avx512vnni)
+ functable.adler32_fold_copy = &adler32_fold_copy_avx512_vnni;
#endif
return functable.adler32_fold_copy(adler, dst, src, len);
}
#endif
case DICTID:
NEEDBITS(32);
- //strm->adler = state->check = ZSWAP32(hold);
strm->adler = state->check = ZSWAP32(hold);
INITBITS();
state->mode = DICT;
#ifndef INFLATE_H_
#define INFLATE_H_
-#include "crc32_fold.h"
#include "adler32_fold.h"
+#include "crc32_fold.h"
/* define NO_GZIP when compiling if you want to disable gzip header and trailer decoding by inflate().
NO_GZIP would be used to avoid linking in the crc code when it is not needed.
add_executable(benchmark_zlib
benchmark_adler32.cc
+ benchmark_adler32_copy.cc
benchmark_compare256.cc
benchmark_crc32.cc
benchmark_main.cc
--- /dev/null
+/* benchmark_adler32_copy.cc -- benchmark adler32 (elided copy) variants
+ * Copyright (C) 2022 Nathan Moinvaziri, Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+
+#include <benchmark/benchmark.h>
+
+extern "C" {
+# include "zbuild.h"
+# include "zutil_p.h"
+# include "cpu_features.h"
+}
+
+#define MAX_RANDOM_INTS (1024 * 1024)
+#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t))
+
+typedef uint32_t (*adler32_cpy_func)(uint32_t adler, unsigned char *dst, const unsigned char *buf, size_t len);
+
+class adler32_copy: public benchmark::Fixture {
+private:
+ uint32_t *random_ints_src;
+ uint32_t *random_ints_dst;
+
+public:
+ void SetUp(const ::benchmark::State& state) {
+ /* Control the alignment so that we have the best case scenario for loads. With
+ * AVX512, unaligned loads can mean we're crossing a cacheline boundary at every load.
+ * And while this is a realistic scenario, it makes it difficult to compare benchmark
+ * to benchmark because one allocation could have been aligned perfectly for the loads
+ * while the subsequent one happened to not be. This is not to be advantageous to AVX512
+ * (indeed, all lesser SIMD implementations benefit from this aligned allocation), but to
+ * control the _consistency_ of the results */
+ random_ints_src = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE);
+ random_ints_dst = (uint32_t *)zng_alloc(MAX_RANDOM_INTS_SIZE);
+ assert(random_ints != NULL);
+
+ for (int32_t i = 0; i < MAX_RANDOM_INTS; i++) {
+ random_ints_src[i] = rand();
+ }
+ }
+
+ void Bench(benchmark::State& state, adler32_cpy_func adler32_func) {
+ uint32_t hash = 0;
+
+ for (auto _ : state) {
+ hash = adler32_func(hash, (unsigned char *)random_ints_dst,
+ (const unsigned char*)random_ints_src, state.range(0));
+ }
+
+ benchmark::DoNotOptimize(hash);
+ }
+
+ void TearDown(const ::benchmark::State& state) {
+ zng_free(random_ints_src);
+ zng_free(random_ints_dst);
+ }
+};
+
+#define BENCHMARK_ADLER32_COPY(name, fptr, support_flag) \
+ BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \
+ if (!support_flag) { \
+ state.SkipWithError("CPU does not support " #name); \
+ } \
+ Bench(state, fptr); \
+ } \
+ BENCHMARK_REGISTER_F(adler32_copy, name)->Range(8192, MAX_RANDOM_INTS_SIZE);
+
+#define BENCHMARK_ADLER32_BASELINE_COPY(name, fptr, support_flag) \
+ BENCHMARK_DEFINE_F(adler32_copy, name)(benchmark::State& state) { \
+ if (!support_flag) { \
+ state.SkipWithError("CPU does not support " #name); \
+ } \
+ Bench(state, [](uint32_t init_sum, unsigned char *dst, \
+ const unsigned char *buf, size_t len) -> uint32_t { \
+ memcpy(dst, buf, len); \
+ return fptr(init_sum, buf, len); \
+ }); \
+ } \
+ BENCHMARK_REGISTER_F(adler32_copy, name)->Range(8192, MAX_RANDOM_INTS_SIZE);
+
+BENCHMARK_ADLER32_BASELINE_COPY(c, adler32_c, 1);
+
+#ifdef ARM_NEON_ADLER32
+/* If we inline this copy for neon, the function would go here */
+//BENCHMARK_ADLER32_COPY(neon, adler32_neon, arm_cpu_has_neon);
+BENCHMARK_ADLER32_BASELINE_COPY(neon_copy_baseline, adler32_neon, arm_cpu_has_neon);
+#endif
+
+#ifdef PPC_VMX_ADLER32
+//BENCHMARK_ADLER32_COPY(vmx_inline_copy, adler32_fold_copy_vmx, power_cpu_has_altivec);
+BENCHMARK_ADLER32_BASELINE_COPY(vmx_copy_baseline, adler32_vmx, power_cpu_has_altivec);
+#endif
+#ifdef POWER8_VSX_ADLER32
+//BENCHMARK_ADLER32_COPY(power8_inline_copy, adler32_fold_copy_power8, power_cpu_has_arch_2_07);
+BENCHMARK_ADLER32_BASELINE_COPY(power8, adler32_power8, power_cpu_has_arch_2_07);
+#endif
+
+#ifdef X86_SSE42_ADLER32
+BENCHMARK_ADLER32_BASELINE_COPY(sse42_baseline, adler32_ssse3, x86_cpu_has_ssse3);
+BENCHMARK_ADLER32_COPY(sse42, adler32_fold_copy_sse42, x86_cpu_has_sse42);
+#endif
+#ifdef X86_AVX2_ADLER32
+BENCHMARK_ADLER32_BASELINE_COPY(avx2_baseline, adler32_avx2, x86_cpu_has_avx2);
+BENCHMARK_ADLER32_COPY(avx2, adler32_fold_copy_avx2, x86_cpu_has_avx2);
+#endif
+#ifdef X86_AVX512_ADLER32
+BENCHMARK_ADLER32_BASELINE_COPY(avx512_baseline, adler32_avx512, x86_cpu_has_avx512);
+BENCHMARK_ADLER32_COPY(avx512, adler32_fold_copy_avx512, x86_cpu_has_avx512);
+#endif
+#ifdef X86_AVX512VNNI_ADLER32
+BENCHMARK_ADLER32_BASELINE_COPY(avx512_vnni_baseline, adler32_avx512_vnni, x86_cpu_has_avx512vnni);
+BENCHMARK_ADLER32_COPY(avx512_vnni, adler32_fold_copy_avx512_vnni, x86_cpu_has_avx512vnni);
+#endif
-DX86_FEATURES \
-DX86_PCLMULQDQ_CRC \
-DX86_SSE2 \
- -DX86_SSE42_ADLER32 \
+ -DX86_SSE42_ADLER32 \
-DX86_SSE42_CRC_INTRIN \
-DX86_SSE42_CRC_HASH \
- -DX86_SSSE3_ADLER32 \
+ -DX86_SSSE3_ADLER32 \
-DX86_AVX2 \
- -DX86_AVX2_ADLER32 \
+ -DX86_AVX2_ADLER32 \
-DX86_AVX_CHUNKSET \
- -DX86_SSE2_CHUNKSET \
- #
+ -DX86_SSE2_CHUNKSET
+
LDFLAGS = -nologo -debug -incremental:no -opt:ref -manifest
ARFLAGS = -nologo
RCFLAGS = /dWIN32 /r
OBJS = \
adler32.obj \
- adler32_avx2.obj \
- adler32_avx512.obj \
- adler32_avx512_vnni.obj \
- adler32_sse42.obj \
- adler32_ssse3.obj \
- adler32_fold.obj \
+ adler32_avx2.obj \
+ adler32_avx512.obj \
+ adler32_avx512_vnni.obj \
+ adler32_sse42.obj \
+ adler32_ssse3.obj \
+ adler32_fold.obj \
chunkset.obj \
chunkset_avx.obj \
chunkset_sse2.obj \