#include "adler32_p.h"
/* ========================================================================= */
-Z_INTERNAL uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len) {
+Z_INTERNAL uint32_t adler32_c(uint32_t adler, const unsigned char *buf, uint64_t len) {
uint32_t sum2;
unsigned n;
/* adler32_fold.c -- adler32 folding interface
- * Copyright (C) 2022 Adam Stylinski
+ * Copyright (C) 2022 Adam Stylinski
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "functable.h"
#include "adler32_fold.h"
-Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
- memcpy(dst, src, len);
- return functable.adler32(adler, src, len);
+Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len) {
+ adler = functable.adler32(adler, src, len);
+ while (len > SIZE_MAX) {
+ memcpy(dst, src, SIZE_MAX);
+ dst += SIZE_MAX;
+ src += SIZE_MAX;
+ len -= SIZE_MAX;
+ }
+ if (len) {
+ memcpy(dst, src, (size_t)len);
+ }
+ return adler;
}
#ifndef ADLER32_FOLD_H_
#define ADLER32_FOLD_H_
-Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len);
#endif
return adler | (sum2 << 16);
}
-static inline uint32_t adler32_len_16(uint32_t adler, const unsigned char *buf, size_t len, uint32_t sum2) {
+static inline uint32_t adler32_len_16(uint32_t adler, const unsigned char *buf, uint64_t len, uint32_t sum2) {
while (len) {
--len;
adler += *buf++;
return adler | (sum2 << 16);
}
-static inline uint32_t adler32_copy_len_16(uint32_t adler, const unsigned char *buf, uint8_t *dst, size_t len, uint32_t sum2) {
+static inline uint32_t adler32_copy_len_16(uint32_t adler, const unsigned char *buf, uint8_t *dst, uint64_t len, uint32_t sum2) {
while (len--) {
- *dst = *buf++;
+ *dst = *buf++;
adler += *dst++;
sum2 += adler;
}
return adler | (sum2 << 16);
}
-static inline uint32_t adler32_len_64(uint32_t adler, const unsigned char *buf, size_t len, uint32_t sum2) {
+static inline uint32_t adler32_len_64(uint32_t adler, const unsigned char *buf, uint64_t len, uint32_t sum2) {
#ifdef UNROLL_MORE
while (len >= 16) {
len -= 16;
#include "../../adler32_p.h"
#include "../../fallback_builtins.h"
-static void NEON_accum32(uint32_t *s, const unsigned char *buf, size_t len) {
+static void NEON_accum32(uint32_t *s, const unsigned char *buf, uint64_t len) {
static const uint16_t ALIGNED_(16) taps[64] = {
64, 63, 62, 61, 60, 59, 58, 57,
56, 55, 54, 53, 52, 51, 50, 49,
s[1] = vget_lane_u32(as, 1);
}
-static void NEON_handle_tail(uint32_t *pair, const unsigned char *buf, size_t len) {
+static void NEON_handle_tail(uint32_t *pair, const unsigned char *buf, uint64_t len) {
unsigned int i;
for (i = 0; i < len; ++i) {
pair[0] += buf[i];
}
}
-uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len) {
+uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, uint64_t len) {
/* split Adler-32 into component sums */
uint32_t sum2 = (adler >> 16) & 0xffff;
adler &= 0xffff;
return __a;
}
-uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len) {
+uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, uint64_t len) {
uint32_t s1 = adler & 0xffff;
uint32_t s2 = (adler >> 16) & 0xffff;
#define vmx_zero() (vec_splat_u32(0))
-static inline void vmx_handle_head_or_tail(uint32_t *pair, const unsigned char *buf, size_t len) {
+static inline void vmx_handle_head_or_tail(uint32_t *pair, const unsigned char *buf, uint64_t len) {
unsigned int i;
for (i = 0; i < len; ++i) {
pair[0] += buf[i];
}
}
-static void vmx_accum32(uint32_t *s, const unsigned char *buf, size_t len) {
+static void vmx_accum32(uint32_t *s, const unsigned char *buf, uint64_t len) {
/* Different taps for the separable components of sums */
const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49};
const vector unsigned char t1 = {48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33};
vec_ste(s2acc, 0, s+1);
}
-uint32_t adler32_vmx(uint32_t adler, const unsigned char *buf, size_t len) {
+uint32_t adler32_vmx(uint32_t adler, const unsigned char *buf, uint64_t len) {
uint32_t sum2;
uint32_t pair[16] ALIGNED_(16);
memset(&pair[2], 0, 14);
#include "adler32_avx2_p.h"
#ifdef X86_SSE42_ADLER32
-extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
-extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);
+extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len);
+extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, uint64_t len);
+
#define copy_sub32(a, b, c, d) adler32_fold_copy_sse42(a, b, c, d)
#define sub32(a, b, c) adler32_ssse3(a, b, c)
#else
#endif
#ifdef COPY
-Z_INTERNAL uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+Z_INTERNAL uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len) {
#else
-Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) {
+Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, uint64_t len) {
#endif
if (src == NULL) return 1L;
if (len == 0) return adler;
uint32_t adler0, adler1;
adler1 = (adler >> 16) & 0xffff;
- adler0 = adler & 0xffff;
+ adler0 = adler & 0xffff;
rem_peel:
if (len < 16) {
__m256i vs1_0 = vs1;
__m256i vs3 = _mm256_setzero_si256();
- size_t k = MIN(len, NMAX);
+ uint64_t k = MIN(len, NMAX);
k -= k % 32;
len -= k;
/* The compiler is generating the following sequence for this integer modulus
* when done the scalar way, in GPRs:
-
+
adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
(s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
...
vmovd %xmm1,%esi // move vector lane 0 to 32 bit register %esi
mov %rsi,%rax // zero-extend this value to 64 bit precision in %rax
- imul %rdi,%rsi // do a signed multiplication with magic constant and vector element
+ imul %rdi,%rsi // do a signed multiplication with magic constant and vector element
shr $0x2f,%rsi // shift right by 47
- imul $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1
+ imul $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1
sub %esi,%eax // subtract lower 32 bits of original vector value from modified one above
...
// repeats for each element with vpextract instructions
This is tricky with AVX2 for a number of reasons:
1.) There's no 64 bit multiplication instruction, but there is a sequence to get there
2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate
- back down to 32 bit precision later (there is in AVX512)
+ back down to 32 bit precision later (there is in AVX512)
3.) Full width integer multiplications aren't cheap
- We can, however, and do a relatively cheap sequence for horizontal sums.
+ We can, however, and do a relatively cheap sequence for horizontal sums.
Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was
previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but
that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be
performed on the maximum possible inputs before overflow
*/
-
+
/* In AVX2-land, this trip through GPRs will probably be unvoidable, as there's no cheap and easy
* conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant).
* This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly
#ifdef X86_AVX512_ADLER32
#ifdef COPY
-Z_INTERNAL uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+Z_INTERNAL uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len) {
#else
-Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, size_t len) {
+Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, uint64_t len) {
#endif
if (src == NULL) return 1L;
uint32_t adler0, adler1;
adler1 = (adler >> 16) & 0xffff;
- adler0 = adler & 0xffff;
+ adler0 = adler & 0xffff;
rem_peel:
if (len < 64) {
#elif defined(X86_SSSE3_ADLER32)
return adler32_ssse3(adler, src, len);
#else
- return adler32_len_16(adler0, src, len, adler1);
+ return adler32_len_16(adler0, src, len, adler1);
#endif
}
56, 57, 58, 59, 60, 61, 62, 63, 64);
const __m512i dot3v = _mm512_set1_epi16(1);
const __m512i zero = _mm512_setzero_si512();
- size_t k;
+ uint64_t k;
while (len >= 64) {
__m512i vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
goto rem_peel;
}
- return adler;
+ return adler;
}
#endif
#include "adler32_avx512_p.h"
#include "adler32_avx2_p.h"
-Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size_t len) {
+Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, uint64_t len) {
if (src == NULL) return 1L;
if (len == 0) return adler;
uint32_t adler0, adler1;
adler1 = (adler >> 16) & 0xffff;
- adler0 = adler & 0xffff;
+ adler0 = adler & 0xffff;
rem_peel:
if (len < 32)
#if defined(X86_SSSE3_ADLER32)
return adler32_ssse3(adler, src, len);
#else
- return adler32_len_16(adler0, src, len, adler1);
+ return adler32_len_16(adler0, src, len, adler1);
#endif
if (len < 64)
#elif defined(X86_SSE3_ADLER32)
return adler32_ssse3(adler, src, len);
#else
- return adler32_len_16(adler0, src, len, adler1);
+ return adler32_len_16(adler0, src, len, adler1);
#endif
const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
while (len >= 64) {
vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
- size_t k = MIN(len, NMAX);
+ uint64_t k = MIN(len, NMAX);
k -= k % 64;
len -= k;
__m512i vs1_0 = vs1;
goto rem_peel;
}
- return adler;
+ return adler;
}
-Z_INTERNAL uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+Z_INTERNAL uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len) {
if (src == NULL) return 1L;
if (len == 0) return adler;
uint32_t adler0, adler1;
adler1 = (adler >> 16) & 0xffff;
- adler0 = adler & 0xffff;
+ adler0 = adler & 0xffff;
rem_peel_copy:
if (len < 32) {
#if defined(X86_SSSE3_ADLER32)
return adler32_ssse3(adler, src, len);
#else
- return adler32_len_16(adler0, src, len, adler1);
+ return adler32_len_16(adler0, src, len, adler1);
#endif
}
while (len >= 32) {
vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
- size_t k = MIN(len, NMAX);
+ uint64_t k = MIN(len, NMAX);
k -= k % 32;
len -= k;
__m256i vs1_0 = vs1;
goto rem_peel_copy;
}
- return adler;
+ return adler;
}
#endif
#ifdef X86_SSE42_ADLER32
-Z_INTERNAL uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+Z_INTERNAL uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len) {
uint32_t adler0, adler1;
adler1 = (adler >> 16) & 0xffff;
- adler0 = adler & 0xffff;
+ adler0 = adler & 0xffff;
rem_peel:
if (len < 16) {
const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
const __m128i dot3v = _mm_set1_epi16(1);
- size_t k;
+ uint64_t k;
while (len >= 16) {
#include <immintrin.h>
-Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len) {
+Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, uint64_t len) {
uint32_t sum2;
/* split Adler-32 into component sums */
* additions worthwhile or if it's worth it to just eat the cost of an unaligned
* load. This is a pretty simple test, just test if 16 - the remainder + len is
* < 16 */
- size_t max_iters = NMAX;
- size_t rem = (uintptr_t)buf & 15;
- size_t align_offset = 16 - rem;
- size_t k = 0;
+ uint64_t max_iters = NMAX;
+ uint64_t rem = (uintptr_t)buf & 15;
+ uint64_t align_offset = 16 - rem;
+ uint64_t k = 0;
if (rem) {
if (len < 16 + align_offset) {
/* Let's eat the cost of this one unaligned load so that
extern void cpu_check_features(void);
/* adler32 */
-typedef uint32_t (*adler32_func)(uint32_t adler, const unsigned char *buf, size_t len);
+typedef uint32_t (*adler32_func)(uint32_t adler, const unsigned char *buf, uint64_t len);
-extern uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t len);
+extern uint32_t adler32_c(uint32_t adler, const unsigned char *buf, uint64_t len);
#ifdef ARM_NEON_ADLER32
-extern uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len);
+extern uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, uint64_t len);
#endif
#ifdef PPC_VMX_ADLER32
-extern uint32_t adler32_vmx(uint32_t adler, const unsigned char *buf, size_t len);
+extern uint32_t adler32_vmx(uint32_t adler, const unsigned char *buf, uint64_t len);
#endif
#ifdef X86_SSSE3_ADLER32
-extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size_t len);
+extern uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, uint64_t len);
#endif
#ifdef X86_AVX2_ADLER32
-extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len);
+extern uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, uint64_t len);
#endif
#ifdef X86_AVX512_ADLER32
-extern uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, size_t len);
+extern uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, uint64_t len);
#endif
#ifdef X86_AVX512VNNI_ADLER32
-extern uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, size_t len);
+extern uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf, uint64_t len);
#endif
#ifdef POWER8_VSX_ADLER32
-extern uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, size_t len);
+extern uint32_t adler32_power8(uint32_t adler, const unsigned char* buf, uint64_t len);
#endif
/* adler32 folding */
#ifdef X86_SSE42_ADLER32
-extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len);
#endif
#ifdef X86_AVX2_ADLER32
-extern uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+extern uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len);
#endif
#ifdef X86_AVX512_ADLER32
-extern uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+extern uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len);
#endif
#ifdef X86_AVX512VNNI_ADLER32
-extern uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+extern uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len);
#endif
/* CRC32 folding */
return functable.longest_match_slow(s, cur_match);
}
-Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, size_t len) {
+Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const unsigned char *buf, uint64_t len) {
// Initialize default
functable.adler32 = &adler32_c;
cpu_check_features();
return functable.adler32(adler, buf, len);
}
-Z_INTERNAL uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+Z_INTERNAL uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len) {
functable.adler32_fold_copy = &adler32_fold_copy_c;
#if (defined X86_SSE42_ADLER32)
if (x86_cpu_has_sse42)
#include "adler32_fold.h"
struct functable_s {
- uint32_t (* adler32) (uint32_t adler, const unsigned char *buf, size_t len);
- uint32_t (* adler32_fold_copy) (uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+ uint32_t (* adler32) (uint32_t adler, const unsigned char *buf, uint64_t len);
+ uint32_t (* adler32_fold_copy) (uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len);
uint32_t (* crc32) (uint32_t crc, const unsigned char *buf, uint64_t len);
uint32_t (* crc32_fold_reset) (crc32_fold *crc);
void (* crc32_fold_copy) (crc32_fold *crc, uint8_t *dst, const uint8_t *src, uint64_t len);
#define MAX_RANDOM_INTS (1024 * 1024)
#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t))
-typedef uint32_t (*adler32_cpy_func)(uint32_t adler, unsigned char *dst, const unsigned char *buf, size_t len);
+typedef uint32_t (*adler32_cpy_func)(uint32_t adler, unsigned char *dst, const unsigned char *buf, uint64_t len);
class adler32_copy: public benchmark::Fixture {
private:
state.SkipWithError("CPU does not support " #name); \
} \
Bench(state, [](uint32_t init_sum, unsigned char *dst, \
- const unsigned char *buf, size_t len) -> uint32_t { \
+ const unsigned char *buf, uint64_t len) -> uint32_t { \
memcpy(dst, buf, len); \
return fptr(init_sum, buf, len); \
}); \