#include "adler32_p.h"
/* ========================================================================= */
-Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, uint64_t len) {
+Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
uint32_t sum2;
unsigned n;
#include <limits.h>
-Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len) {
+Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
adler = functable.adler32(adler, src, len);
-/* Test that we don't try to copy more than actually fits in available address space */
-#if INTPTR_MAX > SSIZE_MAX
- while (len > SSIZE_MAX) {
- memcpy(dst, src, SSIZE_MAX);
- dst += SSIZE_MAX;
- src += SSIZE_MAX;
- len -= SSIZE_MAX;
- }
-#endif
- if (len) {
- memcpy(dst, src, (size_t)len);
- }
+ memcpy(dst, src, len);
return adler;
}
#ifndef ADLER32_FOLD_H_
#define ADLER32_FOLD_H_
-Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len);
+Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
return adler | (sum2 << 16);
}
-static inline uint32_t adler32_len_16(uint32_t adler, const uint8_t *buf, uint64_t len, uint32_t sum2) {
+static inline uint32_t adler32_len_16(uint32_t adler, const uint8_t *buf, size_t len, uint32_t sum2) {
while (len) {
--len;
adler += *buf++;
return adler | (sum2 << 16);
}
-static inline uint32_t adler32_copy_len_16(uint32_t adler, const uint8_t *buf, uint8_t *dst, uint64_t len, uint32_t sum2) {
+static inline uint32_t adler32_copy_len_16(uint32_t adler, const uint8_t *buf, uint8_t *dst, size_t len, uint32_t sum2) {
while (len--) {
*dst = *buf++;
adler += *dst++;
return adler | (sum2 << 16);
}
-static inline uint32_t adler32_len_64(uint32_t adler, const uint8_t *buf, uint64_t len, uint32_t sum2) {
+static inline uint32_t adler32_len_64(uint32_t adler, const uint8_t *buf, size_t len, uint32_t sum2) {
#ifdef UNROLL_MORE
while (len >= 16) {
len -= 16;
#include "../../zbuild.h"
#include "../../adler32_p.h"
-static void NEON_accum32(uint32_t *s, const uint8_t *buf, uint64_t len) {
+static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
static const uint16_t ALIGNED_(16) taps[64] = {
64, 63, 62, 61, 60, 59, 58, 57,
56, 55, 54, 53, 52, 51, 50, 49,
uint16x8_t s2_4, s2_5, s2_6, s2_7;
s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16(0);
- uint64_t num_iter = len >> 2;
+ size_t num_iter = len >> 2;
int rem = len & 3;
- for (uint64_t i = 0; i < num_iter; ++i) {
+ for (size_t i = 0; i < num_iter; ++i) {
uint8x16x4_t d0_d3 = vld1q_u8_x4(buf);
/* Unfortunately it doesn't look like there's a direct sum 8 bit to 32
s[1] = vget_lane_u32(as, 1);
}
-static void NEON_handle_tail(uint32_t *pair, const uint8_t *buf, uint64_t len) {
+static void NEON_handle_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
unsigned int i;
for (i = 0; i < len; ++i) {
pair[0] += buf[i];
}
}
-uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, uint64_t len) {
+uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len) {
/* split Adler-32 into component sums */
uint32_t sum2 = (adler >> 16) & 0xffff;
adler &= 0xffff;
return __a;
}
-uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, uint64_t len) {
+uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len) {
uint32_t s1 = adler & 0xffff;
uint32_t s2 = (adler >> 16) & 0xffff;
#define vmx_zero() (vec_splat_u32(0))
-static inline void vmx_handle_head_or_tail(uint32_t *pair, const uint8_t *buf, uint64_t len) {
+static inline void vmx_handle_head_or_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
unsigned int i;
for (i = 0; i < len; ++i) {
pair[0] += buf[i];
}
}
-static void vmx_accum32(uint32_t *s, const uint8_t *buf, uint64_t len) {
+static void vmx_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
/* Different taps for the separable components of sums */
const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49};
const vector unsigned char t1 = {48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33};
vec_ste(s2acc, 0, s+1);
}
-uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, uint64_t len) {
+uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len) {
uint32_t sum2;
uint32_t pair[16] ALIGNED_(16);
memset(&pair[2], 0, 14);
#include "adler32_avx2_p.h"
#ifdef X86_SSE42_ADLER32
-extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len);
-extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, uint64_t len);
+extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);
#define copy_sub32(a, b, c, d) adler32_fold_copy_sse42(a, b, c, d)
#define sub32(a, b, c) adler32_ssse3(a, b, c)
#endif
#ifdef COPY
-Z_INTERNAL uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len) {
+Z_INTERNAL uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
#else
-Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, uint64_t len) {
+Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) {
#endif
if (src == NULL) return 1L;
if (len == 0) return adler;
__m256i vs1_0 = vs1;
__m256i vs3 = _mm256_setzero_si256();
- uint64_t k = MIN(len, NMAX);
+ size_t k = MIN(len, NMAX);
k -= k % 32;
len -= k;
#ifdef X86_AVX512_ADLER32
#ifdef COPY
-Z_INTERNAL uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len) {
+Z_INTERNAL uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
#else
-Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, uint64_t len) {
+Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, size_t len) {
#endif
if (src == NULL) return 1L;
56, 57, 58, 59, 60, 61, 62, 63, 64);
const __m512i dot3v = _mm512_set1_epi16(1);
const __m512i zero = _mm512_setzero_si512();
- uint64_t k;
+ size_t k;
while (len >= 64) {
__m512i vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
#include "adler32_avx512_p.h"
#include "adler32_avx2_p.h"
-Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, uint64_t len) {
+Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size_t len) {
if (src == NULL) return 1L;
if (len == 0) return adler;
while (len >= 64) {
vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
- uint64_t k = MIN(len, NMAX);
+ size_t k = MIN(len, NMAX);
k -= k % 64;
len -= k;
__m512i vs1_0 = vs1;
return adler;
}
-Z_INTERNAL uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len) {
+Z_INTERNAL uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
if (src == NULL) return 1L;
if (len == 0) return adler;
while (len >= 32) {
vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
- uint64_t k = MIN(len, NMAX);
+ size_t k = MIN(len, NMAX);
k -= k % 32;
len -= k;
__m256i vs1_0 = vs1;
#ifdef X86_SSE42_ADLER32
-Z_INTERNAL uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len) {
+Z_INTERNAL uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
uint32_t adler0, adler1;
adler1 = (adler >> 16) & 0xffff;
adler0 = adler & 0xffff;
const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
const __m128i dot3v = _mm_set1_epi16(1);
- uint64_t k;
+ size_t k;
while (len >= 16) {
#include <immintrin.h>
-Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, uint64_t len) {
+Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len) {
uint32_t sum2;
/* split Adler-32 into component sums */
* additions worthwhile or if it's worth it to just eat the cost of an unaligned
* load. This is a pretty simple test, just test if 16 - the remainder + len is
* < 16 */
- uint64_t max_iters = NMAX;
- uint64_t rem = (uintptr_t)buf & 15;
- uint64_t align_offset = 16 - rem;
- uint64_t k = 0;
+ size_t max_iters = NMAX;
+ size_t rem = (uintptr_t)buf & 15;
+ size_t align_offset = 16 - rem;
+ size_t k = 0;
if (rem) {
if (len < 16 + align_offset) {
/* Let's eat the cost of this one unaligned load so that
extern void cpu_check_features(void);
/* adler32 */
-typedef uint32_t (*adler32_func)(uint32_t adler, const uint8_t *buf, uint64_t len);
+typedef uint32_t (*adler32_func)(uint32_t adler, const uint8_t *buf, size_t len);
-extern uint32_t adler32_c(uint32_t adler, const uint8_t *buf, uint64_t len);
+extern uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len);
#ifdef ARM_NEON_ADLER32
-extern uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, uint64_t len);
+extern uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len);
#endif
#ifdef PPC_VMX_ADLER32
-extern uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, uint64_t len);
+extern uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len);
#endif
#ifdef X86_SSSE3_ADLER32
-extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, uint64_t len);
+extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len);
#endif
#ifdef X86_AVX2_ADLER32
-extern uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, uint64_t len);
+extern uint32_t adler32_avx2(uint32_t adler, const uint8_t *buf, size_t len);
#endif
#ifdef X86_AVX512_ADLER32
-extern uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, uint64_t len);
+extern uint32_t adler32_avx512(uint32_t adler, const uint8_t *buf, size_t len);
#endif
#ifdef X86_AVX512VNNI_ADLER32
-extern uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, uint64_t len);
+extern uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *buf, size_t len);
#endif
#ifdef POWER8_VSX_ADLER32
-extern uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, uint64_t len);
+extern uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len);
#endif
/* adler32 folding */
#ifdef X86_SSE42_ADLER32
-extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len);
+extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
#ifdef X86_AVX2_ADLER32
-extern uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len);
+extern uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
#ifdef X86_AVX512_ADLER32
-extern uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len);
+extern uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
#ifdef X86_AVX512VNNI_ADLER32
-extern uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len);
+extern uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
#endif
/* CRC32 folding */
return functable.longest_match_slow(s, cur_match);
}
-Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const uint8_t *buf, uint64_t len) {
+Z_INTERNAL uint32_t adler32_stub(uint32_t adler, const uint8_t *buf, size_t len) {
// Initialize default
functable.adler32 = &adler32_c;
cpu_check_features();
return functable.adler32(adler, buf, len);
}
-Z_INTERNAL uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len) {
+Z_INTERNAL uint32_t adler32_fold_copy_stub(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
functable.adler32_fold_copy = &adler32_fold_copy_c;
#if (defined X86_SSE42_ADLER32)
if (x86_cpu_has_sse42)
#include "adler32_fold.h"
struct functable_s {
- uint32_t (* adler32) (uint32_t adler, const uint8_t *buf, uint64_t len);
- uint32_t (* adler32_fold_copy) (uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len);
+ uint32_t (* adler32) (uint32_t adler, const uint8_t *buf, size_t len);
+ uint32_t (* adler32_fold_copy) (uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
uint32_t (* crc32) (uint32_t crc, const uint8_t *buf, size_t len);
uint32_t (* crc32_fold_reset) (struct crc32_fold_s *crc);
void (* crc32_fold_copy) (struct crc32_fold_s *crc, uint8_t *dst, const uint8_t *src, size_t len);
#define MAX_RANDOM_INTS (1024 * 1024)
#define MAX_RANDOM_INTS_SIZE (MAX_RANDOM_INTS * sizeof(uint32_t))
-typedef uint32_t (*adler32_cpy_func)(uint32_t adler, unsigned char *dst, const uint8_t *buf, uint64_t len);
+typedef uint32_t (*adler32_cpy_func)(uint32_t adler, unsigned char *dst, const uint8_t *buf, size_t len);
class adler32_copy: public benchmark::Fixture {
private:
state.SkipWithError("CPU does not support " #name); \
} \
Bench(state, [](uint32_t init_sum, unsigned char *dst, \
- const uint8_t *buf, uint64_t len) -> uint32_t { \
+ const uint8_t *buf, size_t len) -> uint32_t { \
memcpy(dst, buf, (size_t)len); \
return fptr(init_sum, buf, len); \
}); \