#define ADLER_DO8(sum1, sum2, buf, i) {ADLER_DO4(sum1, sum2, buf, i); ADLER_DO4(sum1, sum2, buf, i+4);}
#define ADLER_DO16(sum1, sum2, buf) {ADLER_DO8(sum1, sum2, buf, 0); ADLER_DO8(sum1, sum2, buf, 8);}
-static inline uint32_t adler32_len_1(uint32_t adler, const uint8_t *buf, uint32_t sum2) {
- adler += buf[0];
+static inline uint32_t adler32_copy_len_1(uint32_t adler, uint8_t *dst, const uint8_t *buf, uint32_t sum2, const int COPY) {
+ uint8_t c = *buf;
+ if (COPY) {
+ *dst = c;
+ }
+ adler += c;
adler %= BASE;
sum2 += adler;
sum2 %= BASE;
return adler | (sum2 << 16);
}
-static inline uint32_t adler32_len_16(uint32_t adler, const uint8_t *buf, size_t len, uint32_t sum2) {
- while (len) {
- --len;
- adler += *buf++;
- sum2 += adler;
- }
- adler %= BASE;
- sum2 %= BASE; /* only added so many BASE's */
- /* return recombined sums */
- return adler | (sum2 << 16);
-}
-
-static inline uint32_t adler32_copy_len_16(uint32_t adler, const uint8_t *buf, uint8_t *dst, size_t len, uint32_t sum2) {
+static inline uint32_t adler32_copy_len_16(uint32_t adler, uint8_t *dst, const uint8_t *buf, size_t len, uint32_t sum2, const int COPY) {
while (len--) {
- *dst = *buf++;
- adler += *dst++;
+ uint8_t c = *buf++;
+ if (COPY) {
+ *dst++ = c;
+ }
+ adler += c;
sum2 += adler;
}
adler %= BASE;
return adler | (sum2 << 16);
}
-static inline uint32_t adler32_len_64(uint32_t adler, const uint8_t *buf, size_t len, uint32_t sum2) {
+static inline uint32_t adler32_copy_len_64(uint32_t adler, uint8_t *dst, const uint8_t *buf, size_t len, uint32_t sum2, const int COPY) {
+ const uint8_t *src = buf;
+ const size_t src_len = len;
#ifdef UNROLL_MORE
while (len >= 16) {
len -= 16;
#endif
}
/* Process tail (len < 16). */
- return adler32_len_16(adler, buf, len, sum2);
+ adler = adler32_copy_len_16(adler, NULL, buf, len, sum2, 0);
+ if (COPY) {
+ memcpy(dst, src, src_len);
+ }
+ return adler;
}
#endif /* ADLER32_P_H */
adler &= 0xffff;
/* in case user likes doing a byte at a time, keep it fast */
- if (len == 1) {
- if (COPY)
- *dst = *src;
- return adler32_len_1(adler, src, sum2);
- }
+ if (len == 1)
+ return adler32_copy_len_1(adler, dst, src, sum2, COPY);
/* initial Adler-32 value (deferred check for len == 1 speed) */
if (src == NULL)
return 1L;
/* in case short lengths are provided, keep it somewhat fast */
- if (len < 16) {
- if (COPY)
- return adler32_copy_len_16(adler, src, dst, len, sum2);
- else
- return adler32_len_16(adler, src, len, sum2);
- }
+ if (len < 16)
+ return adler32_copy_len_16(adler, dst, src, len, sum2, COPY);
uint32_t pair[2];
int n = NMAX;
/* in case user likes doing a byte at a time, keep it fast */
if (UNLIKELY(len == 1))
- return adler32_len_1(adler, buf, sum2);
+ return adler32_copy_len_1(adler, NULL, buf, sum2, 0);
/* initial Adler-32 value (deferred check for len == 1 speed) */
if (UNLIKELY(buf == NULL))
/* in case short lengths are provided, keep it somewhat fast */
if (UNLIKELY(len < 16))
- return adler32_len_16(adler, buf, len, sum2);
+ return adler32_copy_len_16(adler, NULL, buf, len, sum2, 0);
/* do length NMAX blocks -- requires just one modulo operation */
while (len >= NMAX) {
}
/* do remaining bytes (less than NMAX, still just one modulo) */
- return adler32_len_64(adler, buf, len, sum2);
+ return adler32_copy_len_64(adler, NULL, buf, len, sum2, 0);
}
Z_INTERNAL uint32_t adler32_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
rem_peel:
if (len < 16) {
- if (COPY) {
- return adler32_copy_len_16(adler0, src, dst, len, adler1);
- } else {
- return adler32_len_16(adler0, src, len, adler1);
- }
+ return adler32_copy_len_16(adler0, dst, src, len, adler1, COPY);
} else if (len < 32) {
if (COPY) {
return adler32_copy_lsx(adler, dst, src, len);
adler0 = adler & 0xffff;
rem_peel:
- if (len < 16) {
- if (COPY) {
- return adler32_copy_len_16(adler0, src, dst, len, adler1);
- } else {
- return adler32_len_16(adler0, src, len, adler1);
- }
- }
+ if (len < 16)
+ return adler32_copy_len_16(adler0, dst, src, len, adler1, COPY);
__m128i vbuf, vbuf_0;
__m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
/* in case user likes doing a byte at a time, keep it fast */
if (UNLIKELY(len == 1))
- return adler32_len_1(s1, buf, s2);
+ return adler32_copy_len_1(s1, NULL, buf, s2, 0);
/* If buffer is empty or len=0 we need to return adler initial value. */
if (UNLIKELY(buf == NULL))
/* This is faster than VSX code for len < 64. */
if (len < 64)
- return adler32_len_64(s1, buf, len, s2);
+ return adler32_copy_len_64(s1, NULL, buf, len, s2, 0);
/* Use POWER VSX instructions for len >= 64. */
const vector unsigned int v_zeros = { 0 };
s2 = vs2[0] % BASE;
/* Process tail (len < 16). */
- return adler32_len_16(s1, buf, len, s2);
+ return adler32_copy_len_16(s1, NULL, buf, len, s2, 0);
}
#endif /* POWER8_VSX */
/* in case user likes doing a byte at a time, keep it fast */
if (UNLIKELY(len == 1))
- return adler32_len_1(adler, buf, sum2);
+ return adler32_copy_len_1(adler, NULL, buf, sum2, 0);
/* initial Adler-32 value (deferred check for len == 1 speed) */
if (UNLIKELY(buf == NULL))
/* in case short lengths are provided, keep it somewhat fast */
if (UNLIKELY(len < 16))
- return adler32_len_16(adler, buf, len, sum2);
+ return adler32_copy_len_16(adler, NULL, buf, len, sum2, 0);
// Align buffer
unsigned int al = 0;
adler &= 0xffff;
/* in case user likes doing a byte at a time, keep it fast */
- if (len == 1) {
- if (COPY) memcpy(dst, src, 1);
- return adler32_len_1(adler, src, sum2);
- }
+ if (len == 1)
+ return adler32_copy_len_1(adler, dst, src, sum2, COPY);
/* initial Adler-32 value (deferred check for len == 1 speed) */
if (src == NULL)
return 1L;
/* in case short lengths are provided, keep it somewhat fast */
- if (len < 16) {
- if (COPY) memcpy(dst, src, len);
- return adler32_len_16(adler, src, len, sum2);
- }
+ if (len < 16)
+ return adler32_copy_len_16(adler, dst, src, len, sum2, COPY);
size_t left = len;
size_t vl = __riscv_vsetvlmax_e8m1();
rem_peel:
if (len < 16) {
- if (COPY) {
- return adler32_copy_len_16(adler0, src, dst, len, adler1);
- } else {
- return adler32_len_16(adler0, src, len, adler1);
- }
+ return adler32_copy_len_16(adler0, dst, src, len, adler1, COPY);
} else if (len < 32) {
if (COPY) {
return adler32_copy_sse42(adler, dst, src, len);
adler0 = adler & 0xffff;
rem_peel:
- if (len < 16) {
- return adler32_copy_len_16(adler0, src, dst, len, adler1);
- }
+ if (len < 16)
+ return adler32_copy_len_16(adler0, dst, src, len, adler1, 1);
__m128i vbuf, vbuf_0;
__m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
/* in case user likes doing a byte at a time, keep it fast */
if (UNLIKELY(len == 1))
- return adler32_len_1(adler, buf, sum2);
+ return adler32_copy_len_1(adler, NULL, buf, sum2, 0);
/* initial Adler-32 value (deferred check for len == 1 speed) */
if (UNLIKELY(buf == NULL))
/* in case short lengths are provided, keep it somewhat fast */
if (UNLIKELY(len < 16))
- return adler32_len_16(adler, buf, len, sum2);
+ return adler32_copy_len_16(adler, NULL, buf, len, sum2, 0);
const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
}
/* Process tail (len < 16). */
- return adler32_len_16(adler, buf, len, sum2);
+ return adler32_copy_len_16(adler, NULL, buf, len, sum2, 0);
}
#endif