From: Simon Hosie Date: Mon, 6 Nov 2023 23:39:34 +0000 (-0800) Subject: Add adler32_fold_copy_rvv implementation. X-Git-Tag: 2.1.6~33 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f3211aba349a1d4781d0d41cb00d29fb8325af06;p=thirdparty%2Fzlib-ng.git Add adler32_fold_copy_rvv implementation. --- diff --git a/arch/riscv/adler32_rvv.c b/arch/riscv/adler32_rvv.c index 9442f280..da46f37e 100644 --- a/arch/riscv/adler32_rvv.c +++ b/arch/riscv/adler32_rvv.c @@ -12,23 +12,25 @@ #include "../../zbuild.h" #include "../../adler32_p.h" -Z_INTERNAL uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len) { +static inline uint32_t adler32_rvv_impl(uint32_t adler, uint8_t* restrict dst, const uint8_t *src, size_t len, int COPY) { /* split Adler-32 into component sums */ uint32_t sum2 = (adler >> 16) & 0xffff; adler &= 0xffff; /* in case user likes doing a byte at a time, keep it fast */ if (len == 1) { - return adler32_len_1(adler, buf, sum2); + if (COPY) memcpy(dst, src, 1); + return adler32_len_1(adler, src, sum2); } /* initial Adler-32 value (deferred check for len == 1 speed) */ - if (buf == NULL) + if (src == NULL) return 1L; /* in case short lengths are provided, keep it somewhat fast */ if (len < 16) { - return adler32_len_16(adler, buf, len, sum2); + if (COPY) memcpy(dst, src, len); + return adler32_len_16(adler, src, len, sum2); } size_t left = len; @@ -56,10 +58,12 @@ Z_INTERNAL uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len) v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl); size_t subprob = block_size; while (subprob > 0) { - vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl); + vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(src, vl); + if (COPY) __riscv_vse8_v_u8m1(dst, v_buf8, vl); v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl); v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl); - buf += vl; + src += vl; + if (COPY) dst += vl; subprob -= vl; } v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl); @@ -75,10 +79,12 @@ Z_INTERNAL uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len) v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl); size_t res = left; while (left >= vl) { - vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl); + vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(src, vl); + if (COPY) __riscv_vse8_v_u8m1(dst, v_buf8, vl); v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl); v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl); - buf += vl; + src += vl; + if (COPY) dst += vl; left -= vl; } v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl); @@ -104,7 +110,8 @@ Z_INTERNAL uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len) adler += adler_sum; while (left--) { - adler += *buf++; + if (COPY) *dst++ = *src; + adler += *src++; sum2 += adler; } @@ -114,4 +121,12 @@ Z_INTERNAL uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len) return adler | (sum2 << 16); } +Z_INTERNAL uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { + return adler32_rvv_impl(adler, dst, src, len, 1); +} + +Z_INTERNAL uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len) { + return adler32_rvv_impl(adler, NULL, buf, len, 0); +} + #endif // RISCV_RVV diff --git a/cpu_features.h b/cpu_features.h index aed1eaf9..00fa6c74 100644 --- a/cpu_features.h +++ b/cpu_features.h @@ -70,6 +70,9 @@ extern uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len); #endif /* adler32 folding */ +#ifdef RISCV_RVV +extern uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +#endif #ifdef X86_SSE42 extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); #endif diff --git a/functable.c b/functable.c index df35bae7..f664d280 100644 --- a/functable.c +++ b/functable.c @@ -215,6 +215,7 @@ static void init_functable(void) { #ifdef RISCV_RVV if (cf.riscv.has_rvv) { ft.adler32 = &adler32_rvv; + ft.adler32_fold_copy = &adler32_fold_copy_rvv; ft.chunkmemset_safe = &chunkmemset_safe_rvv; ft.chunksize = &chunksize_rvv; ft.compare256 = &compare256_rvv;