From: alexsifivetw Date: Sat, 1 Jul 2023 17:40:06 +0000 (-0700) Subject: Optimize adler32 using rvv X-Git-Tag: 2.1.4~58 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=6eed7416ed38a7740da77e86f2e5be5e7bce586d;p=thirdparty%2Fzlib-ng.git Optimize adler32 using rvv --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 420a5c78d..003e66db7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -736,7 +736,7 @@ if(WITH_OPTIM) list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/riscv_features.c) # FIXME: we will not set compile flags for riscv_features.c when # the kernels update hwcap or hwprobe for riscv - set(RVV_SRCS ${ARCHDIR}/riscv_features.c ${ARCHDIR}/compare256_rvv.c ${ARCHDIR}/slide_hash_rvv.c) + set(RVV_SRCS ${ARCHDIR}/riscv_features.c ${ARCHDIR}/adler32_rvv.c ${ARCHDIR}/compare256_rvv.c ${ARCHDIR}/slide_hash_rvv.c) list(APPEND ZLIB_ARCH_SRCS ${RVV_SRCS}) set_property(SOURCE ${RVV_SRCS} PROPERTY COMPILE_FLAGS "${RISCVFLAG} ${NOLTOFLAG}") else() diff --git a/arch/riscv/adler32_rvv.c b/arch/riscv/adler32_rvv.c new file mode 100644 index 000000000..c2ef40c16 --- /dev/null +++ b/arch/riscv/adler32_rvv.c @@ -0,0 +1,110 @@ +/* adler32_rvv.c - RVV version of adler32 + * Copyright (C) 2023 SiFive, Inc. All rights reserved. + * Contributed by Alex Chiang + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef RISCV_RVV + +#include +#include + +#include "../../zbuild.h" +#include "../../adler32_p.h" + +Z_INTERNAL uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len) { + /* split Adler-32 into component sums */ + uint32_t sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; + + /* in case user likes doing a byte at a time, keep it fast */ + if (len == 1) { + return adler32_len_1(adler, buf, sum2); + } + + /* initial Adler-32 value (deferred check for len == 1 speed) */ + if (buf == NULL) + return 1L; + + /* in case short lengths are provided, keep it somewhat fast */ + if (len < 16) { + return adler32_len_16(adler, buf, len, sum2); + } + + size_t left = len; + size_t vl = __riscv_vsetvlmax_e8m1(); + vl = vl > 256 ? 256 : vl; + vuint32m4_t v_buf32_accu = __riscv_vmv_v_x_u32m4(0, vl); + vuint32m4_t v_adler32_prev_accu = __riscv_vmv_v_x_u32m4(0, vl); + vuint16m2_t v_buf16_accu; + + /* + * We accumulate 8-bit data, and to prevent overflow, we have to use a 32-bit accumulator. + * However, adding 8-bit data into a 32-bit accumulator isn't efficient. We use 16-bit & 32-bit + * accumulators to boost performance. + * + * The block_size is the largest multiple of vl that <= 256, because overflow would occur when + * vl > 256 (255 * 256 <= UINT16_MAX). + * + * We accumulate 8-bit data into a 16-bit accumulator and then + * move the data into the 32-bit accumulator at the last iteration. + */ + size_t block_size = (256 / vl) * vl; + while (left >= block_size) { + v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl); + size_t subprob = block_size; + while (subprob > 0) { + vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl); + v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl); + v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl); + buf += vl; + subprob -= vl; + } + v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl); + v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl); + left -= block_size; + } + v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl); + + v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl); + size_t res = left; + while (left >= vl) { + vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl); + v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl); + v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl); + buf += vl; + left -= vl; + } + v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl); + v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl); + + vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl); + vuint32m4_t v_rev_seq = __riscv_vrsub_vx_u32m4(v_seq, vl, vl); + vuint32m4_t v_sum32_accu = __riscv_vmul_vv_u32m4(v_buf32_accu, v_rev_seq, vl); + + v_sum32_accu = __riscv_vadd_vv_u32m4(v_sum32_accu, __riscv_vmul_vx_u32m4(v_adler32_prev_accu, vl, vl), vl); + + vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl); + v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl); + uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum); + + sum2 += (sum2_sum + adler * (len - left)); + + vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl); + v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl); + uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum); + + adler += adler_sum; + + while (left--) { + adler += *buf++; + sum2 += adler; + } + + sum2 %= BASE; + adler %= BASE; + + return adler | (sum2 << 16); +} + +#endif // RISCV_RVV diff --git a/cpu_features.h b/cpu_features.h index f47ddf0d4..870f6e656 100644 --- a/cpu_features.h +++ b/cpu_features.h @@ -50,6 +50,9 @@ extern uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len); #ifdef PPC_VMX extern uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len); #endif +#ifdef RISCV_RVV +extern uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len); +#endif #ifdef X86_SSSE3 extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *buf, size_t len); #endif diff --git a/functable.c b/functable.c index 449edaa0b..6e195acea 100644 --- a/functable.c +++ b/functable.c @@ -205,6 +205,7 @@ static void init_functable(void) { // RISCV - RVV #ifdef RISCV_RVV if (cf.riscv.has_rvv) { + ft.adler32 = &adler32_rvv; ft.compare256 = &compare256_rvv; ft.longest_match = &longest_match_rvv; ft.longest_match_slow = &longest_match_slow_rvv; diff --git a/test/benchmarks/benchmark_adler32.cc b/test/benchmarks/benchmark_adler32.cc index 5b0b65d67..0be3a707a 100644 --- a/test/benchmarks/benchmark_adler32.cc +++ b/test/benchmarks/benchmark_adler32.cc @@ -75,6 +75,10 @@ BENCHMARK_ADLER32(vmx, adler32_vmx, test_cpu_features.power.has_altivec); BENCHMARK_ADLER32(power8, adler32_power8, test_cpu_features.power.has_arch_2_07); #endif +#ifdef RISCV_RVV +BENCHMARK_ADLER32(rvv, adler32_rvv, test_cpu_features.riscv.has_rvv); +#endif + #ifdef X86_SSSE3 BENCHMARK_ADLER32(ssse3, adler32_ssse3, test_cpu_features.x86.has_ssse3); #endif diff --git a/test/benchmarks/benchmark_adler32_copy.cc b/test/benchmarks/benchmark_adler32_copy.cc index cbee780b7..9a4a5ff42 100644 --- a/test/benchmarks/benchmark_adler32_copy.cc +++ b/test/benchmarks/benchmark_adler32_copy.cc @@ -100,6 +100,11 @@ BENCHMARK_ADLER32_BASELINE_COPY(vmx_copy_baseline, adler32_vmx, test_cpu_feature BENCHMARK_ADLER32_BASELINE_COPY(power8, adler32_power8, test_cpu_features.power.has_arch_2_07); #endif +#ifdef RISCV_RVV +//BENCHMARK_ADLER32_COPY(rvv, adler32_rvv, test_cpu_features.riscv.has_rvv); +BENCHMARK_ADLER32_BASELINE_COPY(rvv, adler32_rvv, test_cpu_features.riscv.has_rvv); +#endif + #ifdef X86_SSE42 BENCHMARK_ADLER32_BASELINE_COPY(sse42_baseline, adler32_ssse3, test_cpu_features.x86.has_ssse3); BENCHMARK_ADLER32_COPY(sse42, adler32_fold_copy_sse42, test_cpu_features.x86.has_sse42); diff --git a/test/test_adler32.cc b/test/test_adler32.cc index 4dfe63f20..1ed6e2248 100644 --- a/test/test_adler32.cc +++ b/test/test_adler32.cc @@ -370,6 +370,8 @@ TEST_ADLER32(neon, adler32_neon, test_cpu_features.arm.has_neon) TEST_ADLER32(power8, adler32_power8, test_cpu_features.power.has_arch_2_07) #elif defined(PPC_VMX) TEST_ADLER32(vmx, adler32_vmx, test_cpu_features.power.has_altivec) +#elif defined(RISCV_RVV) +TEST_ADLER32(rvv, adler32_rvv, test_cpu_features.riscv.has_rvv) #endif #ifdef X86_SSSE3