From: alexsifivetw Date: Tue, 16 May 2023 10:01:37 +0000 (-0700) Subject: Optimize compare256 with rvv X-Git-Tag: 2.1.3~8 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=de1b640ffbd0b5b3eccc083fbe22124333824284;p=thirdparty%2Fzlib-ng.git Optimize compare256 with rvv --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 424fddf61..ef4f239b6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -731,6 +731,11 @@ if(WITH_OPTIM) add_definitions(-DRISCV_RVV) list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/riscv_features.h) list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/riscv_features.c) + # FIXME: we will not set compile flags for riscv_features.c when + # the kernels update hwcap or hwprobe for riscv + set(RVV_SRCS ${ARCHDIR}/riscv_features.c ${ARCHDIR}/compare256_rvv.c) + list(APPEND ZLIB_ARCH_SRCS ${RVV_SRCS}) + set_property(SOURCE ${RVV_SRCS} PROPERTY COMPILE_FLAGS "${RISCVFLAG} ${NOLTOFLAG}") else() set(WITH_RVV OFF) endif() diff --git a/README.md b/README.md index 75b716b60..c83b8487f 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ Features * CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z * Hash table implementation using CRC32-C intrinsics on x86 and ARM * Slide hash implementations using SSE2, AVX2, Neon, VMX & VSX - * Compare256 implementations using SSE2, AVX2, Neon, & POWER9 + * Compare256 implementations using SSE2, AVX2, Neon, POWER9 & RVV * Inflate chunk copying using SSE2, SSSE3, AVX, Neon & VSX * Support for hardware-accelerated deflate using IBM Z DFLTCC * Unaligned memory read/writes and large bit buffer improvements diff --git a/arch/riscv/compare256_rvv.c b/arch/riscv/compare256_rvv.c new file mode 100644 index 000000000..acb28035f --- /dev/null +++ b/arch/riscv/compare256_rvv.c @@ -0,0 +1,47 @@ +/* compare256_rvv.c - RVV version of compare256 + * Copyright (C) 2023 SiFive, Inc. All rights reserved. + * Contributed by Alex Chiang + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef RISCV_RVV + +#include "../../zbuild.h" +#include "fallback_builtins.h" + +#include + +static inline uint32_t compare256_rvv_static(const uint8_t *src0, const uint8_t *src1) { + uint32_t len = 0; + size_t vl; + long found_diff; + do { + vl = __riscv_vsetvl_e8m4(256 - len); + vuint8m4_t v_src0 = __riscv_vle8_v_u8m4(src0, vl); + vuint8m4_t v_src1 = __riscv_vle8_v_u8m4(src1, vl); + vbool2_t v_mask = __riscv_vmsne_vv_u8m4_b2(v_src0, v_src1, vl); + found_diff = __riscv_vfirst_m_b2(v_mask, vl); + if (found_diff >= 0) + return len + (uint32_t)found_diff; + src0 += vl, src1 += vl, len += vl; + } while (len < 256); + + return 256; +} + +Z_INTERNAL uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1) { + return compare256_rvv_static(src0, src1); +} + +#define LONGEST_MATCH longest_match_rvv +#define COMPARE256 compare256_rvv_static + +#include "match_tpl.h" + +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_rvv +#define COMPARE256 compare256_rvv_static + +#include "match_tpl.h" + +#endif // RISCV_RVV diff --git a/cpu_features.h b/cpu_features.h index 647d027f6..fb43d90a2 100644 --- a/cpu_features.h +++ b/cpu_features.h @@ -180,6 +180,9 @@ extern uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1); #ifdef POWER9 extern uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1); #endif +#ifdef RISCV_RVV +extern uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1); +#endif #ifdef DEFLATE_H_ /* insert_string */ @@ -213,6 +216,9 @@ extern uint32_t longest_match_neon(deflate_state *const s, Pos cur_match); #ifdef POWER9 extern uint32_t longest_match_power9(deflate_state *const s, Pos cur_match); #endif +#ifdef RISCV_RVV +extern uint32_t longest_match_rvv(deflate_state *const s, Pos cur_match); +#endif /* longest_match_slow */ extern uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match); @@ -235,6 +241,9 @@ extern uint32_t longest_match_slow_neon(deflate_state *const s, Pos cur_match); #ifdef POWER9 extern uint32_t longest_match_slow_power9(deflate_state *const s, Pos cur_match); #endif +#ifdef RISCV_RVV +extern uint32_t longest_match_slow_rvv(deflate_state *const s, Pos cur_match); +#endif /* quick_insert_string */ extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str); diff --git a/functable.c b/functable.c index d20098292..60d4137b3 100644 --- a/functable.c +++ b/functable.c @@ -202,6 +202,16 @@ static void init_functable(void) { #endif + // RISCV - RVV +#ifdef RISCV_RVV + if (cf.riscv.has_rvv) { + ft.compare256 = &compare256_rvv; + ft.longest_match = &longest_match_rvv; + ft.longest_match_slow = &longest_match_slow_rvv; + } +#endif + + // S390 #ifdef S390_CRC32_VX if (cf.s390.has_vx) diff --git a/test/benchmarks/benchmark_compare256.cc b/test/benchmarks/benchmark_compare256.cc index 3ab04d202..db5ba83f6 100644 --- a/test/benchmarks/benchmark_compare256.cc +++ b/test/benchmarks/benchmark_compare256.cc @@ -82,3 +82,6 @@ BENCHMARK_COMPARE256(neon, compare256_neon, test_cpu_features.arm.has_neon); #ifdef POWER9 BENCHMARK_COMPARE256(power9, compare256_power9, test_cpu_features.power.has_arch_3_00); #endif +#ifdef RISCV_RVV +BENCHMARK_COMPARE256(rvv, compare256_rvv, test_cpu_features.riscv.has_rvv); +#endif diff --git a/test/test_compare256.cc b/test/test_compare256.cc index 0e656da37..8900902f9 100644 --- a/test/test_compare256.cc +++ b/test/test_compare256.cc @@ -81,3 +81,6 @@ TEST_COMPARE256(neon, compare256_neon, test_cpu_features.arm.has_neon) #ifdef POWER9 TEST_COMPARE256(power9, compare256_power9, test_cpu_features.power.has_arch_3_00) #endif +#ifdef RISCV_RVV +TEST_COMPARE256(rvv, compare256_rvv, test_cpu_features.riscv.has_rvv) +#endif