From: alexsifivetw Date: Thu, 31 Aug 2023 08:22:20 +0000 (-0700) Subject: General optimized chunkset X-Git-Tag: 2.1.4~10 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=fe6aaedaf8be680d43d5726a2e2b3ed948406800;p=thirdparty%2Fzlib-ng.git General optimized chunkset --- diff --git a/CMakeLists.txt b/CMakeLists.txt index bf0423b95..b7dc46be0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -739,7 +739,7 @@ if(WITH_OPTIM) list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/riscv_features.c) # FIXME: we will not set compile flags for riscv_features.c when # the kernels update hwcap or hwprobe for riscv - set(RVV_SRCS ${ARCHDIR}/riscv_features.c ${ARCHDIR}/adler32_rvv.c ${ARCHDIR}/compare256_rvv.c ${ARCHDIR}/slide_hash_rvv.c) + set(RVV_SRCS ${ARCHDIR}/riscv_features.c ${ARCHDIR}/adler32_rvv.c ${ARCHDIR}/chunkset_rvv.c ${ARCHDIR}/compare256_rvv.c ${ARCHDIR}/slide_hash_rvv.c) list(APPEND ZLIB_ARCH_SRCS ${RVV_SRCS}) set_property(SOURCE ${RVV_SRCS} PROPERTY COMPILE_FLAGS "${RISCVFLAG} ${NOLTOFLAG}") else() diff --git a/arch/riscv/chunkset_rvv.c b/arch/riscv/chunkset_rvv.c new file mode 100644 index 000000000..034063ba2 --- /dev/null +++ b/arch/riscv/chunkset_rvv.c @@ -0,0 +1,95 @@ +/* chunkset_rvv.c - General version of chunkset + * Copyright (C) 2023 SiFive, Inc. All rights reserved. + * Contributed by Alex Chiang + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zbuild.h" + +/* + * It's not a optimized implemantation using RISC-V RVV, but a general optimized one. + * + * RISC-V glibc would enable RVV optimized memcpy at runtime by IFUNC, + * so we prefer using large size chunk and copy memory as much as possible. + */ +#define CHUNK_SIZE 32 + +/* We don't have a 32-byte datatype for RISC-V arch. */ +typedef struct chunk_s { + uint8_t data[CHUNK_SIZE]; +} chunk_t; + +static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { + memcpy(chunk->data, s, CHUNK_SIZE); +} + +static inline void storechunk(uint8_t *out, chunk_t *chunk) { + memcpy(out, chunk->data, CHUNK_SIZE); +} + +#define CHUNKSIZE chunksize_rvv +#define CHUNKCOPY chunkcopy_rvv +#define CHUNKUNROLL chunkunroll_rvv +#define CHUNKMEMSET chunkmemset_rvv +#define CHUNKMEMSET_SAFE chunkmemset_safe_rvv + +#define HAVE_CHUNKCOPY + +/* + * Assuming that the length is non-zero, and that `from` lags `out` by at least + * sizeof chunk_t bytes, please see the comments in chunkset_tpl.h. + * + * We load/store a single chunk once in the `CHUNKCOPY`. + * However, RISC-V glibc would enable RVV optimized memcpy at runtime by IFUNC, + * such that, we prefer copy large memory size once to make good use of the the RVV advance. + * + * To be aligned to the other platforms, we did't modify `CHUNKCOPY` method a lot, + * but we still copy as much memory as possible for some conditions. + * + * case 1: out - from >= len (no overlap) + * We can use memcpy to copy `len` size once + * because the memory layout would be the same. + * + * case 2: overlap + * We copy N chunks using memcpy at once, aiming to achieve our goal: + * to copy as much memory as possible. + * + * After using a single memcpy to copy N chunks, we have to use series of + * loadchunk and storechunk to ensure the result is correct. + */ +static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) { + Assert(len > 0, "chunkcopy should never have a length 0"); + int32_t align = ((len - 1) % sizeof(chunk_t)) + 1; + chunk_t chunk; + memcpy(out, from, sizeof(chunk)); + out += align; + from += align; + len -= align; + ptrdiff_t dist = out - from; + if (dist >= len) { + memcpy(out, from, len); + out += len; + from += len; + return out; + } + if (dist >= sizeof(chunk_t)) { + dist = (dist / sizeof(chunk_t)) * sizeof(chunk_t); + memcpy(out, from, dist); + out += dist; + from += dist; + len -= dist; + } + while (len > 0) { + memcpy(out, from, sizeof(chunk)); + out += sizeof(chunk_t); + from += sizeof(chunk_t); + len -= sizeof(chunk_t); + } + return out; +} + +#include "chunkset_tpl.h" + +#define INFLATE_FAST inflate_fast_rvv + +#include "inffast_tpl.h" diff --git a/cpu_features.h b/cpu_features.h index faca52ad4..aed1eaf90 100644 --- a/cpu_features.h +++ b/cpu_features.h @@ -121,6 +121,10 @@ extern uint8_t* chunkmemset_safe_neon(uint8_t *out, unsigned dist, unsigned len, extern uint32_t chunksize_power8(void); extern uint8_t* chunkmemset_safe_power8(uint8_t *out, unsigned dist, unsigned len, unsigned left); #endif +#ifdef RISCV_RVV +extern uint32_t chunksize_rvv(void); +extern uint8_t* chunkmemset_safe_rvv(uint8_t *out, unsigned dist, unsigned len, unsigned left); +#endif #ifdef ZLIB_COMPAT typedef struct z_stream_s z_stream; @@ -145,6 +149,9 @@ extern void inflate_fast_neon(PREFIX3(stream) *strm, uint32_t start); #ifdef POWER8_VSX extern void inflate_fast_power8(PREFIX3(stream) *strm, uint32_t start); #endif +#ifdef RISCV_RVV +extern void inflate_fast_rvv(PREFIX3(stream) *strm, uint32_t start); +#endif /* CRC32 */ typedef uint32_t (*crc32_func)(uint32_t crc32, const uint8_t *buf, size_t len); diff --git a/functable.c b/functable.c index 3ef840cff..df35bae7c 100644 --- a/functable.c +++ b/functable.c @@ -215,7 +215,10 @@ static void init_functable(void) { #ifdef RISCV_RVV if (cf.riscv.has_rvv) { ft.adler32 = &adler32_rvv; + ft.chunkmemset_safe = &chunkmemset_safe_rvv; + ft.chunksize = &chunksize_rvv; ft.compare256 = &compare256_rvv; + ft.inflate_fast = &inflate_fast_rvv; ft.longest_match = &longest_match_rvv; ft.longest_match_slow = &longest_match_slow_rvv; ft.slide_hash = &slide_hash_rvv;