From: LFF Date: Sat, 7 Jun 2025 08:23:29 +0000 (+0800) Subject: Optimize chunkcopy_rvv: X-Git-Url: http://git.ipfire.org/gitweb/gitweb.cgi?a=commitdiff_plain;h=962ab42190c0fd197595c8f47e2c9ea91fd7c0aa;p=thirdparty%2Fzlib-ng.git Optimize chunkcopy_rvv: 1. Skip aligning memcpy when dist >= len. Obviously aligning memcpy is redundant when dist >= len which contains extra very slow load&store instrutions. And I noticed that dist is way larger than len in most cases by adding printf in chunkcopy_rvv with apt install (very narrow situation but makes sense). So I tend to move the comparing before aligning memcpy since it is only needed by the overlap situation. 2. Make the largest copy while len > dist. Chunkcopy_rvv only copies as much memory as possible once after aligning memcpy then uses sizeof(chunk_t) to finish the rest copying. However, we should do the largest copy as long as len < dist. --- diff --git a/arch/riscv/chunkset_rvv.c b/arch/riscv/chunkset_rvv.c index e0915dfc..45ff8d33 100644 --- a/arch/riscv/chunkset_rvv.c +++ b/arch/riscv/chunkset_rvv.c @@ -86,11 +86,6 @@ static inline void storechunk(uint8_t *out, chunk_t *chunk) { */ static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) { Assert(len > 0, "chunkcopy should never have a length 0"); - int32_t align = ((len - 1) % sizeof(chunk_t)) + 1; - memcpy(out, from, sizeof(chunk_t)); - out += align; - from += align; - len -= align; ptrdiff_t dist = out - from; if (dist < 0 || dist >= len) { memcpy(out, from, len); @@ -98,18 +93,24 @@ static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len from += len; return out; } - if (dist >= sizeof(chunk_t)) { - dist = (dist / sizeof(chunk_t)) * sizeof(chunk_t); - memcpy(out, from, dist); - out += dist; - from += dist; - len -= dist; + + int32_t align = ((len - 1) % sizeof(chunk_t)) + 1; + memcpy(out, from, sizeof(chunk_t)); + out += align; + from += align; + len -= align; + + size_t vl = (dist / sizeof(chunk_t)) * sizeof(chunk_t); + while (len > dist) { + memcpy(out, from, vl); + out += vl; + from += vl; + len -= vl; } - while (len > 0) { - memcpy(out, from, sizeof(chunk_t)); - out += sizeof(chunk_t); - from += sizeof(chunk_t); - len -= sizeof(chunk_t); + + if (len > 0) { + memcpy(out, from, len); + out += len; } return out; } @@ -118,4 +119,4 @@ static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len #define INFLATE_FAST inflate_fast_rvv -#include "inffast_tpl.h" +#include "inffast_tpl.h" \ No newline at end of file