-/* chunkset_rvv.c - General version of chunkset
+/* chunkset_rvv.c - RVV version of chunkset
* Copyright (C) 2023 SiFive, Inc. All rights reserved.
* Contributed by Alex Chiang <alex.chiang@sifive.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
-
+#include <riscv_vector.h>
#include "zbuild.h"
/*
- * It's not a optimized implemantation using RISC-V RVV, but a general optimized one.
- *
* RISC-V glibc would enable RVV optimized memcpy at runtime by IFUNC,
* so we prefer using large size chunk and copy memory as much as possible.
*/
#define CHUNK_SIZE 32
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+
+#define CHUNK_MEMSET_RVV_IMPL(elen) \
+do { \
+ size_t vl, len = CHUNK_SIZE / sizeof(uint##elen##_t); \
+ uint##elen##_t val = *(uint##elen##_t*)from; \
+ uint##elen##_t* chunk_p = (uint##elen##_t*)chunk; \
+ do { \
+ vl = __riscv_vsetvl_e##elen##m4(len); \
+ vuint##elen##m4_t v_val = __riscv_vmv_v_x_u##elen##m4(val, vl); \
+ __riscv_vse##elen##_v_u##elen##m4(chunk_p, v_val, vl); \
+ len -= vl; chunk_p += vl; \
+ } while (len > 0); \
+} while (0)
+
/* We don't have a 32-byte datatype for RISC-V arch. */
typedef struct chunk_s {
- uint8_t data[CHUNK_SIZE];
+ uint64_t data[4];
} chunk_t;
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+ CHUNK_MEMSET_RVV_IMPL(16);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+ CHUNK_MEMSET_RVV_IMPL(32);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+ CHUNK_MEMSET_RVV_IMPL(64);
+}
+
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
- memcpy(chunk->data, s, CHUNK_SIZE);
+ memcpy(chunk->data, (uint8_t *)s, CHUNK_SIZE);
}
static inline void storechunk(uint8_t *out, chunk_t *chunk) {
* However, RISC-V glibc would enable RVV optimized memcpy at runtime by IFUNC,
* such that, we prefer copy large memory size once to make good use of the the RVV advance.
*
- * To be aligned to the other platforms, we did't modify `CHUNKCOPY` method a lot,
+ * To be aligned to the other platforms, we didn't modify `CHUNKCOPY` method a lot,
* but we still copy as much memory as possible for some conditions.
*
* case 1: out - from >= len (no overlap)
static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
Assert(len > 0, "chunkcopy should never have a length 0");
int32_t align = ((len - 1) % sizeof(chunk_t)) + 1;
- chunk_t chunk;
- memcpy(out, from, sizeof(chunk));
+ memcpy(out, from, sizeof(chunk_t));
out += align;
from += align;
len -= align;
len -= dist;
}
while (len > 0) {
- memcpy(out, from, sizeof(chunk));
+ memcpy(out, from, sizeof(chunk_t));
out += sizeof(chunk_t);
from += sizeof(chunk_t);
len -= sizeof(chunk_t);