]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Optimize compare256 with rvv
authoralexsifivetw <alex.chiang@sifive.com>
Tue, 16 May 2023 10:01:37 +0000 (03:01 -0700)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Tue, 13 Jun 2023 10:25:48 +0000 (12:25 +0200)
CMakeLists.txt
README.md
arch/riscv/compare256_rvv.c [new file with mode: 0644]
cpu_features.h
functable.c
test/benchmarks/benchmark_compare256.cc
test/test_compare256.cc

index 424fddf6180c135e127668676f7e5d09f8e77719..ef4f239b6b00dc6e724a3896677dc0b3c3565754 100644 (file)
@@ -731,6 +731,11 @@ if(WITH_OPTIM)
                 add_definitions(-DRISCV_RVV)
                 list(APPEND ZLIB_ARCH_HDRS ${ARCHDIR}/riscv_features.h)
                 list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/riscv_features.c)
+                # FIXME: we will not set compile flags for riscv_features.c when 
+                # the kernels update hwcap or hwprobe for riscv
+                set(RVV_SRCS ${ARCHDIR}/riscv_features.c ${ARCHDIR}/compare256_rvv.c)
+                list(APPEND ZLIB_ARCH_SRCS ${RVV_SRCS})
+                set_property(SOURCE ${RVV_SRCS} PROPERTY COMPILE_FLAGS "${RISCVFLAG} ${NOLTOFLAG}")
             else()
                 set(WITH_RVV OFF)
             endif()
index 75b716b606fa0bf6fb49caaee4b12187f0957ff5..c83b8487f1a02f1e7d2f5876751492bd40ad3649 100644 (file)
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@ Features
   * CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z
   * Hash table implementation using CRC32-C intrinsics on x86 and ARM
   * Slide hash implementations using SSE2, AVX2, Neon, VMX & VSX
-  * Compare256 implementations using SSE2, AVX2, Neon, & POWER9
+  * Compare256 implementations using SSE2, AVX2, Neon, POWER9 & RVV
   * Inflate chunk copying using SSE2, SSSE3, AVX, Neon & VSX
   * Support for hardware-accelerated deflate using IBM Z DFLTCC
 * Unaligned memory read/writes and large bit buffer improvements
diff --git a/arch/riscv/compare256_rvv.c b/arch/riscv/compare256_rvv.c
new file mode 100644 (file)
index 0000000..acb2803
--- /dev/null
@@ -0,0 +1,47 @@
+/* compare256_rvv.c - RVV version of compare256
+ * Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * Contributed by Alex Chiang <alex.chiang@sifive.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef RISCV_RVV
+
+#include "../../zbuild.h"
+#include "fallback_builtins.h"
+
+#include <riscv_vector.h>
+
+static inline uint32_t compare256_rvv_static(const uint8_t *src0, const uint8_t *src1) {
+    uint32_t len = 0;
+    size_t vl;
+    long found_diff;
+    do {
+        vl = __riscv_vsetvl_e8m4(256 - len);
+        vuint8m4_t v_src0 = __riscv_vle8_v_u8m4(src0, vl);
+        vuint8m4_t v_src1 = __riscv_vle8_v_u8m4(src1, vl);
+        vbool2_t v_mask = __riscv_vmsne_vv_u8m4_b2(v_src0, v_src1, vl);
+        found_diff = __riscv_vfirst_m_b2(v_mask, vl);
+        if (found_diff >= 0) 
+            return len + (uint32_t)found_diff;
+        src0 += vl, src1 += vl, len += vl;
+    } while (len < 256);
+
+    return 256;
+}
+
+Z_INTERNAL uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1) {
+    return compare256_rvv_static(src0, src1);
+}
+
+#define LONGEST_MATCH       longest_match_rvv
+#define COMPARE256          compare256_rvv_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH       longest_match_slow_rvv
+#define COMPARE256          compare256_rvv_static
+
+#include "match_tpl.h"
+
+#endif // RISCV_RVV
index 647d027f6ec2d04202d9ce141a96c23afb30b1e7..fb43d90a2689708a3b0771b1e3feb8e9cd214289 100644 (file)
@@ -180,6 +180,9 @@ extern uint32_t compare256_neon(const uint8_t *src0, const uint8_t *src1);
 #ifdef POWER9
 extern uint32_t compare256_power9(const uint8_t *src0, const uint8_t *src1);
 #endif
+#ifdef RISCV_RVV
+extern uint32_t compare256_rvv(const uint8_t *src0, const uint8_t *src1);
+#endif
 
 #ifdef DEFLATE_H_
 /* insert_string */
@@ -213,6 +216,9 @@ extern uint32_t longest_match_neon(deflate_state *const s, Pos cur_match);
 #ifdef POWER9
 extern uint32_t longest_match_power9(deflate_state *const s, Pos cur_match);
 #endif
+#ifdef RISCV_RVV
+extern uint32_t longest_match_rvv(deflate_state *const s, Pos cur_match);
+#endif
 
 /* longest_match_slow */
 extern uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match);
@@ -235,6 +241,9 @@ extern uint32_t longest_match_slow_neon(deflate_state *const s, Pos cur_match);
 #ifdef POWER9
 extern uint32_t longest_match_slow_power9(deflate_state *const s, Pos cur_match);
 #endif
+#ifdef RISCV_RVV
+extern uint32_t longest_match_slow_rvv(deflate_state *const s, Pos cur_match);
+#endif
 
 /* quick_insert_string */
 extern Pos quick_insert_string_c(deflate_state *const s, const uint32_t str);
index d2009829221677d683377ca562ff7a8e10bf42f2..60d4137b37cbe210f138a4125518ce12049a7711 100644 (file)
@@ -202,6 +202,16 @@ static void init_functable(void) {
 #endif
 
 
+    // RISCV - RVV
+#ifdef RISCV_RVV
+    if (cf.riscv.has_rvv) {
+        ft.compare256 = &compare256_rvv;
+        ft.longest_match = &longest_match_rvv;
+        ft.longest_match_slow = &longest_match_slow_rvv;
+    }
+#endif
+
+
     // S390
 #ifdef S390_CRC32_VX
     if (cf.s390.has_vx)
index 3ab04d2022117f386501ad11b8a5217427b61cac..db5ba83f6ccf3fd9cf44d3e31a90145c1b4d2f28 100644 (file)
@@ -82,3 +82,6 @@ BENCHMARK_COMPARE256(neon, compare256_neon, test_cpu_features.arm.has_neon);
 #ifdef POWER9
 BENCHMARK_COMPARE256(power9, compare256_power9, test_cpu_features.power.has_arch_3_00);
 #endif
+#ifdef RISCV_RVV
+BENCHMARK_COMPARE256(rvv, compare256_rvv, test_cpu_features.riscv.has_rvv);
+#endif
index 0e656da3713766e3d97777f7509ede2ea0e11ece..8900902f912ff556377b37c14aa69f24e196a049 100644 (file)
@@ -81,3 +81,6 @@ TEST_COMPARE256(neon, compare256_neon, test_cpu_features.arm.has_neon)
 #ifdef POWER9
 TEST_COMPARE256(power9, compare256_power9, test_cpu_features.power.has_arch_3_00)
 #endif
+#ifdef RISCV_RVV
+TEST_COMPARE256(rvv, compare256_rvv, test_cpu_features.riscv.has_rvv)
+#endif