]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Do mod BASE once each block of NMAX size
authoralexsifivetw <alex.chiang@sifive.com>
Tue, 11 Jul 2023 12:47:29 +0000 (05:47 -0700)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Sun, 16 Jul 2023 10:44:25 +0000 (12:44 +0200)
arch/riscv/adler32_rvv.c

index c2ef40c16f22ddd2f6b83735470c1f822cbf8c8e..0e78c92875c05e43dbad4ce2aeb41505f5014b5f 100644 (file)
@@ -50,6 +50,8 @@ Z_INTERNAL uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len)
      * move the data into the 32-bit accumulator at the last iteration.
      */
     size_t block_size = (256 / vl) * vl;
+    size_t nmax_limit = (NMAX / block_size);
+    size_t cnt = 0;
     while (left >= block_size) {
         v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
         size_t subprob = block_size;
@@ -63,9 +65,13 @@ Z_INTERNAL uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len)
         v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl);
         v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
         left -= block_size;
+        /* do modulo once each block of NMAX size */
+        if (++cnt >= nmax_limit) {
+            v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
+            cnt = 0;
+        }
     }
-    v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
-
+    /* the left len <= 256 now, we can use 16-bit accum safetly */
     v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
     size_t res = left;
     while (left >= vl) {
@@ -76,6 +82,7 @@ Z_INTERNAL uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len)
         left -= vl;
     }
     v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl);
+    v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
     v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
 
     vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl);