/* do modulo once each block of NMAX size */
if (++cnt >= nmax_limit) {
v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
+ v_buf32_accu = __riscv_vremu_vx_u32m4(v_buf32_accu, BASE, vl);
cnt = 0;
}
}
vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl);
v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl);
- uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum);
+ uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum) % BASE;
- sum2 += (sum2_sum + adler * (len - left));
+ sum2 += (sum2_sum + adler * ((len - left) % BASE));
vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl);
v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl);
- uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum);
+ uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum) % BASE;
adler += adler_sum;
+ sum2 %= BASE;
+ adler %= BASE;
+
while (left--) {
if (COPY) *dst++ = *src;
adler += *src++;