From: Vladislav Shchapov Date: Fri, 20 Jun 2025 16:56:47 +0000 (+0500) Subject: LoongArch64 micro-optimizations X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=caba1ef6fdf138953b1ed8b43f5252643e8403bf;p=thirdparty%2Fzlib-ng.git LoongArch64 micro-optimizations Co-authored-by: junchao-zhao Signed-off-by: Vladislav Shchapov --- diff --git a/arch/loongarch/chunkset_lasx.c b/arch/loongarch/chunkset_lasx.c index 8b232327..5626ca1f 100644 --- a/arch/loongarch/chunkset_lasx.c +++ b/arch/loongarch/chunkset_lasx.c @@ -36,8 +36,7 @@ static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { } static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) { - halfchunk_t half = __lsx_vld(from, 0); - *chunk = lasx_inserti128_si256(lasx_castsi128_si256(half), half, 1); + *chunk = lasx_broadcastsi128_si256(__lsx_vld(from, 0)); } static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { @@ -63,11 +62,11 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t /* This simpler case still requires us to shuffle in 128 bit lanes, so we must apply a static offset after * broadcasting the first vector register to both halves. This is _marginally_ faster than doing two separate * shuffles and combining the halves later */ - const __m256i permute_xform = lasx_inserti128_si256(__lasx_xvreplgr2vr_b(0), __lsx_vreplgr2vr_b(16), 1); + const __m256i permute_xform = lasx_set_si128(__lsx_vreplgr2vr_b(16), __lsx_vreplgr2vr_b(0)); __m256i perm_vec = __lasx_xvld(permute_table+lut_rem.idx, 0); __m128i ret_vec0 = __lsx_vld(buf, 0); perm_vec = __lasx_xvadd_b(perm_vec, permute_xform); - ret_vec = lasx_inserti128_si256(lasx_castsi128_si256(ret_vec0), ret_vec0, 1); + ret_vec = lasx_set_si128(ret_vec0, ret_vec0); ret_vec = lasx_shuffle_b(ret_vec, perm_vec); } else { __m128i ret_vec0 = __lsx_vld(buf, 0); @@ -79,7 +78,7 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t /* Since we can't wrap twice, we can simply keep the later half exactly how it is instead of having to _also_ * shuffle those values */ __m128i latter_half = __lsx_vbitsel_v(ret_vec1, xlane_res, xlane_permutes); - ret_vec = lasx_inserti128_si256(lasx_castsi128_si256(ret_vec0), latter_half, 1); + ret_vec = lasx_set_si128(latter_half, ret_vec0); } return ret_vec; diff --git a/arch/loongarch/lasxintrin_ext.h b/arch/loongarch/lasxintrin_ext.h index b2d0be5f..833267de 100644 --- a/arch/loongarch/lasxintrin_ext.h +++ b/arch/loongarch/lasxintrin_ext.h @@ -9,6 +9,65 @@ #include +#ifdef __clang__ +# define LA_VREGS_PREFIX "$vr" +# define LA_XREGS_PREFIX "$xr" +#else /* GCC */ +# define LA_VREGS_PREFIX "$f" +# define LA_XREGS_PREFIX "$f" +#endif +#define LA_ALL_REGS "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31" + +static inline __m256i lasx_zextsi128_si256(__m128i in) { + __m256i out = __lasx_xvldi(0); + __asm__ volatile ( + ".irp i," LA_ALL_REGS "\n\t" + " .ifc %[out], " LA_XREGS_PREFIX"\\i \n\t" + " .irp j," LA_ALL_REGS "\n\t" + " .ifc %[in], " LA_VREGS_PREFIX "\\j \n\t" + " xvpermi.q $xr\\i, $xr\\j, 0x20 \n\t" + " .endif \n\t" + " .endr \n\t" + " .endif \n\t" + ".endr \n\t" + : [out] "+f" (out) : [in] "f" (in) + ); + return out; +} + +static inline __m256i lasx_set_si128(__m128i inhi, __m128i inlo) { + __m256i out; + __asm__ volatile ( + ".irp i," LA_ALL_REGS "\n\t" + " .ifc %[hi], " LA_VREGS_PREFIX "\\i \n\t" + " .irp j," LA_ALL_REGS "\n\t" + " .ifc %[lo], " LA_VREGS_PREFIX "\\j \n\t" + " xvpermi.q $xr\\i, $xr\\j, 0x20 \n\t" + " .endif \n\t" + " .endr \n\t" + " .endif \n\t" + ".endr \n\t" + ".ifnc %[out], %[hi] \n\t" + ".irp i," LA_ALL_REGS "\n\t" + " .ifc %[out], " LA_XREGS_PREFIX "\\i \n\t" + " .irp j," LA_ALL_REGS "\n\t" + " .ifc %[hi], " LA_VREGS_PREFIX "\\j \n\t" + " xvori.b $xr\\i, $xr\\j, 0 \n\t" + " .endif \n\t" + " .endr \n\t" + " .endif \n\t" + ".endr \n\t" + ".endif \n\t" + : [out] "=f" (out), [hi] "+f" (inhi) + : [lo] "f" (inlo) + ); + return out; +} + +static inline __m256i lasx_broadcastsi128_si256(__m128i in) { + return lasx_set_si128(in, in); +} + static inline __m256i lasx_sad_bu(__m256i a, __m256i b) { __m256i tmp = __lasx_xvabsd_bu(a, b); tmp = __lasx_xvhaddw_hu_bu(tmp, tmp); @@ -21,23 +80,6 @@ static inline int lasx_movemask_b(__m256i v) { return __lasx_xvpickve2gr_w(v, 0) | (__lasx_xvpickve2gr_w(v, 4) << 16); } -static inline __m256i lasx_castsi128_si256(__m128i v) -{ - return (__m256i) { v[0], v[1], 0, 0 }; -} - -static inline __m256i lasx_inserti128_si256(__m256i a, __m128i b, const int imm8) { - if (imm8 == 0) - return __lasx_xvpermi_q(a, lasx_castsi128_si256(b), 0x30); - else - return __lasx_xvpermi_q(a, lasx_castsi128_si256(b), 0x02); -} - -static inline __m256i lasx_zextsi128_si256(__m128i v) { - return (__m256i) { v[0], v[1], 0, 0 }; - /* return lasx_inserti128_si256(__lasx_xvreplgr2vr_w(0), v, 0); */ -} - /* See: lsx_shuffle_b */ static inline __m256i lasx_shuffle_b(__m256i a, __m256i b) { __m256i msb_mask = __lasx_xvslti_b(b, 0);