}
static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) {
- *chunk = lasx_broadcastsi128_si256(__lsx_vld(from, 0));
+ *chunk = lasx_broadcast_128(__lsx_vld(from, 0));
}
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
* shuffles and combining the halves later */
__m256i perm_vec = __lasx_xvld(permute_table+lut_rem.idx, 0);
__m128i ret_vec0 = __lsx_vld(buf, 0);
- ret_vec = lasx_set_si128(ret_vec0, ret_vec0);
+ ret_vec = __lasx_concat_128(ret_vec0, ret_vec0);
ret_vec = lasx_shuffle_b(ret_vec, perm_vec);
} else {
__m128i ret_vec0 = __lsx_vld(buf, 0);
/* Since we can't wrap twice, we can simply keep the later half exactly how it is instead of having to _also_
* shuffle those values */
__m128i latter_half = __lsx_vbitsel_v(ret_vec1, xlane_res, xlane_permutes);
- ret_vec = lasx_set_si128(latter_half, ret_vec0);
+ ret_vec = __lasx_concat_128(ret_vec0, latter_half);
}
return ret_vec;
static inline chunk_t halfchunk2whole(halfchunk_t *chunk) {
/* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately
* unlikely to be actually written or read from */
- return lasx_zextsi128_si256(*chunk);
+ return lasx_zext_128(*chunk);
}
static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
#include <lasxintrin.h>
-#ifdef __clang__
-# define LA_VREGS_PREFIX "$vr"
-# define LA_XREGS_PREFIX "$xr"
-#else /* GCC */
-# define LA_VREGS_PREFIX "$f"
-# define LA_XREGS_PREFIX "$f"
+static inline __m256i lasx_zext_128(__m128i src) {
+#ifdef __loongarch_asx_sx_conv
+ return __lasx_insert_128_lo(__lasx_xvldi(0), src);
+#else
+ __m256i dest = __lasx_xvldi(0);
+ __asm__ volatile ("xvpermi.q %u0,%u2,0x30\n" : "=f"(dest) : "0"(dest), "f"(src));
+ return dest;
#endif
-#define LA_ALL_REGS "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31"
-
-static inline __m256i lasx_zextsi128_si256(__m128i in) {
- __m256i out = __lasx_xvldi(0);
- __asm__ volatile (
- ".irp i," LA_ALL_REGS "\n\t"
- " .ifc %[out], " LA_XREGS_PREFIX"\\i \n\t"
- " .irp j," LA_ALL_REGS "\n\t"
- " .ifc %[in], " LA_VREGS_PREFIX "\\j \n\t"
- " xvpermi.q $xr\\i, $xr\\j, 0x20 \n\t"
- " .endif \n\t"
- " .endr \n\t"
- " .endif \n\t"
- ".endr \n\t"
- : [out] "+f" (out) : [in] "f" (in)
- );
- return out;
}
-static inline __m256i lasx_set_si128(__m128i inhi, __m128i inlo) {
- __m256i out;
- __asm__ volatile (
- ".irp i," LA_ALL_REGS "\n\t"
- " .ifc %[hi], " LA_VREGS_PREFIX "\\i \n\t"
- " .irp j," LA_ALL_REGS "\n\t"
- " .ifc %[lo], " LA_VREGS_PREFIX "\\j \n\t"
- " xvpermi.q $xr\\i, $xr\\j, 0x20 \n\t"
- " .endif \n\t"
- " .endr \n\t"
- " .endif \n\t"
- ".endr \n\t"
- ".ifnc %[out], %[hi] \n\t"
- ".irp i," LA_ALL_REGS "\n\t"
- " .ifc %[out], " LA_XREGS_PREFIX "\\i \n\t"
- " .irp j," LA_ALL_REGS "\n\t"
- " .ifc %[hi], " LA_VREGS_PREFIX "\\j \n\t"
- " xvori.b $xr\\i, $xr\\j, 0 \n\t"
- " .endif \n\t"
- " .endr \n\t"
- " .endif \n\t"
- ".endr \n\t"
- ".endif \n\t"
- : [out] "=f" (out), [hi] "+f" (inhi)
- : [lo] "f" (inlo)
- );
- return out;
+#ifndef __loongarch_asx_sx_conv
+static inline __m256i __lasx_concat_128(__m128i lo, __m128i hi) {
+ __m256i dest;
+ __asm__ volatile ("xvpermi.q %u0,%u2,0x02\n" : "=f"(dest) : "0"(lo), "f"(hi));
+ return dest;
}
+#endif
-static inline __m256i lasx_broadcastsi128_si256(__m128i in) {
- return lasx_set_si128(in, in);
+static inline __m256i lasx_broadcast_128(__m128i in) {
+ return __lasx_concat_128(in, in);
}
static inline __m256i lasx_sad_bu(__m256i a, __m256i b) {