]> git.ipfire.org Git - thirdparty/zlib-ng.git/commitdiff
Simplify LoongArch64 assembler. GCC 16, LLVM 22 have LASX and LSX conversion intrinsics.
authorVladislav Shchapov <vladislav@shchapov.ru>
Sat, 20 Dec 2025 22:38:50 +0000 (03:38 +0500)
committerHans Kristian Rosbach <hk-github@circlestorm.org>
Sun, 28 Dec 2025 21:34:38 +0000 (22:34 +0100)
Signed-off-by: Vladislav Shchapov <vladislav@shchapov.ru>
arch/loongarch/chunkset_lasx.c
arch/loongarch/lasxintrin_ext.h

index 03a0a9c85482f4f8af131d336d79e2cd1af2156b..6ac43909799177fb3ee37f84c9379ff56b5674bb 100644 (file)
@@ -36,7 +36,7 @@ static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
 }
 
 static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) {
-    *chunk = lasx_broadcastsi128_si256(__lsx_vld(from, 0));
+    *chunk = lasx_broadcast_128(__lsx_vld(from, 0));
 }
 
 static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
@@ -64,7 +64,7 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
          * shuffles and combining the halves later */
         __m256i perm_vec = __lasx_xvld(permute_table+lut_rem.idx, 0);
         __m128i ret_vec0 = __lsx_vld(buf, 0);
-        ret_vec = lasx_set_si128(ret_vec0, ret_vec0);
+        ret_vec = __lasx_concat_128(ret_vec0, ret_vec0);
         ret_vec = lasx_shuffle_b(ret_vec, perm_vec);
     }  else {
         __m128i ret_vec0 = __lsx_vld(buf, 0);
@@ -76,7 +76,7 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
         /* Since we can't wrap twice, we can simply keep the later half exactly how it is instead of having to _also_
          * shuffle those values */
         __m128i latter_half = __lsx_vbitsel_v(ret_vec1, xlane_res, xlane_permutes);
-        ret_vec = lasx_set_si128(latter_half, ret_vec0);
+        ret_vec = __lasx_concat_128(ret_vec0, latter_half);
     }
 
     return ret_vec;
@@ -93,7 +93,7 @@ static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) {
 static inline chunk_t halfchunk2whole(halfchunk_t *chunk) {
     /* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately
      * unlikely to be actually written or read from */
-    return lasx_zextsi128_si256(*chunk);
+    return lasx_zext_128(*chunk);
 }
 
 static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
index 833267de313645ecf0c91ab723b29cd23a9e015a..d97eab8df511575368fe984d11d8ee265fbdf54f 100644 (file)
@@ -9,63 +9,26 @@
 #include <lasxintrin.h>
 
 
-#ifdef __clang__
-#  define LA_VREGS_PREFIX "$vr"
-#  define LA_XREGS_PREFIX "$xr"
-#else /* GCC */
-#  define LA_VREGS_PREFIX "$f"
-#  define LA_XREGS_PREFIX "$f"
+static inline __m256i lasx_zext_128(__m128i src) {
+#ifdef __loongarch_asx_sx_conv
+    return __lasx_insert_128_lo(__lasx_xvldi(0), src);
+#else
+    __m256i dest = __lasx_xvldi(0);
+    __asm__ volatile ("xvpermi.q %u0,%u2,0x30\n" : "=f"(dest) : "0"(dest), "f"(src));
+    return dest;
 #endif
-#define LA_ALL_REGS "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31"
-
-static inline __m256i lasx_zextsi128_si256(__m128i in) {
-    __m256i out = __lasx_xvldi(0);
-    __asm__ volatile (
-        ".irp i," LA_ALL_REGS                  "\n\t"
-        " .ifc %[out], " LA_XREGS_PREFIX"\\i    \n\t"
-        "  .irp j," LA_ALL_REGS                "\n\t"
-        "   .ifc %[in], " LA_VREGS_PREFIX "\\j  \n\t"
-        "    xvpermi.q $xr\\i, $xr\\j, 0x20     \n\t"
-        "   .endif                              \n\t"
-        "  .endr                                \n\t"
-        " .endif                                \n\t"
-        ".endr                                  \n\t"
-        : [out] "+f" (out) : [in] "f" (in)
-    );
-    return out;
 }
 
-static inline __m256i lasx_set_si128(__m128i inhi, __m128i inlo) {
-    __m256i out;
-    __asm__ volatile (
-        ".irp i," LA_ALL_REGS                  "\n\t"
-        " .ifc %[hi], " LA_VREGS_PREFIX "\\i    \n\t"
-        "  .irp j," LA_ALL_REGS                "\n\t"
-        "   .ifc %[lo], " LA_VREGS_PREFIX "\\j  \n\t"
-        "    xvpermi.q $xr\\i, $xr\\j, 0x20     \n\t"
-        "   .endif                              \n\t"
-        "  .endr                                \n\t"
-        " .endif                                \n\t"
-        ".endr                                  \n\t"
-        ".ifnc %[out], %[hi]                    \n\t"
-        ".irp i," LA_ALL_REGS                  "\n\t"
-        " .ifc %[out], " LA_XREGS_PREFIX "\\i   \n\t"
-        "  .irp j," LA_ALL_REGS                "\n\t"
-        "   .ifc %[hi], " LA_VREGS_PREFIX "\\j  \n\t"
-        "    xvori.b $xr\\i, $xr\\j, 0          \n\t"
-        "   .endif                              \n\t"
-        "  .endr                                \n\t"
-        " .endif                                \n\t"
-        ".endr                                  \n\t"
-        ".endif                                 \n\t"
-        : [out] "=f" (out), [hi] "+f" (inhi)
-        : [lo] "f" (inlo)
-    );
-    return out;
+#ifndef __loongarch_asx_sx_conv
+static inline __m256i __lasx_concat_128(__m128i lo, __m128i hi) {
+    __m256i dest;
+    __asm__ volatile ("xvpermi.q %u0,%u2,0x02\n" : "=f"(dest) : "0"(lo), "f"(hi));
+    return dest;
 }
+#endif
 
-static inline __m256i lasx_broadcastsi128_si256(__m128i in) {
-    return lasx_set_si128(in, in);
+static inline __m256i lasx_broadcast_128(__m128i in) {
+    return __lasx_concat_128(in, in);
 }
 
 static inline __m256i lasx_sad_bu(__m256i a, __m256i b) {