LoongArch64 micro-optimizations

author Vladislav Shchapov <vladislav@shchapov.ru>

Fri, 20 Jun 2025 16:56:47 +0000 (21:56 +0500)

committer Hans Kristian Rosbach <hk-github@circlestorm.org>

Fri, 11 Jul 2025 14:12:18 +0000 (16:12 +0200)
author Vladislav Shchapov <vladislav@shchapov.ru>
Fri, 20 Jun 2025 16:56:47 +0000 (21:56 +0500)
committer Hans Kristian Rosbach <hk-github@circlestorm.org>
Fri, 11 Jul 2025 14:12:18 +0000 (16:12 +0200)
diff --git a/arch/loongarch/chunkset_lasx.c b/arch/loongarch/chunkset_lasx.c

index 8b23232756ebafaa030d951274955314727d286a..5626ca1f7e4fb98a8da42086c3674708aca3cf37 100644 (file)
--- a/arch/loongarch/chunkset_lasx.c
+++ b/arch/loongarch/chunkset_lasx.c
@@ -36,8 +36,7 @@ static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
  }
  
  static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) {
-    halfchunk_t half = __lsx_vld(from, 0);
-    *chunk = lasx_inserti128_si256(lasx_castsi128_si256(half), half, 1);
+    *chunk = lasx_broadcastsi128_si256(__lsx_vld(from, 0));
  }
  
  static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
@@ -63,11 +62,11 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
          /* This simpler case still requires us to shuffle in 128 bit lanes, so we must apply a static offset after
           * broadcasting the first vector register to both halves. This is _marginally_ faster than doing two separate
           * shuffles and combining the halves later */
-        const __m256i permute_xform = lasx_inserti128_si256(__lasx_xvreplgr2vr_b(0), __lsx_vreplgr2vr_b(16), 1);
+        const __m256i permute_xform = lasx_set_si128(__lsx_vreplgr2vr_b(16), __lsx_vreplgr2vr_b(0));
          __m256i perm_vec = __lasx_xvld(permute_table+lut_rem.idx, 0);
          __m128i ret_vec0 = __lsx_vld(buf, 0);
          perm_vec = __lasx_xvadd_b(perm_vec, permute_xform);
-        ret_vec = lasx_inserti128_si256(lasx_castsi128_si256(ret_vec0), ret_vec0, 1);
+        ret_vec = lasx_set_si128(ret_vec0, ret_vec0);
          ret_vec = lasx_shuffle_b(ret_vec, perm_vec);
      }  else {
          __m128i ret_vec0 = __lsx_vld(buf, 0);
@@ -79,7 +78,7 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
          /* Since we can't wrap twice, we can simply keep the later half exactly how it is instead of having to _also_
           * shuffle those values */
          __m128i latter_half = __lsx_vbitsel_v(ret_vec1, xlane_res, xlane_permutes);
-        ret_vec = lasx_inserti128_si256(lasx_castsi128_si256(ret_vec0), latter_half, 1);
+        ret_vec = lasx_set_si128(latter_half, ret_vec0);
      }
  
      return ret_vec;
diff --git a/arch/loongarch/lasxintrin_ext.h b/arch/loongarch/lasxintrin_ext.h

index b2d0be5f2e09137a01490071eec72126770a0021..833267de313645ecf0c91ab723b29cd23a9e015a 100644 (file)
--- a/arch/loongarch/lasxintrin_ext.h
+++ b/arch/loongarch/lasxintrin_ext.h
@@ -9,6 +9,65 @@
  #include <lasxintrin.h>
  
  
+#ifdef __clang__
+#  define LA_VREGS_PREFIX "$vr"
+#  define LA_XREGS_PREFIX "$xr"
+#else /* GCC */
+#  define LA_VREGS_PREFIX "$f"
+#  define LA_XREGS_PREFIX "$f"
+#endif
+#define LA_ALL_REGS "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31"
+
+static inline __m256i lasx_zextsi128_si256(__m128i in) {
+    __m256i out = __lasx_xvldi(0);
+    __asm__ volatile (
+        ".irp i," LA_ALL_REGS                  "\n\t"
+        " .ifc %[out], " LA_XREGS_PREFIX"\\i    \n\t"
+        "  .irp j," LA_ALL_REGS                "\n\t"
+        "   .ifc %[in], " LA_VREGS_PREFIX "\\j  \n\t"
+        "    xvpermi.q $xr\\i, $xr\\j, 0x20     \n\t"
+        "   .endif                              \n\t"
+        "  .endr                                \n\t"
+        " .endif                                \n\t"
+        ".endr                                  \n\t"
+        : [out] "+f" (out) : [in] "f" (in)
+    );
+    return out;
+}
+
+static inline __m256i lasx_set_si128(__m128i inhi, __m128i inlo) {
+    __m256i out;
+    __asm__ volatile (
+        ".irp i," LA_ALL_REGS                  "\n\t"
+        " .ifc %[hi], " LA_VREGS_PREFIX "\\i    \n\t"
+        "  .irp j," LA_ALL_REGS                "\n\t"
+        "   .ifc %[lo], " LA_VREGS_PREFIX "\\j  \n\t"
+        "    xvpermi.q $xr\\i, $xr\\j, 0x20     \n\t"
+        "   .endif                              \n\t"
+        "  .endr                                \n\t"
+        " .endif                                \n\t"
+        ".endr                                  \n\t"
+        ".ifnc %[out], %[hi]                    \n\t"
+        ".irp i," LA_ALL_REGS                  "\n\t"
+        " .ifc %[out], " LA_XREGS_PREFIX "\\i   \n\t"
+        "  .irp j," LA_ALL_REGS                "\n\t"
+        "   .ifc %[hi], " LA_VREGS_PREFIX "\\j  \n\t"
+        "    xvori.b $xr\\i, $xr\\j, 0          \n\t"
+        "   .endif                              \n\t"
+        "  .endr                                \n\t"
+        " .endif                                \n\t"
+        ".endr                                  \n\t"
+        ".endif                                 \n\t"
+        : [out] "=f" (out), [hi] "+f" (inhi)
+        : [lo] "f" (inlo)
+    );
+    return out;
+}
+
+static inline __m256i lasx_broadcastsi128_si256(__m128i in) {
+    return lasx_set_si128(in, in);
+}
+
  static inline __m256i lasx_sad_bu(__m256i a, __m256i b) {
      __m256i tmp = __lasx_xvabsd_bu(a, b);
      tmp = __lasx_xvhaddw_hu_bu(tmp, tmp);
@@ -21,23 +80,6 @@ static inline int lasx_movemask_b(__m256i v) {
      return __lasx_xvpickve2gr_w(v, 0) | (__lasx_xvpickve2gr_w(v, 4) << 16);
  }
  
-static inline __m256i lasx_castsi128_si256(__m128i v)
-{
-    return (__m256i) { v[0], v[1], 0, 0 };
-}
-
-static inline __m256i lasx_inserti128_si256(__m256i a, __m128i b, const int imm8) {
-    if (imm8 == 0)
-        return __lasx_xvpermi_q(a, lasx_castsi128_si256(b), 0x30);
-    else
-        return __lasx_xvpermi_q(a, lasx_castsi128_si256(b), 0x02);
-}
-
-static inline __m256i lasx_zextsi128_si256(__m128i v) {
-    return (__m256i) { v[0], v[1], 0, 0 };
-    /* return lasx_inserti128_si256(__lasx_xvreplgr2vr_w(0), v, 0); */
-}
-
  /* See: lsx_shuffle_b */
  static inline __m256i lasx_shuffle_b(__m256i a, __m256i b) {
      __m256i msb_mask = __lasx_xvslti_b(b, 0);
author	Vladislav Shchapov <vladislav@shchapov.ru>
	Fri, 20 Jun 2025 16:56:47 +0000 (21:56 +0500)
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>
	Fri, 11 Jul 2025 14:12:18 +0000 (16:12 +0200)
arch/loongarch/chunkset_lasx.c		patch \| blob \| blame \| history
arch/loongarch/lasxintrin_ext.h		patch \| blob \| blame \| history