DEF_HELPER_FLAGS_6(gvec_bfmlal, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_6(gvec_bfmlsl, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_6(gvec_ah_bfmlsl, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_6(gvec_bfmlal_idx, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_6(gvec_bfmlsl_idx, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, fpst, i32)
+DEF_HELPER_FLAGS_6(gvec_ah_bfmlsl_idx, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, fpst, i32)
DEF_HELPER_FLAGS_5(gvec_sclamp_b, TCG_CALL_NO_RWG,
void, ptr, ptr, ptr, ptr, i32)
clear_tail(d, opr_sz, simd_maxsz(desc));
}
-void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
- float_status *stat, uint32_t desc)
+static void do_bfmlal(float32 *d, bfloat16 *n, bfloat16 *m, float32 *a,
+ float_status *stat, uint32_t desc, int negx, int negf)
{
intptr_t i, opr_sz = simd_oprsz(desc);
- intptr_t sel = simd_data(desc);
- float32 *d = vd, *a = va;
- bfloat16 *n = vn, *m = vm;
+ intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
for (i = 0; i < opr_sz / 4; ++i) {
- float32 nn = n[H2(i * 2 + sel)] << 16;
+ float32 nn = (negx ^ n[H2(i * 2 + sel)]) << 16;
float32 mm = m[H2(i * 2 + sel)] << 16;
- d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
+ d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], negf, stat);
}
clear_tail(d, opr_sz, simd_maxsz(desc));
}
-void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
- void *va, float_status *stat, uint32_t desc)
+void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
+ float_status *stat, uint32_t desc)
+{
+ do_bfmlal(vd, vn, vm, va, stat, desc, 0, 0);
+}
+
+void HELPER(gvec_bfmlsl)(void *vd, void *vn, void *vm, void *va,
+ float_status *stat, uint32_t desc)
+{
+ do_bfmlal(vd, vn, vm, va, stat, desc, 0x8000, 0);
+}
+
+void HELPER(gvec_ah_bfmlsl)(void *vd, void *vn, void *vm, void *va,
+ float_status *stat, uint32_t desc)
+{
+ do_bfmlal(vd, vn, vm, va, stat, desc, 0, float_muladd_negate_product);
+}
+
+static void do_bfmlal_idx(float32 *d, bfloat16 *n, bfloat16 *m, float32 *a,
+ float_status *stat, uint32_t desc, int negx, int negf)
{
intptr_t i, j, opr_sz = simd_oprsz(desc);
intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
intptr_t elements = opr_sz / 4;
intptr_t eltspersegment = MIN(16 / 4, elements);
- float32 *d = vd, *a = va;
- bfloat16 *n = vn, *m = vm;
for (i = 0; i < elements; i += eltspersegment) {
float32 m_idx = m[H2(2 * i + index)] << 16;
for (j = i; j < i + eltspersegment; j++) {
- float32 n_j = n[H2(2 * j + sel)] << 16;
- d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
+ float32 n_j = (negx ^ n[H2(2 * j + sel)]) << 16;
+ d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], negf, stat);
}
}
clear_tail(d, opr_sz, simd_maxsz(desc));
}
+void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, void *va,
+ float_status *stat, uint32_t desc)
+{
+ do_bfmlal_idx(vd, vn, vm, va, stat, desc, 0, 0);
+}
+
+void HELPER(gvec_bfmlsl_idx)(void *vd, void *vn, void *vm, void *va,
+ float_status *stat, uint32_t desc)
+{
+ do_bfmlal_idx(vd, vn, vm, va, stat, desc, 0x8000, 0);
+}
+
+void HELPER(gvec_ah_bfmlsl_idx)(void *vd, void *vn, void *vm, void *va,
+ float_status *stat, uint32_t desc)
+{
+ do_bfmlal_idx(vd, vn, vm, va, stat, desc, 0, float_muladd_negate_product);
+}
+
#define DO_CLAMP(NAME, TYPE) \
void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \
{ \