DEF_HELPER_FLAGS_3(sme2_uunpk4_hs, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(sme2_uunpk4_sd, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_zip2_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_zip2_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_zip2_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_zip2_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_zip2_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(sme2_uzp2_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_uzp2_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_uzp2_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_uzp2_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(sme2_uzp2_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
DEF_HELPER_FLAGS_3(sme2_zip4_b, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(sme2_zip4_h, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(sme2_zip4_s, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
UQRSHRN_dh 11000001 1.1 ..... 110111 ...01 ..... @rshr_dh
SQRSHRUN_sb 11000001 011 ..... 110111 ...10 ..... @rshr_sb
SQRSHRUN_dh 11000001 1.1 ..... 110111 ...10 ..... @rshr_dh
+
+&zzz_e zd zn zm esz
+
+ZIP_2 11000001 esz:2 1 zm:5 110100 zn:5 .... 0 \
+ &zzz_e zd=%zd_ax2
+ZIP_2 11000001 00 1 zm:5 110101 zn:5 .... 0 \
+ &zzz_e zd=%zd_ax2 esz=4
+
+UZP_2 11000001 esz:2 1 zm:5 110100 zn:5 .... 1 \
+ &zzz_e zd=%zd_ax2
+UZP_2 11000001 00 1 zm:5 110101 zn:5 .... 1 \
+ &zzz_e zd=%zd_ax2 esz=4
}
}
+#define ZIP2(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ ARMVectorReg scratch[2]; \
+ size_t oprsz = simd_oprsz(desc); \
+ size_t pairs = oprsz / (sizeof(TYPE) * 2); \
+ TYPE *n = vn, *m = vm; \
+ if (vectors_overlap(vd, 2, vn, 1)) { \
+ n = memcpy(&scratch[0], vn, oprsz); \
+ } \
+ if (vectors_overlap(vd, 2, vm, 1)) { \
+ m = memcpy(&scratch[1], vm, oprsz); \
+ } \
+ for (size_t r = 0; r < 2; ++r) { \
+ TYPE *d = vd + r * sizeof(ARMVectorReg); \
+ size_t base = r * pairs; \
+ for (size_t p = 0; p < pairs; ++p) { \
+ d[H(2 * p + 0)] = n[base + H(p)]; \
+ d[H(2 * p + 1)] = m[base + H(p)]; \
+ } \
+ } \
+}
+
+ZIP2(sme2_zip2_b, uint8_t, H1)
+ZIP2(sme2_zip2_h, uint16_t, H2)
+ZIP2(sme2_zip2_s, uint32_t, H4)
+ZIP2(sme2_zip2_d, uint64_t, )
+ZIP2(sme2_zip2_q, Int128, )
+
+#undef ZIP2
+
#define ZIP4(NAME, TYPE, H) \
void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \
{ \
#undef ZIP4
+#define UZP2(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
+{ \
+ ARMVectorReg scratch[2]; \
+ size_t oprsz = simd_oprsz(desc); \
+ size_t pairs = oprsz / (sizeof(TYPE) * 2); \
+ TYPE *d0 = vd, *d1 = vd + sizeof(ARMVectorReg); \
+ if (vectors_overlap(vd, 2, vn, 1)) { \
+ vn = memcpy(&scratch[0], vn, oprsz); \
+ } \
+ if (vectors_overlap(vd, 2, vm, 1)) { \
+ vm = memcpy(&scratch[1], vm, oprsz); \
+ } \
+ for (size_t r = 0; r < 2; ++r) { \
+ TYPE *s = r ? vm : vn; \
+ size_t base = r * pairs; \
+ for (size_t p = 0; p < pairs; ++p) { \
+ d0[base + H(p)] = s[H(2 * p + 0)]; \
+ d1[base + H(p)] = s[H(2 * p + 1)]; \
+ } \
+ } \
+}
+
+UZP2(sme2_uzp2_b, uint8_t, H1)
+UZP2(sme2_uzp2_h, uint16_t, H2)
+UZP2(sme2_uzp2_s, uint32_t, H4)
+UZP2(sme2_uzp2_d, uint64_t, )
+UZP2(sme2_uzp2_q, Int128, )
+
+#undef UZP2
+
#define UZP4(NAME, TYPE, H) \
void HELPER(NAME)(void *vd, void *vs, uint32_t desc) \
{ \
TRANS_FEAT(UQRSHRN_dh, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_uqrshrn_dh)
TRANS_FEAT(SQRSHRUN_sb, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_sqrshrun_sb)
TRANS_FEAT(SQRSHRUN_dh, aa64_sme2, do_zz_rshr, a, gen_helper_sme2_sqrshrun_dh)
+
+static bool do_zipuzp_2(DisasContext *s, arg_zzz_e *a,
+ gen_helper_gvec_3 * const fn[5])
+{
+ int bytes_per_op = 2 << a->esz;
+
+ /* MO_128 can fail the size test. */
+ if (s->max_svl < bytes_per_op) {
+ unallocated_encoding(s);
+ } else if (sme_sm_enabled_check(s)) {
+ int svl = streaming_vec_reg_size(s);
+ if (svl < bytes_per_op) {
+ unallocated_encoding(s);
+ } else {
+ tcg_gen_gvec_3_ool(vec_full_reg_offset(s, a->zd),
+ vec_full_reg_offset(s, a->zn),
+ vec_full_reg_offset(s, a->zm),
+ svl, svl, 0, fn[a->esz]);
+ }
+ }
+ return true;
+}
+
+static gen_helper_gvec_3 * const zip2_fns[] = {
+ gen_helper_sme2_zip2_b,
+ gen_helper_sme2_zip2_h,
+ gen_helper_sme2_zip2_s,
+ gen_helper_sme2_zip2_d,
+ gen_helper_sme2_zip2_q,
+};
+TRANS_FEAT(ZIP_2, aa64_sme2, do_zipuzp_2, a, zip2_fns)
+
+static gen_helper_gvec_3 * const uzp2_fns[] = {
+ gen_helper_sme2_uzp2_b,
+ gen_helper_sme2_uzp2_h,
+ gen_helper_sme2_uzp2_s,
+ gen_helper_sme2_uzp2_d,
+ gen_helper_sme2_uzp2_q,
+};
+TRANS_FEAT(UZP_2, aa64_sme2, do_zipuzp_2, a, uzp2_fns)