FMLS_nx_h 11000001 0001 .... 1 .. 1.. ...00 1 .... @azx_4x1_i3_o3
FMLS_nx_s 11000001 0101 .... 1 .. 0.. ...00 10 ... @azx_4x1_i2_o3
FMLS_nx_d 11000001 1101 .... 1 .. 00. ...00 10 ... @azx_4x1_i1_o3
+
+### SME2 Add / Sub array accumulators
+
+ADD_aaz_s 11000001 101 000000 .. 111 ....0 10 ... @az_2x2_o3
+ADD_aaz_s 11000001 101 000010 .. 111 ...00 10 ... @az_4x4_o3
+ADD_aaz_d 11000001 111 000000 .. 111 ....0 10 ... @az_2x2_o3
+ADD_aaz_d 11000001 111 000010 .. 111 ...00 10 ... @az_4x4_o3
+
+SUB_aaz_s 11000001 101 000000 .. 111 ....0 11 ... @az_2x2_o3
+SUB_aaz_s 11000001 101 000010 .. 111 ...00 11 ... @az_4x4_o3
+SUB_aaz_d 11000001 111 000000 .. 111 ....0 11 ... @az_2x2_o3
+SUB_aaz_d 11000001 111 000010 .. 111 ...00 11 ... @az_4x4_o3
TRANS_FEAT(ADD_azz_nn_d, aa64_sme2_i16i64, do_azz_nn, a, MO_64, tcg_gen_gvec_add_var)
TRANS_FEAT(SUB_azz_nn_d, aa64_sme2_i16i64, do_azz_nn, a, MO_64, tcg_gen_gvec_sub_var)
+/* Add/Sub each ZA[d*N] += Z[m*N] */
+static bool do_aaz(DisasContext *s, arg_az_n *a, int esz, GVecGen3FnVar *fn)
+{
+ TCGv_ptr t_za;
+ int svl, n;
+
+ if (!sme_smza_enabled_check(s)) {
+ return true;
+ }
+
+ n = a->n;
+ t_za = get_zarray(s, a->rv, a->off, n, 0);
+ svl = streaming_vec_reg_size(s);
+
+ for (int i = 0; i < n; ++i) {
+ int o_za = (svl / n * sizeof(ARMVectorReg)) * i;
+ int o_zm = vec_full_reg_offset(s, a->zm + i);
+
+ fn(esz, t_za, o_za, t_za, o_za, tcg_env, o_zm, svl, svl);
+ }
+ return true;
+}
+
+TRANS_FEAT(ADD_aaz_s, aa64_sme2, do_aaz, a, MO_32, tcg_gen_gvec_add_var)
+TRANS_FEAT(SUB_aaz_s, aa64_sme2, do_aaz, a, MO_32, tcg_gen_gvec_sub_var)
+TRANS_FEAT(ADD_aaz_d, aa64_sme2_i16i64, do_aaz, a, MO_64, tcg_gen_gvec_add_var)
+TRANS_FEAT(SUB_aaz_d, aa64_sme2_i16i64, do_aaz, a, MO_64, tcg_gen_gvec_sub_var)
+
/*
* Expand array multi-vector single (n1), array multi-vector (nn),
* and array multi-vector indexed (nx), for floating-point accumulate.