aarch64_def_or_undef (TARGET_SME, "__ARM_FEATURE_SME", pfile);
aarch64_def_or_undef (TARGET_SME_I16I64, "__ARM_FEATURE_SME_I16I64", pfile);
+ aarch64_def_or_undef (AARCH64_HAVE_ISA (SME_B16B16),
+ "__ARM_FEATURE_SME_B16B16", pfile);
aarch64_def_or_undef (AARCH64_HAVE_ISA (SME_F16F16),
"__ARM_FEATURE_SME_F16F16", pfile);
aarch64_def_or_undef (TARGET_SME_F64F64, "__ARM_FEATURE_SME_F64F64", pfile);
AARCH64_OPT_FMV_EXTENSION("sme2", SME2, (SME), (), (), "sme2")
+AARCH64_OPT_EXTENSION("sme-b16b16", SME_B16B16, (SME2, SVE_B16B16), (), (), "")
+
AARCH64_OPT_EXTENSION("sme-f16f16", SME_F16F16, (SME2), (), (), "")
AARCH64_OPT_EXTENSION("mops", MOPS, (), (), (), "")
;; -------------------------------------------------------------------------
;; Includes:
;; - ADD
+;; - BFADD
+;; - BFSUB
;; - FADD
;; - FSUB
;; - SUB
(match_operand:SME_ZA_HSDFx24 1 "aligned_register_operand" "Uw<vector_count>")]
SME_BINARY_SLICE_HSDF))]
"TARGET_STREAMING_SME2"
- "<optab>\tza.<Vetype>[%w0, 0, vgx<vector_count>], %1"
+ "<b><optab>\tza.<Vetype>[%w0, 0, vgx<vector_count>], %1"
)
(define_insn "*aarch64_sme_<optab><mode>_plus"
(match_operand:SME_ZA_HSDFx24 2 "aligned_register_operand" "Uw<vector_count>")]
SME_BINARY_SLICE_HSDF))]
"TARGET_STREAMING_SME2"
- "<optab>\tza.<Vetype>[%w0, %1, vgx<vector_count>], %2"
+ "<b><optab>\tza.<Vetype>[%w0, %1, vgx<vector_count>], %2"
)
;; -------------------------------------------------------------------------
;; ---- [FP] Ternary arithmetic on ZA slice
;; -------------------------------------------------------------------------
;; Includes:
+;; - BFMLA
+;; - BFMLS
;; - FMLA
;; - FMLS
;; -------------------------------------------------------------------------
(match_operand:SME_ZA_HSDFx24 2 "aligned_register_operand" "Uw<vector_count>")]
SME_FP_TERNARY_SLICE))]
"TARGET_STREAMING_SME2"
- "<optab>\tza.<Vetype>[%w0, 0, vgx<vector_count>], %1, %2"
+ "<b><optab>\tza.<Vetype>[%w0, 0, vgx<vector_count>], %1, %2"
)
(define_insn "*aarch64_sme_<optab><mode><mode>_plus"
(match_operand:SME_ZA_HSDFx24 3 "aligned_register_operand" "Uw<vector_count>")]
SME_FP_TERNARY_SLICE))]
"TARGET_STREAMING_SME2"
- "<optab>\tza.<Vetype>[%w0, %1, vgx<vector_count>], %2, %3"
+ "<b><optab>\tza.<Vetype>[%w0, %1, vgx<vector_count>], %2, %3"
)
(define_insn "@aarch64_sme_single_<optab><mode><mode>"
(match_operand:<SME_ZA_HSDFx24:VSINGLE> 2 "register_operand" "x"))]
SME_FP_TERNARY_SLICE))]
"TARGET_STREAMING_SME2"
- "<optab>\tza.<Vetype>[%w0, 0, vgx<vector_count>], %1, %2.<Vetype>"
+ "<b><optab>\tza.<Vetype>[%w0, 0, vgx<vector_count>], %1, %2.<Vetype>"
)
(define_insn "*aarch64_sme_single_<optab><mode><mode>_plus"
(match_operand:<SME_ZA_HSDFx24:VSINGLE> 3 "register_operand" "x"))]
SME_FP_TERNARY_SLICE))]
"TARGET_STREAMING_SME2"
- "<optab>\tza.<Vetype>[%w0, %1, vgx<vector_count>], %2, %3.<Vetype>"
+ "<b><optab>\tza.<Vetype>[%w0, %1, vgx<vector_count>], %2, %3.<Vetype>"
)
(define_insn "@aarch64_sme_lane_<optab><mode><mode>"
UNSPEC_SVE_LANE_SELECT)]
SME_FP_TERNARY_SLICE))]
"TARGET_STREAMING_SME2"
- "<optab>\tza.<Vetype>[%w0, 0, vgx<vector_count>], %1, %2.<Vetype>[%3]"
+ "<b><optab>\tza.<Vetype>[%w0, 0, vgx<vector_count>], %1, %2.<Vetype>[%3]"
)
(define_insn "*aarch64_sme_lane_<optab><mode><mode>"
UNSPEC_SVE_LANE_SELECT)]
SME_FP_TERNARY_SLICE))]
"TARGET_STREAMING_SME2"
- "<optab>\tza.<Vetype>[%w0, %1, vgx<vector_count>], %2, %3.<Vetype>[%4]"
+ "<b><optab>\tza.<Vetype>[%w0, %1, vgx<vector_count>], %2, %3.<Vetype>[%4]"
)
;; -------------------------------------------------------------------------
DEF_SME_ZA_FUNCTION_GS (svsub, unary_za_slice, za_h_float, vg1x24, none)
#undef REQUIRED_EXTENSIONS
+#define REQUIRED_EXTENSIONS streaming_only (AARCH64_FL_SME_B16B16)
+DEF_SME_ZA_FUNCTION_GS (svadd, unary_za_slice, za_h_bfloat, vg1x24, none)
+DEF_SME_ZA_FUNCTION_GS (svmla, binary_za_slice_opt_single, za_h_bfloat,
+ vg1x24, none)
+DEF_SME_ZA_FUNCTION_GS (svmla_lane, binary_za_slice_lane, za_h_bfloat,
+ vg1x24, none)
+DEF_SME_ZA_FUNCTION_GS (svmls, binary_za_slice_opt_single, za_h_bfloat,
+ vg1x24, none)
+DEF_SME_ZA_FUNCTION_GS (svmls_lane, binary_za_slice_lane, za_h_bfloat,
+ vg1x24, none)
+DEF_SME_ZA_FUNCTION (svmopa, binary_za_m, za_h_bfloat, za_m)
+DEF_SME_ZA_FUNCTION (svmops, binary_za_m, za_h_bfloat, za_m)
+DEF_SME_ZA_FUNCTION_GS (svsub, unary_za_slice, za_h_bfloat, vg1x24, none)
+#undef REQUIRED_EXTENSIONS
+
#undef DEF_SME_ZA_FUNCTION
#undef DEF_SME_ZA_FUNCTION_GS
#undef DEF_SME_FUNCTION
TYPES_za_bhsd_data (S, D), \
TYPES_reinterpret1 (D, za128)
+/* _za16_bf16. */
+#define TYPES_za_h_bfloat(S, D) \
+ D (za16, bf16)
+
/* _za16_f16. */
#define TYPES_za_h_float(S, D) \
D (za16, f16)
DEF_SVE_TYPES_ARRAY (d_za);
DEF_SVE_TYPES_ARRAY (za_bhsd_data);
DEF_SVE_TYPES_ARRAY (za_all_data);
+DEF_SVE_TYPES_ARRAY (za_h_bfloat);
DEF_SVE_TYPES_ARRAY (za_h_float);
DEF_SVE_TYPES_ARRAY (za_s_b_signed);
DEF_SVE_TYPES_ARRAY (za_s_b_unsigned);
/* The FEAT_SME_I16I64 extension to SME, enabled through +sme-i16i64. */
#define TARGET_SME_I16I64 AARCH64_HAVE_ISA (SME_I16I64)
+/* The FEAT_SME_B16B16 extension to SME, enabled through +sme-b16b16. */
+#define TARGET_STREAMING_SME_B16B16 \
+ (AARCH64_HAVE_ISA (SME_B16B16) && TARGET_STREAMING)
+
/* The FEAT_SME_F16F16 extension to SME, enabled through +sme-f16f16. */
#define TARGET_STREAMING_SME_F16F16 \
(AARCH64_HAVE_ISA (SME_F16F16) && TARGET_STREAMING)
(VNx4DF "TARGET_SME_F64F64")
(VNx8DF "TARGET_SME_F64F64")
(VNx16HF "TARGET_STREAMING_SME_F16F16")
- (VNx32HF "TARGET_STREAMING_SME_F16F16")])
+ (VNx32HF "TARGET_STREAMING_SME_F16F16")
+ (VNx16BF "TARGET_STREAMING_SME_B16B16")
+ (VNx32BF "TARGET_STREAMING_SME_B16B16")])
;; The modes for which outer product instructions are supported.
(define_mode_iterator SME_MOP_BHI [VNx16QI (VNx8HI "TARGET_SME_I16I64")])
(define_mode_iterator SME_MOP_HSDF [VNx4SF
(VNx2DF "TARGET_SME_F64F64")
- (VNx8HF "TARGET_STREAMING_SME_F16F16")])
+ (VNx8HF "TARGET_STREAMING_SME_F16F16")
+ (VNx8BF "TARGET_STREAMING_SME_B16B16")])
;; ------------------------------------------------------------------
;; Unspec enumerations for Advance SIMD. These could well go into
instructions.
@item sme2
Enable the Scalable Matrix Extension 2. This also enables SME instructions.
+@item sme-b16b16
+Enable the FEAT_SME_B16B16 extension to SME. This also enables SME2
+and SVE_B16B16 instructions.
@item sme-f16f16
Enable the FEAT_SME_F16F16 extension to SME. This also enables SME2
instructions.
#ifdef __ARM_FEATURE_SME_I16I64
#error Foo
#endif
+#ifdef __ARM_FEATURE_SME_B16B16
+#error Foo
+#endif
#ifdef __ARM_FEATURE_SME_F16F16
#error Foo
#endif
#error Foo
#endif
+#pragma GCC target "+nothing+sme-b16b16"
+#ifndef __ARM_FEATURE_SME_B16B16
+#error Foo
+#endif
+#ifndef __ARM_FEATURE_SME
+#error Foo
+#endif
+#ifndef __ARM_FEATURE_SME2
+#error Foo
+#endif
+#ifndef __ARM_FEATURE_SVE_B16B16
+#error Foo
+#endif
+
#pragma GCC target "+nothing+sme-f16f16"
#ifndef __ARM_FEATURE_SME_F16F16
#error Foo
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sme-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sme-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sme2_acle.h"
+
+#pragma GCC target "+sme-b16b16"
+
+/*
+** add_0_z0:
+** mov (w8|w9|w10|w11), #?0
+** bfadd za\.h\[\1, 0, vgx2\], {z0\.h - z1\.h}
+** ret
+*/
+TEST_ZA_XN (add_0_z0, svbfloat16x2_t,
+ svadd_za16_bf16_vg1x2 (0, z0),
+ svadd_za16_vg1x2 (0, z0))
+
+/*
+** add_w0_z0:
+** mov (w8|w9|w10|w11), w0
+** bfadd za\.h\[\1, 0, vgx2\], {z0\.h - z1\.h}
+** ret
+*/
+TEST_ZA_XN (add_w0_z0, svbfloat16x2_t,
+ svadd_za16_bf16_vg1x2 (w0, z0),
+ svadd_za16_vg1x2 (w0, z0))
+
+/*
+** add_w7_z0:
+** mov (w8|w9|w10|w11), w7
+** bfadd za\.h\[\1, 0, vgx2\], {z0\.h - z1\.h}
+** ret
+*/
+TEST_ZA_XN (add_w7_z0, svbfloat16x2_t,
+ svadd_za16_bf16_vg1x2 (w7, z0),
+ svadd_za16_vg1x2 (w7, z0))
+
+/*
+** add_w8_z0:
+** bfadd za\.h\[w8, 0, vgx2\], {z0\.h - z1\.h}
+** ret
+*/
+TEST_ZA_XN (add_w8_z0, svbfloat16x2_t,
+ svadd_za16_bf16_vg1x2 (w8, z0),
+ svadd_za16_vg1x2 (w8, z0))
+
+/*
+** add_w11_z0:
+** bfadd za\.h\[w11, 0, vgx2\], {z0\.h - z1\.h}
+** ret
+*/
+TEST_ZA_XN (add_w11_z0, svbfloat16x2_t,
+ svadd_za16_bf16_vg1x2 (w11, z0),
+ svadd_za16_vg1x2 (w11, z0))
+
+
+/*
+** add_w12_z0:
+** mov (w8|w9|w10|w11), w12
+** bfadd za\.h\[\1, 0, vgx2\], {z0\.h - z1\.h}
+** ret
+*/
+TEST_ZA_XN (add_w12_z0, svbfloat16x2_t,
+ svadd_za16_bf16_vg1x2 (w12, z0),
+ svadd_za16_vg1x2 (w12, z0))
+
+/*
+** add_w8p7_z0:
+** bfadd za\.h\[w8, 7, vgx2\], {z0\.h - z1\.h}
+** ret
+*/
+TEST_ZA_XN (add_w8p7_z0, svbfloat16x2_t,
+ svadd_za16_bf16_vg1x2 (w8 + 7, z0),
+ svadd_za16_vg1x2 (w8 + 7, z0))
+
+/*
+** add_w8p8_z0:
+** add (w8|w9|w10|w11), w8, #?8
+** bfadd za\.h\[\1, 0, vgx2\], {z0\.h - z1\.h}
+** ret
+*/
+TEST_ZA_XN (add_w8p8_z0, svbfloat16x2_t,
+ svadd_za16_bf16_vg1x2 (w8 + 8, z0),
+ svadd_za16_vg1x2 (w8 + 8, z0))
+
+/*
+** add_w8m1_z0:
+** sub (w8|w9|w10|w11), w8, #?1
+** bfadd za\.h\[\1, 0, vgx2\], {z0\.h - z1\.h}
+** ret
+*/
+TEST_ZA_XN (add_w8m1_z0, svbfloat16x2_t,
+ svadd_za16_bf16_vg1x2 (w8 - 1, z0),
+ svadd_za16_vg1x2 (w8 - 1, z0))
+
+/*
+** add_w8_z18:
+** bfadd za\.h\[w8, 0, vgx2\], {z18\.h - z19\.h}
+** ret
+*/
+TEST_ZA_XN (add_w8_z18, svbfloat16x2_t,
+ svadd_za16_bf16_vg1x2 (w8, z18),
+ svadd_za16_vg1x2 (w8, z18))
+
+/* Leave the assembler to check for correctness for misaligned registers. */
+
+/*
+** add_w8_z23:
+** mov [^\n]+
+** mov [^\n]+
+** bfadd za\.h\[w8, 0, vgx2\], [^\n]+
+** ret
+*/
+TEST_ZA_XN (add_w8_z23, svbfloat16x2_t,
+ svadd_za16_bf16_vg1x2 (w8, z23),
+ svadd_za16_vg1x2 (w8, z23))
+
+/*
+** add_w8_z28:
+** bfadd za\.h\[w8, 0, vgx2\], {z28\.h - z29\.h}
+** ret
+*/
+TEST_ZA_XN (add_w8_z28, svbfloat16x2_t,
+ svadd_za16_bf16_vg1x2 (w8, z28),
+ svadd_za16_vg1x2 (w8, z28))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sme-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sme-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sme2_acle.h"
+
+#pragma GCC target "+sme-b16b16"
+
+/*
+** add_0_z0:
+** mov (w8|w9|w10|w11), #?0
+** bfadd za\.h\[\1, 0, vgx4\], {z0\.h - z3\.h}
+** ret
+*/
+TEST_ZA_XN (add_0_z0, svbfloat16x4_t,
+ svadd_za16_bf16_vg1x4 (0, z0),
+ svadd_za16_vg1x4 (0, z0))
+
+/*
+** add_w0_z0:
+** mov (w8|w9|w10|w11), w0
+** bfadd za\.h\[\1, 0, vgx4\], {z0\.h - z3\.h}
+** ret
+*/
+TEST_ZA_XN (add_w0_z0, svbfloat16x4_t,
+ svadd_za16_bf16_vg1x4 (w0, z0),
+ svadd_za16_vg1x4 (w0, z0))
+
+/*
+** add_w7_z0:
+** mov (w8|w9|w10|w11), w7
+** bfadd za\.h\[\1, 0, vgx4\], {z0\.h - z3\.h}
+** ret
+*/
+TEST_ZA_XN (add_w7_z0, svbfloat16x4_t,
+ svadd_za16_bf16_vg1x4 (w7, z0),
+ svadd_za16_vg1x4 (w7, z0))
+
+/*
+** add_w8_z0:
+** bfadd za\.h\[w8, 0, vgx4\], {z0\.h - z3\.h}
+** ret
+*/
+TEST_ZA_XN (add_w8_z0, svbfloat16x4_t,
+ svadd_za16_bf16_vg1x4 (w8, z0),
+ svadd_za16_vg1x4 (w8, z0))
+
+/*
+** add_w11_z0:
+** bfadd za\.h\[w11, 0, vgx4\], {z0\.h - z3\.h}
+** ret
+*/
+TEST_ZA_XN (add_w11_z0, svbfloat16x4_t,
+ svadd_za16_bf16_vg1x4 (w11, z0),
+ svadd_za16_vg1x4 (w11, z0))
+
+
+/*
+** add_w12_z0:
+** mov (w8|w9|w10|w11), w12
+** bfadd za\.h\[\1, 0, vgx4\], {z0\.h - z3\.h}
+** ret
+*/
+TEST_ZA_XN (add_w12_z0, svbfloat16x4_t,
+ svadd_za16_bf16_vg1x4 (w12, z0),
+ svadd_za16_vg1x4 (w12, z0))
+
+/*
+** add_w8p7_z0:
+** bfadd za\.h\[w8, 7, vgx4\], {z0\.h - z3\.h}
+** ret
+*/
+TEST_ZA_XN (add_w8p7_z0, svbfloat16x4_t,
+ svadd_za16_bf16_vg1x4 (w8 + 7, z0),
+ svadd_za16_vg1x4 (w8 + 7, z0))
+
+/*
+** add_w8p8_z0:
+** add (w8|w9|w10|w11), w8, #?8
+** bfadd za\.h\[\1, 0, vgx4\], {z0\.h - z3\.h}
+** ret
+*/
+TEST_ZA_XN (add_w8p8_z0, svbfloat16x4_t,
+ svadd_za16_bf16_vg1x4 (w8 + 8, z0),
+ svadd_za16_vg1x4 (w8 + 8, z0))
+
+/*
+** add_w8m1_z0:
+** sub (w8|w9|w10|w11), w8, #?1
+** bfadd za\.h\[\1, 0, vgx4\], {z0\.h - z3\.h}
+** ret
+*/
+TEST_ZA_XN (add_w8m1_z0, svbfloat16x4_t,
+ svadd_za16_bf16_vg1x4 (w8 - 1, z0),
+ svadd_za16_vg1x4 (w8 - 1, z0))
+
+/*
+** add_w8_z4:
+** bfadd za\.h\[w8, 0, vgx4\], {z4\.h - z7\.h}
+** ret
+*/
+TEST_ZA_XN (add_w8_z4, svbfloat16x4_t,
+ svadd_za16_bf16_vg1x4 (w8, z4),
+ svadd_za16_vg1x4 (w8, z4))
+
+/* Leave the assembler to check for correctness for misaligned registers. */
+
+/*
+** add_w8_z18:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfadd za\.h\[w8, 0, vgx4\], [^\n]+
+** ret
+*/
+TEST_ZA_XN (add_w8_z18, svbfloat16x4_t,
+ svadd_za16_bf16_vg1x4 (w8, z18),
+ svadd_za16_vg1x4 (w8, z18))
+
+/*
+** add_w8_z23:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfadd za\.h\[w8, 0, vgx4\], [^\n]+
+** ret
+*/
+TEST_ZA_XN (add_w8_z23, svbfloat16x4_t,
+ svadd_za16_bf16_vg1x4 (w8, z23),
+ svadd_za16_vg1x4 (w8, z23))
+
+/*
+** add_w8_z28:
+** bfadd za\.h\[w8, 0, vgx4\], {z28\.h - z31\.h}
+** ret
+*/
+TEST_ZA_XN (add_w8_z28, svbfloat16x4_t,
+ svadd_za16_bf16_vg1x4 (w8, z28),
+ svadd_za16_vg1x4 (w8, z28))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sme-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sme-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sme2_acle.h"
+
+#pragma GCC target "+sme-b16b16"
+
+/*
+** mla_lane_0_z0_z4_0:
+** mov (w8|w9|w10|w11), #?0
+** bfmla za\.h\[\1, 0, vgx2\], {z0\.h - z1\.h}, z4\.h\[0\]
+** ret
+*/
+TEST_ZA_LANE (mla_lane_0_z0_z4_0, svbfloat16x2_t, svbfloat16_t,
+ svmla_lane_za16_bf16_vg1x2 (0, z0, z4, 0),
+ svmla_lane_za16_vg1x2 (0, z0, z4, 0))
+
+/*
+** mla_lane_w0_z0_z7_1:
+** mov (w8|w9|w10|w11), w0
+** bfmla za\.h\[\1, 0, vgx2\], {z0\.h - z1\.h}, z7\.h\[1\]
+** ret
+*/
+TEST_ZA_LANE (mla_lane_w0_z0_z7_1, svbfloat16x2_t, svbfloat16_t,
+ svmla_lane_za16_bf16_vg1x2 (w0, z0, z7, 1),
+ svmla_lane_za16_vg1x2 (w0, z0, z7, 1))
+
+/*
+** mla_lane_w8_z28_z4_2:
+** bfmla za\.h\[w8, 0, vgx2\], {z28\.h - z29\.h}, z4\.h\[2\]
+** ret
+*/
+TEST_ZA_LANE (mla_lane_w8_z28_z4_2, svbfloat16x2_t, svbfloat16_t,
+ svmla_lane_za16_bf16_vg1x2 (w8, z28, z4, 2),
+ svmla_lane_za16_vg1x2 (w8, z28, z4, 2))
+
+/*
+** mla_lane_w8p7_z0_z4_3:
+** bfmla za\.h\[w8, 7, vgx2\], {z0\.h - z1\.h}, z4\.h\[3\]
+** ret
+*/
+TEST_ZA_LANE (mla_lane_w8p7_z0_z4_3, svbfloat16x2_t, svbfloat16_t,
+ svmla_lane_za16_bf16_vg1x2 (w8 + 7, z0, z4, 3),
+ svmla_lane_za16_vg1x2 (w8 + 7, z0, z4, 3))
+
+/*
+** mla_lane_w8p8_z0_z4_4:
+** add (w8|w9|w10|w11), w8, #?8
+** bfmla za\.h\[\1, 0, vgx2\], {z0\.h - z1\.h}, z4\.h\[4\]
+** ret
+*/
+TEST_ZA_LANE (mla_lane_w8p8_z0_z4_4, svbfloat16x2_t, svbfloat16_t,
+ svmla_lane_za16_bf16_vg1x2 (w8 + 8, z0, z4, 4),
+ svmla_lane_za16_vg1x2 (w8 + 8, z0, z4, 4))
+
+/*
+** mla_lane_w0m1_z0_z4_5:
+** sub (w8|w9|w10|w11), w0, #?1
+** bfmla za\.h\[\1, 0, vgx2\], {z0\.h - z1\.h}, z4\.h\[5\]
+** ret
+*/
+TEST_ZA_LANE (mla_lane_w0m1_z0_z4_5, svbfloat16x2_t, svbfloat16_t,
+ svmla_lane_za16_bf16_vg1x2 (w0 - 1, z0, z4, 5),
+ svmla_lane_za16_vg1x2 (w0 - 1, z0, z4, 5))
+
+/*
+** mla_lane_w8_z4_z15_6:
+** str d15, \[sp, #?-16\]!
+** bfmla za\.h\[w8, 0, vgx2\], {z4\.h - z5\.h}, z15\.h\[6\]
+** ldr d15, \[sp\], #?16
+** ret
+*/
+TEST_ZA_LANE_Z15 (mla_lane_w8_z4_z15_6, svbfloat16x2_t, svbfloat16_t,
+ svmla_lane_za16_bf16_vg1x2 (w8, z4, z15, 6),
+ svmla_lane_za16_vg1x2 (w8, z4, z15, 6))
+
+/*
+** mla_lane_w8_z28_z16_7:
+** mov (z[0-7]).d, z16.d
+** bfmla za\.h\[w8, 0, vgx2\], {z28\.h - z29\.h}, \1\.h\[7\]
+** ret
+*/
+TEST_ZA_LANE (mla_lane_w8_z28_z16_7, svbfloat16x2_t, svbfloat16_t,
+ svmla_lane_za16_bf16_vg1x2 (w8, z28, z16, 7),
+ svmla_lane_za16_vg1x2 (w8, z28, z16, 7))
+
+/*
+** mla_lane_w8_z17_z7_0:
+** mov [^\n]+
+** mov [^\n]+
+** bfmla za\.h\[w8, 0, vgx2\], [^\n]+, z7\.h\[0\]
+** ret
+*/
+TEST_ZA_LANE (mla_lane_w8_z17_z7_0, svbfloat16x2_t, svbfloat16_t,
+ svmla_lane_za16_bf16_vg1x2 (w8, z17, z7, 0),
+ svmla_lane_za16_vg1x2 (w8, z17, z7, 0))
+
+/*
+** mla_lane_w8_z22_z4_1:
+** bfmla za\.h\[w8, 0, vgx2\], {z22\.h - z23\.h}, z4\.h\[1\]
+** ret
+*/
+TEST_ZA_LANE (mla_lane_w8_z22_z4_1, svbfloat16x2_t, svbfloat16_t,
+ svmla_lane_za16_bf16_vg1x2 (w8, z22, z4, 1),
+ svmla_lane_za16_vg1x2 (w8, z22, z4, 1))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sme-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sme-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sme2_acle.h"
+
+#pragma GCC target "+sme-b16b16"
+
+/*
+** mla_lane_0_z0_z4_0:
+** mov (w8|w9|w10|w11), #?0
+** bfmla za\.h\[\1, 0, vgx4\], {z0\.h - z3\.h}, z4\.h\[0\]
+** ret
+*/
+TEST_ZA_LANE (mla_lane_0_z0_z4_0, svbfloat16x4_t, svbfloat16_t,
+ svmla_lane_za16_bf16_vg1x4 (0, z0, z4, 0),
+ svmla_lane_za16_vg1x4 (0, z0, z4, 0))
+
+/*
+** mla_lane_w0_z0_z7_1:
+** mov (w8|w9|w10|w11), w0
+** bfmla za\.h\[\1, 0, vgx4\], {z0\.h - z3\.h}, z7\.h\[1\]
+** ret
+*/
+TEST_ZA_LANE (mla_lane_w0_z0_z7_1, svbfloat16x4_t, svbfloat16_t,
+ svmla_lane_za16_bf16_vg1x4 (w0, z0, z7, 1),
+ svmla_lane_za16_vg1x4 (w0, z0, z7, 1))
+
+/*
+** mla_lane_w8_z28_z4_2:
+** bfmla za\.h\[w8, 0, vgx4\], {z28\.h - z31\.h}, z4\.h\[2\]
+** ret
+*/
+TEST_ZA_LANE (mla_lane_w8_z28_z4_2, svbfloat16x4_t, svbfloat16_t,
+ svmla_lane_za16_bf16_vg1x4 (w8, z28, z4, 2),
+ svmla_lane_za16_vg1x4 (w8, z28, z4, 2))
+
+/*
+** mla_lane_w8p7_z0_z4_3:
+** bfmla za\.h\[w8, 7, vgx4\], {z0\.h - z3\.h}, z4\.h\[3\]
+** ret
+*/
+TEST_ZA_LANE (mla_lane_w8p7_z0_z4_3, svbfloat16x4_t, svbfloat16_t,
+ svmla_lane_za16_bf16_vg1x4 (w8 + 7, z0, z4, 3),
+ svmla_lane_za16_vg1x4 (w8 + 7, z0, z4, 3))
+
+/*
+** mla_lane_w8p8_z0_z4_4:
+** add (w8|w9|w10|w11), w8, #?8
+** bfmla za\.h\[\1, 0, vgx4\], {z0\.h - z3\.h}, z4\.h\[4\]
+** ret
+*/
+TEST_ZA_LANE (mla_lane_w8p8_z0_z4_4, svbfloat16x4_t, svbfloat16_t,
+ svmla_lane_za16_bf16_vg1x4 (w8 + 8, z0, z4, 4),
+ svmla_lane_za16_vg1x4 (w8 + 8, z0, z4, 4))
+
+/*
+** mla_lane_w0m1_z0_z4_5:
+** sub (w8|w9|w10|w11), w0, #?1
+** bfmla za\.h\[\1, 0, vgx4\], {z0\.h - z3\.h}, z4\.h\[5\]
+** ret
+*/
+TEST_ZA_LANE (mla_lane_w0m1_z0_z4_5, svbfloat16x4_t, svbfloat16_t,
+ svmla_lane_za16_bf16_vg1x4 (w0 - 1, z0, z4, 5),
+ svmla_lane_za16_vg1x4 (w0 - 1, z0, z4, 5))
+
+/*
+** mla_lane_w8_z4_z15_6:
+** str d15, \[sp, #?-16\]!
+** bfmla za\.h\[w8, 0, vgx4\], {z4\.h - z7\.h}, z15\.h\[6\]
+** ldr d15, \[sp\], #?16
+** ret
+*/
+TEST_ZA_LANE_Z15 (mla_lane_w8_z4_z15_6, svbfloat16x4_t, svbfloat16_t,
+ svmla_lane_za16_bf16_vg1x4 (w8, z4, z15, 6),
+ svmla_lane_za16_vg1x4 (w8, z4, z15, 6))
+
+/*
+** mla_lane_w8_z28_z16_7:
+** mov (z[0-7]).d, z16.d
+** bfmla za\.h\[w8, 0, vgx4\], {z28\.h - z31\.h}, \1\.h\[7\]
+** ret
+*/
+TEST_ZA_LANE (mla_lane_w8_z28_z16_7, svbfloat16x4_t, svbfloat16_t,
+ svmla_lane_za16_bf16_vg1x4 (w8, z28, z16, 7),
+ svmla_lane_za16_vg1x4 (w8, z28, z16, 7))
+
+/*
+** mla_lane_w8_z17_z7_0:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmla za\.h\[w8, 0, vgx4\], [^\n]+, z7\.h\[0\]
+** ret
+*/
+TEST_ZA_LANE (mla_lane_w8_z17_z7_0, svbfloat16x4_t, svbfloat16_t,
+ svmla_lane_za16_bf16_vg1x4 (w8, z17, z7, 0),
+ svmla_lane_za16_vg1x4 (w8, z17, z7, 0))
+
+/*
+** mla_lane_w8_z22_z4_1:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmla za\.h\[w8, 0, vgx4\], [^\n]+, z4\.h\[1\]
+** ret
+*/
+TEST_ZA_LANE (mla_lane_w8_z22_z4_1, svbfloat16x4_t, svbfloat16_t,
+ svmla_lane_za16_bf16_vg1x4 (w8, z22, z4, 1),
+ svmla_lane_za16_vg1x4 (w8, z22, z4, 1))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sme-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sme-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sme2_acle.h"
+
+#pragma GCC target "+sme-b16b16"
+
+/*
+** mla_0_z0_z0:
+** mov (w8|w9|w10|w11), #?0
+** bfmla za\.h\[\1, 0, vgx2\], {z0\.h - z1\.h}, {z0\.h - z1\.h}
+** ret
+*/
+TEST_ZA_XN (mla_0_z0_z0, svbfloat16x2_t,
+ svmla_za16_bf16_vg1x2 (0, z0, z0),
+ svmla_za16_vg1x2 (0, z0, z0))
+
+/*
+** mla_w0_z0_z0:
+** mov (w8|w9|w10|w11), w0
+** bfmla za\.h\[\1, 0, vgx2\], {z0\.h - z1\.h}, {z0\.h - z1\.h}
+** ret
+*/
+TEST_ZA_XN (mla_w0_z0_z0, svbfloat16x2_t,
+ svmla_za16_bf16_vg1x2 (w0, z0, z0),
+ svmla_za16_vg1x2 (w0, z0, z0))
+
+/*
+** mla_w8_z0_z4:
+** bfmla za\.h\[w8, 0, vgx2\], {z0\.h - z1\.h}, {z4\.h - z5\.h}
+** ret
+*/
+TEST_ZA_XN (mla_w8_z0_z4, svbfloat16x2_t,
+ svmla_za16_bf16_vg1x2 (w8, z0, z4),
+ svmla_za16_vg1x2 (w8, z0, z4))
+
+/*
+** mla_w8_z4_z18:
+** bfmla za\.h\[w8, 0, vgx2\], {z4\.h - z5\.h}, {z18\.h - z19\.h}
+** ret
+*/
+TEST_ZA_XN (mla_w8_z4_z18, svbfloat16x2_t,
+ svmla_za16_bf16_vg1x2 (w8, z4, z18),
+ svmla_za16_vg1x2 (w8, z4, z18))
+
+/* Leave the assembler to check for correctness for misaligned registers. */
+
+/*
+** mla_w8_z23_z0:
+** ...
+** bfmla za\.h\[w8, 0, vgx2\], [^\n]+, {z0\.h - z1\.h}
+** ret
+*/
+TEST_ZA_XN (mla_w8_z23_z0, svbfloat16x2_t,
+ svmla_za16_bf16_vg1x2 (w8, z23, z0),
+ svmla_za16_vg1x2 (w8, z23, z0))
+
+/*
+** mla_w8_z18_z23:
+** ...
+** bfmla za\.h\[w8, 0, vgx2\], {z18\.h - z19\.h}, [^\n]+
+** ret
+*/
+TEST_ZA_XN (mla_w8_z18_z23, svbfloat16x2_t,
+ svmla_za16_bf16_vg1x2 (w8, z18, z23),
+ svmla_za16_vg1x2 (w8, z18, z23))
+
+/*
+** mla_w8_z4_z28:
+** bfmla za\.h\[w8, 0, vgx2\], {z4\.h - z5\.h}, {z28\.h - z29\.h}
+** ret
+*/
+TEST_ZA_XN (mla_w8_z4_z28, svbfloat16x2_t,
+ svmla_za16_bf16_vg1x2 (w8, z4, z28),
+ svmla_za16_vg1x2 (w8, z4, z28))
+
+/*
+** mla_w8p7_z4_z0:
+** bfmla za\.h\[w8, 7, vgx2\], {z4\.h - z5\.h}, {z0\.h - z1\.h}
+** ret
+*/
+TEST_ZA_XN (mla_w8p7_z4_z0, svbfloat16x2_t,
+ svmla_za16_bf16_vg1x2 (w8 + 7, z4, z0),
+ svmla_za16_vg1x2 (w8 + 7, z4, z0))
+
+/*
+** mla_w8p8_z4_z4:
+** add (w8|w9|w10|w11), w8, #?8
+** bfmla za\.h\[\1, 0, vgx2\], {z4\.h - z5\.h}, {z4\.h - z5\.h}
+** ret
+*/
+TEST_ZA_XN (mla_w8p8_z4_z4, svbfloat16x2_t,
+ svmla_za16_bf16_vg1x2 (w8 + 8, z4, z4),
+ svmla_za16_vg1x2 (w8 + 8, z4, z4))
+
+/*
+** mla_w8m1_z4_z0:
+** sub (w8|w9|w10|w11), w8, #?1
+** bfmla za\.h\[\1, 0, vgx2\], {z4\.h - z5\.h}, {z0\.h - z1\.h}
+** ret
+*/
+TEST_ZA_XN (mla_w8m1_z4_z0, svbfloat16x2_t,
+ svmla_za16_bf16_vg1x2 (w8 - 1, z4, z0),
+ svmla_za16_vg1x2 (w8 - 1, z4, z0))
+
+/*
+** mla_single_0_z1_z0:
+** mov (w8|w9|w10|w11), #?0
+** bfmla za\.h\[\1, 0, vgx2\], {z1\.h - z2\.h}, z0\.h
+** ret
+*/
+TEST_ZA_SINGLE (mla_single_0_z1_z0, svbfloat16x2_t, svbfloat16_t,
+ svmla_single_za16_bf16_vg1x2 (0, z1, z0),
+ svmla_za16_vg1x2 (0, z1, z0))
+
+/*
+** mla_single_w0_z1_z0:
+** mov (w8|w9|w10|w11), w0
+** bfmla za\.h\[\1, 0, vgx2\], {z1\.h - z2\.h}, z0\.h
+** ret
+*/
+TEST_ZA_SINGLE (mla_single_w0_z1_z0, svbfloat16x2_t, svbfloat16_t,
+ svmla_single_za16_bf16_vg1x2 (w0, z1, z0),
+ svmla_za16_vg1x2 (w0, z1, z0))
+
+/*
+** mla_single_w8_z1_z0:
+** bfmla za\.h\[w8, 0, vgx2\], {z1\.h - z2\.h}, z0\.h
+** ret
+*/
+TEST_ZA_SINGLE (mla_single_w8_z1_z0, svbfloat16x2_t, svbfloat16_t,
+ svmla_single_za16_bf16_vg1x2 (w8, z1, z0),
+ svmla_za16_vg1x2 (w8, z1, z0))
+
+/*
+** mla_single_w8p7_z1_z0:
+** bfmla za\.h\[w8, 7, vgx2\], {z1\.h - z2\.h}, z0\.h
+** ret
+*/
+TEST_ZA_SINGLE (mla_single_w8p7_z1_z0, svbfloat16x2_t, svbfloat16_t,
+ svmla_single_za16_bf16_vg1x2 (w8 + 7, z1, z0),
+ svmla_za16_vg1x2 (w8 + 7, z1, z0))
+
+/*
+** mla_single_w8p8_z1_z0:
+** add (w8|w9|w10|w11), w8, #?8
+** bfmla za\.h\[\1, 0, vgx2\], {z1\.h - z2\.h}, z0\.h
+** ret
+*/
+TEST_ZA_SINGLE (mla_single_w8p8_z1_z0, svbfloat16x2_t, svbfloat16_t,
+ svmla_single_za16_bf16_vg1x2 (w8 + 8, z1, z0),
+ svmla_za16_vg1x2 (w8 + 8, z1, z0))
+
+/*
+** mla_single_w0m1_z1_z0:
+** sub (w8|w9|w10|w11), w0, #?1
+** bfmla za\.h\[\1, 0, vgx2\], {z1\.h - z2\.h}, z0\.h
+** ret
+*/
+TEST_ZA_SINGLE (mla_single_w0m1_z1_z0, svbfloat16x2_t, svbfloat16_t,
+ svmla_single_za16_bf16_vg1x2 (w0 - 1, z1, z0),
+ svmla_za16_vg1x2 (w0 - 1, z1, z0))
+
+/*
+** mla_single_w8_z0_z15:
+** str d15, \[sp, #?-16\]!
+** bfmla za\.h\[w8, 0, vgx2\], {z0\.h - z1\.h}, z15\.h
+** ldr d15, \[sp\], #?16
+** ret
+*/
+TEST_ZA_SINGLE_Z15 (mla_single_w8_z0_z15, svbfloat16x2_t, svbfloat16_t,
+ svmla_single_za16_bf16_vg1x2 (w8, z0, z15),
+ svmla_za16_vg1x2 (w8, z0, z15))
+
+/*
+** mla_single_w8_z20_z16:
+** mov (z[0-7]).d, z16.d
+** bfmla za\.h\[w8, 0, vgx2\], {z20\.h - z21\.h}, \1\.h
+** ret
+*/
+TEST_ZA_SINGLE (mla_single_w8_z20_z16, svbfloat16x2_t, svbfloat16_t,
+ svmla_single_za16_bf16_vg1x2 (w8, z20, z16),
+ svmla_za16_vg1x2 (w8, z20, z16))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sme-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sme-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sme2_acle.h"
+
+#pragma GCC target "+sme-b16b16"
+
+/*
+** mla_0_z0_z0:
+** mov (w8|w9|w10|w11), #?0
+** bfmla za\.h\[\1, 0, vgx4\], {z0\.h - z3\.h}, {z0\.h - z3\.h}
+** ret
+*/
+TEST_ZA_XN (mla_0_z0_z0, svbfloat16x4_t,
+ svmla_za16_bf16_vg1x4 (0, z0, z0),
+ svmla_za16_vg1x4 (0, z0, z0))
+
+/*
+** mla_w0_z0_z0:
+** mov (w8|w9|w10|w11), w0
+** bfmla za\.h\[\1, 0, vgx4\], {z0\.h - z3\.h}, {z0\.h - z3\.h}
+** ret
+*/
+TEST_ZA_XN (mla_w0_z0_z0, svbfloat16x4_t,
+ svmla_za16_bf16_vg1x4 (w0, z0, z0),
+ svmla_za16_vg1x4 (w0, z0, z0))
+
+/*
+** mla_w8_z0_z4:
+** bfmla za\.h\[w8, 0, vgx4\], {z0\.h - z3\.h}, {z4\.h - z7\.h}
+** ret
+*/
+TEST_ZA_XN (mla_w8_z0_z4, svbfloat16x4_t,
+ svmla_za16_bf16_vg1x4 (w8, z0, z4),
+ svmla_za16_vg1x4 (w8, z0, z4))
+
+/* Leave the assembler to check for correctness for misaligned registers. */
+
+/*
+** mla_w8_z0_z18:
+** ...
+** bfmla za\.h\[w8, 0, vgx4\], {z0\.h - z3\.h}, [^\n]+
+** ret
+*/
+TEST_ZA_XN (mla_w8_z0_z18, svbfloat16x4_t,
+ svmla_za16_bf16_vg1x4 (w8, z0, z18),
+ svmla_za16_vg1x4 (w8, z0, z18))
+
+/*
+** mla_w8_z18_z28:
+** ...
+** bfmla za\.h\[w8, 0, vgx4\], [^\n]+, {z28\.h - z31\.h}
+** ret
+*/
+TEST_ZA_XN (mla_w8_z18_z28, svbfloat16x4_t,
+ svmla_za16_bf16_vg1x4 (w8, z18, z28),
+ svmla_za16_vg1x4 (w8, z18, z28))
+
+/*
+** mla_w8_z28_z23:
+** ...
+** bfmla za\.h\[w8, 0, vgx4\], {z28\.h - z31\.h}, [^\n]+
+** ret
+*/
+TEST_ZA_XN (mla_w8_z28_z23, svbfloat16x4_t,
+ svmla_za16_bf16_vg1x4 (w8, z28, z23),
+ svmla_za16_vg1x4 (w8, z28, z23))
+
+/*
+** mla_w8p7_z4_z0:
+** bfmla za\.h\[w8, 7, vgx4\], {z4\.h - z7\.h}, {z0\.h - z3\.h}
+** ret
+*/
+TEST_ZA_XN (mla_w8p7_z4_z0, svbfloat16x4_t,
+ svmla_za16_bf16_vg1x4 (w8 + 7, z4, z0),
+ svmla_za16_vg1x4 (w8 + 7, z4, z0))
+
+/*
+** mla_w8p8_z4_z4:
+** add (w8|w9|w10|w11), w8, #?8
+** bfmla za\.h\[\1, 0, vgx4\], {z4\.h - z7\.h}, {z4\.h - z7\.h}
+** ret
+*/
+TEST_ZA_XN (mla_w8p8_z4_z4, svbfloat16x4_t,
+ svmla_za16_bf16_vg1x4 (w8 + 8, z4, z4),
+ svmla_za16_vg1x4 (w8 + 8, z4, z4))
+
+/*
+** mla_w8m1_z4_z0:
+** sub (w8|w9|w10|w11), w8, #?1
+** bfmla za\.h\[\1, 0, vgx4\], {z4\.h - z7\.h}, {z0\.h - z3\.h}
+** ret
+*/
+TEST_ZA_XN (mla_w8m1_z4_z0, svbfloat16x4_t,
+ svmla_za16_bf16_vg1x4 (w8 - 1, z4, z0),
+ svmla_za16_vg1x4 (w8 - 1, z4, z0))
+
+/*
+** mla_single_0_z1_z0:
+** mov (w8|w9|w10|w11), #?0
+** bfmla za\.h\[\1, 0, vgx4\], {z1\.h - z4\.h}, z0\.h
+** ret
+*/
+TEST_ZA_SINGLE (mla_single_0_z1_z0, svbfloat16x4_t, svbfloat16_t,
+ svmla_single_za16_bf16_vg1x4 (0, z1, z0),
+ svmla_za16_vg1x4 (0, z1, z0))
+
+/*
+** mla_single_w0_z1_z0:
+** mov (w8|w9|w10|w11), w0
+** bfmla za\.h\[\1, 0, vgx4\], {z1\.h - z4\.h}, z0\.h
+** ret
+*/
+TEST_ZA_SINGLE (mla_single_w0_z1_z0, svbfloat16x4_t, svbfloat16_t,
+ svmla_single_za16_bf16_vg1x4 (w0, z1, z0),
+ svmla_za16_vg1x4 (w0, z1, z0))
+
+/*
+** mla_single_w8_z1_z0:
+** bfmla za\.h\[w8, 0, vgx4\], {z1\.h - z4\.h}, z0\.h
+** ret
+*/
+TEST_ZA_SINGLE (mla_single_w8_z1_z0, svbfloat16x4_t, svbfloat16_t,
+ svmla_single_za16_bf16_vg1x4 (w8, z1, z0),
+ svmla_za16_vg1x4 (w8, z1, z0))
+
+/*
+** mla_single_w8p7_z1_z0:
+** bfmla za\.h\[w8, 7, vgx4\], {z1\.h - z4\.h}, z0\.h
+** ret
+*/
+TEST_ZA_SINGLE (mla_single_w8p7_z1_z0, svbfloat16x4_t, svbfloat16_t,
+ svmla_single_za16_bf16_vg1x4 (w8 + 7, z1, z0),
+ svmla_za16_vg1x4 (w8 + 7, z1, z0))
+
+/*
+** mla_single_w8p8_z1_z0:
+** add (w8|w9|w10|w11), w8, #?8
+** bfmla za\.h\[\1, 0, vgx4\], {z1\.h - z4\.h}, z0\.h
+** ret
+*/
+TEST_ZA_SINGLE (mla_single_w8p8_z1_z0, svbfloat16x4_t, svbfloat16_t,
+ svmla_single_za16_bf16_vg1x4 (w8 + 8, z1, z0),
+ svmla_za16_vg1x4 (w8 + 8, z1, z0))
+
+/*
+** mla_single_w0m1_z1_z0:
+** sub (w8|w9|w10|w11), w0, #?1
+** bfmla za\.h\[\1, 0, vgx4\], {z1\.h - z4\.h}, z0\.h
+** ret
+*/
+TEST_ZA_SINGLE (mla_single_w0m1_z1_z0, svbfloat16x4_t, svbfloat16_t,
+ svmla_single_za16_bf16_vg1x4 (w0 - 1, z1, z0),
+ svmla_za16_vg1x4 (w0 - 1, z1, z0))
+
+/*
+** mla_single_w8_z0_z15:
+** str d15, \[sp, #?-16\]!
+** bfmla za\.h\[w8, 0, vgx4\], {z0\.h - z3\.h}, z15\.h
+** ldr d15, \[sp\], #?16
+** ret
+*/
+TEST_ZA_SINGLE_Z15 (mla_single_w8_z0_z15, svbfloat16x4_t, svbfloat16_t,
+ svmla_single_za16_bf16_vg1x4 (w8, z0, z15),
+ svmla_za16_vg1x4 (w8, z0, z15))
+
+/*
+** mla_single_w8_z20_z16:
+** mov (z[0-7]).d, z16.d
+** bfmla za\.h\[w8, 0, vgx4\], {z20\.h - z23\.h}, \1\.h
+** ret
+*/
+TEST_ZA_SINGLE (mla_single_w8_z20_z16, svbfloat16x4_t, svbfloat16_t,
+ svmla_single_za16_bf16_vg1x4 (w8, z20, z16),
+ svmla_za16_vg1x4 (w8, z20, z16))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sme-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sme-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sme2_acle.h"
+
+#pragma GCC target "+sme-b16b16"
+
+/*
+** mls_lane_0_z0_z4_0:
+** mov (w8|w9|w10|w11), #?0
+** bfmls za\.h\[\1, 0, vgx2\], {z0\.h - z1\.h}, z4\.h\[0\]
+** ret
+*/
+TEST_ZA_LANE (mls_lane_0_z0_z4_0, svbfloat16x2_t, svbfloat16_t,
+ svmls_lane_za16_bf16_vg1x2 (0, z0, z4, 0),
+ svmls_lane_za16_vg1x2 (0, z0, z4, 0))
+
+/*
+** mls_lane_w0_z0_z7_1:
+** mov (w8|w9|w10|w11), w0
+** bfmls za\.h\[\1, 0, vgx2\], {z0\.h - z1\.h}, z7\.h\[1\]
+** ret
+*/
+TEST_ZA_LANE (mls_lane_w0_z0_z7_1, svbfloat16x2_t, svbfloat16_t,
+ svmls_lane_za16_bf16_vg1x2 (w0, z0, z7, 1),
+ svmls_lane_za16_vg1x2 (w0, z0, z7, 1))
+
+/*
+** mls_lane_w8_z28_z4_2:
+** bfmls za\.h\[w8, 0, vgx2\], {z28\.h - z29\.h}, z4\.h\[2\]
+** ret
+*/
+TEST_ZA_LANE (mls_lane_w8_z28_z4_2, svbfloat16x2_t, svbfloat16_t,
+ svmls_lane_za16_bf16_vg1x2 (w8, z28, z4, 2),
+ svmls_lane_za16_vg1x2 (w8, z28, z4, 2))
+
+/*
+** mls_lane_w8p7_z0_z4_3:
+** bfmls za\.h\[w8, 7, vgx2\], {z0\.h - z1\.h}, z4\.h\[3\]
+** ret
+*/
+TEST_ZA_LANE (mls_lane_w8p7_z0_z4_3, svbfloat16x2_t, svbfloat16_t,
+ svmls_lane_za16_bf16_vg1x2 (w8 + 7, z0, z4, 3),
+ svmls_lane_za16_vg1x2 (w8 + 7, z0, z4, 3))
+
+/*
+** mls_lane_w8p8_z0_z4_4:
+** add (w8|w9|w10|w11), w8, #?8
+** bfmls za\.h\[\1, 0, vgx2\], {z0\.h - z1\.h}, z4\.h\[4\]
+** ret
+*/
+TEST_ZA_LANE (mls_lane_w8p8_z0_z4_4, svbfloat16x2_t, svbfloat16_t,
+ svmls_lane_za16_bf16_vg1x2 (w8 + 8, z0, z4, 4),
+ svmls_lane_za16_vg1x2 (w8 + 8, z0, z4, 4))
+
+/*
+** mls_lane_w0m1_z0_z4_5:
+** sub (w8|w9|w10|w11), w0, #?1
+** bfmls za\.h\[\1, 0, vgx2\], {z0\.h - z1\.h}, z4\.h\[5\]
+** ret
+*/
+TEST_ZA_LANE (mls_lane_w0m1_z0_z4_5, svbfloat16x2_t, svbfloat16_t,
+ svmls_lane_za16_bf16_vg1x2 (w0 - 1, z0, z4, 5),
+ svmls_lane_za16_vg1x2 (w0 - 1, z0, z4, 5))
+
+/*
+** mls_lane_w8_z4_z15_6:
+** str d15, \[sp, #?-16\]!
+** bfmls za\.h\[w8, 0, vgx2\], {z4\.h - z5\.h}, z15\.h\[6\]
+** ldr d15, \[sp\], #?16
+** ret
+*/
+TEST_ZA_LANE_Z15 (mls_lane_w8_z4_z15_6, svbfloat16x2_t, svbfloat16_t,
+ svmls_lane_za16_bf16_vg1x2 (w8, z4, z15, 6),
+ svmls_lane_za16_vg1x2 (w8, z4, z15, 6))
+
+/*
+** mls_lane_w8_z28_z16_7:
+** mov (z[0-7]).d, z16.d
+** bfmls za\.h\[w8, 0, vgx2\], {z28\.h - z29\.h}, \1\.h\[7\]
+** ret
+*/
+TEST_ZA_LANE (mls_lane_w8_z28_z16_7, svbfloat16x2_t, svbfloat16_t,
+ svmls_lane_za16_bf16_vg1x2 (w8, z28, z16, 7),
+ svmls_lane_za16_vg1x2 (w8, z28, z16, 7))
+
+/*
+** mls_lane_w8_z17_z7_0:
+** mov [^\n]+
+** mov [^\n]+
+** bfmls za\.h\[w8, 0, vgx2\], [^\n]+, z7\.h\[0\]
+** ret
+*/
+TEST_ZA_LANE (mls_lane_w8_z17_z7_0, svbfloat16x2_t, svbfloat16_t,
+ svmls_lane_za16_bf16_vg1x2 (w8, z17, z7, 0),
+ svmls_lane_za16_vg1x2 (w8, z17, z7, 0))
+
+/*
+** mls_lane_w8_z22_z4_1:
+** bfmls za\.h\[w8, 0, vgx2\], {z22\.h - z23\.h}, z4\.h\[1\]
+** ret
+*/
+TEST_ZA_LANE (mls_lane_w8_z22_z4_1, svbfloat16x2_t, svbfloat16_t,
+ svmls_lane_za16_bf16_vg1x2 (w8, z22, z4, 1),
+ svmls_lane_za16_vg1x2 (w8, z22, z4, 1))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sme-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sme-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sme2_acle.h"
+
+#pragma GCC target "+sme-b16b16"
+
+/*
+** mls_lane_0_z0_z4_0:
+** mov (w8|w9|w10|w11), #?0
+** bfmls za\.h\[\1, 0, vgx4\], {z0\.h - z3\.h}, z4\.h\[0\]
+** ret
+*/
+TEST_ZA_LANE (mls_lane_0_z0_z4_0, svbfloat16x4_t, svbfloat16_t,
+ svmls_lane_za16_bf16_vg1x4 (0, z0, z4, 0),
+ svmls_lane_za16_vg1x4 (0, z0, z4, 0))
+
+/*
+** mls_lane_w0_z0_z7_1:
+** mov (w8|w9|w10|w11), w0
+** bfmls za\.h\[\1, 0, vgx4\], {z0\.h - z3\.h}, z7\.h\[1\]
+** ret
+*/
+TEST_ZA_LANE (mls_lane_w0_z0_z7_1, svbfloat16x4_t, svbfloat16_t,
+ svmls_lane_za16_bf16_vg1x4 (w0, z0, z7, 1),
+ svmls_lane_za16_vg1x4 (w0, z0, z7, 1))
+
+/*
+** mls_lane_w8_z28_z4_2:
+** bfmls za\.h\[w8, 0, vgx4\], {z28\.h - z31\.h}, z4\.h\[2\]
+** ret
+*/
+TEST_ZA_LANE (mls_lane_w8_z28_z4_2, svbfloat16x4_t, svbfloat16_t,
+ svmls_lane_za16_bf16_vg1x4 (w8, z28, z4, 2),
+ svmls_lane_za16_vg1x4 (w8, z28, z4, 2))
+
+/*
+** mls_lane_w8p7_z0_z4_3:
+** bfmls za\.h\[w8, 7, vgx4\], {z0\.h - z3\.h}, z4\.h\[3\]
+** ret
+*/
+TEST_ZA_LANE (mls_lane_w8p7_z0_z4_3, svbfloat16x4_t, svbfloat16_t,
+ svmls_lane_za16_bf16_vg1x4 (w8 + 7, z0, z4, 3),
+ svmls_lane_za16_vg1x4 (w8 + 7, z0, z4, 3))
+
+/*
+** mls_lane_w8p8_z0_z4_4:
+** add (w8|w9|w10|w11), w8, #?8
+** bfmls za\.h\[\1, 0, vgx4\], {z0\.h - z3\.h}, z4\.h\[4\]
+** ret
+*/
+TEST_ZA_LANE (mls_lane_w8p8_z0_z4_4, svbfloat16x4_t, svbfloat16_t,
+ svmls_lane_za16_bf16_vg1x4 (w8 + 8, z0, z4, 4),
+ svmls_lane_za16_vg1x4 (w8 + 8, z0, z4, 4))
+
+/*
+** mls_lane_w0m1_z0_z4_5:
+** sub (w8|w9|w10|w11), w0, #?1
+** bfmls za\.h\[\1, 0, vgx4\], {z0\.h - z3\.h}, z4\.h\[5\]
+** ret
+*/
+TEST_ZA_LANE (mls_lane_w0m1_z0_z4_5, svbfloat16x4_t, svbfloat16_t,
+ svmls_lane_za16_bf16_vg1x4 (w0 - 1, z0, z4, 5),
+ svmls_lane_za16_vg1x4 (w0 - 1, z0, z4, 5))
+
+/*
+** mls_lane_w8_z4_z15_6:
+** str d15, \[sp, #?-16\]!
+** bfmls za\.h\[w8, 0, vgx4\], {z4\.h - z7\.h}, z15\.h\[6\]
+** ldr d15, \[sp\], #?16
+** ret
+*/
+TEST_ZA_LANE_Z15 (mls_lane_w8_z4_z15_6, svbfloat16x4_t, svbfloat16_t,
+ svmls_lane_za16_bf16_vg1x4 (w8, z4, z15, 6),
+ svmls_lane_za16_vg1x4 (w8, z4, z15, 6))
+
+/*
+** mls_lane_w8_z28_z16_7:
+** mov (z[0-7]).d, z16.d
+** bfmls za\.h\[w8, 0, vgx4\], {z28\.h - z31\.h}, \1\.h\[7\]
+** ret
+*/
+TEST_ZA_LANE (mls_lane_w8_z28_z16_7, svbfloat16x4_t, svbfloat16_t,
+ svmls_lane_za16_bf16_vg1x4 (w8, z28, z16, 7),
+ svmls_lane_za16_vg1x4 (w8, z28, z16, 7))
+
+/*
+** mls_lane_w8_z17_z7_0:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmls za\.h\[w8, 0, vgx4\], [^\n]+, z7\.h\[0\]
+** ret
+*/
+TEST_ZA_LANE (mls_lane_w8_z17_z7_0, svbfloat16x4_t, svbfloat16_t,
+ svmls_lane_za16_bf16_vg1x4 (w8, z17, z7, 0),
+ svmls_lane_za16_vg1x4 (w8, z17, z7, 0))
+
+/*
+** mls_lane_w8_z22_z4_1:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmls za\.h\[w8, 0, vgx4\], [^\n]+, z4\.h\[1\]
+** ret
+*/
+TEST_ZA_LANE (mls_lane_w8_z22_z4_1, svbfloat16x4_t, svbfloat16_t,
+ svmls_lane_za16_bf16_vg1x4 (w8, z22, z4, 1),
+ svmls_lane_za16_vg1x4 (w8, z22, z4, 1))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sme-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sme-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sme2_acle.h"
+
+#pragma GCC target "+sme-b16b16"
+
+/*
+** mls_0_z0_z0:
+** mov (w8|w9|w10|w11), #?0
+** bfmls za\.h\[\1, 0, vgx2\], {z0\.h - z1\.h}, {z0\.h - z1\.h}
+** ret
+*/
+TEST_ZA_XN (mls_0_z0_z0, svbfloat16x2_t,
+ svmls_za16_bf16_vg1x2 (0, z0, z0),
+ svmls_za16_vg1x2 (0, z0, z0))
+
+/*
+** mls_w0_z0_z0:
+** mov (w8|w9|w10|w11), w0
+** bfmls za\.h\[\1, 0, vgx2\], {z0\.h - z1\.h}, {z0\.h - z1\.h}
+** ret
+*/
+TEST_ZA_XN (mls_w0_z0_z0, svbfloat16x2_t,
+ svmls_za16_bf16_vg1x2 (w0, z0, z0),
+ svmls_za16_vg1x2 (w0, z0, z0))
+
+/*
+** mls_w8_z0_z4:
+** bfmls za\.h\[w8, 0, vgx2\], {z0\.h - z1\.h}, {z4\.h - z5\.h}
+** ret
+*/
+TEST_ZA_XN (mls_w8_z0_z4, svbfloat16x2_t,
+ svmls_za16_bf16_vg1x2 (w8, z0, z4),
+ svmls_za16_vg1x2 (w8, z0, z4))
+
+/*
+** mls_w8_z4_z18:
+** bfmls za\.h\[w8, 0, vgx2\], {z4\.h - z5\.h}, {z18\.h - z19\.h}
+** ret
+*/
+TEST_ZA_XN (mls_w8_z4_z18, svbfloat16x2_t,
+ svmls_za16_bf16_vg1x2 (w8, z4, z18),
+ svmls_za16_vg1x2 (w8, z4, z18))
+
+/* Leave the assembler to check for correctness for misaligned registers. */
+
+/*
+** mls_w8_z23_z0:
+** ...
+** bfmls za\.h\[w8, 0, vgx2\], [^\n]+, {z0\.h - z1\.h}
+** ret
+*/
+TEST_ZA_XN (mls_w8_z23_z0, svbfloat16x2_t,
+ svmls_za16_bf16_vg1x2 (w8, z23, z0),
+ svmls_za16_vg1x2 (w8, z23, z0))
+
+/*
+** mls_w8_z18_z23:
+** ...
+** bfmls za\.h\[w8, 0, vgx2\], {z18\.h - z19\.h}, [^\n]+
+** ret
+*/
+TEST_ZA_XN (mls_w8_z18_z23, svbfloat16x2_t,
+ svmls_za16_bf16_vg1x2 (w8, z18, z23),
+ svmls_za16_vg1x2 (w8, z18, z23))
+
+/*
+** mls_w8_z4_z28:
+** bfmls za\.h\[w8, 0, vgx2\], {z4\.h - z5\.h}, {z28\.h - z29\.h}
+** ret
+*/
+TEST_ZA_XN (mls_w8_z4_z28, svbfloat16x2_t,
+ svmls_za16_bf16_vg1x2 (w8, z4, z28),
+ svmls_za16_vg1x2 (w8, z4, z28))
+
+/*
+** mls_w8p7_z4_z0:
+** bfmls za\.h\[w8, 7, vgx2\], {z4\.h - z5\.h}, {z0\.h - z1\.h}
+** ret
+*/
+TEST_ZA_XN (mls_w8p7_z4_z0, svbfloat16x2_t,
+ svmls_za16_bf16_vg1x2 (w8 + 7, z4, z0),
+ svmls_za16_vg1x2 (w8 + 7, z4, z0))
+
+/*
+** mls_w8p8_z4_z4:
+** add (w8|w9|w10|w11), w8, #?8
+** bfmls za\.h\[\1, 0, vgx2\], {z4\.h - z5\.h}, {z4\.h - z5\.h}
+** ret
+*/
+TEST_ZA_XN (mls_w8p8_z4_z4, svbfloat16x2_t,
+ svmls_za16_bf16_vg1x2 (w8 + 8, z4, z4),
+ svmls_za16_vg1x2 (w8 + 8, z4, z4))
+
+/*
+** mls_w8m1_z4_z0:
+** sub (w8|w9|w10|w11), w8, #?1
+** bfmls za\.h\[\1, 0, vgx2\], {z4\.h - z5\.h}, {z0\.h - z1\.h}
+** ret
+*/
+TEST_ZA_XN (mls_w8m1_z4_z0, svbfloat16x2_t,
+ svmls_za16_bf16_vg1x2 (w8 - 1, z4, z0),
+ svmls_za16_vg1x2 (w8 - 1, z4, z0))
+
+/*
+** mls_single_0_z1_z0:
+** mov (w8|w9|w10|w11), #?0
+** bfmls za\.h\[\1, 0, vgx2\], {z1\.h - z2\.h}, z0\.h
+** ret
+*/
+TEST_ZA_SINGLE (mls_single_0_z1_z0, svbfloat16x2_t, svbfloat16_t,
+ svmls_single_za16_bf16_vg1x2 (0, z1, z0),
+ svmls_za16_vg1x2 (0, z1, z0))
+
+/*
+** mls_single_w0_z1_z0:
+** mov (w8|w9|w10|w11), w0
+** bfmls za\.h\[\1, 0, vgx2\], {z1\.h - z2\.h}, z0\.h
+** ret
+*/
+TEST_ZA_SINGLE (mls_single_w0_z1_z0, svbfloat16x2_t, svbfloat16_t,
+ svmls_single_za16_bf16_vg1x2 (w0, z1, z0),
+ svmls_za16_vg1x2 (w0, z1, z0))
+
+/*
+** mls_single_w8_z1_z0:
+** bfmls za\.h\[w8, 0, vgx2\], {z1\.h - z2\.h}, z0\.h
+** ret
+*/
+TEST_ZA_SINGLE (mls_single_w8_z1_z0, svbfloat16x2_t, svbfloat16_t,
+ svmls_single_za16_bf16_vg1x2 (w8, z1, z0),
+ svmls_za16_vg1x2 (w8, z1, z0))
+
+/*
+** mls_single_w8p7_z1_z0:
+** bfmls za\.h\[w8, 7, vgx2\], {z1\.h - z2\.h}, z0\.h
+** ret
+*/
+TEST_ZA_SINGLE (mls_single_w8p7_z1_z0, svbfloat16x2_t, svbfloat16_t,
+ svmls_single_za16_bf16_vg1x2 (w8 + 7, z1, z0),
+ svmls_za16_vg1x2 (w8 + 7, z1, z0))
+
+/*
+** mls_single_w8p8_z1_z0:
+** add (w8|w9|w10|w11), w8, #?8
+** bfmls za\.h\[\1, 0, vgx2\], {z1\.h - z2\.h}, z0\.h
+** ret
+*/
+TEST_ZA_SINGLE (mls_single_w8p8_z1_z0, svbfloat16x2_t, svbfloat16_t,
+ svmls_single_za16_bf16_vg1x2 (w8 + 8, z1, z0),
+ svmls_za16_vg1x2 (w8 + 8, z1, z0))
+
+/*
+** mls_single_w0m1_z1_z0:
+** sub (w8|w9|w10|w11), w0, #?1
+** bfmls za\.h\[\1, 0, vgx2\], {z1\.h - z2\.h}, z0\.h
+** ret
+*/
+TEST_ZA_SINGLE (mls_single_w0m1_z1_z0, svbfloat16x2_t, svbfloat16_t,
+ svmls_single_za16_bf16_vg1x2 (w0 - 1, z1, z0),
+ svmls_za16_vg1x2 (w0 - 1, z1, z0))
+
+/*
+** mls_single_w8_z0_z15:
+** str d15, \[sp, #?-16\]!
+** bfmls za\.h\[w8, 0, vgx2\], {z0\.h - z1\.h}, z15\.h
+** ldr d15, \[sp\], #?16
+** ret
+*/
+TEST_ZA_SINGLE_Z15 (mls_single_w8_z0_z15, svbfloat16x2_t, svbfloat16_t,
+ svmls_single_za16_bf16_vg1x2 (w8, z0, z15),
+ svmls_za16_vg1x2 (w8, z0, z15))
+
+/*
+** mls_single_w8_z20_z16:
+** mov (z[0-7]).d, z16.d
+** bfmls za\.h\[w8, 0, vgx2\], {z20\.h - z21\.h}, \1\.h
+** ret
+*/
+TEST_ZA_SINGLE (mls_single_w8_z20_z16, svbfloat16x2_t, svbfloat16_t,
+ svmls_single_za16_bf16_vg1x2 (w8, z20, z16),
+ svmls_za16_vg1x2 (w8, z20, z16))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sme-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sme-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sme2_acle.h"
+
+#pragma GCC target "+sme-b16b16"
+
+/*
+** mls_0_z0_z0:
+** mov (w8|w9|w10|w11), #?0
+** bfmls za\.h\[\1, 0, vgx4\], {z0\.h - z3\.h}, {z0\.h - z3\.h}
+** ret
+*/
+TEST_ZA_XN (mls_0_z0_z0, svbfloat16x4_t,
+ svmls_za16_bf16_vg1x4 (0, z0, z0),
+ svmls_za16_vg1x4 (0, z0, z0))
+
+/*
+** mls_w0_z0_z0:
+** mov (w8|w9|w10|w11), w0
+** bfmls za\.h\[\1, 0, vgx4\], {z0\.h - z3\.h}, {z0\.h - z3\.h}
+** ret
+*/
+TEST_ZA_XN (mls_w0_z0_z0, svbfloat16x4_t,
+ svmls_za16_bf16_vg1x4 (w0, z0, z0),
+ svmls_za16_vg1x4 (w0, z0, z0))
+
+/*
+** mls_w8_z0_z4:
+** bfmls za\.h\[w8, 0, vgx4\], {z0\.h - z3\.h}, {z4\.h - z7\.h}
+** ret
+*/
+TEST_ZA_XN (mls_w8_z0_z4, svbfloat16x4_t,
+ svmls_za16_bf16_vg1x4 (w8, z0, z4),
+ svmls_za16_vg1x4 (w8, z0, z4))
+
+/* Leave the assembler to check for correctness for misaligned registers. */
+
+/*
+** mls_w8_z0_z18:
+** ...
+** bfmls za\.h\[w8, 0, vgx4\], {z0\.h - z3\.h}, [^\n]+
+** ret
+*/
+TEST_ZA_XN (mls_w8_z0_z18, svbfloat16x4_t,
+ svmls_za16_bf16_vg1x4 (w8, z0, z18),
+ svmls_za16_vg1x4 (w8, z0, z18))
+
+/*
+** mls_w8_z18_z28:
+** ...
+** bfmls za\.h\[w8, 0, vgx4\], [^\n]+, {z28\.h - z31\.h}
+** ret
+*/
+TEST_ZA_XN (mls_w8_z18_z28, svbfloat16x4_t,
+ svmls_za16_bf16_vg1x4 (w8, z18, z28),
+ svmls_za16_vg1x4 (w8, z18, z28))
+
+/*
+** mls_w8_z28_z23:
+** ...
+** bfmls za\.h\[w8, 0, vgx4\], {z28\.h - z31\.h}, [^\n]+
+** ret
+*/
+TEST_ZA_XN (mls_w8_z28_z23, svbfloat16x4_t,
+ svmls_za16_bf16_vg1x4 (w8, z28, z23),
+ svmls_za16_vg1x4 (w8, z28, z23))
+
+/*
+** mls_w8p7_z4_z0:
+** bfmls za\.h\[w8, 7, vgx4\], {z4\.h - z7\.h}, {z0\.h - z3\.h}
+** ret
+*/
+TEST_ZA_XN (mls_w8p7_z4_z0, svbfloat16x4_t,
+ svmls_za16_bf16_vg1x4 (w8 + 7, z4, z0),
+ svmls_za16_vg1x4 (w8 + 7, z4, z0))
+
+/*
+** mls_w8p8_z4_z4:
+** add (w8|w9|w10|w11), w8, #?8
+** bfmls za\.h\[\1, 0, vgx4\], {z4\.h - z7\.h}, {z4\.h - z7\.h}
+** ret
+*/
+TEST_ZA_XN (mls_w8p8_z4_z4, svbfloat16x4_t,
+ svmls_za16_bf16_vg1x4 (w8 + 8, z4, z4),
+ svmls_za16_vg1x4 (w8 + 8, z4, z4))
+
+/*
+** mls_w8m1_z4_z0:
+** sub (w8|w9|w10|w11), w8, #?1
+** bfmls za\.h\[\1, 0, vgx4\], {z4\.h - z7\.h}, {z0\.h - z3\.h}
+** ret
+*/
+TEST_ZA_XN (mls_w8m1_z4_z0, svbfloat16x4_t,
+ svmls_za16_bf16_vg1x4 (w8 - 1, z4, z0),
+ svmls_za16_vg1x4 (w8 - 1, z4, z0))
+
+/*
+** mls_single_0_z1_z0:
+** mov (w8|w9|w10|w11), #?0
+** bfmls za\.h\[\1, 0, vgx4\], {z1\.h - z4\.h}, z0\.h
+** ret
+*/
+TEST_ZA_SINGLE (mls_single_0_z1_z0, svbfloat16x4_t, svbfloat16_t,
+ svmls_single_za16_bf16_vg1x4 (0, z1, z0),
+ svmls_za16_vg1x4 (0, z1, z0))
+
+/*
+** mls_single_w0_z1_z0:
+** mov (w8|w9|w10|w11), w0
+** bfmls za\.h\[\1, 0, vgx4\], {z1\.h - z4\.h}, z0\.h
+** ret
+*/
+TEST_ZA_SINGLE (mls_single_w0_z1_z0, svbfloat16x4_t, svbfloat16_t,
+ svmls_single_za16_bf16_vg1x4 (w0, z1, z0),
+ svmls_za16_vg1x4 (w0, z1, z0))
+
+/*
+** mls_single_w8_z1_z0:
+** bfmls za\.h\[w8, 0, vgx4\], {z1\.h - z4\.h}, z0\.h
+** ret
+*/
+TEST_ZA_SINGLE (mls_single_w8_z1_z0, svbfloat16x4_t, svbfloat16_t,
+ svmls_single_za16_bf16_vg1x4 (w8, z1, z0),
+ svmls_za16_vg1x4 (w8, z1, z0))
+
+/*
+** mls_single_w8p7_z1_z0:
+** bfmls za\.h\[w8, 7, vgx4\], {z1\.h - z4\.h}, z0\.h
+** ret
+*/
+TEST_ZA_SINGLE (mls_single_w8p7_z1_z0, svbfloat16x4_t, svbfloat16_t,
+ svmls_single_za16_bf16_vg1x4 (w8 + 7, z1, z0),
+ svmls_za16_vg1x4 (w8 + 7, z1, z0))
+
+/*
+** mls_single_w8p8_z1_z0:
+** add (w8|w9|w10|w11), w8, #?8
+** bfmls za\.h\[\1, 0, vgx4\], {z1\.h - z4\.h}, z0\.h
+** ret
+*/
+TEST_ZA_SINGLE (mls_single_w8p8_z1_z0, svbfloat16x4_t, svbfloat16_t,
+ svmls_single_za16_bf16_vg1x4 (w8 + 8, z1, z0),
+ svmls_za16_vg1x4 (w8 + 8, z1, z0))
+
+/*
+** mls_single_w0m1_z1_z0:
+** sub (w8|w9|w10|w11), w0, #?1
+** bfmls za\.h\[\1, 0, vgx4\], {z1\.h - z4\.h}, z0\.h
+** ret
+*/
+TEST_ZA_SINGLE (mls_single_w0m1_z1_z0, svbfloat16x4_t, svbfloat16_t,
+ svmls_single_za16_bf16_vg1x4 (w0 - 1, z1, z0),
+ svmls_za16_vg1x4 (w0 - 1, z1, z0))
+
+/*
+** mls_single_w8_z0_z15:
+** str d15, \[sp, #?-16\]!
+** bfmls za\.h\[w8, 0, vgx4\], {z0\.h - z3\.h}, z15\.h
+** ldr d15, \[sp\], #?16
+** ret
+*/
+TEST_ZA_SINGLE_Z15 (mls_single_w8_z0_z15, svbfloat16x4_t, svbfloat16_t,
+ svmls_single_za16_bf16_vg1x4 (w8, z0, z15),
+ svmls_za16_vg1x4 (w8, z0, z15))
+
+/*
+** mls_single_w8_z20_z16:
+** mov (z[0-7]).d, z16.d
+** bfmls za\.h\[w8, 0, vgx4\], {z20\.h - z23\.h}, \1\.h
+** ret
+*/
+TEST_ZA_SINGLE (mls_single_w8_z20_z16, svbfloat16x4_t, svbfloat16_t,
+ svmls_single_za16_bf16_vg1x4 (w8, z20, z16),
+ svmls_za16_vg1x4 (w8, z20, z16))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sme-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sme-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sme2_acle.h"
+
+#pragma GCC target "+sme-b16b16"
+
+/*
+** mopa_za16_bf16_0_p0_p1_z0_z1:
+** bfmopa za0\.h, p0/m, p1/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_ZA (mopa_za16_bf16_0_p0_p1_z0_z1, svbfloat16_t,
+ svmopa_za16_bf16_m (0, p0, p1, z0, z1),
+ svmopa_za16_m (0, p0, p1, z0, z1))
+
+/*
+** mopa_za16_bf16_0_p1_p0_z1_z0:
+** bfmopa za0\.h, p1/m, p0/m, z1\.h, z0\.h
+** ret
+*/
+TEST_UNIFORM_ZA (mopa_za16_bf16_0_p1_p0_z1_z0, svbfloat16_t,
+ svmopa_za16_bf16_m (0, p1, p0, z1, z0),
+ svmopa_za16_m (0, p1, p0, z1, z0))
+
+/*
+** mopa_za16_bf16_1_p0_p1_z0_z1:
+** bfmopa za1\.h, p0/m, p1/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_ZA (mopa_za16_bf16_1_p0_p1_z0_z1, svbfloat16_t,
+ svmopa_za16_bf16_m (1, p0, p1, z0, z1),
+ svmopa_za16_m (1, p0, p1, z0, z1))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sme-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sme-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sme2_acle.h"
+
+#pragma GCC target "+sme-b16b16"
+
+/*
+** mops_za16_bf16_0_p0_p1_z0_z1:
+** bfmops za0\.h, p0/m, p1/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_ZA (mops_za16_bf16_0_p0_p1_z0_z1, svbfloat16_t,
+ svmops_za16_bf16_m (0, p0, p1, z0, z1),
+ svmops_za16_m (0, p0, p1, z0, z1))
+
+/*
+** mops_za16_bf16_0_p1_p0_z1_z0:
+** bfmops za0\.h, p1/m, p0/m, z1\.h, z0\.h
+** ret
+*/
+TEST_UNIFORM_ZA (mops_za16_bf16_0_p1_p0_z1_z0, svbfloat16_t,
+ svmops_za16_bf16_m (0, p1, p0, z1, z0),
+ svmops_za16_m (0, p1, p0, z1, z0))
+
+/*
+** mops_za16_bf16_1_p0_p1_z0_z1:
+** bfmops za1\.h, p0/m, p1/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_ZA (mops_za16_bf16_1_p0_p1_z0_z1, svbfloat16_t,
+ svmops_za16_bf16_m (1, p0, p1, z0, z1),
+ svmops_za16_m (1, p0, p1, z0, z1))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sme-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sme-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sme2_acle.h"
+
+#pragma GCC target "+sme-b16b16"
+
+/*
+** sub_0_z0:
+** mov (w8|w9|w10|w11), #?0
+** bfsub za\.h\[\1, 0, vgx2\], {z0\.h - z1\.h}
+** ret
+*/
+TEST_ZA_XN (sub_0_z0, svbfloat16x2_t,
+ svsub_za16_bf16_vg1x2 (0, z0),
+ svsub_za16_vg1x2 (0, z0))
+
+/*
+** sub_w0_z0:
+** mov (w8|w9|w10|w11), w0
+** bfsub za\.h\[\1, 0, vgx2\], {z0\.h - z1\.h}
+** ret
+*/
+TEST_ZA_XN (sub_w0_z0, svbfloat16x2_t,
+ svsub_za16_bf16_vg1x2 (w0, z0),
+ svsub_za16_vg1x2 (w0, z0))
+
+/*
+** sub_w7_z0:
+** mov (w8|w9|w10|w11), w7
+** bfsub za\.h\[\1, 0, vgx2\], {z0\.h - z1\.h}
+** ret
+*/
+TEST_ZA_XN (sub_w7_z0, svbfloat16x2_t,
+ svsub_za16_bf16_vg1x2 (w7, z0),
+ svsub_za16_vg1x2 (w7, z0))
+
+/*
+** sub_w8_z0:
+** bfsub za\.h\[w8, 0, vgx2\], {z0\.h - z1\.h}
+** ret
+*/
+TEST_ZA_XN (sub_w8_z0, svbfloat16x2_t,
+ svsub_za16_bf16_vg1x2 (w8, z0),
+ svsub_za16_vg1x2 (w8, z0))
+
+/*
+** sub_w11_z0:
+** bfsub za\.h\[w11, 0, vgx2\], {z0\.h - z1\.h}
+** ret
+*/
+TEST_ZA_XN (sub_w11_z0, svbfloat16x2_t,
+ svsub_za16_bf16_vg1x2 (w11, z0),
+ svsub_za16_vg1x2 (w11, z0))
+
+
+/*
+** sub_w12_z0:
+** mov (w8|w9|w10|w11), w12
+** bfsub za\.h\[\1, 0, vgx2\], {z0\.h - z1\.h}
+** ret
+*/
+TEST_ZA_XN (sub_w12_z0, svbfloat16x2_t,
+ svsub_za16_bf16_vg1x2 (w12, z0),
+ svsub_za16_vg1x2 (w12, z0))
+
+/*
+** sub_w8p7_z0:
+** bfsub za\.h\[w8, 7, vgx2\], {z0\.h - z1\.h}
+** ret
+*/
+TEST_ZA_XN (sub_w8p7_z0, svbfloat16x2_t,
+ svsub_za16_bf16_vg1x2 (w8 + 7, z0),
+ svsub_za16_vg1x2 (w8 + 7, z0))
+
+/*
+** sub_w8p8_z0:
+** add (w8|w9|w10|w11), w8, #?8
+** bfsub za\.h\[\1, 0, vgx2\], {z0\.h - z1\.h}
+** ret
+*/
+TEST_ZA_XN (sub_w8p8_z0, svbfloat16x2_t,
+ svsub_za16_bf16_vg1x2 (w8 + 8, z0),
+ svsub_za16_vg1x2 (w8 + 8, z0))
+
+/*
+** sub_w8m1_z0:
+** sub (w8|w9|w10|w11), w8, #?1
+** bfsub za\.h\[\1, 0, vgx2\], {z0\.h - z1\.h}
+** ret
+*/
+TEST_ZA_XN (sub_w8m1_z0, svbfloat16x2_t,
+ svsub_za16_bf16_vg1x2 (w8 - 1, z0),
+ svsub_za16_vg1x2 (w8 - 1, z0))
+
+/*
+** sub_w8_z18:
+** bfsub za\.h\[w8, 0, vgx2\], {z18\.h - z19\.h}
+** ret
+*/
+TEST_ZA_XN (sub_w8_z18, svbfloat16x2_t,
+ svsub_za16_bf16_vg1x2 (w8, z18),
+ svsub_za16_vg1x2 (w8, z18))
+
+/* Leave the assembler to check for correctness for misaligned registers. */
+
+/*
+** sub_w8_z23:
+** mov [^\n]+
+** mov [^\n]+
+** bfsub za\.h\[w8, 0, vgx2\], [^\n]+
+** ret
+*/
+TEST_ZA_XN (sub_w8_z23, svbfloat16x2_t,
+ svsub_za16_bf16_vg1x2 (w8, z23),
+ svsub_za16_vg1x2 (w8, z23))
+
+/*
+** sub_w8_z28:
+** bfsub za\.h\[w8, 0, vgx2\], {z28\.h - z29\.h}
+** ret
+*/
+TEST_ZA_XN (sub_w8_z28, svbfloat16x2_t,
+ svsub_za16_bf16_vg1x2 (w8, z28),
+ svsub_za16_vg1x2 (w8, z28))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sme-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sme-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sme2_acle.h"
+
+#pragma GCC target "+sme-b16b16"
+
+/*
+** sub_0_z0:
+** mov (w8|w9|w10|w11), #?0
+** bfsub za\.h\[\1, 0, vgx4\], {z0\.h - z3\.h}
+** ret
+*/
+TEST_ZA_XN (sub_0_z0, svbfloat16x4_t,
+ svsub_za16_bf16_vg1x4 (0, z0),
+ svsub_za16_vg1x4 (0, z0))
+
+/*
+** sub_w0_z0:
+** mov (w8|w9|w10|w11), w0
+** bfsub za\.h\[\1, 0, vgx4\], {z0\.h - z3\.h}
+** ret
+*/
+TEST_ZA_XN (sub_w0_z0, svbfloat16x4_t,
+ svsub_za16_bf16_vg1x4 (w0, z0),
+ svsub_za16_vg1x4 (w0, z0))
+
+/*
+** sub_w7_z0:
+** mov (w8|w9|w10|w11), w7
+** bfsub za\.h\[\1, 0, vgx4\], {z0\.h - z3\.h}
+** ret
+*/
+TEST_ZA_XN (sub_w7_z0, svbfloat16x4_t,
+ svsub_za16_bf16_vg1x4 (w7, z0),
+ svsub_za16_vg1x4 (w7, z0))
+
+/*
+** sub_w8_z0:
+** bfsub za\.h\[w8, 0, vgx4\], {z0\.h - z3\.h}
+** ret
+*/
+TEST_ZA_XN (sub_w8_z0, svbfloat16x4_t,
+ svsub_za16_bf16_vg1x4 (w8, z0),
+ svsub_za16_vg1x4 (w8, z0))
+
+/*
+** sub_w11_z0:
+** bfsub za\.h\[w11, 0, vgx4\], {z0\.h - z3\.h}
+** ret
+*/
+TEST_ZA_XN (sub_w11_z0, svbfloat16x4_t,
+ svsub_za16_bf16_vg1x4 (w11, z0),
+ svsub_za16_vg1x4 (w11, z0))
+
+
+/*
+** sub_w12_z0:
+** mov (w8|w9|w10|w11), w12
+** bfsub za\.h\[\1, 0, vgx4\], {z0\.h - z3\.h}
+** ret
+*/
+TEST_ZA_XN (sub_w12_z0, svbfloat16x4_t,
+ svsub_za16_bf16_vg1x4 (w12, z0),
+ svsub_za16_vg1x4 (w12, z0))
+
+/*
+** sub_w8p7_z0:
+** bfsub za\.h\[w8, 7, vgx4\], {z0\.h - z3\.h}
+** ret
+*/
+TEST_ZA_XN (sub_w8p7_z0, svbfloat16x4_t,
+ svsub_za16_bf16_vg1x4 (w8 + 7, z0),
+ svsub_za16_vg1x4 (w8 + 7, z0))
+
+/*
+** sub_w8p8_z0:
+** add (w8|w9|w10|w11), w8, #?8
+** bfsub za\.h\[\1, 0, vgx4\], {z0\.h - z3\.h}
+** ret
+*/
+TEST_ZA_XN (sub_w8p8_z0, svbfloat16x4_t,
+ svsub_za16_bf16_vg1x4 (w8 + 8, z0),
+ svsub_za16_vg1x4 (w8 + 8, z0))
+
+/*
+** sub_w8m1_z0:
+** sub (w8|w9|w10|w11), w8, #?1
+** bfsub za\.h\[\1, 0, vgx4\], {z0\.h - z3\.h}
+** ret
+*/
+TEST_ZA_XN (sub_w8m1_z0, svbfloat16x4_t,
+ svsub_za16_bf16_vg1x4 (w8 - 1, z0),
+ svsub_za16_vg1x4 (w8 - 1, z0))
+
+/*
+** sub_w8_z4:
+** bfsub za\.h\[w8, 0, vgx4\], {z4\.h - z7\.h}
+** ret
+*/
+TEST_ZA_XN (sub_w8_z4, svbfloat16x4_t,
+ svsub_za16_bf16_vg1x4 (w8, z4),
+ svsub_za16_vg1x4 (w8, z4))
+
+/* Leave the assembler to check for correctness for misaligned registers. */
+
+/*
+** sub_w8_z18:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfsub za\.h\[w8, 0, vgx4\], [^\n]+
+** ret
+*/
+TEST_ZA_XN (sub_w8_z18, svbfloat16x4_t,
+ svsub_za16_bf16_vg1x4 (w8, z18),
+ svsub_za16_vg1x4 (w8, z18))
+
+/*
+** sub_w8_z23:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfsub za\.h\[w8, 0, vgx4\], [^\n]+
+** ret
+*/
+TEST_ZA_XN (sub_w8_z23, svbfloat16x4_t,
+ svsub_za16_bf16_vg1x4 (w8, z23),
+ svsub_za16_vg1x4 (w8, z23))
+
+/*
+** sub_w8_z28:
+** bfsub za\.h\[w8, 0, vgx4\], {z28\.h - z31\.h}
+** ret
+*/
+TEST_ZA_XN (sub_w8_z28, svbfloat16x4_t,
+ svsub_za16_bf16_vg1x4 (w8, z28),
+ svsub_za16_vg1x4 (w8, z28))
foreach { aarch64_ext } { "fp" "simd" "crypto" "crc" "lse" "dotprod" "sve"
"i8mm" "f32mm" "f64mm" "bf16" "sb" "sve2" "ls64"
"sme" "sme-i16i64" "sme2" "sve-b16b16"
- "sme-f16f16" } {
+ "sme-b16b16" "sme-f16f16" } {
eval [string map [list FUNC $aarch64_ext] {
proc check_effective_target_aarch64_asm_FUNC_ok { } {
if { [istarget aarch64*-*-*] } {