"__ARM_FEATURE_SVE_MATMUL_FP32", pfile);
aarch64_def_or_undef (TARGET_SVE_F64MM,
"__ARM_FEATURE_SVE_MATMUL_FP64", pfile);
+ aarch64_def_or_undef (AARCH64_HAVE_ISA (SVE_B16B16)
+ && (TARGET_SVE2 || TARGET_SME2),
+ "__ARM_FEATURE_SVE_B16B16", pfile);
aarch64_def_or_undef (TARGET_SVE2, "__ARM_FEATURE_SVE2", pfile);
aarch64_def_or_undef (TARGET_SVE2_AES, "__ARM_FEATURE_SVE2_AES", pfile);
aarch64_def_or_undef (TARGET_SVE2_BITPERM,
AARCH64_OPT_FMV_EXTENSION("sve", SVE, (SIMD, F16), (), (), "sve")
+/* This specifically does not imply +sve. */
+AARCH64_OPT_EXTENSION("sve-b16b16", SVE_B16B16, (), (), (), "")
+
AARCH64_OPT_EXTENSION("f32mm", F32MM, (SVE), (), (), "f32mm")
AARCH64_FMV_FEATURE("f32mm", SVE_F32MM, (F32MM))
DEF_SVE_FUNCTION (svamax, binary_opt_single_n, all_float, mxz)
DEF_SVE_FUNCTION (svamin, binary_opt_single_n, all_float, mxz)
#undef REQUIRED_EXTENSIONS
+
+#define REQUIRED_EXTENSIONS \
+ sve_and_sme (AARCH64_FL_SVE2 | AARCH64_FL_SVE_B16B16, \
+ AARCH64_FL_SME2 | AARCH64_FL_SVE_B16B16)
+DEF_SVE_FUNCTION (svadd, binary_opt_n, h_bfloat, mxz)
+DEF_SVE_FUNCTION (svclamp, clamp, h_bfloat, none)
+DEF_SVE_FUNCTION (svmax, binary_opt_single_n, h_bfloat, mxz)
+DEF_SVE_FUNCTION (svmaxnm, binary_opt_single_n, h_bfloat, mxz)
+DEF_SVE_FUNCTION (svmla, ternary_opt_n, h_bfloat, mxz)
+DEF_SVE_FUNCTION (svmla_lane, ternary_lane, h_bfloat, none)
+DEF_SVE_FUNCTION (svmls, ternary_opt_n, h_bfloat, mxz)
+DEF_SVE_FUNCTION (svmls_lane, ternary_lane, h_bfloat, none)
+DEF_SVE_FUNCTION (svmin, binary_opt_single_n, h_bfloat, mxz)
+DEF_SVE_FUNCTION (svminnm, binary_opt_single_n, h_bfloat, mxz)
+DEF_SVE_FUNCTION (svmul, binary_opt_n, h_bfloat, mxz)
+DEF_SVE_FUNCTION (svmul_lane, binary_lane, h_bfloat, none)
+DEF_SVE_FUNCTION (svsub, binary_opt_n, h_bfloat, mxz)
+#undef REQUIRED_EXTENSIONS
+
+#define REQUIRED_EXTENSIONS \
+ streaming_only (AARCH64_FL_SME2 | AARCH64_FL_SVE_B16B16)
+DEF_SVE_FUNCTION_GS (svclamp, clamp, h_bfloat, x24, none)
+DEF_SVE_FUNCTION_GS (svmax, binary_opt_single_n, h_bfloat, x24, none)
+DEF_SVE_FUNCTION_GS (svmaxnm, binary_opt_single_n, h_bfloat, x24, none)
+DEF_SVE_FUNCTION_GS (svmin, binary_opt_single_n, h_bfloat, x24, none)
+DEF_SVE_FUNCTION_GS (svminnm, binary_opt_single_n, h_bfloat, x24, none)
+#undef REQUIRED_EXTENSIONS
BITS / BITS_PER_UNIT, \
TYPE_##CLASS == TYPE_signed || TYPE_##CLASS == TYPE_unsigned, \
TYPE_##CLASS == TYPE_unsigned, \
- TYPE_##CLASS == TYPE_float, \
+ TYPE_##CLASS == TYPE_float || TYPE_##CLASS == TYPE_bfloat, \
TYPE_##CLASS != TYPE_bool, \
TYPE_##CLASS == TYPE_bool, \
false, \
D (s16, s8), D (s32, s16), D (s64, s32), \
D (u16, u8), D (u32, u16), D (u64, u32)
+/* _bf16. */
+#define TYPES_h_bfloat(S, D) \
+ S (bf16)
+
/* _s16
_u16. */
#define TYPES_h_integer(S, D) \
DEF_SVE_TYPES_ARRAY (bhs_data);
DEF_SVE_TYPES_ARRAY (bhs_widen);
DEF_SVE_TYPES_ARRAY (c);
+DEF_SVE_TYPES_ARRAY (h_bfloat);
DEF_SVE_TYPES_ARRAY (h_integer);
DEF_SVE_TYPES_ARRAY (hs_signed);
DEF_SVE_TYPES_ARRAY (hs_integer);
;; ---- [FP] General binary arithmetic corresponding to rtx codes
;; -------------------------------------------------------------------------
;; Includes post-RA forms of:
+;; - BFADD (SVE_B16B16)
+;; - BFMUL (SVE_B16B16)
+;; - BFSUB (SVE_B16B16)
;; - FADD
;; - FMUL
;; - FSUB
;; Split a predicated instruction whose predicate is unused into an
;; unpredicated instruction.
(define_split
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_FULL_F_BF 0 "register_operand")
+ (unspec:SVE_FULL_F_BF
[(match_operand:<VPRED> 1 "register_operand")
(match_operand:SI 4 "aarch64_sve_gp_strictness")
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "register_operand")]
+ (match_operand:SVE_FULL_F_BF 2 "register_operand")
+ (match_operand:SVE_FULL_F_BF 3 "register_operand")]
<SVE_COND_FP>))]
"TARGET_SVE
&& reload_completed
&& INTVAL (operands[4]) == SVE_RELAXED_GP"
[(set (match_dup 0)
- (SVE_UNPRED_FP_BINARY:SVE_FULL_F (match_dup 2) (match_dup 3)))]
+ (SVE_UNPRED_FP_BINARY:SVE_FULL_F_BF (match_dup 2) (match_dup 3)))]
)
;; Unpredicated floating-point binary operations (post-RA only).
;; These are generated by the split above.
(define_insn "*post_ra_<sve_fp_op><mode>3"
- [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w")
- (SVE_UNPRED_FP_BINARY:SVE_FULL_F
- (match_operand:SVE_FULL_F 1 "register_operand" "w")
- (match_operand:SVE_FULL_F 2 "register_operand" "w")))]
+ [(set (match_operand:SVE_FULL_F_BF 0 "register_operand" "=w")
+ (SVE_UNPRED_FP_BINARY:SVE_FULL_F_BF
+ (match_operand:SVE_FULL_F_BF 1 "register_operand" "w")
+ (match_operand:SVE_FULL_F_BF 2 "register_operand" "w")))]
"TARGET_SVE && reload_completed"
- "<sve_fp_op>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>")
+ "<b><sve_fp_op>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>")
;; -------------------------------------------------------------------------
;; ---- [FP] General binary arithmetic corresponding to unspecs
;; -------------------------------------------------------------------------
;; Includes merging forms of:
+;; - BFADD (SVE_B16B16)
+;; - BFMAX (SVE_B16B16)
+;; - BFMAXNM (SVE_B16B16)
+;; - BFMIN (SVE_B16B16)
+;; - BFMINNM (SVE_B16B16)
+;; - BFMUL (SVE_B16B16)
+;; - BFSUB (SVE_B16B16)
;; - FADD (constant forms handled in the "Addition" section)
;; - FDIV
;; - FDIVR
;; Unpredicated floating-point binary operations that need to be predicated
;; for SVE.
(define_expand "<optab><mode>3"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_FULL_F_BF 0 "register_operand")
+ (unspec:SVE_FULL_F_BF
[(match_dup 3)
(const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F 1 "<sve_pred_fp_rhs1_operand>")
- (match_operand:SVE_FULL_F 2 "<sve_pred_fp_rhs2_operand>")]
+ (match_operand:SVE_FULL_F_BF 1 "<sve_pred_fp_rhs1_operand>")
+ (match_operand:SVE_FULL_F_BF 2 "<sve_pred_fp_rhs2_operand>")]
SVE_COND_FP_BINARY_OPTAB))]
- "TARGET_SVE"
+ "TARGET_SVE && (<supports_bf16> || !<is_bf16>)"
{
operands[3] = aarch64_ptrue_reg (<VPRED>mode);
}
;; Predicated floating-point operations with merging.
(define_expand "@cond_<optab><mode>"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_FULL_F_BF 0 "register_operand")
+ (unspec:SVE_FULL_F_BF
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ (unspec:SVE_FULL_F_BF
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F 2 "<sve_pred_fp_rhs1_operand>")
- (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_operand>")]
+ (match_operand:SVE_FULL_F_BF 2 "<sve_pred_fp_rhs1_operand>")
+ (match_operand:SVE_FULL_F_BF 3 "<sve_pred_fp_rhs2_operand>")]
SVE_COND_FP_BINARY)
- (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
+ (match_operand:SVE_FULL_F_BF 4 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
- "TARGET_SVE"
+ "TARGET_SVE && (<supports_bf16> || !<is_bf16>)"
)
;; Predicated floating-point operations, merging with the first input.
(define_insn_and_rewrite "*cond_<optab><mode>_2_relaxed"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_FULL_F_BF 0 "register_operand")
+ (unspec:SVE_FULL_F_BF
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ (unspec:SVE_FULL_F_BF
[(match_operand 4)
(const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "register_operand")]
+ (match_operand:SVE_FULL_F_BF 2 "register_operand")
+ (match_operand:SVE_FULL_F_BF 3 "register_operand")]
SVE_COND_FP_BINARY)
(match_dup 2)]
UNSPEC_SEL))]
- "TARGET_SVE"
+ "TARGET_SVE && (<supports_bf16> || !<is_bf16>)"
{@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
- [ w , Upl , 0 , w ; * ] <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
- [ ?&w , Upl , w , w ; yes ] movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+ [ w , Upl , 0 , w ; * ] <b><sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+ [ ?&w , Upl , w , w ; yes ] movprfx\t%0, %2\;<b><sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
}
"&& !rtx_equal_p (operands[1], operands[4])"
{
)
(define_insn "*cond_<optab><mode>_2_strict"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_FULL_F_BF 0 "register_operand")
+ (unspec:SVE_FULL_F_BF
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ (unspec:SVE_FULL_F_BF
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "register_operand")]
+ (match_operand:SVE_FULL_F_BF 2 "register_operand")
+ (match_operand:SVE_FULL_F_BF 3 "register_operand")]
SVE_COND_FP_BINARY)
(match_dup 2)]
UNSPEC_SEL))]
- "TARGET_SVE"
+ "TARGET_SVE && (<supports_bf16> || !<is_bf16>)"
{@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
- [ w , Upl , 0 , w ; * ] <sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
- [ ?&w , Upl , w , w ; yes ] movprfx\t%0, %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+ [ w , Upl , 0 , w ; * ] <b><sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+ [ ?&w , Upl , w , w ; yes ] movprfx\t%0, %2\;<b><sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
}
)
;; Predicated floating-point operations, merging with the second input.
(define_insn_and_rewrite "*cond_<optab><mode>_3_relaxed"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_FULL_F_BF 0 "register_operand")
+ (unspec:SVE_FULL_F_BF
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ (unspec:SVE_FULL_F_BF
[(match_operand 4)
(const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "register_operand")]
+ (match_operand:SVE_FULL_F_BF 2 "register_operand")
+ (match_operand:SVE_FULL_F_BF 3 "register_operand")]
SVE_COND_FP_BINARY)
(match_dup 3)]
UNSPEC_SEL))]
- "TARGET_SVE"
+ "TARGET_SVE && (<supports_bf16_rev> || !<is_bf16>)"
{@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
- [ w , Upl , w , 0 ; * ] <sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
- [ ?&w , Upl , w , w ; yes ] movprfx\t%0, %3\;<sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+ [ w , Upl , w , 0 ; * ] <b><sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+ [ ?&w , Upl , w , w ; yes ] movprfx\t%0, %3\;<b><sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
}
"&& !rtx_equal_p (operands[1], operands[4])"
{
)
(define_insn "*cond_<optab><mode>_3_strict"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_FULL_F_BF 0 "register_operand")
+ (unspec:SVE_FULL_F_BF
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ (unspec:SVE_FULL_F_BF
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "register_operand")]
+ (match_operand:SVE_FULL_F_BF 2 "register_operand")
+ (match_operand:SVE_FULL_F_BF 3 "register_operand")]
SVE_COND_FP_BINARY)
(match_dup 3)]
UNSPEC_SEL))]
- "TARGET_SVE"
+ "TARGET_SVE && (<supports_bf16_rev> || !<is_bf16>)"
{@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
- [ w , Upl , w , 0 ; * ] <sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
- [ ?&w , Upl , w , w ; yes ] movprfx\t%0, %3\;<sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+ [ w , Upl , w , 0 ; * ] <b><sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+ [ ?&w , Upl , w , w ; yes ] movprfx\t%0, %3\;<b><sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
}
)
;; Predicated floating-point operations, merging with an independent value.
(define_insn_and_rewrite "*cond_<optab><mode>_any_relaxed"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_FULL_F_BF 0 "register_operand")
+ (unspec:SVE_FULL_F_BF
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ (unspec:SVE_FULL_F_BF
[(match_operand 5)
(const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "register_operand")]
+ (match_operand:SVE_FULL_F_BF 2 "register_operand")
+ (match_operand:SVE_FULL_F_BF 3 "register_operand")]
SVE_COND_FP_BINARY)
- (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
+ (match_operand:SVE_FULL_F_BF 4 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
"TARGET_SVE
+ && (<supports_bf16> || !<is_bf16>)
&& !rtx_equal_p (operands[2], operands[4])
- && !rtx_equal_p (operands[3], operands[4])"
- {@ [ cons: =0 , 1 , 2 , 3 , 4 ]
- [ &w , Upl , 0 , w , Dz ] movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
- [ &w , Upl , w , 0 , Dz ] movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
- [ &w , Upl , w , w , Dz ] movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
- [ &w , Upl , w , w , 0 ] movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
- [ ?&w , Upl , w , w , w ] #
+ && !((<supports_bf16_rev> || !<is_bf16>)
+ && rtx_equal_p (operands[3], operands[4]))"
+ {@ [ cons: =0 , 1 , 2 , 3 , 4 ; attrs: is_rev ]
+ [ &w , Upl , 0 , w , Dz ; * ] movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<b><sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+ [ &w , Upl , w , 0 , Dz ; true ] movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<b><sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+ [ &w , Upl , w , w , Dz ; * ] movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<b><sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+ [ &w , Upl , w , w , 0 ; * ] movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;<b><sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+ [ ?&w , Upl , w , w , w ; * ] #
}
"&& 1"
{
else
FAIL;
}
- [(set_attr "movprfx" "yes")]
+ [(set_attr "movprfx" "yes")
+ (set_attr "is_bf16" "<is_bf16>")
+ (set_attr "supports_bf16_rev" "<supports_bf16_rev>")]
)
(define_insn_and_rewrite "*cond_<optab><mode>_any_strict"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_FULL_F_BF 0 "register_operand")
+ (unspec:SVE_FULL_F_BF
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ (unspec:SVE_FULL_F_BF
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "register_operand")]
+ (match_operand:SVE_FULL_F_BF 2 "register_operand")
+ (match_operand:SVE_FULL_F_BF 3 "register_operand")]
SVE_COND_FP_BINARY)
- (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
+ (match_operand:SVE_FULL_F_BF 4 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
"TARGET_SVE
+ && (<supports_bf16> || !<is_bf16>)
&& !rtx_equal_p (operands[2], operands[4])
- && !rtx_equal_p (operands[3], operands[4])"
- {@ [ cons: =0 , 1 , 2 , 3 , 4 ]
- [ &w , Upl , 0 , w , Dz ] movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
- [ &w , Upl , w , 0 , Dz ] movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
- [ &w , Upl , w , w , Dz ] movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
- [ &w , Upl , w , w , 0 ] movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
- [ ?&w , Upl , w , w , w ] #
+ && !((<supports_bf16_rev> || !<is_bf16>)
+ && rtx_equal_p (operands[3], operands[4]))"
+ {@ [ cons: =0 , 1 , 2 , 3 , 4 ; attrs: is_rev ]
+ [ &w , Upl , 0 , w , Dz ; * ] movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<b><sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+ [ &w , Upl , w , 0 , Dz ; true ] movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<b><sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+ [ &w , Upl , w , w , Dz ; * ] movprfx\t%0.<Vetype>, %1/z, %2.<Vetype>\;<b><sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+ [ &w , Upl , w , w , 0 ; * ] movprfx\t%0.<Vetype>, %1/m, %2.<Vetype>\;<b><sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+ [ ?&w , Upl , w , w , w ; * ] #
}
"&& reload_completed
&& register_operand (operands[4], <MODE>mode)
operands[4], operands[1]));
operands[4] = operands[2] = operands[0];
}
- [(set_attr "movprfx" "yes")]
+ [(set_attr "movprfx" "yes")
+ (set_attr "is_bf16" "<is_bf16>")
+ (set_attr "supports_bf16_rev" "<supports_bf16_rev>")]
)
;; Same for operations that take a 1-bit constant.
;; ---- [FP] Multiplication
;; -------------------------------------------------------------------------
;; Includes:
+;; - BFMUL (SVE_B16B16)
;; - FMUL
;; -------------------------------------------------------------------------
;; Unpredicated multiplication by selected lanes.
(define_insn "@aarch64_mul_lane_<mode>"
- [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w")
- (mult:SVE_FULL_F
- (unspec:SVE_FULL_F
- [(match_operand:SVE_FULL_F 2 "register_operand" "<sve_lane_con>")
+ [(set (match_operand:SVE_FULL_F_BF 0 "register_operand" "=w")
+ (mult:SVE_FULL_F_BF
+ (unspec:SVE_FULL_F_BF
+ [(match_operand:SVE_FULL_F_BF 2 "register_operand" "<sve_lane_con>")
(match_operand:SI 3 "const_int_operand")]
UNSPEC_SVE_LANE_SELECT)
- (match_operand:SVE_FULL_F 1 "register_operand" "w")))]
+ (match_operand:SVE_FULL_F_BF 1 "register_operand" "w")))]
"TARGET_SVE"
- "fmul\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>[%3]"
+ "<b>fmul\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>[%3]"
)
;; -------------------------------------------------------------------------
;; Unpredicated floating-point ternary operations.
(define_expand "<optab><mode>4"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_FULL_F_BF 0 "register_operand")
+ (unspec:SVE_FULL_F_BF
[(match_dup 4)
(const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F 1 "register_operand")
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "register_operand")]
+ (match_operand:SVE_FULL_F_BF 1 "register_operand")
+ (match_operand:SVE_FULL_F_BF 2 "register_operand")
+ (match_operand:SVE_FULL_F_BF 3 "register_operand")]
SVE_COND_FP_TERNARY))]
- "TARGET_SVE"
+ "TARGET_SVE && (<supports_bf16> || !<is_bf16>)"
{
operands[4] = aarch64_ptrue_reg (<VPRED>mode);
}
;; Predicated floating-point ternary operations.
(define_insn "@aarch64_pred_<optab><mode>"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_FULL_F_BF 0 "register_operand")
+ (unspec:SVE_FULL_F_BF
[(match_operand:<VPRED> 1 "register_operand")
(match_operand:SI 5 "aarch64_sve_gp_strictness")
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "register_operand")
- (match_operand:SVE_FULL_F 4 "register_operand")]
+ (match_operand:SVE_FULL_F_BF 2 "register_operand")
+ (match_operand:SVE_FULL_F_BF 3 "register_operand")
+ (match_operand:SVE_FULL_F_BF 4 "register_operand")]
SVE_COND_FP_TERNARY))]
- "TARGET_SVE"
- {@ [ cons: =0 , 1 , 2 , 3 , 4 ; attrs: movprfx ]
- [ w , Upl , %w , w , 0 ; * ] <sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
- [ w , Upl , 0 , w , w ; * ] <sve_fmad_op>\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
- [ ?&w , Upl , w , w , w ; yes ] movprfx\t%0, %4\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+ "TARGET_SVE && (<supports_bf16> || !<is_bf16>)"
+ {@ [ cons: =0 , 1 , 2 , 3 , 4 ; attrs: movprfx , is_rev ]
+ [ w , Upl , %w , w , 0 ; * , * ] <b><sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+ [ w , Upl , 0 , w , w ; * , true ] <b><sve_fmad_op>\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+ [ ?&w , Upl , w , w , w ; yes , * ] movprfx\t%0, %4\;<b><sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
}
+ [(set_attr "is_bf16" "<is_bf16>")
+ (set_attr "supports_bf16_rev" "false")]
)
;; Predicated floating-point ternary operations with merging.
(define_expand "@cond_<optab><mode>"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_FULL_F_BF 0 "register_operand")
+ (unspec:SVE_FULL_F_BF
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ (unspec:SVE_FULL_F_BF
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "register_operand")
- (match_operand:SVE_FULL_F 4 "register_operand")]
+ (match_operand:SVE_FULL_F_BF 2 "register_operand")
+ (match_operand:SVE_FULL_F_BF 3 "register_operand")
+ (match_operand:SVE_FULL_F_BF 4 "register_operand")]
SVE_COND_FP_TERNARY)
- (match_operand:SVE_FULL_F 5 "aarch64_simd_reg_or_zero")]
+ (match_operand:SVE_FULL_F_BF 5 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
- "TARGET_SVE"
+ "TARGET_SVE && (<supports_bf16> || !<is_bf16>)"
{
/* Swap the multiplication operands if the fallback value is the
second of the two. */
;; Predicated floating-point ternary operations, merging with the
;; third input.
(define_insn_and_rewrite "*cond_<optab><mode>_4_relaxed"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_FULL_F_BF 0 "register_operand")
+ (unspec:SVE_FULL_F_BF
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ (unspec:SVE_FULL_F_BF
[(match_operand 5)
(const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "register_operand")
- (match_operand:SVE_FULL_F 4 "register_operand")]
+ (match_operand:SVE_FULL_F_BF 2 "register_operand")
+ (match_operand:SVE_FULL_F_BF 3 "register_operand")
+ (match_operand:SVE_FULL_F_BF 4 "register_operand")]
SVE_COND_FP_TERNARY)
(match_dup 4)]
UNSPEC_SEL))]
- "TARGET_SVE"
+ "TARGET_SVE && (<supports_bf16> || !<is_bf16>)"
{@ [ cons: =0 , 1 , 2 , 3 , 4 ; attrs: movprfx ]
- [ w , Upl , w , w , 0 ; * ] <sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
- [ ?&w , Upl , w , w , w ; yes ] movprfx\t%0, %4\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+ [ w , Upl , w , w , 0 ; * ] <b><sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+ [ ?&w , Upl , w , w , w ; yes ] movprfx\t%0, %4\;<b><sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
}
"&& !rtx_equal_p (operands[1], operands[5])"
{
)
(define_insn "*cond_<optab><mode>_4_strict"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_FULL_F_BF 0 "register_operand")
+ (unspec:SVE_FULL_F_BF
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ (unspec:SVE_FULL_F_BF
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "register_operand")
- (match_operand:SVE_FULL_F 4 "register_operand")]
+ (match_operand:SVE_FULL_F_BF 2 "register_operand")
+ (match_operand:SVE_FULL_F_BF 3 "register_operand")
+ (match_operand:SVE_FULL_F_BF 4 "register_operand")]
SVE_COND_FP_TERNARY)
(match_dup 4)]
UNSPEC_SEL))]
- "TARGET_SVE"
+ "TARGET_SVE && (<supports_bf16> || !<is_bf16>)"
{@ [ cons: =0 , 1 , 2 , 3 , 4 ; attrs: movprfx ]
- [ w , Upl , w , w , 0 ; * ] <sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
- [ ?&w , Upl , w , w , w ; yes ] movprfx\t%0, %4\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+ [ w , Upl , w , w , 0 ; * ] <b><sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+ [ ?&w , Upl , w , w , w ; yes ] movprfx\t%0, %4\;<b><sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
}
)
;; Predicated floating-point ternary operations, merging with an
;; independent value.
(define_insn_and_rewrite "*cond_<optab><mode>_any_relaxed"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_FULL_F_BF 0 "register_operand")
+ (unspec:SVE_FULL_F_BF
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ (unspec:SVE_FULL_F_BF
[(match_operand 6)
(const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "register_operand")
- (match_operand:SVE_FULL_F 4 "register_operand")]
+ (match_operand:SVE_FULL_F_BF 2 "register_operand")
+ (match_operand:SVE_FULL_F_BF 3 "register_operand")
+ (match_operand:SVE_FULL_F_BF 4 "register_operand")]
SVE_COND_FP_TERNARY)
- (match_operand:SVE_FULL_F 5 "aarch64_simd_reg_or_zero")]
+ (match_operand:SVE_FULL_F_BF 5 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
"TARGET_SVE
- && !rtx_equal_p (operands[2], operands[5])
- && !rtx_equal_p (operands[3], operands[5])
+ && (<supports_bf16> || !<is_bf16>)
+ && (<is_bf16> || !rtx_equal_p (operands[2], operands[5]))
+ && (<is_bf16> || !rtx_equal_p (operands[3], operands[5]))
&& !rtx_equal_p (operands[4], operands[5])"
- {@ [ cons: =0 , 1 , 2 , 3 , 4 , 5 ]
- [ &w , Upl , w , w , w , Dz ] movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
- [ &w , Upl , w , w , 0 , Dz ] movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
- [ &w , Upl , 0 , w , w , Dz ] movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fmad_op>\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
- [ &w , Upl , w , 0 , w , Dz ] movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fmad_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %4.<Vetype>
- [ &w , Upl , w , w , w , 0 ] movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
- [ ?&w , Upl , w , w , w , w ] #
+ {@ [ cons: =0 , 1 , 2 , 3 , 4 , 5 ; attrs: is_rev ]
+ [ &w , Upl , w , w , w , Dz ; * ] movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;<b><sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+ [ &w , Upl , w , w , 0 , Dz ; * ] movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<b><sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+ [ &w , Upl , 0 , w , w , Dz ; true ] movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<b><sve_fmad_op>\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+ [ &w , Upl , w , 0 , w , Dz ; true ] movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<b><sve_fmad_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %4.<Vetype>
+ [ &w , Upl , w , w , w , 0 ; * ] movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;<b><sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+ [ ?&w , Upl , w , w , w , w ; * ] #
}
"&& 1"
{
else
FAIL;
}
- [(set_attr "movprfx" "yes")]
+ [(set_attr "movprfx" "yes")
+ (set_attr "is_bf16" "<is_bf16>")
+ (set_attr "supports_bf16_rev" "false")]
)
(define_insn_and_rewrite "*cond_<optab><mode>_any_strict"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_FULL_F_BF 0 "register_operand")
+ (unspec:SVE_FULL_F_BF
[(match_operand:<VPRED> 1 "register_operand")
- (unspec:SVE_FULL_F
+ (unspec:SVE_FULL_F_BF
[(match_dup 1)
(const_int SVE_STRICT_GP)
- (match_operand:SVE_FULL_F 2 "register_operand")
- (match_operand:SVE_FULL_F 3 "register_operand")
- (match_operand:SVE_FULL_F 4 "register_operand")]
+ (match_operand:SVE_FULL_F_BF 2 "register_operand")
+ (match_operand:SVE_FULL_F_BF 3 "register_operand")
+ (match_operand:SVE_FULL_F_BF 4 "register_operand")]
SVE_COND_FP_TERNARY)
- (match_operand:SVE_FULL_F 5 "aarch64_simd_reg_or_zero")]
+ (match_operand:SVE_FULL_F_BF 5 "aarch64_simd_reg_or_zero")]
UNSPEC_SEL))]
"TARGET_SVE
- && !rtx_equal_p (operands[2], operands[5])
- && !rtx_equal_p (operands[3], operands[5])
+ && (<supports_bf16> || !<is_bf16>)
+ && (<is_bf16> || !rtx_equal_p (operands[2], operands[5]))
+ && (<is_bf16> || !rtx_equal_p (operands[3], operands[5]))
&& !rtx_equal_p (operands[4], operands[5])"
- {@ [ cons: =0 , 1 , 2 , 3 , 4 , 5 ]
- [ &w , Upl , w , w , w , Dz ] movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
- [ &w , Upl , w , w , 0 , Dz ] movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
- [ &w , Upl , 0 , w , w , Dz ] movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fmad_op>\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
- [ &w , Upl , w , 0 , w , Dz ] movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<sve_fmad_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %4.<Vetype>
- [ &w , Upl , w , w , w , 0 ] movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;<sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
- [ ?&w , Upl , w , w , w , w ] #
+ {@ [ cons: =0 , 1 , 2 , 3 , 4 , 5 ; attrs: is_rev ]
+ [ &w , Upl , w , w , w , Dz ; * ] movprfx\t%0.<Vetype>, %1/z, %4.<Vetype>\;<b><sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+ [ &w , Upl , w , w , 0 , Dz ; * ] movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<b><sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+ [ &w , Upl , 0 , w , w , Dz ; true ] movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<b><sve_fmad_op>\t%0.<Vetype>, %1/m, %3.<Vetype>, %4.<Vetype>
+ [ &w , Upl , w , 0 , w , Dz ; true ] movprfx\t%0.<Vetype>, %1/z, %0.<Vetype>\;<b><sve_fmad_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %4.<Vetype>
+ [ &w , Upl , w , w , w , 0 ; * ] movprfx\t%0.<Vetype>, %1/m, %4.<Vetype>\;<b><sve_fmla_op>\t%0.<Vetype>, %1/m, %2.<Vetype>, %3.<Vetype>
+ [ ?&w , Upl , w , w , w , w ; * ] #
}
"&& reload_completed
&& register_operand (operands[5], <MODE>mode)
operands[5], operands[1]));
operands[5] = operands[4] = operands[0];
}
- [(set_attr "movprfx" "yes")]
+ [(set_attr "movprfx" "yes")
+ (set_attr "is_bf16" "<is_bf16>")
+ (set_attr "supports_bf16_rev" "false")]
)
;; Unpredicated FMLA and FMLS by selected lanes. It doesn't seem worth using
;; (fma ...) since target-independent code won't understand the indexing.
(define_insn "@aarch64_<optab>_lane_<mode>"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
- [(match_operand:SVE_FULL_F 1 "register_operand")
- (unspec:SVE_FULL_F
- [(match_operand:SVE_FULL_F 2 "register_operand")
+ [(set (match_operand:SVE_FULL_F_BF 0 "register_operand")
+ (unspec:SVE_FULL_F_BF
+ [(match_operand:SVE_FULL_F_BF 1 "register_operand")
+ (unspec:SVE_FULL_F_BF
+ [(match_operand:SVE_FULL_F_BF 2 "register_operand")
(match_operand:SI 3 "const_int_operand")]
UNSPEC_SVE_LANE_SELECT)
- (match_operand:SVE_FULL_F 4 "register_operand")]
+ (match_operand:SVE_FULL_F_BF 4 "register_operand")]
SVE_FP_TERNARY_LANE))]
"TARGET_SVE"
{@ [ cons: =0 , 1 , 2 , 4 ; attrs: movprfx ]
- [ w , w , <sve_lane_con> , 0 ; * ] <sve_fp_op>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>[%3]
- [ ?&w , w , <sve_lane_con> , w ; yes ] movprfx\t%0, %4\;<sve_fp_op>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>[%3]
+ [ w , w , <sve_lane_con> , 0 ; * ] <b><sve_fp_op>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>[%3]
+ [ ?&w , w , <sve_lane_con> , w ; yes ] movprfx\t%0, %4\;<b><sve_fp_op>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>[%3]
}
)
;; ---- [INT] General binary arithmetic that maps to unspecs
;; ---- [INT] Saturating binary arithmetic
;; ---- [INT] Saturating left shifts
+;; ---- [FP] Non-widening bfloat16 arithmetic
;; ---- [FP] Clamp to minimum/maximum
;;
;; == Uniform ternary arithmnetic
[(set_attr "movprfx" "yes")]
)
+;; -------------------------------------------------------------------------
+;; ---- [FP] Non-widening bfloat16 arithmetic
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - BFADD
+;; - BFMAX
+;; - BFMAXNM
+;; - BFMIN
+;; - BFMINNM
+;; - BFMUL
+;; -------------------------------------------------------------------------
+
+;; Predicated B16B16 binary operations.
+(define_insn "@aarch64_pred_<optab><mode>"
+ [(set (match_operand:VNx8BF_ONLY 0 "register_operand")
+ (unspec:VNx8BF_ONLY
+ [(match_operand:<VPRED> 1 "register_operand")
+ (match_operand:SI 4 "aarch64_sve_gp_strictness")
+ (match_operand:VNx8BF_ONLY 2 "register_operand")
+ (match_operand:VNx8BF_ONLY 3 "register_operand")]
+ SVE_COND_FP_BINARY_OPTAB))]
+ "TARGET_SSVE_B16B16 && <supports_bf16>"
+ {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx , is_rev ]
+ [ w , Upl , 0 , w ; * , * ] <b><sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+ [ w , Upl , w , 0 ; * , true ] <b><sve_fp_op_rev>\t%0.<Vetype>, %1/m, %0.<Vetype>, %2.<Vetype>
+ [ ?&w , Upl , w , w ; yes , * ] movprfx\t%0, %2\;<b><sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+ }
+ [(set_attr "is_bf16" "<is_bf16>")
+ (set_attr "supports_bf16_rev" "<supports_bf16_rev>")]
+)
+
;; -------------------------------------------------------------------------
;; ---- [FP] Clamp to minimum/maximum
;; -------------------------------------------------------------------------
+;; - BFCLAMP (SVE_B16B16)
;; - FCLAMP
;; -------------------------------------------------------------------------
;; The minimum is applied after the maximum, which matters if the maximum
;; bound is (unexpectedly) less than the minimum bound.
(define_insn "@aarch64_sve_fclamp<mode>"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
- [(unspec:SVE_FULL_F
- [(match_operand:SVE_FULL_F 1 "register_operand")
- (match_operand:SVE_FULL_F 2 "register_operand")]
+ [(set (match_operand:SVE_CLAMP_F 0 "register_operand")
+ (unspec:SVE_CLAMP_F
+ [(unspec:SVE_CLAMP_F
+ [(match_operand:SVE_CLAMP_F 1 "register_operand")
+ (match_operand:SVE_CLAMP_F 2 "register_operand")]
UNSPEC_FMAXNM)
- (match_operand:SVE_FULL_F 3 "register_operand")]
+ (match_operand:SVE_CLAMP_F 3 "register_operand")]
UNSPEC_FMINNM))]
- "TARGET_SVE2p1_OR_SME2"
+ ""
{@ [cons: =0, 1, 2, 3; attrs: movprfx]
- [ w, %0, w, w; * ] fclamp\t%0.<Vetype>, %2.<Vetype>, %3.<Vetype>
- [ ?&w, w, w, w; yes ] movprfx\t%0, %1\;fclamp\t%0.<Vetype>, %2.<Vetype>, %3.<Vetype>
+ [ w, %0, w, w; * ] <b>fclamp\t%0.<Vetype>, %2.<Vetype>, %3.<Vetype>
+ [ ?&w, w, w, w; yes ] movprfx\t%0, %1\;<b>fclamp\t%0.<Vetype>, %2.<Vetype>, %3.<Vetype>
}
)
(define_insn_and_split "*aarch64_sve_fclamp<mode>_x"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
+ [(set (match_operand:SVE_CLAMP_F 0 "register_operand")
+ (unspec:SVE_CLAMP_F
[(match_operand 4)
(const_int SVE_RELAXED_GP)
- (unspec:SVE_FULL_F
+ (unspec:SVE_CLAMP_F
[(match_operand 5)
(const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F 1 "register_operand")
- (match_operand:SVE_FULL_F 2 "register_operand")]
+ (match_operand:SVE_CLAMP_F 1 "register_operand")
+ (match_operand:SVE_CLAMP_F 2 "register_operand")]
UNSPEC_COND_FMAXNM)
- (match_operand:SVE_FULL_F 3 "register_operand")]
+ (match_operand:SVE_CLAMP_F 3 "register_operand")]
UNSPEC_COND_FMINNM))]
- "TARGET_SVE2p1_OR_SME2"
+ ""
{@ [cons: =0, 1, 2, 3; attrs: movprfx]
[ w, %0, w, w; * ] #
[ ?&w, w, w, w; yes ] #
}
"&& true"
[(set (match_dup 0)
- (unspec:SVE_FULL_F
- [(unspec:SVE_FULL_F
+ (unspec:SVE_CLAMP_F
+ [(unspec:SVE_CLAMP_F
[(match_dup 1)
(match_dup 2)]
UNSPEC_FMAXNM)
(match_operand:<VSINGLE> 3 "register_operand" "w"))]
UNSPEC_FMINNM))]
"TARGET_STREAMING_SME2"
- "fclamp\t%0, %2.<Vetype>, %3.<Vetype>"
+ "<b>fclamp\t%0, %2.<Vetype>, %3.<Vetype>"
)
;; =========================================================================
(match_operand:SVE_Fx24 2 "aligned_register_operand" "Uw<vector_count>")]
SVE_FP_BINARY_MULTI))]
"TARGET_STREAMING_SME2"
- "<maxmin_uns_op>\t%0, %0, %2"
+ "<b><maxmin_uns_op>\t%0, %0, %2"
)
(define_insn "@aarch64_sve_single_<maxmin_uns_op><mode>"
(match_operand:<VSINGLE> 2 "register_operand" "x"))]
SVE_FP_BINARY_MULTI))]
"TARGET_STREAMING_SME2"
- "<maxmin_uns_op>\t%0, %0, %2.<Vetype>"
+ "<b><maxmin_uns_op>\t%0, %0, %2.<Vetype>"
)
;; -------------------------------------------------------------------------
rtx elt;
REAL_VALUE_TYPE r;
- if (!const_vec_duplicate_p (x, &elt)
+ if (GET_MODE_INNER (GET_MODE (x)) == BFmode
+ || !const_vec_duplicate_p (x, &elt)
|| !CONST_DOUBLE_P (elt))
return false;
{
rtx elt;
- return (const_vec_duplicate_p (x, &elt)
+ return (GET_MODE_INNER (GET_MODE (x)) != BFmode
+ && const_vec_duplicate_p (x, &elt)
&& CONST_DOUBLE_P (elt)
&& (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
|| real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
/* Same with streaming mode enabled. */
#define TARGET_STREAMING_SME2 (TARGET_STREAMING && TARGET_SME2)
+#define TARGET_SME_B16B16 AARCH64_HAVE_ISA (SME_B16B16)
+
/* ARMv8.3-A features. */
#define TARGET_ARMV8_3 AARCH64_HAVE_ISA (V8_3A)
/* Combinatorial tests. */
+#define TARGET_SVE2_OR_SME2 \
+ ((TARGET_SVE2 || TARGET_STREAMING) \
+ && (TARGET_SME2 || TARGET_NON_STREAMING))
+
/* There's no need to check TARGET_SME for streaming or streaming-compatible
functions, since streaming mode itself implies SME. */
#define TARGET_SVE2p1_OR_SME (TARGET_SVE2p1 || TARGET_STREAMING)
((TARGET_SVE2p1 || TARGET_STREAMING) \
&& (TARGET_SME2 || TARGET_NON_STREAMING))
+#define TARGET_SSVE_B16B16 \
+ (AARCH64_HAVE_ISA (SVE_B16B16) && TARGET_SVE2_OR_SME2)
+
/* Standard register usage. */
/* 31 64-bit general purpose registers R0-R30:
(const_string "yes")
(const_string "no")))
+;; True if this a bfloat16 operation. Only used for certain instructions.
+(define_attr "is_bf16" "false,true" (const_string "false"))
+
+;; True if this alternative uses an SVE instruction in which the operands
+;; are reversed. This can happen for naturally commutative operations
+;; such as FADD, or when using things like FSUBR in preference to FSUB,
+;; or similarly when using things like FMAD in preference to FMLA.
+(define_attr "is_rev" "false,true" (const_string "false"))
+
+;; True if this operation supports is_rev-style instructions for bfloat16.
+(define_attr "supports_bf16_rev" "false,true" (const_string "false"))
+
+;; Selectively enable alternatives based on the mode of the operation.
+(define_attr "mode_enabled" "false,true"
+ (cond [(and (eq_attr "is_bf16" "true")
+ (eq_attr "is_rev" "true")
+ (eq_attr "supports_bf16_rev" "false"))
+ (const_string "false")]
+ (const_string "true")))
+
;; Attribute that controls whether an alternative is enabled or not.
-;; Currently it is only used to disable alternatives which touch fp or simd
-;; registers when -mgeneral-regs-only is specified or to require a special
-;; architecture support.
-(define_attr "enabled" "no,yes" (attr "arch_enabled"))
+(define_attr "enabled" "no,yes"
+ (if_then_else (and (eq_attr "arch_enabled" "yes")
+ (eq_attr "mode_enabled" "true"))
+ (const_string "yes")
+ (const_string "no")))
;; Attribute that specifies whether we are dealing with a branch to a
;; label that is far away, i.e. further away than the maximum/minimum
;; Fully-packed SVE floating-point vector modes and their scalar equivalents.
(define_mode_iterator SVE_FULL_F_SCALAR [SVE_FULL_F GPF_HF])
+(define_mode_iterator SVE_FULL_F_BF [(VNx8BF "TARGET_SSVE_B16B16") SVE_FULL_F])
+
+;; Modes for which (B)FCLAMP is supported.
+(define_mode_iterator SVE_CLAMP_F [(VNx8BF "TARGET_SSVE_B16B16")
+ (VNx8HF "TARGET_SVE2p1_OR_SME2")
+ (VNx4SF "TARGET_SVE2p1_OR_SME2")
+ (VNx2DF "TARGET_SVE2p1_OR_SME2")])
+
;; Fully-packed SVE integer vector modes that have 8-bit or 16-bit elements.
(define_mode_iterator SVE_FULL_BHI [VNx16QI VNx8HI])
(define_mode_iterator SVE_Ix24 [VNx32QI VNx16HI VNx8SI VNx4DI
VNx64QI VNx32HI VNx16SI VNx8DI])
-(define_mode_iterator SVE_Fx24 [VNx16HF VNx8SF VNx4DF
+(define_mode_iterator SVE_Fx24 [(VNx16BF "TARGET_SSVE_B16B16")
+ (VNx32BF "TARGET_SSVE_B16B16")
+ VNx16HF VNx8SF VNx4DF
VNx32HF VNx16SF VNx8DF])
(define_mode_iterator SVE_SFx24 [VNx8SF VNx16SF])
;; The constraint to use for an SVE [SU]DOT, FMUL, FMLA or FMLS lane index.
(define_mode_attr sve_lane_con [(VNx8HI "y") (VNx4SI "y") (VNx2DI "x")
(V2DI "x")
- (VNx8HF "y") (VNx4SF "y") (VNx2DF "x")])
+ (VNx8BF "y") (VNx8HF "y")
+ (VNx4SF "y") (VNx2DF "x")])
;; The constraint to use for an SVE FCMLA lane index.
(define_mode_attr sve_lane_pair_con [(VNx8HF "y") (VNx4SF "x")])
(V2DI "vec") (DI "offset")])
(define_mode_attr b [(VNx8BF "b") (VNx8HF "") (VNx4SF "") (VNx2DF "")
- (VNx16BF "b") (VNx16HF "")
- (VNx32BF "b") (VNx32HF "")])
+ (VNx16BF "b") (VNx16HF "") (VNx8SF "") (VNx4DF "")
+ (VNx32BF "b") (VNx32HF "") (VNx16SF "") (VNx8DF "")])
+
+(define_mode_attr is_bf16 [(VNx8BF "true")
+ (VNx8HF "false")
+ (VNx4SF "false")
+ (VNx2DF "false")])
(define_mode_attr aligned_operand [(VNx16QI "register_operand")
(VNx8HI "register_operand")
(UNSPEC_COND_FNMLA "fnmad")
(UNSPEC_COND_FNMLS "fnmsb")])
+(define_int_attr supports_bf16 [(UNSPEC_COND_FADD "true")
+ (UNSPEC_COND_FAMAX "false")
+ (UNSPEC_COND_FAMIN "false")
+ (UNSPEC_COND_FDIV "false")
+ (UNSPEC_COND_FMAX "true")
+ (UNSPEC_COND_FMAXNM "true")
+ (UNSPEC_COND_FMIN "true")
+ (UNSPEC_COND_FMINNM "true")
+ (UNSPEC_COND_FMLA "true")
+ (UNSPEC_COND_FMLS "true")
+ (UNSPEC_COND_FMUL "true")
+ (UNSPEC_COND_FMULX "false")
+ (UNSPEC_COND_FMULX "false")
+ (UNSPEC_COND_FNMLA "false")
+ (UNSPEC_COND_FNMLS "false")
+ (UNSPEC_COND_FSUB "true")
+ (UNSPEC_COND_SMAX "true")
+ (UNSPEC_COND_SMIN "true")])
+
+;; Differs from supports_bf16 only in UNSPEC_COND_FSUB.
+(define_int_attr supports_bf16_rev [(UNSPEC_COND_FADD "true")
+ (UNSPEC_COND_FAMAX "false")
+ (UNSPEC_COND_FAMIN "false")
+ (UNSPEC_COND_FDIV "false")
+ (UNSPEC_COND_FMAX "true")
+ (UNSPEC_COND_FMAXNM "true")
+ (UNSPEC_COND_FMIN "true")
+ (UNSPEC_COND_FMINNM "true")
+ (UNSPEC_COND_FMLA "true")
+ (UNSPEC_COND_FMLS "true")
+ (UNSPEC_COND_FMUL "true")
+ (UNSPEC_COND_FMULX "false")
+ (UNSPEC_COND_FMULX "false")
+ (UNSPEC_COND_FNMLA "false")
+ (UNSPEC_COND_FNMLS "false")
+ (UNSPEC_COND_FSUB "false")
+ (UNSPEC_COND_SMAX "true")
+ (UNSPEC_COND_SMIN "true")])
+
;; The register constraint to use for the final operand in a binary BRK.
(define_int_attr brk_reg_con [(UNSPEC_BRKN "0")
(UNSPEC_BRKPA "Upa") (UNSPEC_BRKPB "Upa")])
(define_predicate "aarch64_sve_float_maxmin_immediate"
(and (match_code "const_vector")
+ (match_test "GET_MODE_INNER (GET_MODE (op)) != BFmode")
(ior (match_test "op == CONST0_RTX (GET_MODE (op))")
(match_test "op == CONST1_RTX (GET_MODE (op))"))))
Enable the fp8 (8-bit floating point) extension.
@item faminmax
Enable the Floating Point Absolute Maximum/Minimum extension.
+@item sve-b16b16
+Enable the SVE non-widening brain floating-point (@code{bf16}) extension.
+This only has an effect when @code{sve2} or @code{sme2} are also enabled.
@end table
#error Foo
#endif
+#pragma GCC target "+nothing+sve-b16b16"
+#ifdef __ARM_FEATURE_SVE_B16B16
+#error Foo
+#endif
+#ifdef __ARM_FEATURE_SVE
+#error Foo
+#endif
+#ifdef __ARM_FEATURE_SME
+#error Foo
+#endif
+
+#pragma GCC target "+nothing+sve-b16b16+sve"
+#ifdef __ARM_FEATURE_SVE_B16B16
+#error Foo
+#endif
+#ifndef __ARM_FEATURE_SVE
+#error Foo
+#endif
+#ifdef __ARM_FEATURE_SME
+#error Foo
+#endif
+
+#pragma GCC target "+nothing+sve-b16b16+sve2"
+#ifndef __ARM_FEATURE_SVE_B16B16
+#error Foo
+#endif
+#ifndef __ARM_FEATURE_SVE
+#error Foo
+#endif
+#ifdef __ARM_FEATURE_SME
+#error Foo
+#endif
+
+#pragma GCC target "+nothing+sve-b16b16+sme2"
+#ifndef __ARM_FEATURE_SVE_B16B16
+#error Foo
+#endif
+#ifndef __ARM_FEATURE_SME
+#error Foo
+#endif
+
#pragma GCC target "branch-protection=standard"
#ifndef __ARM_FEATURE_BTI_DEFAULT
#error Foo
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sme2_acle.h"
+
+#pragma GCC target "+sve-b16b16"
+
+/*
+** clamp_z24_z24_z0_z5:
+** bfclamp {z24\.h - z25\.h}, z0\.h, z5\.h
+** ret
+*/
+TEST_XN_SINGLE (clamp_z24_z24_z0_z5, svbfloat16x2_t, svbfloat16_t, z24,
+ svclamp_single_bf16_x2 (z24, z0, z5),
+ svclamp (z24, z0, z5))
+
+/*
+** clamp_z24_z28_z5_z7:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** bfclamp {z24\.h - z25\.h}, z5\.h, z7\.h
+** |
+** bfclamp {z28\.h - z29\.h}, z5\.h, z7\.h
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN_SINGLE (clamp_z24_z28_z5_z7, svbfloat16x2_t, svbfloat16_t, z24,
+ svclamp_single_bf16_x2 (z28, z5, z7),
+ svclamp (z28, z5, z7))
+
+/*
+** clamp_z24_z1_z7_z16:
+** (
+** mov z24\.d, z1\.d
+** mov z25\.d, z2\.d
+** |
+** mov z25\.d, z2\.d
+** mov z24\.d, z1\.d
+** )
+** bfclamp {z24\.h - z25\.h}, z7\.h, z16\.h
+** ret
+*/
+TEST_XN_SINGLE (clamp_z24_z1_z7_z16, svbfloat16x2_t, svbfloat16_t, z24,
+ svclamp_single_bf16_x2 (z1, z7, z16),
+ svclamp (z1, z7, z16))
+
+/*
+** clamp_z1_z24_z16_z23:
+** bfclamp {z24\.h - z25\.h}, z16\.h, z23\.h
+** (
+** mov z1\.d, z24\.d
+** mov z2\.d, z25\.d
+** |
+** mov z2\.d, z25\.d
+** mov z1\.d, z24\.d
+** )
+** ret
+*/
+TEST_XN_SINGLE (clamp_z1_z24_z16_z23, svbfloat16x2_t, svbfloat16_t, z1,
+ svclamp_single_bf16_x2 (z24, z16, z23),
+ svclamp (z24, z16, z23))
+
+/*
+** clamp_z1_z1_z23_z0:
+** mov [^\n]+
+** mov [^\n]+
+** bfclamp {z[0-9]+\.h - z[0-9]+\.h}, z23\.h, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN_SINGLE (clamp_z1_z1_z23_z0, svbfloat16x2_t, svbfloat16_t, z1,
+ svclamp_single_bf16_x2 (z1, z23, z0),
+ svclamp (z1, z23, z0))
+
+/*
+** clamp_z18_z18_z0_z23:
+** bfclamp {z18\.h - z19\.h}, z0\.h, z23\.h
+** ret
+*/
+TEST_XN_SINGLE (clamp_z18_z18_z0_z23, svbfloat16x2_t, svbfloat16_t, z18,
+ svclamp_single_bf16_x2 (z18, z0, z23),
+ svclamp (z18, z0, z23))
+
+/*
+** clamp_awkward:
+** ...
+** bfclamp {z[0-9]+\.h - z[0-9]+\.h}, z[0-9]+\.h, z3\.h
+** ...
+** ret
+*/
+TEST_XN_SINGLE_AWKWARD (clamp_awkward, svbfloat16x2_t, svbfloat16_t,
+ z0_res = svclamp_single_bf16_x2 (z1, z0, zn),
+ z0_res = svclamp (z1, z0, zn))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sme2_acle.h"
+
+#pragma GCC target "+sve-b16b16"
+
+/*
+** clamp_z24_z24_z0_z5:
+** bfclamp {z24\.h - z27\.h}, z0\.h, z5\.h
+** ret
+*/
+TEST_XN_SINGLE (clamp_z24_z24_z0_z5, svbfloat16x4_t, svbfloat16_t, z24,
+ svclamp_single_bf16_x4 (z24, z0, z5),
+ svclamp (z24, z0, z5))
+
+/*
+** clamp_z24_z28_z5_z7:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfclamp {z24\.h - z27\.h}, z5\.h, z7\.h
+** |
+** bfclamp {z28\.h - z31\.h}, z5\.h, z7\.h
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN_SINGLE (clamp_z24_z28_z5_z7, svbfloat16x4_t, svbfloat16_t, z24,
+ svclamp_single_bf16_x4 (z28, z5, z7),
+ svclamp (z28, z5, z7))
+
+/*
+** clamp_z24_z1_z7_z16:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfclamp {z24\.h - z27\.h}, z7\.h, z16\.h
+** ret
+*/
+TEST_XN_SINGLE (clamp_z24_z1_z7_z16, svbfloat16x4_t, svbfloat16_t, z24,
+ svclamp_single_bf16_x4 (z1, z7, z16),
+ svclamp (z1, z7, z16))
+
+/*
+** clamp_z1_z24_z16_z23:
+** bfclamp {z24\.h - z27\.h}, z16\.h, z23\.h
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN_SINGLE (clamp_z1_z24_z16_z23, svbfloat16x4_t, svbfloat16_t, z1,
+ svclamp_single_bf16_x4 (z24, z16, z23),
+ svclamp (z24, z16, z23))
+
+/*
+** clamp_z1_z1_z23_z0:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfclamp {z[0-9]+\.h - z[0-9]+\.h}, z23\.h, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN_SINGLE (clamp_z1_z1_z23_z0, svbfloat16x4_t, svbfloat16_t, z1,
+ svclamp_single_bf16_x4 (z1, z23, z0),
+ svclamp (z1, z23, z0))
+
+/*
+** clamp_z18_z18_z16_z5:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfclamp {z[0-9]+\.h - z[0-9]+\.h}, z16\.h, z5\.h
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN_SINGLE (clamp_z18_z18_z16_z5, svbfloat16x4_t, svbfloat16_t, z18,
+ svclamp_single_bf16_x4 (z18, z16, z5),
+ svclamp (z18, z16, z5))
+
+/*
+** clamp_awkward:
+** ...
+** bfclamp {z[0-9]+\.h - z[0-9]+\.h}, z[0-9]+\.h, z5\.h
+** ...
+** ret
+*/
+TEST_XN_SINGLE_AWKWARD (clamp_awkward, svbfloat16x4_t, svbfloat16_t,
+ z0_res = svclamp_single_bf16_x4 (z1, z0, zn),
+ z0_res = svclamp (z1, z0, zn))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sme2_acle.h"
+
+#pragma GCC target "+sve-b16b16"
+
+/*
+** max_z0_z0_z4:
+** bfmax {z0\.h - z1\.h}, {z0\.h - z1\.h}, {z4\.h - z5\.h}
+** ret
+*/
+TEST_XN (max_z0_z0_z4, svbfloat16x2_t, z0,
+ svmax_bf16_x2 (z0, z4),
+ svmax (z0, z4))
+
+/*
+** max_z0_z4_z0:
+** bfmax {z0\.h - z1\.h}, {z0\.h - z1\.h}, {z4\.h - z5\.h}
+** ret
+*/
+TEST_XN (max_z0_z4_z0, svbfloat16x2_t, z0,
+ svmax_bf16_x2 (z4, z0),
+ svmax (z4, z0))
+
+/*
+** max_z0_z4_z28:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** bfmax [^\n]+, {z28\.h - z29\.h}
+** |
+** bfmax [^\n]+, {z28\.h - z29\.h}
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN (max_z0_z4_z28, svbfloat16x2_t, z0,
+ svmax_bf16_x2 (z4, z28),
+ svmax (z4, z28))
+
+/*
+** max_z18_z18_z4:
+** bfmax {z18\.h - z19\.h}, {z18\.h - z19\.h}, {z4\.h - z5\.h}
+** ret
+*/
+TEST_XN (max_z18_z18_z4, svbfloat16x2_t, z18,
+ svmax_bf16_x2 (z18, z4),
+ svmax (z18, z4))
+
+/*
+** max_z23_z23_z18:
+** mov [^\n]+
+** mov [^\n]+
+** bfmax [^\n]+, {z18\.h - z19\.h}
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN (max_z23_z23_z18, svbfloat16x2_t, z23,
+ svmax_bf16_x2 (z23, z18),
+ svmax (z23, z18))
+
+/*
+** max_z28_z28_z0:
+** bfmax {z28\.h - z29\.h}, {z28\.h - z29\.h}, {z0\.h - z1\.h}
+** ret
+*/
+TEST_XN (max_z28_z28_z0, svbfloat16x2_t, z28,
+ svmax_bf16_x2 (z28, z0),
+ svmax (z28, z0))
+
+/*
+** max_z0_z0_z18:
+** bfmax {z0\.h - z1\.h}, {z0\.h - z1\.h}, {z18\.h - z19\.h}
+** ret
+*/
+TEST_XN (max_z0_z0_z18, svbfloat16x2_t, z0,
+ svmax_bf16_x2 (z0, z18),
+ svmax (z0, z18))
+
+/*
+** max_z4_z4_z23:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** bfmax {z4\.h - z5\.h}, {z4\.h - z5\.h}, [^\n]+
+** |
+** bfmax {z4\.h - z5\.h}, {z4\.h - z5\.h}, [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN (max_z4_z4_z23, svbfloat16x2_t, z4,
+ svmax_bf16_x2 (z4, z23),
+ svmax (z4, z23))
+
+/*
+** max_single_z24_z24_z0:
+** bfmax {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (max_single_z24_z24_z0, svbfloat16x2_t, svbfloat16_t, z24,
+ svmax_single_bf16_x2 (z24, z0),
+ svmax (z24, z0))
+
+/*
+** max_single_z24_z28_z0:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** bfmax {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h
+** |
+** bfmax {z28\.h - z29\.h}, {z28\.h - z29\.h}, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN_SINGLE (max_single_z24_z28_z0, svbfloat16x2_t, svbfloat16_t, z24,
+ svmax_single_bf16_x2 (z28, z0),
+ svmax (z28, z0))
+
+/*
+** max_single_z24_z1_z0:
+** (
+** mov z24\.d, z1\.d
+** mov z25\.d, z2\.d
+** |
+** mov z25\.d, z2\.d
+** mov z24\.d, z1\.d
+** )
+** bfmax {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (max_single_z24_z1_z0, svbfloat16x2_t, svbfloat16_t, z24,
+ svmax_single_bf16_x2 (z1, z0),
+ svmax (z1, z0))
+
+/*
+** max_single_z1_z24_z0:
+** bfmax {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h
+** (
+** mov z1\.d, z24\.d
+** mov z2\.d, z25\.d
+** |
+** mov z2\.d, z25\.d
+** mov z1\.d, z24\.d
+** )
+** ret
+*/
+TEST_XN_SINGLE (max_single_z1_z24_z0, svbfloat16x2_t, svbfloat16_t, z1,
+ svmax_single_bf16_x2 (z24, z0),
+ svmax (z24, z0))
+
+/*
+** max_single_z1_z1_z0:
+** mov [^\n]+
+** mov [^\n]+
+** bfmax ({z[0-9]+\.h - z[0-9]+\.h}), \1, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN_SINGLE (max_single_z1_z1_z0, svbfloat16x2_t, svbfloat16_t, z1,
+ svmax_single_bf16_x2 (z1, z0),
+ svmax (z1, z0))
+
+/*
+** max_single_z18_z18_z0:
+** bfmax {z18\.h - z19\.h}, {z18\.h - z19\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (max_single_z18_z18_z0, svbfloat16x2_t, svbfloat16_t, z18,
+ svmax_single_bf16_x2 (z18, z0),
+ svmax (z18, z0))
+
+/*
+** max_single_awkward:
+** ...
+** bfmax ({z[0-9]+\.h - z[0-9]+\.h}), \1, z[0-9]+\.h
+** ...
+** ret
+*/
+TEST_XN_SINGLE_AWKWARD (max_single_awkward, svbfloat16x2_t, svbfloat16_t,
+ z0_res = svmax_single_bf16_x2 (z1, z0),
+ z0_res = svmax (z1, z0))
+
+/*
+** max_single_z0_z0_z15:
+** ...
+** bfmax {z0\.h - z1\.h}, {z0\.h - z1\.h}, z15\.h
+** ...
+** ret
+*/
+TEST_XN_SINGLE_Z15 (max_single_z0_z0_z15, svbfloat16x2_t, svbfloat16_t,
+ z0 = svmax_single_bf16_x2 (z0, z15),
+ z0 = svmax (z0, z15))
+
+/*
+** max_single_z24_z24_z16:
+** mov (z[0-7])\.d, z16\.d
+** bfmax {z24\.h - z25\.h}, {z24\.h - z25\.h}, \1\.h
+** ret
+*/
+TEST_XN_SINGLE (max_single_z24_z24_z16, svbfloat16x2_t, svbfloat16_t, z24,
+ svmax_single_bf16_x2 (z24, z16),
+ svmax (z24, z16))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sme2_acle.h"
+
+#pragma GCC target "+sve-b16b16"
+
+/*
+** max_z0_z0_z4:
+** bfmax {z0\.h - z3\.h}, {z0\.h - z3\.h}, {z4\.h - z7\.h}
+** ret
+*/
+TEST_XN (max_z0_z0_z4, svbfloat16x4_t, z0,
+ svmax_bf16_x4 (z0, z4),
+ svmax (z0, z4))
+
+/*
+** max_z0_z4_z0:
+** bfmax {z0\.h - z3\.h}, {z0\.h - z3\.h}, {z4\.h - z7\.h}
+** ret
+*/
+TEST_XN (max_z0_z4_z0, svbfloat16x4_t, z0,
+ svmax_bf16_x4 (z4, z0),
+ svmax (z4, z0))
+
+/*
+** max_z0_z4_z28:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmax [^\n]+, {z28\.h - z31\.h}
+** |
+** bfmax [^\n]+, {z28\.h - z31\.h}
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN (max_z0_z4_z28, svbfloat16x4_t, z0,
+ svmax_bf16_x4 (z4, z28),
+ svmax (z4, z28))
+
+/*
+** max_z18_z18_z4:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmax [^\n]+, {z4\.h - z7\.h}
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN (max_z18_z18_z4, svbfloat16x4_t, z18,
+ svmax_bf16_x4 (z18, z4),
+ svmax (z18, z4))
+
+/*
+** max_z23_z23_z28:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmax [^\n]+, {z28\.h - z31\.h}
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN (max_z23_z23_z28, svbfloat16x4_t, z23,
+ svmax_bf16_x4 (z23, z28),
+ svmax (z23, z28))
+
+/*
+** max_z28_z28_z0:
+** bfmax {z28\.h - z31\.h}, {z28\.h - z31\.h}, {z0\.h - z3\.h}
+** ret
+*/
+TEST_XN (max_z28_z28_z0, svbfloat16x4_t, z28,
+ svmax_bf16_x4 (z28, z0),
+ svmax (z28, z0))
+
+/*
+** max_z0_z0_z18:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmax {z0\.h - z3\.h}, {z0\.h - z3\.h}, [^\n]+
+** |
+** bfmax {z0\.h - z3\.h}, {z0\.h - z3\.h}, [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN (max_z0_z0_z18, svbfloat16x4_t, z0,
+ svmax_bf16_x4 (z0, z18),
+ svmax (z0, z18))
+
+/*
+** max_z4_z4_z23:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmax {z4\.h - z7\.h}, {z4\.h - z7\.h}, [^\n]+
+** |
+** bfmax {z4\.h - z7\.h}, {z4\.h - z7\.h}, [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN (max_z4_z4_z23, svbfloat16x4_t, z4,
+ svmax_bf16_x4 (z4, z23),
+ svmax (z4, z23))
+
+/*
+** max_single_z24_z24_z0:
+** bfmax {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (max_single_z24_z24_z0, svbfloat16x4_t, svbfloat16_t, z24,
+ svmax_single_bf16_x4 (z24, z0),
+ svmax (z24, z0))
+
+/*
+** max_single_z24_z28_z0:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmax {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h
+** |
+** bfmax {z28\.h - z31\.h}, {z28\.h - z31\.h}, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN_SINGLE (max_single_z24_z28_z0, svbfloat16x4_t, svbfloat16_t, z24,
+ svmax_single_bf16_x4 (z28, z0),
+ svmax (z28, z0))
+
+/*
+** max_single_z24_z1_z0:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmax {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (max_single_z24_z1_z0, svbfloat16x4_t, svbfloat16_t, z24,
+ svmax_single_bf16_x4 (z1, z0),
+ svmax (z1, z0))
+
+/*
+** max_single_z1_z24_z0:
+** bfmax {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN_SINGLE (max_single_z1_z24_z0, svbfloat16x4_t, svbfloat16_t, z1,
+ svmax_single_bf16_x4 (z24, z0),
+ svmax (z24, z0))
+
+/*
+** max_single_z1_z1_z0:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmax ({z[0-9]+\.h - z[0-9]+\.h}), \1, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN_SINGLE (max_single_z1_z1_z0, svbfloat16x4_t, svbfloat16_t, z1,
+ svmax_single_bf16_x4 (z1, z0),
+ svmax (z1, z0))
+
+/*
+** max_single_z18_z18_z0:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmax [^\n]+, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN_SINGLE (max_single_z18_z18_z0, svbfloat16x4_t, svbfloat16_t, z18,
+ svmax_single_bf16_x4 (z18, z0),
+ svmax (z18, z0))
+
+/*
+** max_single_awkward:
+** ...
+** bfmax ({z[0-9]+\.h - z[0-9]+\.h}), \1, z[0-9]+\.h
+** ...
+** ret
+*/
+TEST_XN_SINGLE_AWKWARD (max_single_awkward, svbfloat16x4_t, svbfloat16_t,
+ z0_res = svmax_single_bf16_x4 (z1, z0),
+ z0_res = svmax (z1, z0))
+
+/*
+** max_single_z0_z0_z15:
+** ...
+** bfmax {z0\.h - z3\.h}, {z0\.h - z3\.h}, z15\.h
+** ...
+** ret
+*/
+TEST_XN_SINGLE_Z15 (max_single_z0_z0_z15, svbfloat16x4_t, svbfloat16_t,
+ z0 = svmax_single_bf16_x4 (z0, z15),
+ z0 = svmax (z0, z15))
+
+/*
+** max_single_z24_z24_z16:
+** mov (z[0-7])\.d, z16\.d
+** bfmax {z24\.h - z27\.h}, {z24\.h - z27\.h}, \1\.h
+** ret
+*/
+TEST_XN_SINGLE (max_single_z24_z24_z16, svbfloat16x4_t, svbfloat16_t, z24,
+ svmax_single_bf16_x4 (z24, z16),
+ svmax (z24, z16))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sme2_acle.h"
+
+#pragma GCC target "+sve-b16b16"
+
+/*
+** maxnm_z0_z0_z4:
+** bfmaxnm {z0\.h - z1\.h}, {z0\.h - z1\.h}, {z4\.h - z5\.h}
+** ret
+*/
+TEST_XN (maxnm_z0_z0_z4, svbfloat16x2_t, z0,
+ svmaxnm_bf16_x2 (z0, z4),
+ svmaxnm (z0, z4))
+
+/*
+** maxnm_z0_z4_z0:
+** bfmaxnm {z0\.h - z1\.h}, {z0\.h - z1\.h}, {z4\.h - z5\.h}
+** ret
+*/
+TEST_XN (maxnm_z0_z4_z0, svbfloat16x2_t, z0,
+ svmaxnm_bf16_x2 (z4, z0),
+ svmaxnm (z4, z0))
+
+/*
+** maxnm_z0_z4_z28:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** bfmaxnm [^\n]+, {z28\.h - z29\.h}
+** |
+** bfmaxnm [^\n]+, {z28\.h - z29\.h}
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN (maxnm_z0_z4_z28, svbfloat16x2_t, z0,
+ svmaxnm_bf16_x2 (z4, z28),
+ svmaxnm (z4, z28))
+
+/*
+** maxnm_z18_z18_z4:
+** bfmaxnm {z18\.h - z19\.h}, {z18\.h - z19\.h}, {z4\.h - z5\.h}
+** ret
+*/
+TEST_XN (maxnm_z18_z18_z4, svbfloat16x2_t, z18,
+ svmaxnm_bf16_x2 (z18, z4),
+ svmaxnm (z18, z4))
+
+/*
+** maxnm_z23_z23_z18:
+** mov [^\n]+
+** mov [^\n]+
+** bfmaxnm [^\n]+, {z18\.h - z19\.h}
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN (maxnm_z23_z23_z18, svbfloat16x2_t, z23,
+ svmaxnm_bf16_x2 (z23, z18),
+ svmaxnm (z23, z18))
+
+/*
+** maxnm_z28_z28_z0:
+** bfmaxnm {z28\.h - z29\.h}, {z28\.h - z29\.h}, {z0\.h - z1\.h}
+** ret
+*/
+TEST_XN (maxnm_z28_z28_z0, svbfloat16x2_t, z28,
+ svmaxnm_bf16_x2 (z28, z0),
+ svmaxnm (z28, z0))
+
+/*
+** maxnm_z0_z0_z18:
+** bfmaxnm {z0\.h - z1\.h}, {z0\.h - z1\.h}, {z18\.h - z19\.h}
+** ret
+*/
+TEST_XN (maxnm_z0_z0_z18, svbfloat16x2_t, z0,
+ svmaxnm_bf16_x2 (z0, z18),
+ svmaxnm (z0, z18))
+
+/*
+** maxnm_z4_z4_z23:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** bfmaxnm {z4\.h - z5\.h}, {z4\.h - z5\.h}, [^\n]+
+** |
+** bfmaxnm {z4\.h - z5\.h}, {z4\.h - z5\.h}, [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN (maxnm_z4_z4_z23, svbfloat16x2_t, z4,
+ svmaxnm_bf16_x2 (z4, z23),
+ svmaxnm (z4, z23))
+
+/*
+** maxnm_single_z24_z24_z0:
+** bfmaxnm {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (maxnm_single_z24_z24_z0, svbfloat16x2_t, svbfloat16_t, z24,
+ svmaxnm_single_bf16_x2 (z24, z0),
+ svmaxnm (z24, z0))
+
+/*
+** maxnm_single_z24_z28_z0:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** bfmaxnm {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h
+** |
+** bfmaxnm {z28\.h - z29\.h}, {z28\.h - z29\.h}, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN_SINGLE (maxnm_single_z24_z28_z0, svbfloat16x2_t, svbfloat16_t, z24,
+ svmaxnm_single_bf16_x2 (z28, z0),
+ svmaxnm (z28, z0))
+
+/*
+** maxnm_single_z24_z1_z0:
+** (
+** mov z24\.d, z1\.d
+** mov z25\.d, z2\.d
+** |
+** mov z25\.d, z2\.d
+** mov z24\.d, z1\.d
+** )
+** bfmaxnm {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (maxnm_single_z24_z1_z0, svbfloat16x2_t, svbfloat16_t, z24,
+ svmaxnm_single_bf16_x2 (z1, z0),
+ svmaxnm (z1, z0))
+
+/*
+** maxnm_single_z1_z24_z0:
+** bfmaxnm {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h
+** (
+** mov z1\.d, z24\.d
+** mov z2\.d, z25\.d
+** |
+** mov z2\.d, z25\.d
+** mov z1\.d, z24\.d
+** )
+** ret
+*/
+TEST_XN_SINGLE (maxnm_single_z1_z24_z0, svbfloat16x2_t, svbfloat16_t, z1,
+ svmaxnm_single_bf16_x2 (z24, z0),
+ svmaxnm (z24, z0))
+
+/*
+** maxnm_single_z1_z1_z0:
+** mov [^\n]+
+** mov [^\n]+
+** bfmaxnm ({z[0-9]+\.h - z[0-9]+\.h}), \1, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN_SINGLE (maxnm_single_z1_z1_z0, svbfloat16x2_t, svbfloat16_t, z1,
+ svmaxnm_single_bf16_x2 (z1, z0),
+ svmaxnm (z1, z0))
+
+/*
+** maxnm_single_z18_z18_z0:
+** bfmaxnm {z18\.h - z19\.h}, {z18\.h - z19\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (maxnm_single_z18_z18_z0, svbfloat16x2_t, svbfloat16_t, z18,
+ svmaxnm_single_bf16_x2 (z18, z0),
+ svmaxnm (z18, z0))
+
+/*
+** maxnm_single_awkward:
+** ...
+** bfmaxnm ({z[0-9]+\.h - z[0-9]+\.h}), \1, z[0-9]+\.h
+** ...
+** ret
+*/
+TEST_XN_SINGLE_AWKWARD (maxnm_single_awkward, svbfloat16x2_t, svbfloat16_t,
+ z0_res = svmaxnm_single_bf16_x2 (z1, z0),
+ z0_res = svmaxnm (z1, z0))
+
+/*
+** maxnm_single_z0_z0_z15:
+** ...
+** bfmaxnm {z0\.h - z1\.h}, {z0\.h - z1\.h}, z15\.h
+** ...
+** ret
+*/
+TEST_XN_SINGLE_Z15 (maxnm_single_z0_z0_z15, svbfloat16x2_t, svbfloat16_t,
+ z0 = svmaxnm_single_bf16_x2 (z0, z15),
+ z0 = svmaxnm (z0, z15))
+
+/*
+** maxnm_single_z24_z24_z16:
+** mov (z[0-7])\.d, z16\.d
+** bfmaxnm {z24\.h - z25\.h}, {z24\.h - z25\.h}, \1\.h
+** ret
+*/
+TEST_XN_SINGLE (maxnm_single_z24_z24_z16, svbfloat16x2_t, svbfloat16_t, z24,
+ svmaxnm_single_bf16_x2 (z24, z16),
+ svmaxnm (z24, z16))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sme2_acle.h"
+
+#pragma GCC target "+sve-b16b16"
+
+/*
+** maxnm_z0_z0_z4:
+** bfmaxnm {z0\.h - z3\.h}, {z0\.h - z3\.h}, {z4\.h - z7\.h}
+** ret
+*/
+TEST_XN (maxnm_z0_z0_z4, svbfloat16x4_t, z0,
+ svmaxnm_bf16_x4 (z0, z4),
+ svmaxnm (z0, z4))
+
+/*
+** maxnm_z0_z4_z0:
+** bfmaxnm {z0\.h - z3\.h}, {z0\.h - z3\.h}, {z4\.h - z7\.h}
+** ret
+*/
+TEST_XN (maxnm_z0_z4_z0, svbfloat16x4_t, z0,
+ svmaxnm_bf16_x4 (z4, z0),
+ svmaxnm (z4, z0))
+
+/*
+** maxnm_z0_z4_z28:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmaxnm [^\n]+, {z28\.h - z31\.h}
+** |
+** bfmaxnm [^\n]+, {z28\.h - z31\.h}
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN (maxnm_z0_z4_z28, svbfloat16x4_t, z0,
+ svmaxnm_bf16_x4 (z4, z28),
+ svmaxnm (z4, z28))
+
+/*
+** maxnm_z18_z18_z4:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmaxnm [^\n]+, {z4\.h - z7\.h}
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN (maxnm_z18_z18_z4, svbfloat16x4_t, z18,
+ svmaxnm_bf16_x4 (z18, z4),
+ svmaxnm (z18, z4))
+
+/*
+** maxnm_z23_z23_z28:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmaxnm [^\n]+, {z28\.h - z31\.h}
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN (maxnm_z23_z23_z28, svbfloat16x4_t, z23,
+ svmaxnm_bf16_x4 (z23, z28),
+ svmaxnm (z23, z28))
+
+/*
+** maxnm_z28_z28_z0:
+** bfmaxnm {z28\.h - z31\.h}, {z28\.h - z31\.h}, {z0\.h - z3\.h}
+** ret
+*/
+TEST_XN (maxnm_z28_z28_z0, svbfloat16x4_t, z28,
+ svmaxnm_bf16_x4 (z28, z0),
+ svmaxnm (z28, z0))
+
+/*
+** maxnm_z0_z0_z18:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmaxnm {z0\.h - z3\.h}, {z0\.h - z3\.h}, [^\n]+
+** |
+** bfmaxnm {z0\.h - z3\.h}, {z0\.h - z3\.h}, [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN (maxnm_z0_z0_z18, svbfloat16x4_t, z0,
+ svmaxnm_bf16_x4 (z0, z18),
+ svmaxnm (z0, z18))
+
+/*
+** maxnm_z4_z4_z23:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmaxnm {z4\.h - z7\.h}, {z4\.h - z7\.h}, [^\n]+
+** |
+** bfmaxnm {z4\.h - z7\.h}, {z4\.h - z7\.h}, [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN (maxnm_z4_z4_z23, svbfloat16x4_t, z4,
+ svmaxnm_bf16_x4 (z4, z23),
+ svmaxnm (z4, z23))
+
+/*
+** maxnm_single_z24_z24_z0:
+** bfmaxnm {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (maxnm_single_z24_z24_z0, svbfloat16x4_t, svbfloat16_t, z24,
+ svmaxnm_single_bf16_x4 (z24, z0),
+ svmaxnm (z24, z0))
+
+/*
+** maxnm_single_z24_z28_z0:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmaxnm {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h
+** |
+** bfmaxnm {z28\.h - z31\.h}, {z28\.h - z31\.h}, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN_SINGLE (maxnm_single_z24_z28_z0, svbfloat16x4_t, svbfloat16_t, z24,
+ svmaxnm_single_bf16_x4 (z28, z0),
+ svmaxnm (z28, z0))
+
+/*
+** maxnm_single_z24_z1_z0:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmaxnm {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (maxnm_single_z24_z1_z0, svbfloat16x4_t, svbfloat16_t, z24,
+ svmaxnm_single_bf16_x4 (z1, z0),
+ svmaxnm (z1, z0))
+
+/*
+** maxnm_single_z1_z24_z0:
+** bfmaxnm {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN_SINGLE (maxnm_single_z1_z24_z0, svbfloat16x4_t, svbfloat16_t, z1,
+ svmaxnm_single_bf16_x4 (z24, z0),
+ svmaxnm (z24, z0))
+
+/*
+** maxnm_single_z1_z1_z0:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmaxnm ({z[0-9]+\.h - z[0-9]+\.h}), \1, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN_SINGLE (maxnm_single_z1_z1_z0, svbfloat16x4_t, svbfloat16_t, z1,
+ svmaxnm_single_bf16_x4 (z1, z0),
+ svmaxnm (z1, z0))
+
+/*
+** maxnm_single_z18_z18_z0:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmaxnm [^\n]+, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN_SINGLE (maxnm_single_z18_z18_z0, svbfloat16x4_t, svbfloat16_t, z18,
+ svmaxnm_single_bf16_x4 (z18, z0),
+ svmaxnm (z18, z0))
+
+/*
+** maxnm_single_awkward:
+** ...
+** bfmaxnm ({z[0-9]+\.h - z[0-9]+\.h}), \1, z[0-9]+\.h
+** ...
+** ret
+*/
+TEST_XN_SINGLE_AWKWARD (maxnm_single_awkward, svbfloat16x4_t, svbfloat16_t,
+ z0_res = svmaxnm_single_bf16_x4 (z1, z0),
+ z0_res = svmaxnm (z1, z0))
+
+/*
+** maxnm_single_z0_z0_z15:
+** ...
+** bfmaxnm {z0\.h - z3\.h}, {z0\.h - z3\.h}, z15\.h
+** ...
+** ret
+*/
+TEST_XN_SINGLE_Z15 (maxnm_single_z0_z0_z15, svbfloat16x4_t, svbfloat16_t,
+ z0 = svmaxnm_single_bf16_x4 (z0, z15),
+ z0 = svmaxnm (z0, z15))
+
+/*
+** maxnm_single_z24_z24_z16:
+** mov (z[0-7])\.d, z16\.d
+** bfmaxnm {z24\.h - z27\.h}, {z24\.h - z27\.h}, \1\.h
+** ret
+*/
+TEST_XN_SINGLE (maxnm_single_z24_z24_z16, svbfloat16x4_t, svbfloat16_t, z24,
+ svmaxnm_single_bf16_x4 (z24, z16),
+ svmaxnm (z24, z16))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sme2_acle.h"
+
+#pragma GCC target "+sve-b16b16"
+
+/*
+** min_z0_z0_z4:
+** bfmin {z0\.h - z1\.h}, {z0\.h - z1\.h}, {z4\.h - z5\.h}
+** ret
+*/
+TEST_XN (min_z0_z0_z4, svbfloat16x2_t, z0,
+ svmin_bf16_x2 (z0, z4),
+ svmin (z0, z4))
+
+/*
+** min_z0_z4_z0:
+** bfmin {z0\.h - z1\.h}, {z0\.h - z1\.h}, {z4\.h - z5\.h}
+** ret
+*/
+TEST_XN (min_z0_z4_z0, svbfloat16x2_t, z0,
+ svmin_bf16_x2 (z4, z0),
+ svmin (z4, z0))
+
+/*
+** min_z0_z4_z28:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** bfmin [^\n]+, {z28\.h - z29\.h}
+** |
+** bfmin [^\n]+, {z28\.h - z29\.h}
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN (min_z0_z4_z28, svbfloat16x2_t, z0,
+ svmin_bf16_x2 (z4, z28),
+ svmin (z4, z28))
+
+/*
+** min_z18_z18_z4:
+** bfmin {z18\.h - z19\.h}, {z18\.h - z19\.h}, {z4\.h - z5\.h}
+** ret
+*/
+TEST_XN (min_z18_z18_z4, svbfloat16x2_t, z18,
+ svmin_bf16_x2 (z18, z4),
+ svmin (z18, z4))
+
+/*
+** min_z23_z23_z18:
+** mov [^\n]+
+** mov [^\n]+
+** bfmin [^\n]+, {z18\.h - z19\.h}
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN (min_z23_z23_z18, svbfloat16x2_t, z23,
+ svmin_bf16_x2 (z23, z18),
+ svmin (z23, z18))
+
+/*
+** min_z28_z28_z0:
+** bfmin {z28\.h - z29\.h}, {z28\.h - z29\.h}, {z0\.h - z1\.h}
+** ret
+*/
+TEST_XN (min_z28_z28_z0, svbfloat16x2_t, z28,
+ svmin_bf16_x2 (z28, z0),
+ svmin (z28, z0))
+
+/*
+** min_z0_z0_z18:
+** bfmin {z0\.h - z1\.h}, {z0\.h - z1\.h}, {z18\.h - z19\.h}
+** ret
+*/
+TEST_XN (min_z0_z0_z18, svbfloat16x2_t, z0,
+ svmin_bf16_x2 (z0, z18),
+ svmin (z0, z18))
+
+/*
+** min_z4_z4_z23:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** bfmin {z4\.h - z5\.h}, {z4\.h - z5\.h}, [^\n]+
+** |
+** bfmin {z4\.h - z5\.h}, {z4\.h - z5\.h}, [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN (min_z4_z4_z23, svbfloat16x2_t, z4,
+ svmin_bf16_x2 (z4, z23),
+ svmin (z4, z23))
+
+/*
+** min_single_z24_z24_z0:
+** bfmin {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (min_single_z24_z24_z0, svbfloat16x2_t, svbfloat16_t, z24,
+ svmin_single_bf16_x2 (z24, z0),
+ svmin (z24, z0))
+
+/*
+** min_single_z24_z28_z0:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** bfmin {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h
+** |
+** bfmin {z28\.h - z29\.h}, {z28\.h - z29\.h}, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN_SINGLE (min_single_z24_z28_z0, svbfloat16x2_t, svbfloat16_t, z24,
+ svmin_single_bf16_x2 (z28, z0),
+ svmin (z28, z0))
+
+/*
+** min_single_z24_z1_z0:
+** (
+** mov z24\.d, z1\.d
+** mov z25\.d, z2\.d
+** |
+** mov z25\.d, z2\.d
+** mov z24\.d, z1\.d
+** )
+** bfmin {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (min_single_z24_z1_z0, svbfloat16x2_t, svbfloat16_t, z24,
+ svmin_single_bf16_x2 (z1, z0),
+ svmin (z1, z0))
+
+/*
+** min_single_z1_z24_z0:
+** bfmin {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h
+** (
+** mov z1\.d, z24\.d
+** mov z2\.d, z25\.d
+** |
+** mov z2\.d, z25\.d
+** mov z1\.d, z24\.d
+** )
+** ret
+*/
+TEST_XN_SINGLE (min_single_z1_z24_z0, svbfloat16x2_t, svbfloat16_t, z1,
+ svmin_single_bf16_x2 (z24, z0),
+ svmin (z24, z0))
+
+/*
+** min_single_z1_z1_z0:
+** mov [^\n]+
+** mov [^\n]+
+** bfmin ({z[0-9]+\.h - z[0-9]+\.h}), \1, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN_SINGLE (min_single_z1_z1_z0, svbfloat16x2_t, svbfloat16_t, z1,
+ svmin_single_bf16_x2 (z1, z0),
+ svmin (z1, z0))
+
+/*
+** min_single_z18_z18_z0:
+** bfmin {z18\.h - z19\.h}, {z18\.h - z19\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (min_single_z18_z18_z0, svbfloat16x2_t, svbfloat16_t, z18,
+ svmin_single_bf16_x2 (z18, z0),
+ svmin (z18, z0))
+
+/*
+** min_single_awkward:
+** ...
+** bfmin ({z[0-9]+\.h - z[0-9]+\.h}), \1, z[0-9]+\.h
+** ...
+** ret
+*/
+TEST_XN_SINGLE_AWKWARD (min_single_awkward, svbfloat16x2_t, svbfloat16_t,
+ z0_res = svmin_single_bf16_x2 (z1, z0),
+ z0_res = svmin (z1, z0))
+
+/*
+** min_single_z0_z0_z15:
+** ...
+** bfmin {z0\.h - z1\.h}, {z0\.h - z1\.h}, z15\.h
+** ...
+** ret
+*/
+TEST_XN_SINGLE_Z15 (min_single_z0_z0_z15, svbfloat16x2_t, svbfloat16_t,
+ z0 = svmin_single_bf16_x2 (z0, z15),
+ z0 = svmin (z0, z15))
+
+/*
+** min_single_z24_z24_z16:
+** mov (z[0-7])\.d, z16\.d
+** bfmin {z24\.h - z25\.h}, {z24\.h - z25\.h}, \1\.h
+** ret
+*/
+TEST_XN_SINGLE (min_single_z24_z24_z16, svbfloat16x2_t, svbfloat16_t, z24,
+ svmin_single_bf16_x2 (z24, z16),
+ svmin (z24, z16))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sme2_acle.h"
+
+#pragma GCC target "+sve-b16b16"
+
+/*
+** min_z0_z0_z4:
+** bfmin {z0\.h - z3\.h}, {z0\.h - z3\.h}, {z4\.h - z7\.h}
+** ret
+*/
+TEST_XN (min_z0_z0_z4, svbfloat16x4_t, z0,
+ svmin_bf16_x4 (z0, z4),
+ svmin (z0, z4))
+
+/*
+** min_z0_z4_z0:
+** bfmin {z0\.h - z3\.h}, {z0\.h - z3\.h}, {z4\.h - z7\.h}
+** ret
+*/
+TEST_XN (min_z0_z4_z0, svbfloat16x4_t, z0,
+ svmin_bf16_x4 (z4, z0),
+ svmin (z4, z0))
+
+/*
+** min_z0_z4_z28:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmin [^\n]+, {z28\.h - z31\.h}
+** |
+** bfmin [^\n]+, {z28\.h - z31\.h}
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN (min_z0_z4_z28, svbfloat16x4_t, z0,
+ svmin_bf16_x4 (z4, z28),
+ svmin (z4, z28))
+
+/*
+** min_z18_z18_z4:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmin [^\n]+, {z4\.h - z7\.h}
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN (min_z18_z18_z4, svbfloat16x4_t, z18,
+ svmin_bf16_x4 (z18, z4),
+ svmin (z18, z4))
+
+/*
+** min_z23_z23_z28:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmin [^\n]+, {z28\.h - z31\.h}
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN (min_z23_z23_z28, svbfloat16x4_t, z23,
+ svmin_bf16_x4 (z23, z28),
+ svmin (z23, z28))
+
+/*
+** min_z28_z28_z0:
+** bfmin {z28\.h - z31\.h}, {z28\.h - z31\.h}, {z0\.h - z3\.h}
+** ret
+*/
+TEST_XN (min_z28_z28_z0, svbfloat16x4_t, z28,
+ svmin_bf16_x4 (z28, z0),
+ svmin (z28, z0))
+
+/*
+** min_z0_z0_z18:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmin {z0\.h - z3\.h}, {z0\.h - z3\.h}, [^\n]+
+** |
+** bfmin {z0\.h - z3\.h}, {z0\.h - z3\.h}, [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN (min_z0_z0_z18, svbfloat16x4_t, z0,
+ svmin_bf16_x4 (z0, z18),
+ svmin (z0, z18))
+
+/*
+** min_z4_z4_z23:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmin {z4\.h - z7\.h}, {z4\.h - z7\.h}, [^\n]+
+** |
+** bfmin {z4\.h - z7\.h}, {z4\.h - z7\.h}, [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN (min_z4_z4_z23, svbfloat16x4_t, z4,
+ svmin_bf16_x4 (z4, z23),
+ svmin (z4, z23))
+
+/*
+** min_single_z24_z24_z0:
+** bfmin {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (min_single_z24_z24_z0, svbfloat16x4_t, svbfloat16_t, z24,
+ svmin_single_bf16_x4 (z24, z0),
+ svmin (z24, z0))
+
+/*
+** min_single_z24_z28_z0:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmin {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h
+** |
+** bfmin {z28\.h - z31\.h}, {z28\.h - z31\.h}, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN_SINGLE (min_single_z24_z28_z0, svbfloat16x4_t, svbfloat16_t, z24,
+ svmin_single_bf16_x4 (z28, z0),
+ svmin (z28, z0))
+
+/*
+** min_single_z24_z1_z0:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmin {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (min_single_z24_z1_z0, svbfloat16x4_t, svbfloat16_t, z24,
+ svmin_single_bf16_x4 (z1, z0),
+ svmin (z1, z0))
+
+/*
+** min_single_z1_z24_z0:
+** bfmin {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN_SINGLE (min_single_z1_z24_z0, svbfloat16x4_t, svbfloat16_t, z1,
+ svmin_single_bf16_x4 (z24, z0),
+ svmin (z24, z0))
+
+/*
+** min_single_z1_z1_z0:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmin ({z[0-9]+\.h - z[0-9]+\.h}), \1, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN_SINGLE (min_single_z1_z1_z0, svbfloat16x4_t, svbfloat16_t, z1,
+ svmin_single_bf16_x4 (z1, z0),
+ svmin (z1, z0))
+
+/*
+** min_single_z18_z18_z0:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmin [^\n]+, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN_SINGLE (min_single_z18_z18_z0, svbfloat16x4_t, svbfloat16_t, z18,
+ svmin_single_bf16_x4 (z18, z0),
+ svmin (z18, z0))
+
+/*
+** min_single_awkward:
+** ...
+** bfmin ({z[0-9]+\.h - z[0-9]+\.h}), \1, z[0-9]+\.h
+** ...
+** ret
+*/
+TEST_XN_SINGLE_AWKWARD (min_single_awkward, svbfloat16x4_t, svbfloat16_t,
+ z0_res = svmin_single_bf16_x4 (z1, z0),
+ z0_res = svmin (z1, z0))
+
+/*
+** min_single_z0_z0_z15:
+** ...
+** bfmin {z0\.h - z3\.h}, {z0\.h - z3\.h}, z15\.h
+** ...
+** ret
+*/
+TEST_XN_SINGLE_Z15 (min_single_z0_z0_z15, svbfloat16x4_t, svbfloat16_t,
+ z0 = svmin_single_bf16_x4 (z0, z15),
+ z0 = svmin (z0, z15))
+
+/*
+** min_single_z24_z24_z16:
+** mov (z[0-7])\.d, z16\.d
+** bfmin {z24\.h - z27\.h}, {z24\.h - z27\.h}, \1\.h
+** ret
+*/
+TEST_XN_SINGLE (min_single_z24_z24_z16, svbfloat16x4_t, svbfloat16_t, z24,
+ svmin_single_bf16_x4 (z24, z16),
+ svmin (z24, z16))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sme2_acle.h"
+
+#pragma GCC target "+sve-b16b16"
+
+/*
+** minnm_z0_z0_z4:
+** bfminnm {z0\.h - z1\.h}, {z0\.h - z1\.h}, {z4\.h - z5\.h}
+** ret
+*/
+TEST_XN (minnm_z0_z0_z4, svbfloat16x2_t, z0,
+ svminnm_bf16_x2 (z0, z4),
+ svminnm (z0, z4))
+
+/*
+** minnm_z0_z4_z0:
+** bfminnm {z0\.h - z1\.h}, {z0\.h - z1\.h}, {z4\.h - z5\.h}
+** ret
+*/
+TEST_XN (minnm_z0_z4_z0, svbfloat16x2_t, z0,
+ svminnm_bf16_x2 (z4, z0),
+ svminnm (z4, z0))
+
+/*
+** minnm_z0_z4_z28:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** bfminnm [^\n]+, {z28\.h - z29\.h}
+** |
+** bfminnm [^\n]+, {z28\.h - z29\.h}
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN (minnm_z0_z4_z28, svbfloat16x2_t, z0,
+ svminnm_bf16_x2 (z4, z28),
+ svminnm (z4, z28))
+
+/*
+** minnm_z18_z18_z4:
+** bfminnm {z18\.h - z19\.h}, {z18\.h - z19\.h}, {z4\.h - z5\.h}
+** ret
+*/
+TEST_XN (minnm_z18_z18_z4, svbfloat16x2_t, z18,
+ svminnm_bf16_x2 (z18, z4),
+ svminnm (z18, z4))
+
+/*
+** minnm_z23_z23_z18:
+** mov [^\n]+
+** mov [^\n]+
+** bfminnm [^\n]+, {z18\.h - z19\.h}
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN (minnm_z23_z23_z18, svbfloat16x2_t, z23,
+ svminnm_bf16_x2 (z23, z18),
+ svminnm (z23, z18))
+
+/*
+** minnm_z28_z28_z0:
+** bfminnm {z28\.h - z29\.h}, {z28\.h - z29\.h}, {z0\.h - z1\.h}
+** ret
+*/
+TEST_XN (minnm_z28_z28_z0, svbfloat16x2_t, z28,
+ svminnm_bf16_x2 (z28, z0),
+ svminnm (z28, z0))
+
+/*
+** minnm_z0_z0_z18:
+** bfminnm {z0\.h - z1\.h}, {z0\.h - z1\.h}, {z18\.h - z19\.h}
+** ret
+*/
+TEST_XN (minnm_z0_z0_z18, svbfloat16x2_t, z0,
+ svminnm_bf16_x2 (z0, z18),
+ svminnm (z0, z18))
+
+/*
+** minnm_z4_z4_z23:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** bfminnm {z4\.h - z5\.h}, {z4\.h - z5\.h}, [^\n]+
+** |
+** bfminnm {z4\.h - z5\.h}, {z4\.h - z5\.h}, [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN (minnm_z4_z4_z23, svbfloat16x2_t, z4,
+ svminnm_bf16_x2 (z4, z23),
+ svminnm (z4, z23))
+
+/*
+** minnm_single_z24_z24_z0:
+** bfminnm {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (minnm_single_z24_z24_z0, svbfloat16x2_t, svbfloat16_t, z24,
+ svminnm_single_bf16_x2 (z24, z0),
+ svminnm (z24, z0))
+
+/*
+** minnm_single_z24_z28_z0:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** bfminnm {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h
+** |
+** bfminnm {z28\.h - z29\.h}, {z28\.h - z29\.h}, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN_SINGLE (minnm_single_z24_z28_z0, svbfloat16x2_t, svbfloat16_t, z24,
+ svminnm_single_bf16_x2 (z28, z0),
+ svminnm (z28, z0))
+
+/*
+** minnm_single_z24_z1_z0:
+** (
+** mov z24\.d, z1\.d
+** mov z25\.d, z2\.d
+** |
+** mov z25\.d, z2\.d
+** mov z24\.d, z1\.d
+** )
+** bfminnm {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (minnm_single_z24_z1_z0, svbfloat16x2_t, svbfloat16_t, z24,
+ svminnm_single_bf16_x2 (z1, z0),
+ svminnm (z1, z0))
+
+/*
+** minnm_single_z1_z24_z0:
+** bfminnm {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h
+** (
+** mov z1\.d, z24\.d
+** mov z2\.d, z25\.d
+** |
+** mov z2\.d, z25\.d
+** mov z1\.d, z24\.d
+** )
+** ret
+*/
+TEST_XN_SINGLE (minnm_single_z1_z24_z0, svbfloat16x2_t, svbfloat16_t, z1,
+ svminnm_single_bf16_x2 (z24, z0),
+ svminnm (z24, z0))
+
+/*
+** minnm_single_z1_z1_z0:
+** mov [^\n]+
+** mov [^\n]+
+** bfminnm ({z[0-9]+\.h - z[0-9]+\.h}), \1, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN_SINGLE (minnm_single_z1_z1_z0, svbfloat16x2_t, svbfloat16_t, z1,
+ svminnm_single_bf16_x2 (z1, z0),
+ svminnm (z1, z0))
+
+/*
+** minnm_single_z18_z18_z0:
+** bfminnm {z18\.h - z19\.h}, {z18\.h - z19\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (minnm_single_z18_z18_z0, svbfloat16x2_t, svbfloat16_t, z18,
+ svminnm_single_bf16_x2 (z18, z0),
+ svminnm (z18, z0))
+
+/*
+** minnm_single_awkward:
+** ...
+** bfminnm ({z[0-9]+\.h - z[0-9]+\.h}), \1, z[0-9]+\.h
+** ...
+** ret
+*/
+TEST_XN_SINGLE_AWKWARD (minnm_single_awkward, svbfloat16x2_t, svbfloat16_t,
+ z0_res = svminnm_single_bf16_x2 (z1, z0),
+ z0_res = svminnm (z1, z0))
+
+/*
+** minnm_single_z0_z0_z15:
+** ...
+** bfminnm {z0\.h - z1\.h}, {z0\.h - z1\.h}, z15\.h
+** ...
+** ret
+*/
+TEST_XN_SINGLE_Z15 (minnm_single_z0_z0_z15, svbfloat16x2_t, svbfloat16_t,
+ z0 = svminnm_single_bf16_x2 (z0, z15),
+ z0 = svminnm (z0, z15))
+
+/*
+** minnm_single_z24_z24_z16:
+** mov (z[0-7])\.d, z16\.d
+** bfminnm {z24\.h - z25\.h}, {z24\.h - z25\.h}, \1\.h
+** ret
+*/
+TEST_XN_SINGLE (minnm_single_z24_z24_z16, svbfloat16x2_t, svbfloat16_t, z24,
+ svminnm_single_bf16_x2 (z24, z16),
+ svminnm (z24, z16))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sme2_acle.h"
+
+#pragma GCC target "+sve-b16b16"
+
+/*
+** minnm_z0_z0_z4:
+** bfminnm {z0\.h - z3\.h}, {z0\.h - z3\.h}, {z4\.h - z7\.h}
+** ret
+*/
+TEST_XN (minnm_z0_z0_z4, svbfloat16x4_t, z0,
+ svminnm_bf16_x4 (z0, z4),
+ svminnm (z0, z4))
+
+/*
+** minnm_z0_z4_z0:
+** bfminnm {z0\.h - z3\.h}, {z0\.h - z3\.h}, {z4\.h - z7\.h}
+** ret
+*/
+TEST_XN (minnm_z0_z4_z0, svbfloat16x4_t, z0,
+ svminnm_bf16_x4 (z4, z0),
+ svminnm (z4, z0))
+
+/*
+** minnm_z0_z4_z28:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfminnm [^\n]+, {z28\.h - z31\.h}
+** |
+** bfminnm [^\n]+, {z28\.h - z31\.h}
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN (minnm_z0_z4_z28, svbfloat16x4_t, z0,
+ svminnm_bf16_x4 (z4, z28),
+ svminnm (z4, z28))
+
+/*
+** minnm_z18_z18_z4:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfminnm [^\n]+, {z4\.h - z7\.h}
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN (minnm_z18_z18_z4, svbfloat16x4_t, z18,
+ svminnm_bf16_x4 (z18, z4),
+ svminnm (z18, z4))
+
+/*
+** minnm_z23_z23_z28:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfminnm [^\n]+, {z28\.h - z31\.h}
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN (minnm_z23_z23_z28, svbfloat16x4_t, z23,
+ svminnm_bf16_x4 (z23, z28),
+ svminnm (z23, z28))
+
+/*
+** minnm_z28_z28_z0:
+** bfminnm {z28\.h - z31\.h}, {z28\.h - z31\.h}, {z0\.h - z3\.h}
+** ret
+*/
+TEST_XN (minnm_z28_z28_z0, svbfloat16x4_t, z28,
+ svminnm_bf16_x4 (z28, z0),
+ svminnm (z28, z0))
+
+/*
+** minnm_z0_z0_z18:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfminnm {z0\.h - z3\.h}, {z0\.h - z3\.h}, [^\n]+
+** |
+** bfminnm {z0\.h - z3\.h}, {z0\.h - z3\.h}, [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN (minnm_z0_z0_z18, svbfloat16x4_t, z0,
+ svminnm_bf16_x4 (z0, z18),
+ svminnm (z0, z18))
+
+/*
+** minnm_z4_z4_z23:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfminnm {z4\.h - z7\.h}, {z4\.h - z7\.h}, [^\n]+
+** |
+** bfminnm {z4\.h - z7\.h}, {z4\.h - z7\.h}, [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN (minnm_z4_z4_z23, svbfloat16x4_t, z4,
+ svminnm_bf16_x4 (z4, z23),
+ svminnm (z4, z23))
+
+/*
+** minnm_single_z24_z24_z0:
+** bfminnm {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (minnm_single_z24_z24_z0, svbfloat16x4_t, svbfloat16_t, z24,
+ svminnm_single_bf16_x4 (z24, z0),
+ svminnm (z24, z0))
+
+/*
+** minnm_single_z24_z28_z0:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfminnm {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h
+** |
+** bfminnm {z28\.h - z31\.h}, {z28\.h - z31\.h}, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN_SINGLE (minnm_single_z24_z28_z0, svbfloat16x4_t, svbfloat16_t, z24,
+ svminnm_single_bf16_x4 (z28, z0),
+ svminnm (z28, z0))
+
+/*
+** minnm_single_z24_z1_z0:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfminnm {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (minnm_single_z24_z1_z0, svbfloat16x4_t, svbfloat16_t, z24,
+ svminnm_single_bf16_x4 (z1, z0),
+ svminnm (z1, z0))
+
+/*
+** minnm_single_z1_z24_z0:
+** bfminnm {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN_SINGLE (minnm_single_z1_z24_z0, svbfloat16x4_t, svbfloat16_t, z1,
+ svminnm_single_bf16_x4 (z24, z0),
+ svminnm (z24, z0))
+
+/*
+** minnm_single_z1_z1_z0:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfminnm ({z[0-9]+\.h - z[0-9]+\.h}), \1, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN_SINGLE (minnm_single_z1_z1_z0, svbfloat16x4_t, svbfloat16_t, z1,
+ svminnm_single_bf16_x4 (z1, z0),
+ svminnm (z1, z0))
+
+/*
+** minnm_single_z18_z18_z0:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfminnm [^\n]+, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN_SINGLE (minnm_single_z18_z18_z0, svbfloat16x4_t, svbfloat16_t, z18,
+ svminnm_single_bf16_x4 (z18, z0),
+ svminnm (z18, z0))
+
+/*
+** minnm_single_awkward:
+** ...
+** bfminnm ({z[0-9]+\.h - z[0-9]+\.h}), \1, z[0-9]+\.h
+** ...
+** ret
+*/
+TEST_XN_SINGLE_AWKWARD (minnm_single_awkward, svbfloat16x4_t, svbfloat16_t,
+ z0_res = svminnm_single_bf16_x4 (z1, z0),
+ z0_res = svminnm (z1, z0))
+
+/*
+** minnm_single_z0_z0_z15:
+** ...
+** bfminnm {z0\.h - z3\.h}, {z0\.h - z3\.h}, z15\.h
+** ...
+** ret
+*/
+TEST_XN_SINGLE_Z15 (minnm_single_z0_z0_z15, svbfloat16x4_t, svbfloat16_t,
+ z0 = svminnm_single_bf16_x4 (z0, z15),
+ z0 = svminnm (z0, z15))
+
+/*
+** minnm_single_z24_z24_z16:
+** mov (z[0-7])\.d, z16\.d
+** bfminnm {z24\.h - z27\.h}, {z24\.h - z27\.h}, \1\.h
+** ret
+*/
+TEST_XN_SINGLE (minnm_single_z24_z24_z16, svbfloat16x4_t, svbfloat16_t, z24,
+ svminnm_single_bf16_x4 (z24, z16),
+ svminnm (z24, z16))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-b16b16_ok } } } */
+/* { dg-options "-O3 --save-temps" } */
+
+#pragma GCC target "+sve2+sve-b16b16"
+#include "bf16_arith_1.h"
+
+/* { dg-final { scan-assembler-times {\tbfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tbfsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tbfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 5 } } */
--- /dev/null
+#define DO_REGREG_OPS(OP, NAME) \
+void varith_##NAME (__bf16 *dst, __bf16 *src, int count) \
+{ \
+ for (int i = 0; i < count; ++i) \
+ dst[i] = dst[i] OP src[i]; \
+}
+
+#define DO_IMMEDIATE_OPS(VALUE, OP, NAME) \
+void varithimm_##NAME (__bf16 *dst, int count) \
+{ \
+ for (int i = 0; i < count; ++i) \
+ dst[i] = dst[i] OP (__bf16) VALUE; \
+}
+
+#define DO_ARITH_OPS(OP, NAME) \
+ DO_REGREG_OPS (OP, NAME); \
+ DO_IMMEDIATE_OPS (0.5, OP, NAME ## pointfive); \
+ DO_IMMEDIATE_OPS (2, OP, NAME ## 2); \
+ DO_IMMEDIATE_OPS (2.5, OP, NAME ## twopoint5); \
+ DO_IMMEDIATE_OPS (-0.5, OP, NAME ## minuspointfive);
+
+DO_ARITH_OPS (+, add)
+DO_ARITH_OPS (-, minus)
+DO_ARITH_OPS (*, mult)
--- /dev/null
+/* { dg-options "-O3 --save-temps" } */
+
+#pragma GCC target "arch=armv9-a+sve2"
+#include "bf16_arith_1.h"
+
+/* { dg-final { scan-assembler-not {\tbfadd\t} } } */
+/* { dg-final { scan-assembler-not {\tbfsub\t} } } */
+/* { dg-final { scan-assembler-not {\tbfmul\t} } } */
--- /dev/null
+/* { dg-options "-O3 --save-temps" } */
+
+#pragma GCC target "arch=armv8-a+sve-b16b16"
+#include "bf16_arith_1.h"
+
+/* { dg-final { scan-assembler-not {\tbfadd\t} } } */
+/* { dg-final { scan-assembler-not {\tbfsub\t} } } */
+/* { dg-final { scan-assembler-not {\tbfmul\t} } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#pragma GCC target "+sve2+sve-b16b16"
+
+#define DEF_LOOP(TYPE, NAME, OP) \
+ void __attribute__ ((noipa)) \
+ test_##TYPE##_##NAME (TYPE *__restrict r, \
+ TYPE *__restrict a, \
+ TYPE *__restrict b, TYPE c, \
+ short *__restrict pred, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ r[i] = pred[i] != 1 ? a[i] OP b[i] * c : a[i]; \
+ }
+
+DEF_LOOP (__bf16, add, +)
+DEF_LOOP (__bf16, sub, -)
+
+/* { dg-final { scan-assembler-times {\tbfmla\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tbfmls\tz[0-9]+\.h, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tmov\tz[^,]*z} } } */
+/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
-/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-do assemble { target aarch64_asm_sve-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-b16b16_ok } } } */
/* { dg-options "-O3 -msve-vector-bits=256 --save-temps" } */
+typedef __bf16 vnx8bf __attribute__((vector_size(32)));
typedef _Float16 vnx8hf __attribute__((vector_size(32)));
typedef float vnx4sf __attribute__((vector_size(32)));
typedef double vnx2df __attribute__((vector_size(32)));
DO_OP (vnx4sf)
DO_OP (vnx2df)
+#pragma GCC target "+sve2+sve-b16b16"
+
+DO_OP (vnx8bf)
+
+/* { dg-final { scan-assembler-times {\tbfmla\t} 1 } } */
/* { dg-final { scan-assembler-times {\tfmad\tz0\.h, p[0-7]/m, z2\.h, z4\.h\n} 1 } } */
/* { dg-final { scan-assembler-times {\tfmad\tz0\.s, p[0-7]/m, z2\.s, z4\.s\n} 1 } } */
/* { dg-final { scan-assembler-times {\tfmad\tz0\.d, p[0-7]/m, z2\.d, z4\.d\n} 1 } } */
-/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-do assemble { target aarch64_asm_sve-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-b16b16_ok } } } */
/* { dg-options "-O3 -msve-vector-bits=256 --save-temps" } */
+typedef __bf16 vnx8bf __attribute__((vector_size(32)));
typedef _Float16 vnx8hf __attribute__((vector_size(32)));
typedef float vnx4sf __attribute__((vector_size(32)));
typedef double vnx2df __attribute__((vector_size(32)));
DO_OP (vnx4sf)
DO_OP (vnx2df)
+#pragma GCC target "+sve2+sve-b16b16"
+
+DO_OP (vnx8bf)
+
+/* { dg-final { scan-assembler-times {\tbfmla\tz0\.h, p[0-7]/m, z2\.h, z4\.h\n} 1 } } */
/* { dg-final { scan-assembler-times {\tfmla\tz0\.h, p[0-7]/m, z2\.h, z4\.h\n} 1 } } */
/* { dg-final { scan-assembler-times {\tfmla\tz0\.s, p[0-7]/m, z2\.s, z4\.s\n} 1 } } */
/* { dg-final { scan-assembler-times {\tfmla\tz0\.d, p[0-7]/m, z2\.d, z4\.d\n} 1 } } */
-/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-do assemble { target aarch64_asm_sve-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-b16b16_ok } } } */
/* { dg-options "-O3 -msve-vector-bits=256 --save-temps" } */
+typedef __bf16 vnx8bf __attribute__((vector_size(32)));
typedef _Float16 vnx8hf __attribute__((vector_size(32)));
typedef float vnx4sf __attribute__((vector_size(32)));
typedef double vnx2df __attribute__((vector_size(32)));
DO_OP (vnx4sf)
DO_OP (vnx2df)
+#pragma GCC target "+sve2+sve-b16b16"
+
+DO_OP (vnx8bf)
+
+/* { dg-final { scan-assembler-times {\tbfmls\tz0\.h, p[0-7]/m, z2\.h, z4\.h\n} 1 } } */
/* { dg-final { scan-assembler-times {\tfmls\tz0\.h, p[0-7]/m, z2\.h, z4\.h\n} 1 } } */
/* { dg-final { scan-assembler-times {\tfmls\tz0\.s, p[0-7]/m, z2\.s, z4\.s\n} 1 } } */
/* { dg-final { scan-assembler-times {\tfmls\tz0\.d, p[0-7]/m, z2\.d, z4\.d\n} 1 } } */
-/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-do assemble { target aarch64_asm_sve-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-b16b16_ok } } } */
/* { dg-options "-O3 -msve-vector-bits=256 --save-temps" } */
+typedef __bf16 vnx8bf __attribute__((vector_size(32)));
typedef _Float16 vnx8hf __attribute__((vector_size(32)));
typedef float vnx4sf __attribute__((vector_size(32)));
typedef double vnx2df __attribute__((vector_size(32)));
DO_OP (vnx4sf)
DO_OP (vnx2df)
+#pragma GCC target "+sve2+sve-b16b16"
+
+DO_OP (vnx8bf)
+
+/* { dg-final { scan-assembler-times {\tbfmls\t} 1 } } */
/* { dg-final { scan-assembler-times {\tfmsb\tz0\.h, p[0-7]/m, z2\.h, z4\.h\n} 1 } } */
/* { dg-final { scan-assembler-times {\tfmsb\tz0\.s, p[0-7]/m, z2\.s, z4\.s\n} 1 } } */
/* { dg-final { scan-assembler-times {\tfmsb\tz0\.d, p[0-7]/m, z2\.d, z4\.d\n} 1 } } */
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve-b16b16"
+#ifdef STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** add_bf16_m_tied1:
+** bfadd z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (add_bf16_m_tied1, svbfloat16_t,
+ z0 = svadd_bf16_m (p0, z0, z1),
+ z0 = svadd_m (p0, z0, z1))
+
+/*
+** add_bf16_m_tied2:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** bfadd z0\.h, p0/m, z0\.h, \1\.h
+** ret
+*/
+TEST_UNIFORM_Z (add_bf16_m_tied2, svbfloat16_t,
+ z0 = svadd_bf16_m (p0, z1, z0),
+ z0 = svadd_m (p0, z1, z0))
+
+/*
+** add_bf16_m_untied:
+** movprfx z0, z1
+** bfadd z0\.h, p0/m, z0\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (add_bf16_m_untied, svbfloat16_t,
+ z0 = svadd_bf16_m (p0, z1, z2),
+ z0 = svadd_m (p0, z1, z2))
+
+/*
+** add_h4_bf16_m_tied1:
+** mov (z[0-9]+\.h), h4
+** bfadd z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (add_h4_bf16_m_tied1, svbfloat16_t, __bf16,
+ z0 = svadd_n_bf16_m (p0, z0, d4),
+ z0 = svadd_m (p0, z0, d4))
+
+/*
+** add_h4_bf16_m_untied:
+** mov (z[0-9]+\.h), h4
+** movprfx z0, z1
+** bfadd z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (add_h4_bf16_m_untied, svbfloat16_t, __bf16,
+ z0 = svadd_n_bf16_m (p0, z1, d4),
+ z0 = svadd_m (p0, z1, d4))
+
+/*
+** add_1_bf16_m:
+** fmov (z[0-9]+\.h), #1\.875(?:e\+0)?
+** bfadd z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (add_1_bf16_m, svbfloat16_t,
+ z0 = svadd_n_bf16_m (p0, z0, 1),
+ z0 = svadd_m (p0, z0, 1))
+
+/*
+** add_0p5_bf16_m:
+** fmov (z[0-9]+\.h), #1\.75(?:e\+0)?
+** bfadd z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (add_0p5_bf16_m, svbfloat16_t,
+ z0 = svadd_n_bf16_m (p0, z0, (bfloat16_t) (0.5)),
+ z0 = svadd_m (p0, z0, (bfloat16_t) (0.5)))
+
+/*
+** add_m1_bf16_m:
+** fmov (z[0-9]+\.h), #-1\.875(?:e\+0)?
+** bfadd z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (add_m1_bf16_m, svbfloat16_t,
+ z0 = svadd_n_bf16_m (p0, z0, -1),
+ z0 = svadd_m (p0, z0, -1))
+
+/*
+** add_m0p5_bf16_m:
+** fmov (z[0-9]+\.h), #-1\.75(?:e\+0)?
+** bfadd z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (add_m0p5_bf16_m, svbfloat16_t,
+ z0 = svadd_n_bf16_m (p0, z0, (bfloat16_t) (-0.5)),
+ z0 = svadd_m (p0, z0, (bfloat16_t) (-0.5)))
+
+/*
+** add_m2_bf16_m_tied1:
+** fmov (z[0-9]+\.h), #-2\.0(?:e\+0)?
+** bfadd z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (add_m2_bf16_m_tied1, svbfloat16_t,
+ z0 = svadd_n_bf16_m (p0, z0, -2),
+ z0 = svadd_m (p0, z0, -2))
+
+/*
+** add_m2_bf16_m_untied:
+** fmov (z[0-9]+\.h), #-2\.0(?:e\+0)?
+** movprfx z0, z1
+** bfadd z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (add_m2_bf16_m_untied, svbfloat16_t,
+ z0 = svadd_n_bf16_m (p0, z1, -2),
+ z0 = svadd_m (p0, z1, -2))
+
+/*
+** add_bf16_z_tied1:
+** movprfx z0\.h, p0/z, z0\.h
+** bfadd z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (add_bf16_z_tied1, svbfloat16_t,
+ z0 = svadd_bf16_z (p0, z0, z1),
+ z0 = svadd_z (p0, z0, z1))
+
+/*
+** add_bf16_z_tied2:
+** movprfx z0\.h, p0/z, z0\.h
+** bfadd z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (add_bf16_z_tied2, svbfloat16_t,
+ z0 = svadd_bf16_z (p0, z1, z0),
+ z0 = svadd_z (p0, z1, z0))
+
+/*
+** add_bf16_z_untied:
+** (
+** movprfx z0\.h, p0/z, z1\.h
+** bfadd z0\.h, p0/m, z0\.h, z2\.h
+** |
+** movprfx z0\.h, p0/z, z2\.h
+** bfadd z0\.h, p0/m, z0\.h, z1\.h
+** )
+** ret
+*/
+TEST_UNIFORM_Z (add_bf16_z_untied, svbfloat16_t,
+ z0 = svadd_bf16_z (p0, z1, z2),
+ z0 = svadd_z (p0, z1, z2))
+
+/*
+** add_h4_bf16_z_tied1:
+** mov (z[0-9]+\.h), h4
+** movprfx z0\.h, p0/z, z0\.h
+** bfadd z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (add_h4_bf16_z_tied1, svbfloat16_t, __bf16,
+ z0 = svadd_n_bf16_z (p0, z0, d4),
+ z0 = svadd_z (p0, z0, d4))
+
+/*
+** add_h4_bf16_z_untied:
+** mov (z[0-9]+\.h), h4
+** (
+** movprfx z0\.h, p0/z, z1\.h
+** bfadd z0\.h, p0/m, z0\.h, \1
+** |
+** movprfx z0\.h, p0/z, \1
+** bfadd z0\.h, p0/m, z0\.h, z1\.h
+** )
+** ret
+*/
+TEST_UNIFORM_ZD (add_h4_bf16_z_untied, svbfloat16_t, __bf16,
+ z0 = svadd_n_bf16_z (p0, z1, d4),
+ z0 = svadd_z (p0, z1, d4))
+
+/*
+** add_1_bf16_z:
+** fmov (z[0-9]+\.h), #1\.875(?:e\+0)?
+** movprfx z0\.h, p0/z, z0\.h
+** bfadd z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (add_1_bf16_z, svbfloat16_t,
+ z0 = svadd_n_bf16_z (p0, z0, 1),
+ z0 = svadd_z (p0, z0, 1))
+
+/*
+** add_bf16_x_tied1:
+** bfadd z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (add_bf16_x_tied1, svbfloat16_t,
+ z0 = svadd_bf16_x (p0, z0, z1),
+ z0 = svadd_x (p0, z0, z1))
+
+/*
+** add_bf16_x_tied2:
+** bfadd z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (add_bf16_x_tied2, svbfloat16_t,
+ z0 = svadd_bf16_x (p0, z1, z0),
+ z0 = svadd_x (p0, z1, z0))
+
+/*
+** add_bf16_x_untied:
+** (
+** movprfx z0, z1
+** bfadd z0\.h, p0/m, z0\.h, z2\.h
+** |
+** movprfx z0, z2
+** bfadd z0\.h, p0/m, z0\.h, z1\.h
+** )
+** ret
+*/
+TEST_UNIFORM_Z (add_bf16_x_untied, svbfloat16_t,
+ z0 = svadd_bf16_x (p0, z1, z2),
+ z0 = svadd_x (p0, z1, z2))
+
+/*
+** add_h4_bf16_x_tied1:
+** mov (z[0-9]+\.h), h4
+** bfadd z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (add_h4_bf16_x_tied1, svbfloat16_t, __bf16,
+ z0 = svadd_n_bf16_x (p0, z0, d4),
+ z0 = svadd_x (p0, z0, d4))
+
+/*
+** add_h4_bf16_x_untied:
+** mov z0\.h, h4
+** bfadd z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_ZD (add_h4_bf16_x_untied, svbfloat16_t, __bf16,
+ z0 = svadd_n_bf16_x (p0, z1, d4),
+ z0 = svadd_x (p0, z1, d4))
+
+/*
+** add_1_bf16_x_tied1:
+** fmov (z[0-9]+\.h), #1\.875(?:e\+0)?
+** bfadd z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (add_1_bf16_x_tied1, svbfloat16_t,
+ z0 = svadd_n_bf16_x (p0, z0, 1),
+ z0 = svadd_x (p0, z0, 1))
+
+/*
+** add_1_bf16_x_untied:
+** fmov z0\.h, #1\.875(?:e\+0)?
+** bfadd z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (add_1_bf16_x_untied, svbfloat16_t,
+ z0 = svadd_n_bf16_x (p0, z1, 1),
+ z0 = svadd_x (p0, z1, 1))
+
+/*
+** ptrue_add_bf16_x_tied1:
+** bfadd z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_add_bf16_x_tied1, svbfloat16_t,
+ z0 = svadd_bf16_x (svptrue_b16 (), z0, z1),
+ z0 = svadd_x (svptrue_b16 (), z0, z1))
+
+/*
+** ptrue_add_bf16_x_tied2:
+** bfadd z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_add_bf16_x_tied2, svbfloat16_t,
+ z0 = svadd_bf16_x (svptrue_b16 (), z1, z0),
+ z0 = svadd_x (svptrue_b16 (), z1, z0))
+
+/*
+** ptrue_add_bf16_x_untied:
+** bfadd z0\.h, (z1\.h, z2\.h|z2\.h, z1\.h)
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_add_bf16_x_untied, svbfloat16_t,
+ z0 = svadd_bf16_x (svptrue_b16 (), z1, z2),
+ z0 = svadd_x (svptrue_b16 (), z1, z2))
+
+/*
+** ptrue_add_1_bf16_x_tied1:
+** fmov (z[0-9]+\.h), #1\.875(?:e\+0)?
+** bfadd z0\.h, (z0\.h, \1|\1, z0\.h)
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_add_1_bf16_x_tied1, svbfloat16_t,
+ z0 = svadd_n_bf16_x (svptrue_b16 (), z0, 1),
+ z0 = svadd_x (svptrue_b16 (), z0, 1))
+
+/*
+** ptrue_add_1_bf16_x_untied:
+** fmov (z[0-9]+\.h), #1\.875(?:e\+0)?
+** bfadd z0\.h, (z1\.h, \1|\1, z1\.h)
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_add_1_bf16_x_untied, svbfloat16_t,
+ z0 = svadd_n_bf16_x (svptrue_b16 (), z1, 1),
+ z0 = svadd_x (svptrue_b16 (), z1, 1))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve-b16b16"
+#ifdef STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** clamp_bf16_tied1:
+** bfclamp z0\.h, z1\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (clamp_bf16_tied1, svbfloat16_t,
+ z0 = svclamp_bf16 (z0, z1, z2),
+ z0 = svclamp (z0, z1, z2))
+
+/*
+** clamp_bf16_tied2:
+** bfclamp z0\.h, z1\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (clamp_bf16_tied2, svbfloat16_t,
+ z0 = svclamp_bf16 (z1, z0, z2),
+ z0 = svclamp (z1, z0, z2))
+
+/*
+** clamp_bf16_tied3:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** bfclamp z0\.h, z2\.h, \1\.h
+** ret
+*/
+TEST_UNIFORM_Z (clamp_bf16_tied3, svbfloat16_t,
+ z0 = svclamp_bf16 (z1, z2, z0),
+ z0 = svclamp (z1, z2, z0))
+
+/*
+** clamp_bf16_untied:
+** movprfx z0, z1
+** bfclamp z0\.h, z2\.h, z3\.h
+** ret
+*/
+TEST_UNIFORM_Z (clamp_bf16_untied, svbfloat16_t,
+ z0 = svclamp_bf16 (z1, z2, z3),
+ z0 = svclamp (z1, z2, z3))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve-b16b16"
+#ifdef STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** max_bf16_m_tied1:
+** bfmax z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (max_bf16_m_tied1, svbfloat16_t,
+ z0 = svmax_bf16_m (p0, z0, z1),
+ z0 = svmax_m (p0, z0, z1))
+
+/*
+** max_bf16_m_tied2:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** bfmax z0\.h, p0/m, z0\.h, \1\.h
+** ret
+*/
+TEST_UNIFORM_Z (max_bf16_m_tied2, svbfloat16_t,
+ z0 = svmax_bf16_m (p0, z1, z0),
+ z0 = svmax_m (p0, z1, z0))
+
+/*
+** max_bf16_m_untied:
+** movprfx z0, z1
+** bfmax z0\.h, p0/m, z0\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (max_bf16_m_untied, svbfloat16_t,
+ z0 = svmax_bf16_m (p0, z1, z2),
+ z0 = svmax_m (p0, z1, z2))
+
+/*
+** max_h4_bf16_m_tied1:
+** mov (z[0-9]+\.h), h4
+** bfmax z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (max_h4_bf16_m_tied1, svbfloat16_t, __bf16,
+ z0 = svmax_n_bf16_m (p0, z0, d4),
+ z0 = svmax_m (p0, z0, d4))
+
+/*
+** max_h4_bf16_m_untied:
+** mov (z[0-9]+\.h), h4
+** movprfx z0, z1
+** bfmax z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (max_h4_bf16_m_untied, svbfloat16_t, __bf16,
+ z0 = svmax_n_bf16_m (p0, z1, d4),
+ z0 = svmax_m (p0, z1, d4))
+
+/*
+** max_1_bf16_m:
+** fmov (z[0-9]+\.h), #1\.875(?:e\+0)?
+** bfmax z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (max_1_bf16_m, svbfloat16_t,
+ z0 = svmax_n_bf16_m (p0, z0, 1),
+ z0 = svmax_m (p0, z0, 1))
+
+/*
+** max_0p5_bf16_m:
+** fmov (z[0-9]+\.h), #1\.75(?:e\+0)?
+** bfmax z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (max_0p5_bf16_m, svbfloat16_t,
+ z0 = svmax_n_bf16_m (p0, z0, (bfloat16_t) (0.5)),
+ z0 = svmax_m (p0, z0, (bfloat16_t) (0.5)))
+
+/*
+** max_m1_bf16_m:
+** fmov (z[0-9]+\.h), #-1\.875(?:e\+0)?
+** bfmax z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (max_m1_bf16_m, svbfloat16_t,
+ z0 = svmax_n_bf16_m (p0, z0, -1),
+ z0 = svmax_m (p0, z0, -1))
+
+/*
+** max_m0p5_bf16_m:
+** fmov (z[0-9]+\.h), #-1\.75(?:e\+0)?
+** bfmax z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (max_m0p5_bf16_m, svbfloat16_t,
+ z0 = svmax_n_bf16_m (p0, z0, (bfloat16_t) (-0.5)),
+ z0 = svmax_m (p0, z0, (bfloat16_t) (-0.5)))
+
+/*
+** max_m2_bf16_m_tied1:
+** fmov (z[0-9]+\.h), #-2\.0(?:e\+0)?
+** bfmax z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (max_m2_bf16_m_tied1, svbfloat16_t,
+ z0 = svmax_n_bf16_m (p0, z0, -2),
+ z0 = svmax_m (p0, z0, -2))
+
+/*
+** max_m2_bf16_m_untied:
+** fmov (z[0-9]+\.h), #-2\.0(?:e\+0)?
+** movprfx z0, z1
+** bfmax z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (max_m2_bf16_m_untied, svbfloat16_t,
+ z0 = svmax_n_bf16_m (p0, z1, -2),
+ z0 = svmax_m (p0, z1, -2))
+
+/*
+** max_bf16_z_tied1:
+** movprfx z0\.h, p0/z, z0\.h
+** bfmax z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (max_bf16_z_tied1, svbfloat16_t,
+ z0 = svmax_bf16_z (p0, z0, z1),
+ z0 = svmax_z (p0, z0, z1))
+
+/*
+** max_bf16_z_tied2:
+** movprfx z0\.h, p0/z, z0\.h
+** bfmax z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (max_bf16_z_tied2, svbfloat16_t,
+ z0 = svmax_bf16_z (p0, z1, z0),
+ z0 = svmax_z (p0, z1, z0))
+
+/*
+** max_bf16_z_untied:
+** (
+** movprfx z0\.h, p0/z, z1\.h
+** bfmax z0\.h, p0/m, z0\.h, z2\.h
+** |
+** movprfx z0\.h, p0/z, z2\.h
+** bfmax z0\.h, p0/m, z0\.h, z1\.h
+** )
+** ret
+*/
+TEST_UNIFORM_Z (max_bf16_z_untied, svbfloat16_t,
+ z0 = svmax_bf16_z (p0, z1, z2),
+ z0 = svmax_z (p0, z1, z2))
+
+/*
+** max_h4_bf16_z_tied1:
+** mov (z[0-9]+\.h), h4
+** movprfx z0\.h, p0/z, z0\.h
+** bfmax z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (max_h4_bf16_z_tied1, svbfloat16_t, __bf16,
+ z0 = svmax_n_bf16_z (p0, z0, d4),
+ z0 = svmax_z (p0, z0, d4))
+
+/*
+** max_h4_bf16_z_untied:
+** mov (z[0-9]+\.h), h4
+** (
+** movprfx z0\.h, p0/z, z1\.h
+** bfmax z0\.h, p0/m, z0\.h, \1
+** |
+** movprfx z0\.h, p0/z, \1
+** bfmax z0\.h, p0/m, z0\.h, z1\.h
+** )
+** ret
+*/
+TEST_UNIFORM_ZD (max_h4_bf16_z_untied, svbfloat16_t, __bf16,
+ z0 = svmax_n_bf16_z (p0, z1, d4),
+ z0 = svmax_z (p0, z1, d4))
+
+/*
+** max_1_bf16_z:
+** fmov (z[0-9]+\.h), #1\.875(?:e\+0)?
+** movprfx z0\.h, p0/z, z0\.h
+** bfmax z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (max_1_bf16_z, svbfloat16_t,
+ z0 = svmax_n_bf16_z (p0, z0, 1),
+ z0 = svmax_z (p0, z0, 1))
+
+/*
+** max_bf16_x_tied1:
+** bfmax z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (max_bf16_x_tied1, svbfloat16_t,
+ z0 = svmax_bf16_x (p0, z0, z1),
+ z0 = svmax_x (p0, z0, z1))
+
+/*
+** max_bf16_x_tied2:
+** bfmax z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (max_bf16_x_tied2, svbfloat16_t,
+ z0 = svmax_bf16_x (p0, z1, z0),
+ z0 = svmax_x (p0, z1, z0))
+
+/*
+** max_bf16_x_untied:
+** (
+** movprfx z0, z1
+** bfmax z0\.h, p0/m, z0\.h, z2\.h
+** |
+** movprfx z0, z2
+** bfmax z0\.h, p0/m, z0\.h, z1\.h
+** )
+** ret
+*/
+TEST_UNIFORM_Z (max_bf16_x_untied, svbfloat16_t,
+ z0 = svmax_bf16_x (p0, z1, z2),
+ z0 = svmax_x (p0, z1, z2))
+
+/*
+** max_h4_bf16_x_tied1:
+** mov (z[0-9]+\.h), h4
+** bfmax z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (max_h4_bf16_x_tied1, svbfloat16_t, __bf16,
+ z0 = svmax_n_bf16_x (p0, z0, d4),
+ z0 = svmax_x (p0, z0, d4))
+
+/*
+** max_h4_bf16_x_untied:
+** mov z0\.h, h4
+** bfmax z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_ZD (max_h4_bf16_x_untied, svbfloat16_t, __bf16,
+ z0 = svmax_n_bf16_x (p0, z1, d4),
+ z0 = svmax_x (p0, z1, d4))
+
+/*
+** max_1_bf16_x_tied1:
+** fmov (z[0-9]+\.h), #1\.875(?:e\+0)?
+** bfmax z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (max_1_bf16_x_tied1, svbfloat16_t,
+ z0 = svmax_n_bf16_x (p0, z0, 1),
+ z0 = svmax_x (p0, z0, 1))
+
+/*
+** max_1_bf16_x_untied:
+** fmov z0\.h, #1\.875(?:e\+0)?
+** bfmax z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (max_1_bf16_x_untied, svbfloat16_t,
+ z0 = svmax_n_bf16_x (p0, z1, 1),
+ z0 = svmax_x (p0, z1, 1))
+
+/*
+** ptrue_max_bf16_x_tied1:
+** ...
+** ptrue [^\n]+
+** ...
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_max_bf16_x_tied1, svbfloat16_t,
+ z0 = svmax_bf16_x (svptrue_b16 (), z0, z1),
+ z0 = svmax_x (svptrue_b16 (), z0, z1))
+
+/*
+** ptrue_max_bf16_x_tied2:
+** ...
+** ptrue [^\n]+
+** ...
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_max_bf16_x_tied2, svbfloat16_t,
+ z0 = svmax_bf16_x (svptrue_b16 (), z1, z0),
+ z0 = svmax_x (svptrue_b16 (), z1, z0))
+
+/*
+** ptrue_max_bf16_x_untied:
+** ...
+** ptrue [^\n]+
+** ...
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_max_bf16_x_untied, svbfloat16_t,
+ z0 = svmax_bf16_x (svptrue_b16 (), z1, z2),
+ z0 = svmax_x (svptrue_b16 (), z1, z2))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve-b16b16"
+#ifdef STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** maxnm_bf16_m_tied1:
+** bfmaxnm z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (maxnm_bf16_m_tied1, svbfloat16_t,
+ z0 = svmaxnm_bf16_m (p0, z0, z1),
+ z0 = svmaxnm_m (p0, z0, z1))
+
+/*
+** maxnm_bf16_m_tied2:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** bfmaxnm z0\.h, p0/m, z0\.h, \1\.h
+** ret
+*/
+TEST_UNIFORM_Z (maxnm_bf16_m_tied2, svbfloat16_t,
+ z0 = svmaxnm_bf16_m (p0, z1, z0),
+ z0 = svmaxnm_m (p0, z1, z0))
+
+/*
+** maxnm_bf16_m_untied:
+** movprfx z0, z1
+** bfmaxnm z0\.h, p0/m, z0\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (maxnm_bf16_m_untied, svbfloat16_t,
+ z0 = svmaxnm_bf16_m (p0, z1, z2),
+ z0 = svmaxnm_m (p0, z1, z2))
+
+/*
+** maxnm_h4_bf16_m_tied1:
+** mov (z[0-9]+\.h), h4
+** bfmaxnm z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (maxnm_h4_bf16_m_tied1, svbfloat16_t, __bf16,
+ z0 = svmaxnm_n_bf16_m (p0, z0, d4),
+ z0 = svmaxnm_m (p0, z0, d4))
+
+/*
+** maxnm_h4_bf16_m_untied:
+** mov (z[0-9]+\.h), h4
+** movprfx z0, z1
+** bfmaxnm z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (maxnm_h4_bf16_m_untied, svbfloat16_t, __bf16,
+ z0 = svmaxnm_n_bf16_m (p0, z1, d4),
+ z0 = svmaxnm_m (p0, z1, d4))
+
+/*
+** maxnm_1_bf16_m:
+** fmov (z[0-9]+\.h), #1\.875(?:e\+0)?
+** bfmaxnm z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (maxnm_1_bf16_m, svbfloat16_t,
+ z0 = svmaxnm_n_bf16_m (p0, z0, 1),
+ z0 = svmaxnm_m (p0, z0, 1))
+
+/*
+** maxnm_0p5_bf16_m:
+** fmov (z[0-9]+\.h), #1\.75(?:e\+0)?
+** bfmaxnm z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (maxnm_0p5_bf16_m, svbfloat16_t,
+ z0 = svmaxnm_n_bf16_m (p0, z0, (bfloat16_t) (0.5)),
+ z0 = svmaxnm_m (p0, z0, (bfloat16_t) (0.5)))
+
+/*
+** maxnm_m1_bf16_m:
+** fmov (z[0-9]+\.h), #-1\.875(?:e\+0)?
+** bfmaxnm z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (maxnm_m1_bf16_m, svbfloat16_t,
+ z0 = svmaxnm_n_bf16_m (p0, z0, -1),
+ z0 = svmaxnm_m (p0, z0, -1))
+
+/*
+** maxnm_m0p5_bf16_m:
+** fmov (z[0-9]+\.h), #-1\.75(?:e\+0)?
+** bfmaxnm z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (maxnm_m0p5_bf16_m, svbfloat16_t,
+ z0 = svmaxnm_n_bf16_m (p0, z0, (bfloat16_t) (-0.5)),
+ z0 = svmaxnm_m (p0, z0, (bfloat16_t) (-0.5)))
+
+/*
+** maxnm_m2_bf16_m_tied1:
+** fmov (z[0-9]+\.h), #-2\.0(?:e\+0)?
+** bfmaxnm z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (maxnm_m2_bf16_m_tied1, svbfloat16_t,
+ z0 = svmaxnm_n_bf16_m (p0, z0, -2),
+ z0 = svmaxnm_m (p0, z0, -2))
+
+/*
+** maxnm_m2_bf16_m_untied:
+** fmov (z[0-9]+\.h), #-2\.0(?:e\+0)?
+** movprfx z0, z1
+** bfmaxnm z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (maxnm_m2_bf16_m_untied, svbfloat16_t,
+ z0 = svmaxnm_n_bf16_m (p0, z1, -2),
+ z0 = svmaxnm_m (p0, z1, -2))
+
+/*
+** maxnm_bf16_z_tied1:
+** movprfx z0\.h, p0/z, z0\.h
+** bfmaxnm z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (maxnm_bf16_z_tied1, svbfloat16_t,
+ z0 = svmaxnm_bf16_z (p0, z0, z1),
+ z0 = svmaxnm_z (p0, z0, z1))
+
+/*
+** maxnm_bf16_z_tied2:
+** movprfx z0\.h, p0/z, z0\.h
+** bfmaxnm z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (maxnm_bf16_z_tied2, svbfloat16_t,
+ z0 = svmaxnm_bf16_z (p0, z1, z0),
+ z0 = svmaxnm_z (p0, z1, z0))
+
+/*
+** maxnm_bf16_z_untied:
+** (
+** movprfx z0\.h, p0/z, z1\.h
+** bfmaxnm z0\.h, p0/m, z0\.h, z2\.h
+** |
+** movprfx z0\.h, p0/z, z2\.h
+** bfmaxnm z0\.h, p0/m, z0\.h, z1\.h
+** )
+** ret
+*/
+TEST_UNIFORM_Z (maxnm_bf16_z_untied, svbfloat16_t,
+ z0 = svmaxnm_bf16_z (p0, z1, z2),
+ z0 = svmaxnm_z (p0, z1, z2))
+
+/*
+** maxnm_h4_bf16_z_tied1:
+** mov (z[0-9]+\.h), h4
+** movprfx z0\.h, p0/z, z0\.h
+** bfmaxnm z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (maxnm_h4_bf16_z_tied1, svbfloat16_t, __bf16,
+ z0 = svmaxnm_n_bf16_z (p0, z0, d4),
+ z0 = svmaxnm_z (p0, z0, d4))
+
+/*
+** maxnm_h4_bf16_z_untied:
+** mov (z[0-9]+\.h), h4
+** (
+** movprfx z0\.h, p0/z, z1\.h
+** bfmaxnm z0\.h, p0/m, z0\.h, \1
+** |
+** movprfx z0\.h, p0/z, \1
+** bfmaxnm z0\.h, p0/m, z0\.h, z1\.h
+** )
+** ret
+*/
+TEST_UNIFORM_ZD (maxnm_h4_bf16_z_untied, svbfloat16_t, __bf16,
+ z0 = svmaxnm_n_bf16_z (p0, z1, d4),
+ z0 = svmaxnm_z (p0, z1, d4))
+
+/*
+** maxnm_1_bf16_z:
+** fmov (z[0-9]+\.h), #1\.875(?:e\+0)?
+** movprfx z0\.h, p0/z, z0\.h
+** bfmaxnm z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (maxnm_1_bf16_z, svbfloat16_t,
+ z0 = svmaxnm_n_bf16_z (p0, z0, 1),
+ z0 = svmaxnm_z (p0, z0, 1))
+
+/*
+** maxnm_bf16_x_tied1:
+** bfmaxnm z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (maxnm_bf16_x_tied1, svbfloat16_t,
+ z0 = svmaxnm_bf16_x (p0, z0, z1),
+ z0 = svmaxnm_x (p0, z0, z1))
+
+/*
+** maxnm_bf16_x_tied2:
+** bfmaxnm z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (maxnm_bf16_x_tied2, svbfloat16_t,
+ z0 = svmaxnm_bf16_x (p0, z1, z0),
+ z0 = svmaxnm_x (p0, z1, z0))
+
+/*
+** maxnm_bf16_x_untied:
+** (
+** movprfx z0, z1
+** bfmaxnm z0\.h, p0/m, z0\.h, z2\.h
+** |
+** movprfx z0, z2
+** bfmaxnm z0\.h, p0/m, z0\.h, z1\.h
+** )
+** ret
+*/
+TEST_UNIFORM_Z (maxnm_bf16_x_untied, svbfloat16_t,
+ z0 = svmaxnm_bf16_x (p0, z1, z2),
+ z0 = svmaxnm_x (p0, z1, z2))
+
+/*
+** maxnm_h4_bf16_x_tied1:
+** mov (z[0-9]+\.h), h4
+** bfmaxnm z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (maxnm_h4_bf16_x_tied1, svbfloat16_t, __bf16,
+ z0 = svmaxnm_n_bf16_x (p0, z0, d4),
+ z0 = svmaxnm_x (p0, z0, d4))
+
+/*
+** maxnm_h4_bf16_x_untied:
+** mov z0\.h, h4
+** bfmaxnm z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_ZD (maxnm_h4_bf16_x_untied, svbfloat16_t, __bf16,
+ z0 = svmaxnm_n_bf16_x (p0, z1, d4),
+ z0 = svmaxnm_x (p0, z1, d4))
+
+/*
+** maxnm_1_bf16_x_tied1:
+** fmov (z[0-9]+\.h), #1\.875(?:e\+0)?
+** bfmaxnm z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (maxnm_1_bf16_x_tied1, svbfloat16_t,
+ z0 = svmaxnm_n_bf16_x (p0, z0, 1),
+ z0 = svmaxnm_x (p0, z0, 1))
+
+/*
+** maxnm_1_bf16_x_untied:
+** fmov z0\.h, #1\.875(?:e\+0)?
+** bfmaxnm z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (maxnm_1_bf16_x_untied, svbfloat16_t,
+ z0 = svmaxnm_n_bf16_x (p0, z1, 1),
+ z0 = svmaxnm_x (p0, z1, 1))
+
+/*
+** ptrue_maxnm_bf16_x_tied1:
+** ...
+** ptrue [^\n]+
+** ...
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_maxnm_bf16_x_tied1, svbfloat16_t,
+ z0 = svmaxnm_bf16_x (svptrue_b16 (), z0, z1),
+ z0 = svmaxnm_x (svptrue_b16 (), z0, z1))
+
+/*
+** ptrue_maxnm_bf16_x_tied2:
+** ...
+** ptrue [^\n]+
+** ...
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_maxnm_bf16_x_tied2, svbfloat16_t,
+ z0 = svmaxnm_bf16_x (svptrue_b16 (), z1, z0),
+ z0 = svmaxnm_x (svptrue_b16 (), z1, z0))
+
+/*
+** ptrue_maxnm_bf16_x_untied:
+** ...
+** ptrue [^\n]+
+** ...
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_maxnm_bf16_x_untied, svbfloat16_t,
+ z0 = svmaxnm_bf16_x (svptrue_b16 (), z1, z2),
+ z0 = svmaxnm_x (svptrue_b16 (), z1, z2))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve-b16b16"
+#ifdef STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** min_bf16_m_tied1:
+** bfmin z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (min_bf16_m_tied1, svbfloat16_t,
+ z0 = svmin_bf16_m (p0, z0, z1),
+ z0 = svmin_m (p0, z0, z1))
+
+/*
+** min_bf16_m_tied2:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** bfmin z0\.h, p0/m, z0\.h, \1\.h
+** ret
+*/
+TEST_UNIFORM_Z (min_bf16_m_tied2, svbfloat16_t,
+ z0 = svmin_bf16_m (p0, z1, z0),
+ z0 = svmin_m (p0, z1, z0))
+
+/*
+** min_bf16_m_untied:
+** movprfx z0, z1
+** bfmin z0\.h, p0/m, z0\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (min_bf16_m_untied, svbfloat16_t,
+ z0 = svmin_bf16_m (p0, z1, z2),
+ z0 = svmin_m (p0, z1, z2))
+
+/*
+** min_h4_bf16_m_tied1:
+** mov (z[0-9]+\.h), h4
+** bfmin z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (min_h4_bf16_m_tied1, svbfloat16_t, __bf16,
+ z0 = svmin_n_bf16_m (p0, z0, d4),
+ z0 = svmin_m (p0, z0, d4))
+
+/*
+** min_h4_bf16_m_untied:
+** mov (z[0-9]+\.h), h4
+** movprfx z0, z1
+** bfmin z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (min_h4_bf16_m_untied, svbfloat16_t, __bf16,
+ z0 = svmin_n_bf16_m (p0, z1, d4),
+ z0 = svmin_m (p0, z1, d4))
+
+/*
+** min_1_bf16_m:
+** fmov (z[0-9]+\.h), #1\.875(?:e\+0)?
+** bfmin z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (min_1_bf16_m, svbfloat16_t,
+ z0 = svmin_n_bf16_m (p0, z0, 1),
+ z0 = svmin_m (p0, z0, 1))
+
+/*
+** min_0p5_bf16_m:
+** fmov (z[0-9]+\.h), #1\.75(?:e\+0)?
+** bfmin z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (min_0p5_bf16_m, svbfloat16_t,
+ z0 = svmin_n_bf16_m (p0, z0, (bfloat16_t) (0.5)),
+ z0 = svmin_m (p0, z0, (bfloat16_t) (0.5)))
+
+/*
+** min_m1_bf16_m:
+** fmov (z[0-9]+\.h), #-1\.875(?:e\+0)?
+** bfmin z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (min_m1_bf16_m, svbfloat16_t,
+ z0 = svmin_n_bf16_m (p0, z0, -1),
+ z0 = svmin_m (p0, z0, -1))
+
+/*
+** min_m0p5_bf16_m:
+** fmov (z[0-9]+\.h), #-1\.75(?:e\+0)?
+** bfmin z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (min_m0p5_bf16_m, svbfloat16_t,
+ z0 = svmin_n_bf16_m (p0, z0, (bfloat16_t) (-0.5)),
+ z0 = svmin_m (p0, z0, (bfloat16_t) (-0.5)))
+
+/*
+** min_m2_bf16_m_tied1:
+** fmov (z[0-9]+\.h), #-2\.0(?:e\+0)?
+** bfmin z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (min_m2_bf16_m_tied1, svbfloat16_t,
+ z0 = svmin_n_bf16_m (p0, z0, -2),
+ z0 = svmin_m (p0, z0, -2))
+
+/*
+** min_m2_bf16_m_untied:
+** fmov (z[0-9]+\.h), #-2\.0(?:e\+0)?
+** movprfx z0, z1
+** bfmin z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (min_m2_bf16_m_untied, svbfloat16_t,
+ z0 = svmin_n_bf16_m (p0, z1, -2),
+ z0 = svmin_m (p0, z1, -2))
+
+/*
+** min_bf16_z_tied1:
+** movprfx z0\.h, p0/z, z0\.h
+** bfmin z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (min_bf16_z_tied1, svbfloat16_t,
+ z0 = svmin_bf16_z (p0, z0, z1),
+ z0 = svmin_z (p0, z0, z1))
+
+/*
+** min_bf16_z_tied2:
+** movprfx z0\.h, p0/z, z0\.h
+** bfmin z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (min_bf16_z_tied2, svbfloat16_t,
+ z0 = svmin_bf16_z (p0, z1, z0),
+ z0 = svmin_z (p0, z1, z0))
+
+/*
+** min_bf16_z_untied:
+** (
+** movprfx z0\.h, p0/z, z1\.h
+** bfmin z0\.h, p0/m, z0\.h, z2\.h
+** |
+** movprfx z0\.h, p0/z, z2\.h
+** bfmin z0\.h, p0/m, z0\.h, z1\.h
+** )
+** ret
+*/
+TEST_UNIFORM_Z (min_bf16_z_untied, svbfloat16_t,
+ z0 = svmin_bf16_z (p0, z1, z2),
+ z0 = svmin_z (p0, z1, z2))
+
+/*
+** min_h4_bf16_z_tied1:
+** mov (z[0-9]+\.h), h4
+** movprfx z0\.h, p0/z, z0\.h
+** bfmin z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (min_h4_bf16_z_tied1, svbfloat16_t, __bf16,
+ z0 = svmin_n_bf16_z (p0, z0, d4),
+ z0 = svmin_z (p0, z0, d4))
+
+/*
+** min_h4_bf16_z_untied:
+** mov (z[0-9]+\.h), h4
+** (
+** movprfx z0\.h, p0/z, z1\.h
+** bfmin z0\.h, p0/m, z0\.h, \1
+** |
+** movprfx z0\.h, p0/z, \1
+** bfmin z0\.h, p0/m, z0\.h, z1\.h
+** )
+** ret
+*/
+TEST_UNIFORM_ZD (min_h4_bf16_z_untied, svbfloat16_t, __bf16,
+ z0 = svmin_n_bf16_z (p0, z1, d4),
+ z0 = svmin_z (p0, z1, d4))
+
+/*
+** min_1_bf16_z:
+** fmov (z[0-9]+\.h), #1\.875(?:e\+0)?
+** movprfx z0\.h, p0/z, z0\.h
+** bfmin z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (min_1_bf16_z, svbfloat16_t,
+ z0 = svmin_n_bf16_z (p0, z0, 1),
+ z0 = svmin_z (p0, z0, 1))
+
+/*
+** min_bf16_x_tied1:
+** bfmin z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (min_bf16_x_tied1, svbfloat16_t,
+ z0 = svmin_bf16_x (p0, z0, z1),
+ z0 = svmin_x (p0, z0, z1))
+
+/*
+** min_bf16_x_tied2:
+** bfmin z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (min_bf16_x_tied2, svbfloat16_t,
+ z0 = svmin_bf16_x (p0, z1, z0),
+ z0 = svmin_x (p0, z1, z0))
+
+/*
+** min_bf16_x_untied:
+** (
+** movprfx z0, z1
+** bfmin z0\.h, p0/m, z0\.h, z2\.h
+** |
+** movprfx z0, z2
+** bfmin z0\.h, p0/m, z0\.h, z1\.h
+** )
+** ret
+*/
+TEST_UNIFORM_Z (min_bf16_x_untied, svbfloat16_t,
+ z0 = svmin_bf16_x (p0, z1, z2),
+ z0 = svmin_x (p0, z1, z2))
+
+/*
+** min_h4_bf16_x_tied1:
+** mov (z[0-9]+\.h), h4
+** bfmin z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (min_h4_bf16_x_tied1, svbfloat16_t, __bf16,
+ z0 = svmin_n_bf16_x (p0, z0, d4),
+ z0 = svmin_x (p0, z0, d4))
+
+/*
+** min_h4_bf16_x_untied:
+** mov z0\.h, h4
+** bfmin z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_ZD (min_h4_bf16_x_untied, svbfloat16_t, __bf16,
+ z0 = svmin_n_bf16_x (p0, z1, d4),
+ z0 = svmin_x (p0, z1, d4))
+
+/*
+** min_1_bf16_x_tied1:
+** fmov (z[0-9]+\.h), #1\.875(?:e\+0)?
+** bfmin z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (min_1_bf16_x_tied1, svbfloat16_t,
+ z0 = svmin_n_bf16_x (p0, z0, 1),
+ z0 = svmin_x (p0, z0, 1))
+
+/*
+** min_1_bf16_x_untied:
+** fmov z0\.h, #1\.875(?:e\+0)?
+** bfmin z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (min_1_bf16_x_untied, svbfloat16_t,
+ z0 = svmin_n_bf16_x (p0, z1, 1),
+ z0 = svmin_x (p0, z1, 1))
+
+/*
+** ptrue_min_bf16_x_tied1:
+** ...
+** ptrue [^\n]+
+** ...
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_min_bf16_x_tied1, svbfloat16_t,
+ z0 = svmin_bf16_x (svptrue_b16 (), z0, z1),
+ z0 = svmin_x (svptrue_b16 (), z0, z1))
+
+/*
+** ptrue_min_bf16_x_tied2:
+** ...
+** ptrue [^\n]+
+** ...
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_min_bf16_x_tied2, svbfloat16_t,
+ z0 = svmin_bf16_x (svptrue_b16 (), z1, z0),
+ z0 = svmin_x (svptrue_b16 (), z1, z0))
+
+/*
+** ptrue_min_bf16_x_untied:
+** ...
+** ptrue [^\n]+
+** ...
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_min_bf16_x_untied, svbfloat16_t,
+ z0 = svmin_bf16_x (svptrue_b16 (), z1, z2),
+ z0 = svmin_x (svptrue_b16 (), z1, z2))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve-b16b16"
+#ifdef STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** minnm_bf16_m_tied1:
+** bfminnm z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (minnm_bf16_m_tied1, svbfloat16_t,
+ z0 = svminnm_bf16_m (p0, z0, z1),
+ z0 = svminnm_m (p0, z0, z1))
+
+/*
+** minnm_bf16_m_tied2:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** bfminnm z0\.h, p0/m, z0\.h, \1\.h
+** ret
+*/
+TEST_UNIFORM_Z (minnm_bf16_m_tied2, svbfloat16_t,
+ z0 = svminnm_bf16_m (p0, z1, z0),
+ z0 = svminnm_m (p0, z1, z0))
+
+/*
+** minnm_bf16_m_untied:
+** movprfx z0, z1
+** bfminnm z0\.h, p0/m, z0\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (minnm_bf16_m_untied, svbfloat16_t,
+ z0 = svminnm_bf16_m (p0, z1, z2),
+ z0 = svminnm_m (p0, z1, z2))
+
+/*
+** minnm_h4_bf16_m_tied1:
+** mov (z[0-9]+\.h), h4
+** bfminnm z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (minnm_h4_bf16_m_tied1, svbfloat16_t, __bf16,
+ z0 = svminnm_n_bf16_m (p0, z0, d4),
+ z0 = svminnm_m (p0, z0, d4))
+
+/*
+** minnm_h4_bf16_m_untied:
+** mov (z[0-9]+\.h), h4
+** movprfx z0, z1
+** bfminnm z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (minnm_h4_bf16_m_untied, svbfloat16_t, __bf16,
+ z0 = svminnm_n_bf16_m (p0, z1, d4),
+ z0 = svminnm_m (p0, z1, d4))
+
+/*
+** minnm_1_bf16_m:
+** fmov (z[0-9]+\.h), #1\.875(?:e\+0)?
+** bfminnm z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (minnm_1_bf16_m, svbfloat16_t,
+ z0 = svminnm_n_bf16_m (p0, z0, 1),
+ z0 = svminnm_m (p0, z0, 1))
+
+/*
+** minnm_0p5_bf16_m:
+** fmov (z[0-9]+\.h), #1\.75(?:e\+0)?
+** bfminnm z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (minnm_0p5_bf16_m, svbfloat16_t,
+ z0 = svminnm_n_bf16_m (p0, z0, (bfloat16_t) (0.5)),
+ z0 = svminnm_m (p0, z0, (bfloat16_t) (0.5)))
+
+/*
+** minnm_m1_bf16_m:
+** fmov (z[0-9]+\.h), #-1\.875(?:e\+0)?
+** bfminnm z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (minnm_m1_bf16_m, svbfloat16_t,
+ z0 = svminnm_n_bf16_m (p0, z0, -1),
+ z0 = svminnm_m (p0, z0, -1))
+
+/*
+** minnm_m0p5_bf16_m:
+** fmov (z[0-9]+\.h), #-1\.75(?:e\+0)?
+** bfminnm z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (minnm_m0p5_bf16_m, svbfloat16_t,
+ z0 = svminnm_n_bf16_m (p0, z0, (bfloat16_t) (-0.5)),
+ z0 = svminnm_m (p0, z0, (bfloat16_t) (-0.5)))
+
+/*
+** minnm_m2_bf16_m_tied1:
+** fmov (z[0-9]+\.h), #-2\.0(?:e\+0)?
+** bfminnm z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (minnm_m2_bf16_m_tied1, svbfloat16_t,
+ z0 = svminnm_n_bf16_m (p0, z0, -2),
+ z0 = svminnm_m (p0, z0, -2))
+
+/*
+** minnm_m2_bf16_m_untied:
+** fmov (z[0-9]+\.h), #-2\.0(?:e\+0)?
+** movprfx z0, z1
+** bfminnm z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (minnm_m2_bf16_m_untied, svbfloat16_t,
+ z0 = svminnm_n_bf16_m (p0, z1, -2),
+ z0 = svminnm_m (p0, z1, -2))
+
+/*
+** minnm_bf16_z_tied1:
+** movprfx z0\.h, p0/z, z0\.h
+** bfminnm z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (minnm_bf16_z_tied1, svbfloat16_t,
+ z0 = svminnm_bf16_z (p0, z0, z1),
+ z0 = svminnm_z (p0, z0, z1))
+
+/*
+** minnm_bf16_z_tied2:
+** movprfx z0\.h, p0/z, z0\.h
+** bfminnm z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (minnm_bf16_z_tied2, svbfloat16_t,
+ z0 = svminnm_bf16_z (p0, z1, z0),
+ z0 = svminnm_z (p0, z1, z0))
+
+/*
+** minnm_bf16_z_untied:
+** (
+** movprfx z0\.h, p0/z, z1\.h
+** bfminnm z0\.h, p0/m, z0\.h, z2\.h
+** |
+** movprfx z0\.h, p0/z, z2\.h
+** bfminnm z0\.h, p0/m, z0\.h, z1\.h
+** )
+** ret
+*/
+TEST_UNIFORM_Z (minnm_bf16_z_untied, svbfloat16_t,
+ z0 = svminnm_bf16_z (p0, z1, z2),
+ z0 = svminnm_z (p0, z1, z2))
+
+/*
+** minnm_h4_bf16_z_tied1:
+** mov (z[0-9]+\.h), h4
+** movprfx z0\.h, p0/z, z0\.h
+** bfminnm z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (minnm_h4_bf16_z_tied1, svbfloat16_t, __bf16,
+ z0 = svminnm_n_bf16_z (p0, z0, d4),
+ z0 = svminnm_z (p0, z0, d4))
+
+/*
+** minnm_h4_bf16_z_untied:
+** mov (z[0-9]+\.h), h4
+** (
+** movprfx z0\.h, p0/z, z1\.h
+** bfminnm z0\.h, p0/m, z0\.h, \1
+** |
+** movprfx z0\.h, p0/z, \1
+** bfminnm z0\.h, p0/m, z0\.h, z1\.h
+** )
+** ret
+*/
+TEST_UNIFORM_ZD (minnm_h4_bf16_z_untied, svbfloat16_t, __bf16,
+ z0 = svminnm_n_bf16_z (p0, z1, d4),
+ z0 = svminnm_z (p0, z1, d4))
+
+/*
+** minnm_1_bf16_z:
+** fmov (z[0-9]+\.h), #1\.875(?:e\+0)?
+** movprfx z0\.h, p0/z, z0\.h
+** bfminnm z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (minnm_1_bf16_z, svbfloat16_t,
+ z0 = svminnm_n_bf16_z (p0, z0, 1),
+ z0 = svminnm_z (p0, z0, 1))
+
+/*
+** minnm_bf16_x_tied1:
+** bfminnm z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (minnm_bf16_x_tied1, svbfloat16_t,
+ z0 = svminnm_bf16_x (p0, z0, z1),
+ z0 = svminnm_x (p0, z0, z1))
+
+/*
+** minnm_bf16_x_tied2:
+** bfminnm z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (minnm_bf16_x_tied2, svbfloat16_t,
+ z0 = svminnm_bf16_x (p0, z1, z0),
+ z0 = svminnm_x (p0, z1, z0))
+
+/*
+** minnm_bf16_x_untied:
+** (
+** movprfx z0, z1
+** bfminnm z0\.h, p0/m, z0\.h, z2\.h
+** |
+** movprfx z0, z2
+** bfminnm z0\.h, p0/m, z0\.h, z1\.h
+** )
+** ret
+*/
+TEST_UNIFORM_Z (minnm_bf16_x_untied, svbfloat16_t,
+ z0 = svminnm_bf16_x (p0, z1, z2),
+ z0 = svminnm_x (p0, z1, z2))
+
+/*
+** minnm_h4_bf16_x_tied1:
+** mov (z[0-9]+\.h), h4
+** bfminnm z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (minnm_h4_bf16_x_tied1, svbfloat16_t, __bf16,
+ z0 = svminnm_n_bf16_x (p0, z0, d4),
+ z0 = svminnm_x (p0, z0, d4))
+
+/*
+** minnm_h4_bf16_x_untied:
+** mov z0\.h, h4
+** bfminnm z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_ZD (minnm_h4_bf16_x_untied, svbfloat16_t, __bf16,
+ z0 = svminnm_n_bf16_x (p0, z1, d4),
+ z0 = svminnm_x (p0, z1, d4))
+
+/*
+** minnm_1_bf16_x_tied1:
+** fmov (z[0-9]+\.h), #1\.875(?:e\+0)?
+** bfminnm z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (minnm_1_bf16_x_tied1, svbfloat16_t,
+ z0 = svminnm_n_bf16_x (p0, z0, 1),
+ z0 = svminnm_x (p0, z0, 1))
+
+/*
+** minnm_1_bf16_x_untied:
+** fmov z0\.h, #1\.875(?:e\+0)?
+** bfminnm z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (minnm_1_bf16_x_untied, svbfloat16_t,
+ z0 = svminnm_n_bf16_x (p0, z1, 1),
+ z0 = svminnm_x (p0, z1, 1))
+
+/*
+** ptrue_minnm_bf16_x_tied1:
+** ...
+** ptrue [^\n]+
+** ...
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_minnm_bf16_x_tied1, svbfloat16_t,
+ z0 = svminnm_bf16_x (svptrue_b16 (), z0, z1),
+ z0 = svminnm_x (svptrue_b16 (), z0, z1))
+
+/*
+** ptrue_minnm_bf16_x_tied2:
+** ...
+** ptrue [^\n]+
+** ...
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_minnm_bf16_x_tied2, svbfloat16_t,
+ z0 = svminnm_bf16_x (svptrue_b16 (), z1, z0),
+ z0 = svminnm_x (svptrue_b16 (), z1, z0))
+
+/*
+** ptrue_minnm_bf16_x_untied:
+** ...
+** ptrue [^\n]+
+** ...
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_minnm_bf16_x_untied, svbfloat16_t,
+ z0 = svminnm_bf16_x (svptrue_b16 (), z1, z2),
+ z0 = svminnm_x (svptrue_b16 (), z1, z2))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve-b16b16"
+#ifdef STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** mla_bf16_m_tied1:
+** bfmla z0\.h, p0/m, z1\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (mla_bf16_m_tied1, svbfloat16_t,
+ z0 = svmla_bf16_m (p0, z0, z1, z2),
+ z0 = svmla_m (p0, z0, z1, z2))
+
+/*
+** mla_bf16_m_tied2:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** bfmla z0\.h, p0/m, \1\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (mla_bf16_m_tied2, svbfloat16_t,
+ z0 = svmla_bf16_m (p0, z1, z0, z2),
+ z0 = svmla_m (p0, z1, z0, z2))
+
+/*
+** mla_bf16_m_tied3:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** bfmla z0\.h, p0/m, z2\.h, \1\.h
+** ret
+*/
+TEST_UNIFORM_Z (mla_bf16_m_tied3, svbfloat16_t,
+ z0 = svmla_bf16_m (p0, z1, z2, z0),
+ z0 = svmla_m (p0, z1, z2, z0))
+
+/*
+** mla_bf16_m_untied:
+** movprfx z0, z1
+** bfmla z0\.h, p0/m, z2\.h, z3\.h
+** ret
+*/
+TEST_UNIFORM_Z (mla_bf16_m_untied, svbfloat16_t,
+ z0 = svmla_bf16_m (p0, z1, z2, z3),
+ z0 = svmla_m (p0, z1, z2, z3))
+
+/*
+** mla_h4_bf16_m_tied1:
+** mov (z[0-9]+\.h), h4
+** bfmla z0\.h, p0/m, z1\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (mla_h4_bf16_m_tied1, svbfloat16_t, __bf16,
+ z0 = svmla_n_bf16_m (p0, z0, z1, d4),
+ z0 = svmla_m (p0, z0, z1, d4))
+
+/*
+** mla_h4_bf16_m_untied:
+** mov (z[0-9]+\.h), h4
+** movprfx z0, z1
+** bfmla z0\.h, p0/m, z2\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (mla_h4_bf16_m_untied, svbfloat16_t, __bf16,
+ z0 = svmla_n_bf16_m (p0, z1, z2, d4),
+ z0 = svmla_m (p0, z1, z2, d4))
+
+/*
+** mla_2_bf16_m_tied1:
+** fmov (z[0-9]+\.h), #2\.0(?:e\+0)?
+** bfmla z0\.h, p0/m, z1\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (mla_2_bf16_m_tied1, svbfloat16_t,
+ z0 = svmla_n_bf16_m (p0, z0, z1, 2),
+ z0 = svmla_m (p0, z0, z1, 2))
+
+/*
+** mla_2_bf16_m_untied:
+** fmov (z[0-9]+\.h), #2\.0(?:e\+0)?
+** movprfx z0, z1
+** bfmla z0\.h, p0/m, z2\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (mla_2_bf16_m_untied, svbfloat16_t,
+ z0 = svmla_n_bf16_m (p0, z1, z2, 2),
+ z0 = svmla_m (p0, z1, z2, 2))
+
+/*
+** mla_bf16_z_tied1:
+** movprfx z0\.h, p0/z, z0\.h
+** bfmla z0\.h, p0/m, z1\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (mla_bf16_z_tied1, svbfloat16_t,
+ z0 = svmla_bf16_z (p0, z0, z1, z2),
+ z0 = svmla_z (p0, z0, z1, z2))
+
+/*
+** mla_bf16_z_tied2:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0\.h, p0/z, z1\.h
+** bfmla z0\.h, p0/m, \1\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (mla_bf16_z_tied2, svbfloat16_t,
+ z0 = svmla_bf16_z (p0, z1, z0, z2),
+ z0 = svmla_z (p0, z1, z0, z2))
+
+/*
+** mla_bf16_z_tied3:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0\.h, p0/z, z1\.h
+** bfmla z0\.h, p0/m, z2\.h, \1\.h
+** ret
+*/
+TEST_UNIFORM_Z (mla_bf16_z_tied3, svbfloat16_t,
+ z0 = svmla_bf16_z (p0, z1, z2, z0),
+ z0 = svmla_z (p0, z1, z2, z0))
+
+/*
+** mla_bf16_z_untied:
+** movprfx z0\.h, p0/z, z1\.h
+** bfmla z0\.h, p0/m, z2\.h, z3\.h
+** ret
+*/
+TEST_UNIFORM_Z (mla_bf16_z_untied, svbfloat16_t,
+ z0 = svmla_bf16_z (p0, z1, z2, z3),
+ z0 = svmla_z (p0, z1, z2, z3))
+
+/*
+** mla_h4_bf16_z_tied1:
+** mov (z[0-9]+\.h), h4
+** movprfx z0\.h, p0/z, z0\.h
+** bfmla z0\.h, p0/m, z1\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (mla_h4_bf16_z_tied1, svbfloat16_t, __bf16,
+ z0 = svmla_n_bf16_z (p0, z0, z1, d4),
+ z0 = svmla_z (p0, z0, z1, d4))
+
+/*
+** mla_h4_bf16_z_tied2:
+** mov (z[0-9]+\.h), h4
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0\.h, p0/z, z1\.h
+** bfmla z0\.h, p0/m, \2\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (mla_h4_bf16_z_tied2, svbfloat16_t, __bf16,
+ z0 = svmla_n_bf16_z (p0, z1, z0, d4),
+ z0 = svmla_z (p0, z1, z0, d4))
+
+/*
+** mla_h4_bf16_z_untied:
+** mov (z[0-9]+\.h), h4
+** movprfx z0\.h, p0/z, z1\.h
+** bfmla z0\.h, p0/m, z2\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (mla_h4_bf16_z_untied, svbfloat16_t, __bf16,
+ z0 = svmla_n_bf16_z (p0, z1, z2, d4),
+ z0 = svmla_z (p0, z1, z2, d4))
+
+/*
+** mla_2_bf16_z_tied1:
+** fmov (z[0-9]+\.h), #2\.0(?:e\+0)?
+** movprfx z0\.h, p0/z, z0\.h
+** bfmla z0\.h, p0/m, z1\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (mla_2_bf16_z_tied1, svbfloat16_t,
+ z0 = svmla_n_bf16_z (p0, z0, z1, 2),
+ z0 = svmla_z (p0, z0, z1, 2))
+
+/*
+** mla_bf16_x_tied1:
+** bfmla z0\.h, p0/m, z1\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (mla_bf16_x_tied1, svbfloat16_t,
+ z0 = svmla_bf16_x (p0, z0, z1, z2),
+ z0 = svmla_x (p0, z0, z1, z2))
+
+/*
+** mla_bf16_x_tied2:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** bfmla z0\.h, p0/m, \1\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (mla_bf16_x_tied2, svbfloat16_t,
+ z0 = svmla_bf16_x (p0, z1, z0, z2),
+ z0 = svmla_x (p0, z1, z0, z2))
+
+/*
+** mla_bf16_x_tied3:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** bfmla z0\.h, p0/m, z2\.h, \1\.h
+** ret
+*/
+TEST_UNIFORM_Z (mla_bf16_x_tied3, svbfloat16_t,
+ z0 = svmla_bf16_x (p0, z1, z2, z0),
+ z0 = svmla_x (p0, z1, z2, z0))
+
+/*
+** mla_bf16_x_untied:
+** movprfx z0, z1
+** bfmla z0\.h, p0/m, z2\.h, z3\.h
+** ret
+*/
+TEST_UNIFORM_Z (mla_bf16_x_untied, svbfloat16_t,
+ z0 = svmla_bf16_x (p0, z1, z2, z3),
+ z0 = svmla_x (p0, z1, z2, z3))
+
+/*
+** mla_h4_bf16_x_tied1:
+** mov (z[0-9]+\.h), h4
+** bfmla z0\.h, p0/m, z1\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (mla_h4_bf16_x_tied1, svbfloat16_t, __bf16,
+ z0 = svmla_n_bf16_x (p0, z0, z1, d4),
+ z0 = svmla_x (p0, z0, z1, d4))
+
+/*
+** mla_h4_bf16_x_tied2:
+** mov (z[0-9]+\.h), h4
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** bfmla z0\.h, p0/m, \2\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (mla_h4_bf16_x_tied2, svbfloat16_t, __bf16,
+ z0 = svmla_n_bf16_x (p0, z1, z0, d4),
+ z0 = svmla_x (p0, z1, z0, d4))
+
+/*
+** mla_h4_bf16_x_untied:
+** mov (z[0-9]+\.h), h4
+** movprfx z0, z1
+** bfmla z0\.h, p0/m, z2\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (mla_h4_bf16_x_untied, svbfloat16_t, __bf16,
+ z0 = svmla_n_bf16_x (p0, z1, z2, d4),
+ z0 = svmla_x (p0, z1, z2, d4))
+
+/*
+** mla_2_bf16_x_tied1:
+** fmov (z[0-9]+\.h), #2\.0(?:e\+0)?
+** bfmla z0\.h, p0/m, z1\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (mla_2_bf16_x_tied1, svbfloat16_t,
+ z0 = svmla_n_bf16_x (p0, z0, z1, 2),
+ z0 = svmla_x (p0, z0, z1, 2))
+
+/*
+** ptrue_mla_bf16_x_tied1:
+** ...
+** ptrue p[0-9]+\.b[^\n]*
+** ...
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_mla_bf16_x_tied1, svbfloat16_t,
+ z0 = svmla_bf16_x (svptrue_b16 (), z0, z1, z2),
+ z0 = svmla_x (svptrue_b16 (), z0, z1, z2))
+
+/*
+** ptrue_mla_bf16_x_tied2:
+** ...
+** ptrue p[0-9]+\.b[^\n]*
+** ...
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_mla_bf16_x_tied2, svbfloat16_t,
+ z0 = svmla_bf16_x (svptrue_b16 (), z1, z0, z2),
+ z0 = svmla_x (svptrue_b16 (), z1, z0, z2))
+
+/*
+** ptrue_mla_bf16_x_tied3:
+** ...
+** ptrue p[0-9]+\.b[^\n]*
+** ...
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_mla_bf16_x_tied3, svbfloat16_t,
+ z0 = svmla_bf16_x (svptrue_b16 (), z1, z2, z0),
+ z0 = svmla_x (svptrue_b16 (), z1, z2, z0))
+
+/*
+** ptrue_mla_bf16_x_untied:
+** ...
+** ptrue p[0-9]+\.b[^\n]*
+** ...
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_mla_bf16_x_untied, svbfloat16_t,
+ z0 = svmla_bf16_x (svptrue_b16 (), z1, z2, z3),
+ z0 = svmla_x (svptrue_b16 (), z1, z2, z3))
+
+/*
+** ptrue_mla_2_bf16_x_tied1:
+** ...
+** ptrue p[0-9]+\.b[^\n]*
+** ...
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_mla_2_bf16_x_tied1, svbfloat16_t,
+ z0 = svmla_n_bf16_x (svptrue_b16 (), z0, z1, 2),
+ z0 = svmla_x (svptrue_b16 (), z0, z1, 2))
+
+/*
+** ptrue_mla_2_bf16_x_tied2:
+** ...
+** ptrue p[0-9]+\.b[^\n]*
+** ...
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_mla_2_bf16_x_tied2, svbfloat16_t,
+ z0 = svmla_n_bf16_x (svptrue_b16 (), z1, z0, 2),
+ z0 = svmla_x (svptrue_b16 (), z1, z0, 2))
+
+/*
+** ptrue_mla_2_bf16_x_untied:
+** ...
+** ptrue p[0-9]+\.b[^\n]*
+** ...
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_mla_2_bf16_x_untied, svbfloat16_t,
+ z0 = svmla_n_bf16_x (svptrue_b16 (), z1, z2, 2),
+ z0 = svmla_x (svptrue_b16 (), z1, z2, 2))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve-b16b16"
+#ifdef STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** mla_lane_0_bf16_tied1:
+** bfmla z0\.h, z1\.h, z2\.h\[0\]
+** ret
+*/
+TEST_UNIFORM_Z (mla_lane_0_bf16_tied1, svbfloat16_t,
+ z0 = svmla_lane_bf16 (z0, z1, z2, 0),
+ z0 = svmla_lane (z0, z1, z2, 0))
+
+/*
+** mla_lane_0_bf16_tied2:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** bfmla z0\.h, \1\.h, z2\.h\[0\]
+** ret
+*/
+TEST_UNIFORM_Z (mla_lane_0_bf16_tied2, svbfloat16_t,
+ z0 = svmla_lane_bf16 (z1, z0, z2, 0),
+ z0 = svmla_lane (z1, z0, z2, 0))
+
+/*
+** mla_lane_0_bf16_tied3:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** bfmla z0\.h, z2\.h, \1\.h\[0\]
+** ret
+*/
+TEST_UNIFORM_Z (mla_lane_0_bf16_tied3, svbfloat16_t,
+ z0 = svmla_lane_bf16 (z1, z2, z0, 0),
+ z0 = svmla_lane (z1, z2, z0, 0))
+
+/*
+** mla_lane_0_bf16_untied:
+** movprfx z0, z1
+** bfmla z0\.h, z2\.h, z3\.h\[0\]
+** ret
+*/
+TEST_UNIFORM_Z (mla_lane_0_bf16_untied, svbfloat16_t,
+ z0 = svmla_lane_bf16 (z1, z2, z3, 0),
+ z0 = svmla_lane (z1, z2, z3, 0))
+
+/*
+** mla_lane_1_bf16:
+** bfmla z0\.h, z1\.h, z2\.h\[1\]
+** ret
+*/
+TEST_UNIFORM_Z (mla_lane_1_bf16, svbfloat16_t,
+ z0 = svmla_lane_bf16 (z0, z1, z2, 1),
+ z0 = svmla_lane (z0, z1, z2, 1))
+
+/*
+** mla_lane_2_bf16:
+** bfmla z0\.h, z1\.h, z2\.h\[2\]
+** ret
+*/
+TEST_UNIFORM_Z (mla_lane_2_bf16, svbfloat16_t,
+ z0 = svmla_lane_bf16 (z0, z1, z2, 2),
+ z0 = svmla_lane (z0, z1, z2, 2))
+
+/*
+** mla_lane_3_bf16:
+** bfmla z0\.h, z1\.h, z2\.h\[3\]
+** ret
+*/
+TEST_UNIFORM_Z (mla_lane_3_bf16, svbfloat16_t,
+ z0 = svmla_lane_bf16 (z0, z1, z2, 3),
+ z0 = svmla_lane (z0, z1, z2, 3))
+
+/*
+** mla_lane_4_bf16:
+** bfmla z0\.h, z1\.h, z2\.h\[4\]
+** ret
+*/
+TEST_UNIFORM_Z (mla_lane_4_bf16, svbfloat16_t,
+ z0 = svmla_lane_bf16 (z0, z1, z2, 4),
+ z0 = svmla_lane (z0, z1, z2, 4))
+
+/*
+** mla_lane_5_bf16:
+** bfmla z0\.h, z1\.h, z2\.h\[5\]
+** ret
+*/
+TEST_UNIFORM_Z (mla_lane_5_bf16, svbfloat16_t,
+ z0 = svmla_lane_bf16 (z0, z1, z2, 5),
+ z0 = svmla_lane (z0, z1, z2, 5))
+
+/*
+** mla_lane_6_bf16:
+** bfmla z0\.h, z1\.h, z2\.h\[6\]
+** ret
+*/
+TEST_UNIFORM_Z (mla_lane_6_bf16, svbfloat16_t,
+ z0 = svmla_lane_bf16 (z0, z1, z2, 6),
+ z0 = svmla_lane (z0, z1, z2, 6))
+
+/*
+** mla_lane_7_bf16:
+** bfmla z0\.h, z1\.h, z2\.h\[7\]
+** ret
+*/
+TEST_UNIFORM_Z (mla_lane_7_bf16, svbfloat16_t,
+ z0 = svmla_lane_bf16 (z0, z1, z2, 7),
+ z0 = svmla_lane (z0, z1, z2, 7))
+
+/*
+** mla_lane_z7_bf16:
+** bfmla z0\.h, z1\.h, z7\.h\[7\]
+** ret
+*/
+TEST_DUAL_Z (mla_lane_z7_bf16, svbfloat16_t, svbfloat16_t,
+ z0 = svmla_lane_bf16 (z0, z1, z7, 7),
+ z0 = svmla_lane (z0, z1, z7, 7))
+
+/*
+** mla_lane_z8_bf16:
+** str d8, \[sp, -16\]!
+** mov (z[0-7])\.d, z8\.d
+** bfmla z0\.h, z1\.h, \1\.h\[7\]
+** ldr d8, \[sp\], 16
+** ret
+*/
+TEST_DUAL_LANE_REG (mla_lane_z8_bf16, svbfloat16_t, svbfloat16_t, z8,
+ z0 = svmla_lane_bf16 (z0, z1, z8, 7),
+ z0 = svmla_lane (z0, z1, z8, 7))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve-b16b16"
+#ifdef STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** mls_bf16_m_tied1:
+** bfmls z0\.h, p0/m, z1\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (mls_bf16_m_tied1, svbfloat16_t,
+ z0 = svmls_bf16_m (p0, z0, z1, z2),
+ z0 = svmls_m (p0, z0, z1, z2))
+
+/*
+** mls_bf16_m_tied2:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** bfmls z0\.h, p0/m, \1\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (mls_bf16_m_tied2, svbfloat16_t,
+ z0 = svmls_bf16_m (p0, z1, z0, z2),
+ z0 = svmls_m (p0, z1, z0, z2))
+
+/*
+** mls_bf16_m_tied3:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** bfmls z0\.h, p0/m, z2\.h, \1\.h
+** ret
+*/
+TEST_UNIFORM_Z (mls_bf16_m_tied3, svbfloat16_t,
+ z0 = svmls_bf16_m (p0, z1, z2, z0),
+ z0 = svmls_m (p0, z1, z2, z0))
+
+/*
+** mls_bf16_m_untied:
+** movprfx z0, z1
+** bfmls z0\.h, p0/m, z2\.h, z3\.h
+** ret
+*/
+TEST_UNIFORM_Z (mls_bf16_m_untied, svbfloat16_t,
+ z0 = svmls_bf16_m (p0, z1, z2, z3),
+ z0 = svmls_m (p0, z1, z2, z3))
+
+/*
+** mls_h4_bf16_m_tied1:
+** mov (z[0-9]+\.h), h4
+** bfmls z0\.h, p0/m, z1\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (mls_h4_bf16_m_tied1, svbfloat16_t, __bf16,
+ z0 = svmls_n_bf16_m (p0, z0, z1, d4),
+ z0 = svmls_m (p0, z0, z1, d4))
+
+/*
+** mls_h4_bf16_m_untied:
+** mov (z[0-9]+\.h), h4
+** movprfx z0, z1
+** bfmls z0\.h, p0/m, z2\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (mls_h4_bf16_m_untied, svbfloat16_t, __bf16,
+ z0 = svmls_n_bf16_m (p0, z1, z2, d4),
+ z0 = svmls_m (p0, z1, z2, d4))
+
+/*
+** mls_2_bf16_m_tied1:
+** fmov (z[0-9]+\.h), #2\.0(?:e\+0)?
+** bfmls z0\.h, p0/m, z1\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (mls_2_bf16_m_tied1, svbfloat16_t,
+ z0 = svmls_n_bf16_m (p0, z0, z1, 2),
+ z0 = svmls_m (p0, z0, z1, 2))
+
+/*
+** mls_2_bf16_m_untied:
+** fmov (z[0-9]+\.h), #2\.0(?:e\+0)?
+** movprfx z0, z1
+** bfmls z0\.h, p0/m, z2\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (mls_2_bf16_m_untied, svbfloat16_t,
+ z0 = svmls_n_bf16_m (p0, z1, z2, 2),
+ z0 = svmls_m (p0, z1, z2, 2))
+
+/*
+** mls_bf16_z_tied1:
+** movprfx z0\.h, p0/z, z0\.h
+** bfmls z0\.h, p0/m, z1\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (mls_bf16_z_tied1, svbfloat16_t,
+ z0 = svmls_bf16_z (p0, z0, z1, z2),
+ z0 = svmls_z (p0, z0, z1, z2))
+
+/*
+** mls_bf16_z_tied2:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0\.h, p0/z, z1\.h
+** bfmls z0\.h, p0/m, \1\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (mls_bf16_z_tied2, svbfloat16_t,
+ z0 = svmls_bf16_z (p0, z1, z0, z2),
+ z0 = svmls_z (p0, z1, z0, z2))
+
+/*
+** mls_bf16_z_tied3:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0\.h, p0/z, z1\.h
+** bfmls z0\.h, p0/m, z2\.h, \1\.h
+** ret
+*/
+TEST_UNIFORM_Z (mls_bf16_z_tied3, svbfloat16_t,
+ z0 = svmls_bf16_z (p0, z1, z2, z0),
+ z0 = svmls_z (p0, z1, z2, z0))
+
+/*
+** mls_bf16_z_untied:
+** movprfx z0\.h, p0/z, z1\.h
+** bfmls z0\.h, p0/m, z2\.h, z3\.h
+** ret
+*/
+TEST_UNIFORM_Z (mls_bf16_z_untied, svbfloat16_t,
+ z0 = svmls_bf16_z (p0, z1, z2, z3),
+ z0 = svmls_z (p0, z1, z2, z3))
+
+/*
+** mls_h4_bf16_z_tied1:
+** mov (z[0-9]+\.h), h4
+** movprfx z0\.h, p0/z, z0\.h
+** bfmls z0\.h, p0/m, z1\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (mls_h4_bf16_z_tied1, svbfloat16_t, __bf16,
+ z0 = svmls_n_bf16_z (p0, z0, z1, d4),
+ z0 = svmls_z (p0, z0, z1, d4))
+
+/*
+** mls_h4_bf16_z_tied2:
+** mov (z[0-9]+\.h), h4
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0\.h, p0/z, z1\.h
+** bfmls z0\.h, p0/m, \2\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (mls_h4_bf16_z_tied2, svbfloat16_t, __bf16,
+ z0 = svmls_n_bf16_z (p0, z1, z0, d4),
+ z0 = svmls_z (p0, z1, z0, d4))
+
+/*
+** mls_h4_bf16_z_untied:
+** mov (z[0-9]+\.h), h4
+** movprfx z0\.h, p0/z, z1\.h
+** bfmls z0\.h, p0/m, z2\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (mls_h4_bf16_z_untied, svbfloat16_t, __bf16,
+ z0 = svmls_n_bf16_z (p0, z1, z2, d4),
+ z0 = svmls_z (p0, z1, z2, d4))
+
+/*
+** mls_2_bf16_z_tied1:
+** fmov (z[0-9]+\.h), #2\.0(?:e\+0)?
+** movprfx z0\.h, p0/z, z0\.h
+** bfmls z0\.h, p0/m, z1\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (mls_2_bf16_z_tied1, svbfloat16_t,
+ z0 = svmls_n_bf16_z (p0, z0, z1, 2),
+ z0 = svmls_z (p0, z0, z1, 2))
+
+/*
+** mls_bf16_x_tied1:
+** bfmls z0\.h, p0/m, z1\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (mls_bf16_x_tied1, svbfloat16_t,
+ z0 = svmls_bf16_x (p0, z0, z1, z2),
+ z0 = svmls_x (p0, z0, z1, z2))
+
+/*
+** mls_bf16_x_tied2:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** bfmls z0\.h, p0/m, \1\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (mls_bf16_x_tied2, svbfloat16_t,
+ z0 = svmls_bf16_x (p0, z1, z0, z2),
+ z0 = svmls_x (p0, z1, z0, z2))
+
+/*
+** mls_bf16_x_tied3:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** bfmls z0\.h, p0/m, z2\.h, \1\.h
+** ret
+*/
+TEST_UNIFORM_Z (mls_bf16_x_tied3, svbfloat16_t,
+ z0 = svmls_bf16_x (p0, z1, z2, z0),
+ z0 = svmls_x (p0, z1, z2, z0))
+
+/*
+** mls_bf16_x_untied:
+** movprfx z0, z1
+** bfmls z0\.h, p0/m, z2\.h, z3\.h
+** ret
+*/
+TEST_UNIFORM_Z (mls_bf16_x_untied, svbfloat16_t,
+ z0 = svmls_bf16_x (p0, z1, z2, z3),
+ z0 = svmls_x (p0, z1, z2, z3))
+
+/*
+** mls_h4_bf16_x_tied1:
+** mov (z[0-9]+\.h), h4
+** bfmls z0\.h, p0/m, z1\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (mls_h4_bf16_x_tied1, svbfloat16_t, __bf16,
+ z0 = svmls_n_bf16_x (p0, z0, z1, d4),
+ z0 = svmls_x (p0, z0, z1, d4))
+
+/*
+** mls_h4_bf16_x_tied2:
+** mov (z[0-9]+\.h), h4
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** bfmls z0\.h, p0/m, \2\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (mls_h4_bf16_x_tied2, svbfloat16_t, __bf16,
+ z0 = svmls_n_bf16_x (p0, z1, z0, d4),
+ z0 = svmls_x (p0, z1, z0, d4))
+
+/*
+** mls_h4_bf16_x_untied:
+** mov (z[0-9]+\.h), h4
+** movprfx z0, z1
+** bfmls z0\.h, p0/m, z2\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (mls_h4_bf16_x_untied, svbfloat16_t, __bf16,
+ z0 = svmls_n_bf16_x (p0, z1, z2, d4),
+ z0 = svmls_x (p0, z1, z2, d4))
+
+/*
+** mls_2_bf16_x_tied1:
+** fmov (z[0-9]+\.h), #2\.0(?:e\+0)?
+** bfmls z0\.h, p0/m, z1\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (mls_2_bf16_x_tied1, svbfloat16_t,
+ z0 = svmls_n_bf16_x (p0, z0, z1, 2),
+ z0 = svmls_x (p0, z0, z1, 2))
+
+/*
+** ptrue_mls_bf16_x_tied1:
+** ...
+** ptrue p[0-9]+\.b[^\n]*
+** ...
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_mls_bf16_x_tied1, svbfloat16_t,
+ z0 = svmls_bf16_x (svptrue_b16 (), z0, z1, z2),
+ z0 = svmls_x (svptrue_b16 (), z0, z1, z2))
+
+/*
+** ptrue_mls_bf16_x_tied2:
+** ...
+** ptrue p[0-9]+\.b[^\n]*
+** ...
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_mls_bf16_x_tied2, svbfloat16_t,
+ z0 = svmls_bf16_x (svptrue_b16 (), z1, z0, z2),
+ z0 = svmls_x (svptrue_b16 (), z1, z0, z2))
+
+/*
+** ptrue_mls_bf16_x_tied3:
+** ...
+** ptrue p[0-9]+\.b[^\n]*
+** ...
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_mls_bf16_x_tied3, svbfloat16_t,
+ z0 = svmls_bf16_x (svptrue_b16 (), z1, z2, z0),
+ z0 = svmls_x (svptrue_b16 (), z1, z2, z0))
+
+/*
+** ptrue_mls_bf16_x_untied:
+** ...
+** ptrue p[0-9]+\.b[^\n]*
+** ...
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_mls_bf16_x_untied, svbfloat16_t,
+ z0 = svmls_bf16_x (svptrue_b16 (), z1, z2, z3),
+ z0 = svmls_x (svptrue_b16 (), z1, z2, z3))
+
+/*
+** ptrue_mls_2_bf16_x_tied1:
+** ...
+** ptrue p[0-9]+\.b[^\n]*
+** ...
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_mls_2_bf16_x_tied1, svbfloat16_t,
+ z0 = svmls_n_bf16_x (svptrue_b16 (), z0, z1, 2),
+ z0 = svmls_x (svptrue_b16 (), z0, z1, 2))
+
+/*
+** ptrue_mls_2_bf16_x_tied2:
+** ...
+** ptrue p[0-9]+\.b[^\n]*
+** ...
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_mls_2_bf16_x_tied2, svbfloat16_t,
+ z0 = svmls_n_bf16_x (svptrue_b16 (), z1, z0, 2),
+ z0 = svmls_x (svptrue_b16 (), z1, z0, 2))
+
+/*
+** ptrue_mls_2_bf16_x_untied:
+** ...
+** ptrue p[0-9]+\.b[^\n]*
+** ...
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_mls_2_bf16_x_untied, svbfloat16_t,
+ z0 = svmls_n_bf16_x (svptrue_b16 (), z1, z2, 2),
+ z0 = svmls_x (svptrue_b16 (), z1, z2, 2))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve-b16b16"
+#ifdef STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** mls_lane_0_bf16_tied1:
+** bfmls z0\.h, z1\.h, z2\.h\[0\]
+** ret
+*/
+TEST_UNIFORM_Z (mls_lane_0_bf16_tied1, svbfloat16_t,
+ z0 = svmls_lane_bf16 (z0, z1, z2, 0),
+ z0 = svmls_lane (z0, z1, z2, 0))
+
+/*
+** mls_lane_0_bf16_tied2:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** bfmls z0\.h, \1\.h, z2\.h\[0\]
+** ret
+*/
+TEST_UNIFORM_Z (mls_lane_0_bf16_tied2, svbfloat16_t,
+ z0 = svmls_lane_bf16 (z1, z0, z2, 0),
+ z0 = svmls_lane (z1, z0, z2, 0))
+
+/*
+** mls_lane_0_bf16_tied3:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** bfmls z0\.h, z2\.h, \1\.h\[0\]
+** ret
+*/
+TEST_UNIFORM_Z (mls_lane_0_bf16_tied3, svbfloat16_t,
+ z0 = svmls_lane_bf16 (z1, z2, z0, 0),
+ z0 = svmls_lane (z1, z2, z0, 0))
+
+/*
+** mls_lane_0_bf16_untied:
+** movprfx z0, z1
+** bfmls z0\.h, z2\.h, z3\.h\[0\]
+** ret
+*/
+TEST_UNIFORM_Z (mls_lane_0_bf16_untied, svbfloat16_t,
+ z0 = svmls_lane_bf16 (z1, z2, z3, 0),
+ z0 = svmls_lane (z1, z2, z3, 0))
+
+/*
+** mls_lane_1_bf16:
+** bfmls z0\.h, z1\.h, z2\.h\[1\]
+** ret
+*/
+TEST_UNIFORM_Z (mls_lane_1_bf16, svbfloat16_t,
+ z0 = svmls_lane_bf16 (z0, z1, z2, 1),
+ z0 = svmls_lane (z0, z1, z2, 1))
+
+/*
+** mls_lane_2_bf16:
+** bfmls z0\.h, z1\.h, z2\.h\[2\]
+** ret
+*/
+TEST_UNIFORM_Z (mls_lane_2_bf16, svbfloat16_t,
+ z0 = svmls_lane_bf16 (z0, z1, z2, 2),
+ z0 = svmls_lane (z0, z1, z2, 2))
+
+/*
+** mls_lane_3_bf16:
+** bfmls z0\.h, z1\.h, z2\.h\[3\]
+** ret
+*/
+TEST_UNIFORM_Z (mls_lane_3_bf16, svbfloat16_t,
+ z0 = svmls_lane_bf16 (z0, z1, z2, 3),
+ z0 = svmls_lane (z0, z1, z2, 3))
+
+/*
+** mls_lane_4_bf16:
+** bfmls z0\.h, z1\.h, z2\.h\[4\]
+** ret
+*/
+TEST_UNIFORM_Z (mls_lane_4_bf16, svbfloat16_t,
+ z0 = svmls_lane_bf16 (z0, z1, z2, 4),
+ z0 = svmls_lane (z0, z1, z2, 4))
+
+/*
+** mls_lane_5_bf16:
+** bfmls z0\.h, z1\.h, z2\.h\[5\]
+** ret
+*/
+TEST_UNIFORM_Z (mls_lane_5_bf16, svbfloat16_t,
+ z0 = svmls_lane_bf16 (z0, z1, z2, 5),
+ z0 = svmls_lane (z0, z1, z2, 5))
+
+/*
+** mls_lane_6_bf16:
+** bfmls z0\.h, z1\.h, z2\.h\[6\]
+** ret
+*/
+TEST_UNIFORM_Z (mls_lane_6_bf16, svbfloat16_t,
+ z0 = svmls_lane_bf16 (z0, z1, z2, 6),
+ z0 = svmls_lane (z0, z1, z2, 6))
+
+/*
+** mls_lane_7_bf16:
+** bfmls z0\.h, z1\.h, z2\.h\[7\]
+** ret
+*/
+TEST_UNIFORM_Z (mls_lane_7_bf16, svbfloat16_t,
+ z0 = svmls_lane_bf16 (z0, z1, z2, 7),
+ z0 = svmls_lane (z0, z1, z2, 7))
+
+/*
+** mls_lane_z7_bf16:
+** bfmls z0\.h, z1\.h, z7\.h\[7\]
+** ret
+*/
+TEST_DUAL_Z (mls_lane_z7_bf16, svbfloat16_t, svbfloat16_t,
+ z0 = svmls_lane_bf16 (z0, z1, z7, 7),
+ z0 = svmls_lane (z0, z1, z7, 7))
+
+/*
+** mls_lane_z8_bf16:
+** str d8, \[sp, -16\]!
+** mov (z[0-7])\.d, z8\.d
+** bfmls z0\.h, z1\.h, \1\.h\[7\]
+** ldr d8, \[sp\], 16
+** ret
+*/
+TEST_DUAL_LANE_REG (mls_lane_z8_bf16, svbfloat16_t, svbfloat16_t, z8,
+ z0 = svmls_lane_bf16 (z0, z1, z8, 7),
+ z0 = svmls_lane (z0, z1, z8, 7))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve-b16b16"
+#ifdef STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** mul_bf16_m_tied1:
+** bfmul z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (mul_bf16_m_tied1, svbfloat16_t,
+ z0 = svmul_bf16_m (p0, z0, z1),
+ z0 = svmul_m (p0, z0, z1))
+
+/*
+** mul_bf16_m_tied2:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** bfmul z0\.h, p0/m, z0\.h, \1\.h
+** ret
+*/
+TEST_UNIFORM_Z (mul_bf16_m_tied2, svbfloat16_t,
+ z0 = svmul_bf16_m (p0, z1, z0),
+ z0 = svmul_m (p0, z1, z0))
+
+/*
+** mul_bf16_m_untied:
+** movprfx z0, z1
+** bfmul z0\.h, p0/m, z0\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (mul_bf16_m_untied, svbfloat16_t,
+ z0 = svmul_bf16_m (p0, z1, z2),
+ z0 = svmul_m (p0, z1, z2))
+
+/*
+** mul_h4_bf16_m_tied1:
+** mov (z[0-9]+\.h), h4
+** bfmul z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (mul_h4_bf16_m_tied1, svbfloat16_t, __bf16,
+ z0 = svmul_n_bf16_m (p0, z0, d4),
+ z0 = svmul_m (p0, z0, d4))
+
+/*
+** mul_h4_bf16_m_untied:
+** mov (z[0-9]+\.h), h4
+** movprfx z0, z1
+** bfmul z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (mul_h4_bf16_m_untied, svbfloat16_t, __bf16,
+ z0 = svmul_n_bf16_m (p0, z1, d4),
+ z0 = svmul_m (p0, z1, d4))
+
+/*
+** mul_1_bf16_m:
+** fmov (z[0-9]+\.h), #1\.875(?:e\+0)?
+** bfmul z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (mul_1_bf16_m, svbfloat16_t,
+ z0 = svmul_n_bf16_m (p0, z0, 1),
+ z0 = svmul_m (p0, z0, 1))
+
+/*
+** mul_0p5_bf16_m:
+** fmov (z[0-9]+\.h), #1\.75(?:e\+0)?
+** bfmul z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (mul_0p5_bf16_m, svbfloat16_t,
+ z0 = svmul_n_bf16_m (p0, z0, (bfloat16_t) (0.5)),
+ z0 = svmul_m (p0, z0, (bfloat16_t) (0.5)))
+
+/*
+** mul_m1_bf16_m:
+** fmov (z[0-9]+\.h), #-1\.875(?:e\+0)?
+** bfmul z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (mul_m1_bf16_m, svbfloat16_t,
+ z0 = svmul_n_bf16_m (p0, z0, -1),
+ z0 = svmul_m (p0, z0, -1))
+
+/*
+** mul_m0p5_bf16_m:
+** fmov (z[0-9]+\.h), #-1\.75(?:e\+0)?
+** bfmul z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (mul_m0p5_bf16_m, svbfloat16_t,
+ z0 = svmul_n_bf16_m (p0, z0, (bfloat16_t) (-0.5)),
+ z0 = svmul_m (p0, z0, (bfloat16_t) (-0.5)))
+
+/*
+** mul_m2_bf16_m_tied1:
+** fmov (z[0-9]+\.h), #-2\.0(?:e\+0)?
+** bfmul z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (mul_m2_bf16_m_tied1, svbfloat16_t,
+ z0 = svmul_n_bf16_m (p0, z0, -2),
+ z0 = svmul_m (p0, z0, -2))
+
+/*
+** mul_m2_bf16_m_untied:
+** fmov (z[0-9]+\.h), #-2\.0(?:e\+0)?
+** movprfx z0, z1
+** bfmul z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (mul_m2_bf16_m_untied, svbfloat16_t,
+ z0 = svmul_n_bf16_m (p0, z1, -2),
+ z0 = svmul_m (p0, z1, -2))
+
+/*
+** mul_bf16_z_tied1:
+** movprfx z0\.h, p0/z, z0\.h
+** bfmul z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (mul_bf16_z_tied1, svbfloat16_t,
+ z0 = svmul_bf16_z (p0, z0, z1),
+ z0 = svmul_z (p0, z0, z1))
+
+/*
+** mul_bf16_z_tied2:
+** movprfx z0\.h, p0/z, z0\.h
+** bfmul z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (mul_bf16_z_tied2, svbfloat16_t,
+ z0 = svmul_bf16_z (p0, z1, z0),
+ z0 = svmul_z (p0, z1, z0))
+
+/*
+** mul_bf16_z_untied:
+** (
+** movprfx z0\.h, p0/z, z1\.h
+** bfmul z0\.h, p0/m, z0\.h, z2\.h
+** |
+** movprfx z0\.h, p0/z, z2\.h
+** bfmul z0\.h, p0/m, z0\.h, z1\.h
+** )
+** ret
+*/
+TEST_UNIFORM_Z (mul_bf16_z_untied, svbfloat16_t,
+ z0 = svmul_bf16_z (p0, z1, z2),
+ z0 = svmul_z (p0, z1, z2))
+
+/*
+** mul_h4_bf16_z_tied1:
+** mov (z[0-9]+\.h), h4
+** movprfx z0\.h, p0/z, z0\.h
+** bfmul z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (mul_h4_bf16_z_tied1, svbfloat16_t, __bf16,
+ z0 = svmul_n_bf16_z (p0, z0, d4),
+ z0 = svmul_z (p0, z0, d4))
+
+/*
+** mul_h4_bf16_z_untied:
+** mov (z[0-9]+\.h), h4
+** (
+** movprfx z0\.h, p0/z, z1\.h
+** bfmul z0\.h, p0/m, z0\.h, \1
+** |
+** movprfx z0\.h, p0/z, \1
+** bfmul z0\.h, p0/m, z0\.h, z1\.h
+** )
+** ret
+*/
+TEST_UNIFORM_ZD (mul_h4_bf16_z_untied, svbfloat16_t, __bf16,
+ z0 = svmul_n_bf16_z (p0, z1, d4),
+ z0 = svmul_z (p0, z1, d4))
+
+/*
+** mul_1_bf16_z:
+** fmov (z[0-9]+\.h), #1\.875(?:e\+0)?
+** movprfx z0\.h, p0/z, z0\.h
+** bfmul z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (mul_1_bf16_z, svbfloat16_t,
+ z0 = svmul_n_bf16_z (p0, z0, 1),
+ z0 = svmul_z (p0, z0, 1))
+
+/*
+** mul_bf16_x_tied1:
+** bfmul z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (mul_bf16_x_tied1, svbfloat16_t,
+ z0 = svmul_bf16_x (p0, z0, z1),
+ z0 = svmul_x (p0, z0, z1))
+
+/*
+** mul_bf16_x_tied2:
+** bfmul z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (mul_bf16_x_tied2, svbfloat16_t,
+ z0 = svmul_bf16_x (p0, z1, z0),
+ z0 = svmul_x (p0, z1, z0))
+
+/*
+** mul_bf16_x_untied:
+** (
+** movprfx z0, z1
+** bfmul z0\.h, p0/m, z0\.h, z2\.h
+** |
+** movprfx z0, z2
+** bfmul z0\.h, p0/m, z0\.h, z1\.h
+** )
+** ret
+*/
+TEST_UNIFORM_Z (mul_bf16_x_untied, svbfloat16_t,
+ z0 = svmul_bf16_x (p0, z1, z2),
+ z0 = svmul_x (p0, z1, z2))
+
+/*
+** mul_h4_bf16_x_tied1:
+** mov (z[0-9]+\.h), h4
+** bfmul z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (mul_h4_bf16_x_tied1, svbfloat16_t, __bf16,
+ z0 = svmul_n_bf16_x (p0, z0, d4),
+ z0 = svmul_x (p0, z0, d4))
+
+/*
+** mul_h4_bf16_x_untied:
+** mov z0\.h, h4
+** bfmul z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_ZD (mul_h4_bf16_x_untied, svbfloat16_t, __bf16,
+ z0 = svmul_n_bf16_x (p0, z1, d4),
+ z0 = svmul_x (p0, z1, d4))
+
+/*
+** mul_1_bf16_x_tied1:
+** fmov (z[0-9]+\.h), #1\.875(?:e\+0)?
+** bfmul z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (mul_1_bf16_x_tied1, svbfloat16_t,
+ z0 = svmul_n_bf16_x (p0, z0, 1),
+ z0 = svmul_x (p0, z0, 1))
+
+/*
+** mul_1_bf16_x_untied:
+** fmov z0\.h, #1\.875(?:e\+0)?
+** bfmul z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (mul_1_bf16_x_untied, svbfloat16_t,
+ z0 = svmul_n_bf16_x (p0, z1, 1),
+ z0 = svmul_x (p0, z1, 1))
+
+/*
+** ptrue_mul_bf16_x_tied1:
+** bfmul z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_bf16_x_tied1, svbfloat16_t,
+ z0 = svmul_bf16_x (svptrue_b16 (), z0, z1),
+ z0 = svmul_x (svptrue_b16 (), z0, z1))
+
+/*
+** ptrue_mul_bf16_x_tied2:
+** bfmul z0\.h, (z0\.h, z1\.h|z1\.h, z0\.h)
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_bf16_x_tied2, svbfloat16_t,
+ z0 = svmul_bf16_x (svptrue_b16 (), z1, z0),
+ z0 = svmul_x (svptrue_b16 (), z1, z0))
+
+/*
+** ptrue_mul_bf16_x_untied:
+** bfmul z0\.h, (z1\.h, z2\.h|z2\.h, z1\.h)
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_bf16_x_untied, svbfloat16_t,
+ z0 = svmul_bf16_x (svptrue_b16 (), z1, z2),
+ z0 = svmul_x (svptrue_b16 (), z1, z2))
+
+/*
+** ptrue_mul_1_bf16_x_tied1:
+** fmov (z[0-9]+\.h), #1\.875(?:e\+0)?
+** bfmul z0\.h, (z0\.h, \1|\1, z0\.h)
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_1_bf16_x_tied1, svbfloat16_t,
+ z0 = svmul_n_bf16_x (svptrue_b16 (), z0, 1),
+ z0 = svmul_x (svptrue_b16 (), z0, 1))
+
+/*
+** ptrue_mul_1_bf16_x_untied:
+** fmov (z[0-9]+\.h), #1\.875(?:e\+0)?
+** bfmul z0\.h, (z1\.h, \1|\1, z1\.h)
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_mul_1_bf16_x_untied, svbfloat16_t,
+ z0 = svmul_n_bf16_x (svptrue_b16 (), z1, 1),
+ z0 = svmul_x (svptrue_b16 (), z1, 1))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve-b16b16"
+#ifdef STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** mul_lane_0_bf16_tied1:
+** bfmul z0\.h, z0\.h, z1\.h\[0\]
+** ret
+*/
+TEST_UNIFORM_Z (mul_lane_0_bf16_tied1, svbfloat16_t,
+ z0 = svmul_lane_bf16 (z0, z1, 0),
+ z0 = svmul_lane (z0, z1, 0))
+
+/*
+** mul_lane_0_bf16_tied2:
+** bfmul z0\.h, z1\.h, z0\.h\[0\]
+** ret
+*/
+TEST_UNIFORM_Z (mul_lane_0_bf16_tied2, svbfloat16_t,
+ z0 = svmul_lane_bf16 (z1, z0, 0),
+ z0 = svmul_lane (z1, z0, 0))
+
+/*
+** mul_lane_0_bf16_untied:
+** bfmul z0\.h, z1\.h, z2\.h\[0\]
+** ret
+*/
+TEST_UNIFORM_Z (mul_lane_0_bf16_untied, svbfloat16_t,
+ z0 = svmul_lane_bf16 (z1, z2, 0),
+ z0 = svmul_lane (z1, z2, 0))
+
+/*
+** mul_lane_1_bf16:
+** bfmul z0\.h, z1\.h, z2\.h\[1\]
+** ret
+*/
+TEST_UNIFORM_Z (mul_lane_1_bf16, svbfloat16_t,
+ z0 = svmul_lane_bf16 (z1, z2, 1),
+ z0 = svmul_lane (z1, z2, 1))
+
+/*
+** mul_lane_2_bf16:
+** bfmul z0\.h, z1\.h, z2\.h\[2\]
+** ret
+*/
+TEST_UNIFORM_Z (mul_lane_2_bf16, svbfloat16_t,
+ z0 = svmul_lane_bf16 (z1, z2, 2),
+ z0 = svmul_lane (z1, z2, 2))
+
+/*
+** mul_lane_3_bf16:
+** bfmul z0\.h, z1\.h, z2\.h\[3\]
+** ret
+*/
+TEST_UNIFORM_Z (mul_lane_3_bf16, svbfloat16_t,
+ z0 = svmul_lane_bf16 (z1, z2, 3),
+ z0 = svmul_lane (z1, z2, 3))
+
+/*
+** mul_lane_4_bf16:
+** bfmul z0\.h, z1\.h, z2\.h\[4\]
+** ret
+*/
+TEST_UNIFORM_Z (mul_lane_4_bf16, svbfloat16_t,
+ z0 = svmul_lane_bf16 (z1, z2, 4),
+ z0 = svmul_lane (z1, z2, 4))
+
+/*
+** mul_lane_5_bf16:
+** bfmul z0\.h, z1\.h, z2\.h\[5\]
+** ret
+*/
+TEST_UNIFORM_Z (mul_lane_5_bf16, svbfloat16_t,
+ z0 = svmul_lane_bf16 (z1, z2, 5),
+ z0 = svmul_lane (z1, z2, 5))
+
+/*
+** mul_lane_6_bf16:
+** bfmul z0\.h, z1\.h, z2\.h\[6\]
+** ret
+*/
+TEST_UNIFORM_Z (mul_lane_6_bf16, svbfloat16_t,
+ z0 = svmul_lane_bf16 (z1, z2, 6),
+ z0 = svmul_lane (z1, z2, 6))
+
+/*
+** mul_lane_7_bf16:
+** bfmul z0\.h, z1\.h, z2\.h\[7\]
+** ret
+*/
+TEST_UNIFORM_Z (mul_lane_7_bf16, svbfloat16_t,
+ z0 = svmul_lane_bf16 (z1, z2, 7),
+ z0 = svmul_lane (z1, z2, 7))
+
+/*
+** mul_lane_z7_bf16:
+** bfmul z0\.h, z1\.h, z7\.h\[7\]
+** ret
+*/
+TEST_DUAL_Z (mul_lane_z7_bf16, svbfloat16_t, svbfloat16_t,
+ z0 = svmul_lane_bf16 (z1, z7, 7),
+ z0 = svmul_lane (z1, z7, 7))
+
+/*
+** mul_lane_z8_bf16:
+** str d8, \[sp, -16\]!
+** mov (z[0-7])\.d, z8\.d
+** bfmul z0\.h, z1\.h, \1\.h\[7\]
+** ldr d8, \[sp\], 16
+** ret
+*/
+TEST_DUAL_LANE_REG (mul_lane_z8_bf16, svbfloat16_t, svbfloat16_t, z8,
+ z0 = svmul_lane_bf16 (z1, z8, 7),
+ z0 = svmul_lane (z1, z8, 7))
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve-b16b16_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-b16b16_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+#pragma GCC target "+sve-b16b16"
+#ifdef STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** sub_bf16_m_tied1:
+** bfsub z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (sub_bf16_m_tied1, svbfloat16_t,
+ z0 = svsub_bf16_m (p0, z0, z1),
+ z0 = svsub_m (p0, z0, z1))
+
+/*
+** sub_bf16_m_tied2:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** bfsub z0\.h, p0/m, z0\.h, \1\.h
+** ret
+*/
+TEST_UNIFORM_Z (sub_bf16_m_tied2, svbfloat16_t,
+ z0 = svsub_bf16_m (p0, z1, z0),
+ z0 = svsub_m (p0, z1, z0))
+
+/*
+** sub_bf16_m_untied:
+** movprfx z0, z1
+** bfsub z0\.h, p0/m, z0\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (sub_bf16_m_untied, svbfloat16_t,
+ z0 = svsub_bf16_m (p0, z1, z2),
+ z0 = svsub_m (p0, z1, z2))
+
+/*
+** sub_h4_bf16_m_tied1:
+** mov (z[0-9]+\.h), h4
+** bfsub z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (sub_h4_bf16_m_tied1, svbfloat16_t, __bf16,
+ z0 = svsub_n_bf16_m (p0, z0, d4),
+ z0 = svsub_m (p0, z0, d4))
+
+/*
+** sub_h4_bf16_m_untied:
+** mov (z[0-9]+\.h), h4
+** movprfx z0, z1
+** bfsub z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (sub_h4_bf16_m_untied, svbfloat16_t, __bf16,
+ z0 = svsub_n_bf16_m (p0, z1, d4),
+ z0 = svsub_m (p0, z1, d4))
+
+/*
+** sub_1_bf16_m:
+** fmov (z[0-9]+\.h), #-1\.875(?:e\+0)?
+** bfadd z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (sub_1_bf16_m, svbfloat16_t,
+ z0 = svsub_n_bf16_m (p0, z0, 1),
+ z0 = svsub_m (p0, z0, 1))
+
+/*
+** sub_0p5_bf16_m:
+** fmov (z[0-9]+\.h), #-1\.75(?:e\+0)?
+** bfadd z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (sub_0p5_bf16_m, svbfloat16_t,
+ z0 = svsub_n_bf16_m (p0, z0, (bfloat16_t) (0.5)),
+ z0 = svsub_m (p0, z0, (bfloat16_t) (0.5)))
+
+/*
+** sub_m1_bf16_m:
+** fmov (z[0-9]+\.h), #1\.875(?:e\+0)?
+** bfadd z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (sub_m1_bf16_m, svbfloat16_t,
+ z0 = svsub_n_bf16_m (p0, z0, -1),
+ z0 = svsub_m (p0, z0, -1))
+
+/*
+** sub_m0p5_bf16_m:
+** fmov (z[0-9]+\.h), #1\.75(?:e\+0)?
+** bfadd z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (sub_m0p5_bf16_m, svbfloat16_t,
+ z0 = svsub_n_bf16_m (p0, z0, (bfloat16_t) (-0.5)),
+ z0 = svsub_m (p0, z0, (bfloat16_t) (-0.5)))
+
+/*
+** sub_m2_bf16_m_tied1:
+** fmov (z[0-9]+\.h), #2\.0(?:e\+0)?
+** bfadd z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (sub_m2_bf16_m_tied1, svbfloat16_t,
+ z0 = svsub_n_bf16_m (p0, z0, -2),
+ z0 = svsub_m (p0, z0, -2))
+
+/*
+** sub_m2_bf16_m_untied:
+** fmov (z[0-9]+\.h), #2\.0(?:e\+0)?
+** movprfx z0, z1
+** bfadd z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (sub_m2_bf16_m_untied, svbfloat16_t,
+ z0 = svsub_n_bf16_m (p0, z1, -2),
+ z0 = svsub_m (p0, z1, -2))
+
+/*
+** sub_bf16_z_tied1:
+** movprfx z0\.h, p0/z, z0\.h
+** bfsub z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (sub_bf16_z_tied1, svbfloat16_t,
+ z0 = svsub_bf16_z (p0, z0, z1),
+ z0 = svsub_z (p0, z0, z1))
+
+/*
+** sub_bf16_z_tied2:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0\.h, p0/z, z1\.h
+** bfsub z0\.h, p0/m, z0\.h, \1\.h
+** ret
+*/
+TEST_UNIFORM_Z (sub_bf16_z_tied2, svbfloat16_t,
+ z0 = svsub_bf16_z (p0, z1, z0),
+ z0 = svsub_z (p0, z1, z0))
+
+/*
+** sub_bf16_z_untied:
+** movprfx z0\.h, p0/z, z1\.h
+** bfsub z0\.h, p0/m, z0\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (sub_bf16_z_untied, svbfloat16_t,
+ z0 = svsub_bf16_z (p0, z1, z2),
+ z0 = svsub_z (p0, z1, z2))
+
+/*
+** sub_h4_bf16_z_tied1:
+** mov (z[0-9]+\.h), h4
+** movprfx z0\.h, p0/z, z0\.h
+** bfsub z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (sub_h4_bf16_z_tied1, svbfloat16_t, __bf16,
+ z0 = svsub_n_bf16_z (p0, z0, d4),
+ z0 = svsub_z (p0, z0, d4))
+
+/*
+** sub_h4_bf16_z_untied:
+** mov (z[0-9]+\.h), h4
+** movprfx z0\.h, p0/z, z1\.h
+** bfsub z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (sub_h4_bf16_z_untied, svbfloat16_t, __bf16,
+ z0 = svsub_n_bf16_z (p0, z1, d4),
+ z0 = svsub_z (p0, z1, d4))
+
+/*
+** sub_1_bf16_z:
+** fmov (z[0-9]+\.h), #-1\.875(?:e\+0)?
+** movprfx z0\.h, p0/z, z0\.h
+** bfadd z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (sub_1_bf16_z, svbfloat16_t,
+ z0 = svsub_n_bf16_z (p0, z0, 1),
+ z0 = svsub_z (p0, z0, 1))
+
+/*
+** sub_bf16_x_tied1:
+** bfsub z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (sub_bf16_x_tied1, svbfloat16_t,
+ z0 = svsub_bf16_x (p0, z0, z1),
+ z0 = svsub_x (p0, z0, z1))
+
+/*
+** sub_bf16_x_tied2:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** bfsub z0\.h, p0/m, z0\.h, \1\.h
+** ret
+*/
+TEST_UNIFORM_Z (sub_bf16_x_tied2, svbfloat16_t,
+ z0 = svsub_bf16_x (p0, z1, z0),
+ z0 = svsub_x (p0, z1, z0))
+
+/*
+** sub_bf16_x_untied:
+** movprfx z0, z1
+** bfsub z0\.h, p0/m, z0\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (sub_bf16_x_untied, svbfloat16_t,
+ z0 = svsub_bf16_x (p0, z1, z2),
+ z0 = svsub_x (p0, z1, z2))
+
+/*
+** sub_h4_bf16_x_tied1:
+** mov (z[0-9]+\.h), h4
+** bfsub z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (sub_h4_bf16_x_tied1, svbfloat16_t, __bf16,
+ z0 = svsub_n_bf16_x (p0, z0, d4),
+ z0 = svsub_x (p0, z0, d4))
+
+/*
+** sub_h4_bf16_x_untied:
+** mov (z[0-9]+\.h), h4
+** movprfx z0, z1
+** bfsub z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZD (sub_h4_bf16_x_untied, svbfloat16_t, __bf16,
+ z0 = svsub_n_bf16_x (p0, z1, d4),
+ z0 = svsub_x (p0, z1, d4))
+
+/*
+** sub_1_bf16_x_tied1:
+** fmov (z[0-9]+\.h), #-1\.875(?:e\+0)?
+** bfadd z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (sub_1_bf16_x_tied1, svbfloat16_t,
+ z0 = svsub_n_bf16_x (p0, z0, 1),
+ z0 = svsub_x (p0, z0, 1))
+
+/*
+** sub_1_bf16_x_untied:
+** fmov z0\.h, #-1\.875(?:e\+0)?
+** bfadd z0\.h, p0/m, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (sub_1_bf16_x_untied, svbfloat16_t,
+ z0 = svsub_n_bf16_x (p0, z1, 1),
+ z0 = svsub_x (p0, z1, 1))
+
+/*
+** ptrue_sub_bf16_x_tied1:
+** bfsub z0\.h, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_bf16_x_tied1, svbfloat16_t,
+ z0 = svsub_bf16_x (svptrue_b16 (), z0, z1),
+ z0 = svsub_x (svptrue_b16 (), z0, z1))
+
+/*
+** ptrue_sub_bf16_x_tied2:
+** bfsub z0\.h, z1\.h, z0\.h
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_bf16_x_tied2, svbfloat16_t,
+ z0 = svsub_bf16_x (svptrue_b16 (), z1, z0),
+ z0 = svsub_x (svptrue_b16 (), z1, z0))
+
+/*
+** ptrue_sub_bf16_x_untied:
+** bfsub z0\.h, z1\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_bf16_x_untied, svbfloat16_t,
+ z0 = svsub_bf16_x (svptrue_b16 (), z1, z2),
+ z0 = svsub_x (svptrue_b16 (), z1, z2))
+
+/*
+** ptrue_sub_1_bf16_x_tied1:
+** fmov (z[0-9]+\.h), #-1\.875(?:e\+0)?
+** bfadd z0\.h, (z0\.h, \1|\1, z0\.h)
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_1_bf16_x_tied1, svbfloat16_t,
+ z0 = svsub_n_bf16_x (svptrue_b16 (), z0, 1),
+ z0 = svsub_x (svptrue_b16 (), z0, 1))
+
+/*
+** ptrue_sub_1_bf16_x_untied:
+** fmov (z[0-9]+\.h), #-1\.875(?:e\+0)?
+** bfadd z0\.h, (z1\.h, \1|\1, z1\.h)
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_sub_1_bf16_x_untied, svbfloat16_t,
+ z0 = svsub_n_bf16_x (svptrue_b16 (), z1, 1),
+ z0 = svsub_x (svptrue_b16 (), z1, 1))
foreach { aarch64_ext } { "fp" "simd" "crypto" "crc" "lse" "dotprod" "sve"
"i8mm" "f32mm" "f64mm" "bf16" "sb" "sve2" "ls64"
- "sme" "sme-i16i64" "sme2" } {
+ "sme" "sme-i16i64" "sme2" "sve-b16b16" } {
eval [string map [list FUNC $aarch64_ext] {
proc check_effective_target_aarch64_asm_FUNC_ok { } {
if { [istarget aarch64*-*-*] } {