const unsigned int FLAG_READ_MEMORY = 1U << 2;
const unsigned int FLAG_PREFETCH_MEMORY = 1U << 3;
const unsigned int FLAG_WRITE_MEMORY = 1U << 4;
+const unsigned int FLAG_USES_FPMR = 1U << 5;
/* Indicates that READ_FPCR and RAISE_FP_EXCEPTIONS should be set for
floating-point modes but not for integer modes. */
-const unsigned int FLAG_AUTO_FP = 1U << 5;
+const unsigned int FLAG_AUTO_FP = 1U << 6;
const unsigned int FLAG_QUIET = 0;
const unsigned int FLAG_DEFAULT = FLAG_AUTO_FP;
| FLAG_READ_MEMORY | FLAG_PREFETCH_MEMORY | FLAG_WRITE_MEMORY;
const unsigned int FLAG_STORE = FLAG_WRITE_MEMORY;
const unsigned int FLAG_LOAD = FLAG_READ_MEMORY;
+const unsigned int FLAG_FP8 = FLAG_FP | FLAG_USES_FPMR;
typedef struct
{
AARCH64_SIMD_BUILTIN_##T##_##N##A,
#undef ENTRY
-#define ENTRY(N, S, T0, T1, T2, U, F) \
+#define ENTRY(N, S, T0, T1, T2, T3, U, F) \
AARCH64_##N,
enum aarch64_builtins
{
binary,
binary_lane,
+ ternary,
+ unary,
};
namespace {
};
namespace simd_types {
+ constexpr simd_type f8 { V8QImode, qualifier_modal_float };
+ constexpr simd_type f8q { V16QImode, qualifier_modal_float };
constexpr simd_type p8 { V8QImode, qualifier_poly };
constexpr simd_type p8q { V16QImode, qualifier_poly };
constexpr simd_type s8 { V8QImode, qualifier_none };
constexpr simd_type f32 { V2SFmode, qualifier_none };
constexpr simd_type f32q { V4SFmode, qualifier_none };
+ constexpr simd_type s32 { V2SImode, qualifier_none };
+ constexpr simd_type s32q { V4SImode, qualifier_none };
+
constexpr simd_type f64q { V2DFmode, qualifier_none };
+ constexpr simd_type s64q { V2DImode, qualifier_none };
constexpr simd_type none { VOIDmode, qualifier_none };
}
}
#undef ENTRY
-#define ENTRY(N, S, T0, T1, T2, U, F) \
+#define ENTRY(N, S, T0, T1, T2, T3, U, F) \
{#N, aarch64_builtin_signatures::S, simd_types::T0, simd_types::T1, \
- simd_types::T2, U, aarch64_required_extensions::REQUIRED_EXTENSIONS, \
- FLAG_##F},
+ simd_types::T2, simd_types::T3, U, \
+ aarch64_required_extensions::REQUIRED_EXTENSIONS, FLAG_##F},
/* Initialize pragma builtins. */
{
const char *name;
aarch64_builtin_signatures signature;
- simd_type types[3];
+ simd_type types[4];
int unspec;
aarch64_required_extensions required_extensions;
unsigned int flags;
for (int i = 1; i <= 2; ++i)
arg_types.quick_push (builtin_data.types[i].type ());
break;
+
+ case aarch64_builtin_signatures::ternary:
+ return_type = builtin_data.types[0].type ();
+ for (int i = 1; i <= 3; ++i)
+ arg_types.quick_push (builtin_data.types[i].type ());
+ break;
+
+ case aarch64_builtin_signatures::unary:
+ return_type = builtin_data.types[0].type ();
+ arg_types.quick_push (builtin_data.types[1].type ());
+ break;
}
switch (builtin_data.signature)
{
default:
break;
}
+ if (builtin_data.flags & FLAG_USES_FPMR)
+ arg_types.quick_push (uint64_type_node);
return build_function_type_array (return_type, arg_types.length (),
arg_types.address ());
}
return ops[0].value;
}
+/* If OP is a 128-bit vector, convert it to the equivalent 64-bit vector.
+ Do nothing otherwise. */
+static void
+aarch64_convert_to_v64 (expand_operand *op)
+{
+ if (known_eq (GET_MODE_BITSIZE (op->mode), 128u))
+ {
+ op->mode = aarch64_v64_mode (GET_MODE_INNER (op->mode)).require ();
+ op->value = gen_lowpart (op->mode, op->value);
+ }
+}
+
+/* UNSPEC is a high unspec, indicated by "2" in mnemonics and "_high" in
+ intrinsic names. Return the equivalent low unspec. */
+static int
+aarch64_get_low_unspec (int unspec)
+{
+ switch (unspec)
+ {
+ case UNSPEC_FCVTN2_FP8:
+ return UNSPEC_FCVTN_FP8;
+ case UNSPEC_F1CVTL2_FP8:
+ return UNSPEC_F1CVTL_FP8;
+ case UNSPEC_F2CVTL2_FP8:
+ return UNSPEC_F2CVTL_FP8;
+ default:
+ gcc_unreachable ();
+ }
+}
+
/* Expand CALL_EXPR EXP, given that it is a call to the function described
by BUILTIN_DATA, and return the function's return value. Put the result
in TARGET if convenient. */
TYPE_MODE (TREE_TYPE (arg)));
}
- /* LUTI2 treats the first argument as a vector of 4 elements. The forms
- with 128-bit inputs are only provided as a convenience; the upper halves
- don't actually matter. */
- if (builtin_data.unspec == UNSPEC_LUTI2
- && known_eq (GET_MODE_BITSIZE (ops[1].mode), 128u))
+ if (builtin_data.flags & FLAG_USES_FPMR)
+ {
+ auto fpm_input = ops.pop ().value;
+ auto fpmr = gen_rtx_REG (DImode, FPM_REGNUM);
+ emit_move_insn (fpmr, fpm_input);
+ }
+
+ switch (builtin_data.unspec)
{
- ops[1].mode = aarch64_v64_mode (GET_MODE_INNER (ops[1].mode)).require ();
- ops[1].value = gen_lowpart (ops[1].mode, ops[1].value);
+ case UNSPEC_F1CVTL_FP8:
+ case UNSPEC_F2CVTL_FP8:
+ /* Convert _low forms (which take 128-bit vectors) to the base
+ 64-bit forms. */
+ aarch64_convert_to_v64 (&ops[1]);
+ break;
+
+ case UNSPEC_LUTI2:
+ /* LUTI2 treats the first argument as a vector of 4 elements. The forms
+ with 128-bit inputs are only provided as a convenience; the upper
+ halves don't actually matter. */
+ aarch64_convert_to_v64 (&ops[1]);
+ break;
}
insn_code icode;
{
case UNSPEC_FAMAX:
case UNSPEC_FAMIN:
- icode = code_for_aarch64 (builtin_data.unspec,
- builtin_data.types[0].mode);
+ case UNSPEC_F1CVTL_FP8:
+ case UNSPEC_F2CVTL_FP8:
+ case UNSPEC_FSCALE:
+ icode = code_for_aarch64 (builtin_data.unspec, ops[0].mode);
+ break;
+
+ case UNSPEC_F1CVTL2_FP8:
+ case UNSPEC_F2CVTL2_FP8:
+ {
+ /* Add a high-part selector for the vec_merge. */
+ auto src_mode = ops.last ().mode;
+ auto nunits = GET_MODE_NUNITS (src_mode).to_constant ();
+ rtx par = aarch64_simd_vect_par_cnst_half (src_mode, nunits, true);
+ create_fixed_operand (ops.safe_push ({}), par);
+
+ auto unspec = aarch64_get_low_unspec (builtin_data.unspec);
+ icode = code_for_aarch64_high (unspec, ops[0].mode);
+ break;
+ }
+
+ case UNSPEC_FCVTN_FP8:
+ icode = code_for_aarch64 (builtin_data.unspec, ops[1].mode);
break;
+ case UNSPEC_FCVTN2_FP8:
+ {
+ auto unspec = aarch64_get_low_unspec (builtin_data.unspec);
+ auto mode = ops.last ().mode;
+ if (BYTES_BIG_ENDIAN)
+ icode = code_for_aarch64_high_be (unspec, mode);
+ else
+ icode = code_for_aarch64_high_le (unspec, mode);
+ break;
+ }
+
case UNSPEC_LUTI2:
case UNSPEC_LUTI4:
create_integer_operand (ops.safe_push ({}),
aarch64_def_or_undef (TARGET_SVE_BF16,
"__ARM_FEATURE_SVE_BF16", pfile);
+ aarch64_def_or_undef (TARGET_FP8, "__ARM_FEATURE_FP8", pfile);
+
aarch64_def_or_undef (TARGET_LS64,
"__ARM_FEATURE_LS64", pfile);
aarch64_def_or_undef (TARGET_RCPC, "__ARM_FEATURE_RCPC", pfile);
#undef ENTRY_BINARY
#define ENTRY_BINARY(N, T0, T1, T2, U, F) \
- ENTRY (N, binary, T0, T1, T2, U, F)
+ ENTRY (N, binary, T0, T1, T2, none, U, F)
#undef ENTRY_BINARY_LANE
#define ENTRY_BINARY_LANE(N, T0, T1, T2, U, F) \
- ENTRY (N, binary_lane, T0, T1, T2, U, F)
+ ENTRY (N, binary_lane, T0, T1, T2, none, U, F)
+
+#undef ENTRY_TERNARY
+#define ENTRY_TERNARY(N, T0, T1, T2, T3, U, F) \
+ ENTRY (N, ternary, T0, T1, T2, T3, U, F)
+
+#undef ENTRY_UNARY
+#define ENTRY_UNARY(N, T0, T1, U, F) \
+ ENTRY (N, unary, T0, T1, none, none, U, F)
#undef ENTRY_BINARY_VHSDF
#define ENTRY_BINARY_VHSDF(NAME, UNSPEC, FLAGS) \
ENTRY_BINARY (NAME##q_f32, f32q, f32q, f32q, UNSPEC, FLAGS) \
ENTRY_BINARY (NAME##q_f64, f64q, f64q, f64q, UNSPEC, FLAGS)
+#undef ENTRY_BINARY_VHSDF_SIGNED
+#define ENTRY_BINARY_VHSDF_SIGNED(NAME, UNSPEC, FLAGS) \
+ ENTRY_BINARY (NAME##_f16, f16, f16, s16, UNSPEC, FLAGS) \
+ ENTRY_BINARY (NAME##q_f16, f16q, f16q, s16q, UNSPEC, FLAGS) \
+ ENTRY_BINARY (NAME##_f32, f32, f32, s32, UNSPEC, FLAGS) \
+ ENTRY_BINARY (NAME##q_f32, f32q, f32q, s32q, UNSPEC, FLAGS) \
+ ENTRY_BINARY (NAME##q_f64, f64q, f64q, s64q, UNSPEC, FLAGS)
+
#undef ENTRY_TERNARY_VLUT8
#define ENTRY_TERNARY_VLUT8(T) \
ENTRY_BINARY_LANE (vluti2_lane_##T##8, T##8q, T##8, u8, \
ENTRY_BINARY_LANE (vluti4q_laneq_##T##16_x2, T##16q, T##16qx2, u8q, \
UNSPEC_LUTI4, QUIET)
+#undef ENTRY_UNARY_VQ_BHF
+#define ENTRY_UNARY_VQ_BHF(N, T1, UNSPEC, FLAGS) \
+ ENTRY_UNARY (N##_bf16_mf8_fpm, bf16q, T1, UNSPEC, FLAGS) \
+ ENTRY_UNARY (N##_f16_mf8_fpm, f16q, T1, UNSPEC, FLAGS)
+
// faminmax
#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FAMINMAX)
ENTRY_BINARY_VHSDF (vamax, UNSPEC_FAMAX, FP)
ENTRY_TERNARY_VLUT16 (s)
ENTRY_TERNARY_VLUT16 (u)
#undef REQUIRED_EXTENSIONS
+
+// fpm conversion
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8)
+ENTRY_UNARY_VQ_BHF (vcvt1, f8, UNSPEC_F1CVTL_FP8, FP8)
+ENTRY_UNARY_VQ_BHF (vcvt1_high, f8q, UNSPEC_F1CVTL2_FP8, FP8)
+ENTRY_UNARY_VQ_BHF (vcvt1_low, f8q, UNSPEC_F1CVTL_FP8, FP8)
+ENTRY_UNARY_VQ_BHF (vcvt2, f8, UNSPEC_F2CVTL_FP8, FP8)
+ENTRY_UNARY_VQ_BHF (vcvt2_high, f8q, UNSPEC_F2CVTL2_FP8, FP8)
+ENTRY_UNARY_VQ_BHF (vcvt2_low, f8q, UNSPEC_F2CVTL_FP8, FP8)
+
+ENTRY_BINARY (vcvt_mf8_f16_fpm, f8, f16, f16, UNSPEC_FCVTN_FP8, FP8)
+ENTRY_BINARY (vcvtq_mf8_f16_fpm, f8q, f16q, f16q, UNSPEC_FCVTN_FP8, FP8)
+ENTRY_BINARY (vcvt_mf8_f32_fpm, f8, f32q, f32q, UNSPEC_FCVTN_FP8, FP8)
+
+ENTRY_TERNARY (vcvt_high_mf8_f32_fpm, f8q, f8, f32q, f32q,
+ UNSPEC_FCVTN2_FP8, FP8)
+#undef REQUIRED_EXTENSIONS
+
+// fpm scaling
+#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8)
+ENTRY_BINARY_VHSDF_SIGNED (vscale, UNSPEC_FSCALE, FP)
+#undef REQUIRED_EXTENSIONS
"TARGET_LUT && INTVAL (operands[4]) == 4"
"luti%4\t%0.8h, {%S1.8h, %T1.8h}, %2[%3]"
)
+
+;; fpm unary instructions (low part).
+(define_insn "@aarch64_<insn><mode>"
+ [(set (match_operand:VQ_BHF 0 "register_operand" "=w")
+ (unspec:VQ_BHF
+ [(match_operand:V8QI 1 "register_operand" "w")
+ (reg:DI FPM_REGNUM)]
+ FPM_UNARY_UNS))]
+ "TARGET_FP8"
+ "<b><insn>\t%0.<Vtype>, %1.8b"
+)
+
+;; fpm unary instructions (high part).
+(define_insn "@aarch64_<insn><mode>_high"
+ [(set (match_operand:VQ_BHF 0 "register_operand" "=w")
+ (unspec:VQ_BHF
+ [(vec_select:V8QI
+ (match_operand:V16QI 1 "register_operand" "w")
+ (match_operand:V16QI 2 "vect_par_cnst_hi_half"))
+ (reg:DI FPM_REGNUM)]
+ FPM_UNARY_UNS))]
+ "TARGET_FP8"
+ "<b><insn>2\t%0.<Vtype>, %1.16b"
+)
+
+;; fpm binary instructions.
+(define_insn "@aarch64_<insn><mode>"
+ [(set (match_operand:<VPACKB> 0 "register_operand" "=w")
+ (unspec:<VPACKB>
+ [(match_operand:VCVTFPM 1 "register_operand" "w")
+ (match_operand:VCVTFPM 2 "register_operand" "w")
+ (reg:DI FPM_REGNUM)]
+ FPM_BINARY_UNS))]
+ "TARGET_FP8"
+ "<insn>\t%0.<VPACKBtype>, %1.<Vtype>, %2.<Vtype>"
+)
+
+;; fpm binary instructions & merge with low.
+(define_insn "@aarch64_<insn><mode>_high_le"
+ [(set (match_operand:V16QI 0 "register_operand" "=w")
+ (vec_concat:V16QI
+ (match_operand:V8QI 1 "register_operand" "0")
+ (unspec:V8QI
+ [(match_operand:V4SF_ONLY 2 "register_operand" "w")
+ (match_operand:V4SF_ONLY 3 "register_operand" "w")
+ (reg:DI FPM_REGNUM)]
+ FPM_BINARY_UNS)))]
+ "TARGET_FP8 && !BYTES_BIG_ENDIAN"
+ "<insn>2\t%1.16b, %2.<V4SF_ONLY:Vtype>, %3.<V4SF_ONLY:Vtype>";
+)
+
+(define_insn "@aarch64_<insn><mode>_high_be"
+ [(set (match_operand:V16QI 0 "register_operand" "=w")
+ (vec_concat:V16QI
+ (unspec:V8QI
+ [(match_operand:V4SF_ONLY 2 "register_operand" "w")
+ (match_operand:V4SF_ONLY 3 "register_operand" "w")
+ (reg:DI FPM_REGNUM)]
+ FPM_BINARY_UNS)
+ (match_operand:V8QI 1 "register_operand" "0")))]
+ "TARGET_FP8 && BYTES_BIG_ENDIAN"
+ "<insn>2\t%1.16b, %2.<V4SF_ONLY:Vtype>, %3.<V4SF_ONLY:Vtype>";
+)
+
+;; fscale instructions
+(define_insn "@aarch64_<insn><mode>"
+ [(set (match_operand:VHSDF 0 "register_operand" "=w")
+ (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")
+ (match_operand:<FCVT_TARGET> 2 "register_operand" "w")]
+ FSCALE_UNS))]
+ "TARGET_FP8"
+ "<insn>\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
+)
;; Iterators for single modes, for "@" patterns.
(define_mode_iterator SI_ONLY [SI])
(define_mode_iterator DI_ONLY [DI])
+(define_mode_iterator V4SF_ONLY [V4SF])
;; Iterator for all integer modes (up to 64-bit)
(define_mode_iterator ALLI [QI HI SI DI])
;; Advanced SIMD single Float modes.
(define_mode_iterator VDQSF [V2SF V4SF])
+;; Quad vector float modes with half/bfloat elements.
+(define_mode_iterator VQ_BHF [V8HF V8BF])
+
;; Quad vector Float modes with half/single elements.
(define_mode_iterator VQ_HSF [V8HF V4SF])
(define_mode_iterator VLUT [V8QI V16QI V4HI V4HF V4BF])
(define_mode_iterator VLUTx2 [V2x8HI V2x8HF V2x8BF])
+;; Modes available for Advanced SIMD FP8 conversion operations.
+(define_mode_iterator VCVTFPM [V4HF V8HF V4SF])
+
;; Iterators for single modes, for "@" patterns.
(define_mode_iterator VNx16QI_ONLY [VNx16QI])
(define_mode_iterator VNx16SI_ONLY [VNx16SI])
UNSPEC_ASHIFT_SIGNED ; Used in aarch-simd.md.
UNSPEC_ASHIFT_UNSIGNED ; Used in aarch64-simd.md.
UNSPEC_ABS ; Used in aarch64-simd.md.
+ UNSPEC_FCVTN_FP8 ; Used in aarch64-simd.md.
+ UNSPEC_FCVTN2_FP8 ; Used in aarch64-builtins.cc.
+ UNSPEC_F1CVTL_FP8 ; Used in aarch64-simd.md.
+ UNSPEC_F1CVTL2_FP8 ; Used in aarch64-builtins.cc.
+ UNSPEC_F2CVTL_FP8 ; Used in aarch64-simd.md.
+ UNSPEC_F2CVTL2_FP8 ; Used in aarch64-builtins.cc.
UNSPEC_FMAX ; Used in aarch64-simd.md.
UNSPEC_FMAXNMV ; Used in aarch64-simd.md.
UNSPEC_FMAXV ; Used in aarch64-simd.md.
UNSPEC_FMINV ; Used in aarch64-simd.md.
UNSPEC_FADDV ; Used in aarch64-simd.md.
UNSPEC_FNEG ; Used in aarch64-simd.md.
+ UNSPEC_FSCALE ; Used in aarch64-simd.md.
UNSPEC_ADDV ; Used in aarch64-simd.md.
UNSPEC_SMAXV ; Used in aarch64-simd.md.
UNSPEC_SMINV ; Used in aarch64-simd.md.
(define_mode_attr V2ntype [(V8HI "16b") (V4SI "8h")
(V2DI "4s")])
+;; The result of FCVTN on two vectors of the given mode. The result has
+;; twice as many QI elements as the input.
+(define_mode_attr VPACKB [(V4HF "V8QI") (V8HF "V16QI") (V4SF "V8QI")])
+(define_mode_attr VPACKBtype [(V4HF "8b") (V8HF "16b") (V4SF "8b")])
+
;; Widened modes of vector modes.
(define_mode_attr VWIDE [(V8QI "V8HI") (V4HI "V4SI")
(V2SI "V2DI") (V16QI "V8HI")
(V8HI "vec") (V2SI "vec") (V4SI "vec")
(V2DI "vec") (DI "offset")])
-(define_mode_attr b [(VNx8BF "b") (VNx8HF "") (VNx4SF "") (VNx2DF "")
+(define_mode_attr b [(V4BF "b") (V4HF "") (V8BF "b") (V8HF "")
+ (VNx8BF "b") (VNx8HF "") (VNx4SF "") (VNx2DF "")
(VNx16BF "b") (VNx16HF "") (VNx8SF "") (VNx4DF "")
(VNx32BF "b") (VNx32HF "") (VNx16SF "") (VNx8DF "")])
UNSPEC_FMLALLTB_FP8
UNSPEC_FMLALLTT_FP8])
+;; Iterators for fpm instructions
+
+(define_int_iterator FPM_UNARY_UNS [UNSPEC_F1CVTL_FP8 UNSPEC_F2CVTL_FP8])
+
+(define_int_iterator FPM_BINARY_UNS [UNSPEC_FCVTN_FP8])
+
+(define_int_iterator FSCALE_UNS [UNSPEC_FSCALE])
+
;; -------------------------------------------------------------------
;; Int Iterators Attributes.
;; -------------------------------------------------------------------
+;; The AArch64 insn mnemonic associated with an unspec.
+(define_int_attr insn
+ [(UNSPEC_F1CVTL_FP8 "f1cvtl")
+ (UNSPEC_F2CVTL_FP8 "f2cvtl")
+ (UNSPEC_FCVTN_FP8 "fcvtn")
+ (UNSPEC_FSCALE "fscale")])
+
;; The optab associated with an operation. Note that for ANDF, IORF
;; and XORF, the optab pattern is not actually defined; we just use this
;; name for consistency with the integer patterns.
#include <arm_acle.h>
-#ifdef __ARM_FEATURE_FP8
-#error "__ARM_FEATURE_FP8 feature macro defined."
-#endif
-
#pragma GCC push_options
#pragma GCC target("arch=armv9.4-a+fp8")
-/* We do not define __ARM_FEATURE_FP8 until all
- relevant features have been added. */
-#ifdef __ARM_FEATURE_FP8
-#error "__ARM_FEATURE_FP8 feature macro defined."
-#endif
-
/*
**test_write_fpmr_sysreg_asm_64:
** msr fpmr, x0
#ifdef __ARM_FEATURE_GCS
#error Foo
#endif
+
+#pragma GCC target "arch=armv9-a"
+#ifdef __ARM_FEATURE_FP8
+#error Foo
+#endif
+
+#pragma GCC target "arch=armv9-a+fp8"
+#ifndef __ARM_FEATURE_FP8
+#error Foo
+#endif
--- /dev/null
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv9-a+fp8" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+/*
+** test_vscale_f16:
+** fscale v0.4h, v0.4h, v1.4h
+** ret
+*/
+float16x4_t
+test_vscale_f16 (float16x4_t a, int16x4_t b)
+{
+ return vscale_f16 (a, b);
+}
+
+/*
+** test_vscaleq_f16:
+** fscale v0.8h, v0.8h, v1.8h
+** ret
+*/
+float16x8_t
+test_vscaleq_f16 (float16x8_t a, int16x8_t b)
+{
+ return vscaleq_f16 (a, b);
+}
+
+/*
+** test_vscale_f32:
+** fscale v0.2s, v0.2s, v1.2s
+** ret
+*/
+float32x2_t
+test_vscale_f32 (float32x2_t a, int32x2_t b)
+{
+ return vscale_f32 (a, b);
+}
+
+/*
+** test_vscaleq_f32:
+** fscale v0.4s, v0.4s, v1.4s
+** ret
+*/
+float32x4_t
+test_vscaleq_f32 (float32x4_t a, int32x4_t b)
+{
+ return vscaleq_f32 (a, b);
+}
+
+/*
+** test_vscaleq_f64:
+** fscale v0.2d, v0.2d, v1.2d
+** ret
+*/
+float64x2_t
+test_vscaleq_f64 (float64x2_t a, int64x2_t b)
+{
+ return vscaleq_f64 (a, b);
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv9-a+fp8" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+/*
+** test_vcvt1_bf16:
+** msr fpmr, x0
+** bf1cvtl v0.8h, v0.8b
+** ret
+*/
+bfloat16x8_t
+test_vcvt1_bf16 (mfloat8x8_t a, fpm_t b)
+{
+ return vcvt1_bf16_mf8_fpm(a, b);
+}
+
+/*
+** test_high_vcvt1_bf16:
+** msr fpmr, x0
+** bf1cvtl2 v0.8h, v0.16b
+** ret
+*/
+bfloat16x8_t
+test_high_vcvt1_bf16 (mfloat8x16_t a, fpm_t b)
+{
+ return vcvt1_high_bf16_mf8_fpm(a, b);
+}
+
+/*
+** test_low_vcvt1_bf16:
+** msr fpmr, x0
+** bf1cvtl v0.8h, v0.8b
+** ret
+*/
+bfloat16x8_t
+test_low_vcvt1_bf16 (mfloat8x16_t a, fpm_t b)
+{
+ return vcvt1_low_bf16_mf8_fpm(a, b);
+}
+
+/*
+** test_vcvt1_f16:
+** msr fpmr, x0
+** f1cvtl v0.8h, v0.8b
+** ret
+*/
+float16x8_t
+test_vcvt1_f16 (mfloat8x8_t a, fpm_t b)
+{
+ return vcvt1_f16_mf8_fpm(a, b);
+}
+
+/*
+** test_high_vcvt1_f16:
+** msr fpmr, x0
+** f1cvtl2 v0.8h, v0.16b
+** ret
+*/
+float16x8_t
+test_high_vcvt1_f16 (mfloat8x16_t a, fpm_t b)
+{
+ return vcvt1_high_f16_mf8_fpm(a, b);
+}
+
+/*
+** test_low_vcvt1_f16:
+** msr fpmr, x0
+** f1cvtl v0.8h, v0.8b
+** ret
+*/
+float16x8_t
+test_low_vcvt1_f16 (mfloat8x16_t a, fpm_t b)
+{
+ return vcvt1_low_f16_mf8_fpm(a, b);
+}
+
+/*
+** test_vcvt2_bf16:
+** msr fpmr, x0
+** bf2cvtl v0.8h, v0.8b
+** ret
+*/
+bfloat16x8_t
+test_vcvt2_bf16 (mfloat8x8_t a, fpm_t b)
+{
+ return vcvt2_bf16_mf8_fpm(a, b);
+}
+
+/*
+** test_high_vcvt2_bf16:
+** msr fpmr, x0
+** bf2cvtl2 v0.8h, v0.16b
+** ret
+*/
+bfloat16x8_t
+test_high_vcvt2_bf16 (mfloat8x16_t a, fpm_t b)
+{
+ return vcvt2_high_bf16_mf8_fpm(a, b);
+}
+
+/*
+** test_low_vcvt2_bf16:
+** msr fpmr, x0
+** bf1cvtl v0.8h, v0.8b
+** ret
+*/
+bfloat16x8_t
+test_low_vcvt2_bf16 (mfloat8x16_t a, fpm_t b)
+{
+ return vcvt1_low_bf16_mf8_fpm(a, b);
+}
+
+/*
+** test_vcvt2_f16:
+** msr fpmr, x0
+** f2cvtl v0.8h, v0.8b
+** ret
+*/
+float16x8_t
+test_vcvt2_f16 (mfloat8x8_t a, fpm_t b)
+{
+ return vcvt2_f16_mf8_fpm(a, b);
+}
+
+/*
+** test_high_vcvt2_f16:
+** msr fpmr, x0
+** f2cvtl2 v0.8h, v0.16b
+** ret
+*/
+float16x8_t
+test_high_vcvt2_f16 (mfloat8x16_t a, fpm_t b)
+{
+ return vcvt2_high_f16_mf8_fpm(a, b);
+}
+
+/*
+** test_low_vcvt2_f16:
+** msr fpmr, x0
+** f1cvtl v0.8h, v0.8b
+** ret
+*/
+float16x8_t
+test_low_vcvt2_f16 (mfloat8x16_t a, fpm_t b)
+{
+ return vcvt1_low_f16_mf8_fpm(a, b);
+}
+
+/*
+** test_vcvt_f16:
+** msr fpmr, x0
+** fcvtn v0.8b, v0.4h, v1.4h
+** ret
+*/
+mfloat8x8_t
+test_vcvt_f16 (float16x4_t a, float16x4_t b, fpm_t c)
+{
+ return vcvt_mf8_f16_fpm(a, b, c);
+}
+
+/*
+** test_vcvtq_f16:
+** msr fpmr, x0
+** fcvtn v0.16b, v0.8h, v1.8h
+** ret
+*/
+mfloat8x16_t
+test_vcvtq_f16 (float16x8_t a, float16x8_t b, fpm_t c)
+{
+ return vcvtq_mf8_f16_fpm(a, b, c);
+}
+
+/*
+** test_vcvt_f32:
+** msr fpmr, x0
+** fcvtn v0.8b, v0.4s, v1.4s
+** ret
+*/
+mfloat8x8_t
+test_vcvt_f32 (float32x4_t a, float32x4_t b, fpm_t c)
+{
+ return vcvt_mf8_f32_fpm(a, b, c);
+}
+
+/*
+** test_vcvt_high_f32:
+** msr fpmr, x0
+** fcvtn2 v0.16b, v1.4s, v2.4s
+** ret
+*/
+mfloat8x16_t
+test_vcvt_high_f32 (mfloat8x8_t a, float32x4_t b, float32x4_t c, fpm_t d)
+{
+ return vcvt_high_mf8_f32_fpm(a, b, c, d);
+}