bool aarch64_pars_overlap_p (rtx, rtx);
bool aarch64_simd_scalar_immediate_valid_for_move (rtx, scalar_int_mode);
bool aarch64_simd_shift_imm_p (rtx, machine_mode, bool);
+bool aarch64_sve_valid_pred_p (rtx, machine_mode);
bool aarch64_sve_ptrue_svpattern_p (rtx, struct simd_immediate_info *);
bool aarch64_simd_valid_and_imm (rtx);
bool aarch64_simd_valid_and_imm_fmov (rtx, unsigned int * = NULL);
rtx aarch64_ptrue_reg (machine_mode, machine_mode);
rtx aarch64_pfalse_reg (machine_mode);
bool aarch64_sve_same_pred_for_ptest_p (rtx *, rtx *);
+rtx aarch64_sve_packed_pred (machine_mode);
+rtx aarch64_sve_fp_pred (machine_mode, rtx *);
void aarch64_emit_load_store_through_mode (rtx, rtx, machine_mode);
bool aarch64_expand_maskloadstore (rtx *, machine_mode);
void aarch64_emit_sve_pred_move (rtx, rtx, rtx);
;; ---- [FP<-INT] Packs
;; ---- [FP<-INT] Unpacks
;; ---- [FP<-FP] Packs
+;; ---- [FP<-FP] Truncating conversions
;; ---- [FP<-FP] Packs (bfloat16)
;; ---- [FP<-FP] Unpacks
+;; ---- [FP<-FP] Extending conversions
;; ---- [PRED<-PRED] Packs
;; ---- [PRED<-PRED] Unpacks
;;
;; - FCVTZU
;; -------------------------------------------------------------------------
-;; Unpredicated conversion of floats to integers of the same size (HF to HI,
-;; SF to SI or DF to DI).
-(define_expand "<optab><mode><v_int_equiv>2"
- [(set (match_operand:<V_INT_EQUIV> 0 "register_operand")
- (unspec:<V_INT_EQUIV>
+;; Unpredicated conversion of floats to integers of the same size or wider,
+;; excluding conversions from DF (see below).
+(define_expand "<optab><SVE_HSF:mode><SVE_HSDI:mode>2"
+ [(set (match_operand:SVE_HSDI 0 "register_operand")
+ (unspec:SVE_HSDI
+ [(match_dup 2)
+ (match_dup 3)
+ (match_operand:SVE_HSF 1 "register_operand")]
+ SVE_COND_FCVTI))]
+ "TARGET_SVE
+ && (~(<SVE_HSDI:self_mask> | <SVE_HSDI:narrower_mask>) & <SVE_HSF:self_mask>) == 0"
+ {
+ operands[2] = aarch64_sve_fp_pred (<SVE_HSDI:MODE>mode, &operands[3]);
+ }
+)
+
+;; SI <- DF can't use SI <- trunc (DI <- DF) without -ffast-math, so this
+;; truncating variant of FCVTZ{S,U} is useful for auto-vectorization.
+;;
+;; DF is the only source mode for which the mask used above doesn't apply,
+;; we define a separate pattern for it here.
+(define_expand "<optab><VNx2DF_ONLY:mode><SVE_2SDI:mode>2"
+ [(set (match_operand:SVE_2SDI 0 "register_operand")
+ (unspec:SVE_2SDI
[(match_dup 2)
(const_int SVE_RELAXED_GP)
- (match_operand:SVE_FULL_F 1 "register_operand")]
+ (match_operand:VNx2DF_ONLY 1 "register_operand")]
SVE_COND_FCVTI))]
"TARGET_SVE"
{
- operands[2] = aarch64_ptrue_reg (<VPRED>mode);
+ operands[2] = aarch64_ptrue_reg (VNx2BImode);
}
)
}
)
-;; Predicated narrowing float-to-integer conversion.
-(define_insn "@aarch64_sve_<optab>_trunc<VNx2DF_ONLY:mode><VNx4SI_ONLY:mode>"
- [(set (match_operand:VNx4SI_ONLY 0 "register_operand")
- (unspec:VNx4SI_ONLY
+;; As above, for pairs used by the auto-vectorizer only.
+(define_insn "*aarch64_sve_<optab>_nontrunc<SVE_PARTIAL_F:mode><SVE_HSDI:mode>"
+ [(set (match_operand:SVE_HSDI 0 "register_operand")
+ (unspec:SVE_HSDI
+ [(match_operand:<SVE_HSDI:VPRED> 1 "aarch64_predicate_operand")
+ (match_operand:SI 3 "aarch64_sve_gp_strictness")
+ (match_operand:SVE_PARTIAL_F 2 "register_operand")]
+ SVE_COND_FCVTI))]
+ "TARGET_SVE
+ && (~(<SVE_HSDI:self_mask> | <SVE_HSDI:narrower_mask>) & <SVE_PARTIAL_F:self_mask>) == 0"
+ {@ [ cons: =0 , 1 , 2 ; attrs: movprfx ]
+ [ w , Upl , 0 ; * ] fcvtz<su>\t%0.<SVE_HSDI:Vetype>, %1/m, %2.<SVE_PARTIAL_F:Vetype>
+ [ ?&w , Upl , w ; yes ] movprfx\t%0, %2\;fcvtz<su>\t%0.<SVE_HSDI:Vetype>, %1/m, %2.<SVE_PARTIAL_F:Vetype>
+ }
+)
+
+;; Predicated narrowing float-to-integer conversion. The VNx2DF->VNx4SI
+;; variant is provided for the ACLE, where the zeroed odd-indexed lanes are
+;; significant. The VNx2DF->VNx2SI variant is provided for auto-vectorization,
+;; where the upper 32 bits of each container are ignored.
+(define_insn "@aarch64_sve_<optab>_trunc<VNx2DF_ONLY:mode><SVE_SI:mode>"
+ [(set (match_operand:SVE_SI 0 "register_operand")
+ (unspec:SVE_SI
[(match_operand:VNx2BI 1 "register_operand")
(match_operand:SI 3 "aarch64_sve_gp_strictness")
(match_operand:VNx2DF_ONLY 2 "register_operand")]
SVE_COND_FCVTI))]
"TARGET_SVE"
{@ [ cons: =0 , 1 , 2 ; attrs: movprfx ]
- [ w , Upl , 0 ; * ] fcvtz<su>\t%0.<VNx4SI_ONLY:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype>
- [ ?&w , Upl , w ; yes ] movprfx\t%0, %2\;fcvtz<su>\t%0.<VNx4SI_ONLY:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype>
+ [ w , Upl , 0 ; * ] fcvtz<su>\t%0.<SVE_SI:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype>
+ [ ?&w , Upl , w ; yes ] movprfx\t%0, %2\;fcvtz<su>\t%0.<SVE_SI:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype>
}
)
;; - UCVTF
;; -------------------------------------------------------------------------
-;; Unpredicated conversion of integers to floats of the same size
-;; (HI to HF, SI to SF or DI to DF).
-(define_expand "<optab><v_int_equiv><mode>2"
- [(set (match_operand:SVE_FULL_F 0 "register_operand")
- (unspec:SVE_FULL_F
+;; Unpredicated conversion of integers to floats of the same size or
+;; narrower.
+(define_expand "<optab><SVE_HSDI:mode><SVE_F:mode>2"
+ [(set (match_operand:SVE_F 0 "register_operand")
+ (unspec:SVE_F
[(match_dup 2)
- (const_int SVE_RELAXED_GP)
- (match_operand:<V_INT_EQUIV> 1 "register_operand")]
+ (match_dup 3)
+ (match_operand:SVE_HSDI 1 "register_operand")]
SVE_COND_ICVTF))]
- "TARGET_SVE"
+ "TARGET_SVE
+ && (~(<SVE_HSDI:self_mask> | <SVE_HSDI:narrower_mask>) & <SVE_F:self_mask>) == 0"
{
- operands[2] = aarch64_ptrue_reg (<VPRED>mode);
+ operands[2] = aarch64_sve_fp_pred (<SVE_HSDI:MODE>mode, &operands[3]);
}
)
}
)
+;; As above, for pairs that are used by the auto-vectorizer only.
+(define_insn "*aarch64_sve_<optab>_nonextend<SVE_HSDI:mode><SVE_PARTIAL_F:mode>"
+ [(set (match_operand:SVE_PARTIAL_F 0 "register_operand")
+ (unspec:SVE_PARTIAL_F
+ [(match_operand:<SVE_HSDI:VPRED> 1 "aarch64_predicate_operand")
+ (match_operand:SI 3 "aarch64_sve_gp_strictness")
+ (match_operand:SVE_HSDI 2 "register_operand")]
+ SVE_COND_ICVTF))]
+ "TARGET_SVE
+ && (~(<SVE_HSDI:self_mask> | <SVE_HSDI:narrower_mask>) & <SVE_PARTIAL_F:self_mask>) == 0"
+ {@ [ cons: =0 , 1 , 2 ; attrs: movprfx ]
+ [ w , Upl , 0 ; * ] <su>cvtf\t%0.<SVE_PARTIAL_F:Vetype>, %1/m, %2.<SVE_HSDI:Vetype>
+ [ ?&w , Upl , w ; yes ] movprfx\t%0, %2\;<su>cvtf\t%0.<SVE_PARTIAL_F:Vetype>, %1/m, %2.<SVE_HSDI:Vetype>
+ }
+)
+
;; Predicated widening integer-to-float conversion.
(define_insn "@aarch64_sve_<optab>_extend<VNx4SI_ONLY:mode><VNx2DF_ONLY:mode>"
[(set (match_operand:VNx2DF_ONLY 0 "register_operand")
}
)
+;; -------------------------------------------------------------------------
+;; ---- [FP<-FP] Truncating conversions
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - FCVT
+;; -------------------------------------------------------------------------
+
+;; Unpredicated float-to-float truncation.
+(define_expand "trunc<SVE_SDF:mode><SVE_PARTIAL_HSF:mode>2"
+ [(set (match_operand:SVE_PARTIAL_HSF 0 "register_operand")
+ (unspec:SVE_PARTIAL_HSF
+ [(match_dup 2)
+ (match_dup 3)
+ (match_operand:SVE_SDF 1 "register_operand")]
+ SVE_COND_FCVT))]
+ "TARGET_SVE && (~<SVE_SDF:narrower_mask> & <SVE_PARTIAL_HSF:self_mask>) == 0"
+ {
+ operands[2] = aarch64_sve_fp_pred (<SVE_SDF:MODE>mode, &operands[3]);
+ }
+)
+
;; Predicated float-to-float truncation.
(define_insn "@aarch64_sve_<optab>_trunc<SVE_FULL_SDF:mode><SVE_FULL_HSF:mode>"
[(set (match_operand:SVE_FULL_HSF 0 "register_operand")
}
)
+;; As above, for pairs that are used by the auto-vectorizer only.
+(define_insn "*aarch64_sve_<optab>_trunc<SVE_SDF:mode><SVE_PARTIAL_HSF:mode>"
+ [(set (match_operand:SVE_PARTIAL_HSF 0 "register_operand")
+ (unspec:SVE_PARTIAL_HSF
+ [(match_operand:<SVE_SDF:VPRED> 1 "aarch64_predicate_operand")
+ (match_operand:SI 3 "aarch64_sve_gp_strictness")
+ (match_operand:SVE_SDF 2 "register_operand")]
+ SVE_COND_FCVT))]
+ "TARGET_SVE && (~<SVE_SDF:narrower_mask> & <SVE_PARTIAL_HSF:self_mask>) == 0"
+ {@ [ cons: =0 , 1 , 2 ; attrs: movprfx ]
+ [ w , Upl , 0 ; * ] fcvt\t%0.<SVE_PARTIAL_HSF:Vetype>, %1/m, %2.<SVE_SDF:Vetype>
+ [ ?&w , Upl , w ; yes ] movprfx\t%0, %2\;fcvt\t%0.<SVE_PARTIAL_HSF:Vetype>, %1/m, %2.<SVE_SDF:Vetype>
+ }
+)
+
;; Predicated float-to-float truncation with merging.
(define_expand "@cond_<optab>_trunc<SVE_FULL_SDF:mode><SVE_FULL_HSF:mode>"
[(set (match_operand:SVE_FULL_HSF 0 "register_operand")
}
)
+;; -------------------------------------------------------------------------
+;; ---- [FP<-FP] Extending conversions
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - FCVT
+;; -------------------------------------------------------------------------
+
+;; Unpredicated float-to-float extension.
+(define_expand "extend<SVE_PARTIAL_HSF:mode><SVE_SDF:mode>2"
+ [(set (match_operand:SVE_SDF 0 "register_operand")
+ (unspec:SVE_SDF
+ [(match_dup 2)
+ (match_dup 3)
+ (match_operand:SVE_PARTIAL_HSF 1 "register_operand")]
+ SVE_COND_FCVT))]
+ "TARGET_SVE && (~<SVE_SDF:narrower_mask> & <SVE_PARTIAL_HSF:self_mask>) == 0"
+ {
+ operands[2] = aarch64_sve_fp_pred (<SVE_SDF:MODE>mode, &operands[3]);
+ }
+)
+
;; Predicated float-to-float extension.
(define_insn "@aarch64_sve_<optab>_nontrunc<SVE_FULL_HSF:mode><SVE_FULL_SDF:mode>"
[(set (match_operand:SVE_FULL_SDF 0 "register_operand")
}
)
+;; As above, for pairs that are used by the auto-vectorizer only.
+(define_insn "*aarch64_sve_<optab>_nontrunc<SVE_PARTIAL_HSF:mode><SVE_SDF:mode>"
+ [(set (match_operand:SVE_SDF 0 "register_operand")
+ (unspec:SVE_SDF
+ [(match_operand:<SVE_SDF:VPRED> 1 "aarch64_predicate_operand")
+ (match_operand:SI 3 "aarch64_sve_gp_strictness")
+ (match_operand:SVE_PARTIAL_HSF 2 "register_operand")]
+ SVE_COND_FCVT))]
+ "TARGET_SVE && (~<SVE_SDF:narrower_mask> & <SVE_PARTIAL_HSF:self_mask>) == 0"
+ {@ [ cons: =0 , 1 , 2 ; attrs: movprfx ]
+ [ w , Upl , 0 ; * ] fcvt\t%0.<SVE_SDF:Vetype>, %1/m, %2.<SVE_PARTIAL_HSF:Vetype>
+ [ ?&w , Upl , w ; yes ] movprfx\t%0, %2\;fcvt\t%0.<SVE_SDF:Vetype>, %1/m, %2.<SVE_PARTIAL_HSF:Vetype>
+ }
+)
+
;; Predicated float-to-float extension with merging.
(define_expand "@cond_<optab>_nontrunc<SVE_FULL_HSF:mode><SVE_FULL_SDF:mode>"
[(set (match_operand:SVE_FULL_SDF 0 "register_operand")
return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
}
+
+/* Generate a predicate to control partial SVE mode DATA_MODE as if it
+ were fully packed, enabling the defined elements only. */
+rtx
+aarch64_sve_packed_pred (machine_mode data_mode)
+{
+ unsigned int container_bytes
+ = aarch64_sve_container_bits (data_mode) / BITS_PER_UNIT;
+ /* Enable the significand of each container only. */
+ rtx ptrue = force_reg (VNx16BImode, aarch64_ptrue_all (container_bytes));
+ /* Predicate at the element size. */
+ machine_mode pmode
+ = aarch64_sve_pred_mode (GET_MODE_UNIT_SIZE (data_mode)).require ();
+ return gen_lowpart (pmode, ptrue);
+}
+
+/* Generate a predicate and strictness value to govern a floating-point
+ operation with SVE mode DATA_MODE.
+
+ If DATA_MODE is a partial vector mode, this pair prevents the operation
+ from interpreting undefined elements - unless we don't need to suppress
+ their trapping behavior. */
+rtx
+aarch64_sve_fp_pred (machine_mode data_mode, rtx *strictness)
+{
+ unsigned int vec_flags = aarch64_classify_vector_mode (data_mode);
+ if (flag_trapping_math && (vec_flags & VEC_PARTIAL))
+ {
+ if (strictness)
+ *strictness = gen_int_mode (SVE_STRICT_GP, SImode);
+ return aarch64_sve_packed_pred (data_mode);
+ }
+ if (strictness)
+ *strictness = gen_int_mode (SVE_RELAXED_GP, SImode);
+ /* Use the VPRED mode. */
+ return aarch64_ptrue_reg (aarch64_sve_pred_mode (data_mode));
+}
+
/* Emit a comparison CMP between OP0 and OP1, both of which have mode
DATA_MODE, and return the result in a predicate of mode PRED_MODE.
Use TARGET as the target register if nonnull and convenient. */
return IN_RANGE (INTVAL (x), 1, bit_width);
}
+
+/* Check whether X can control SVE mode MODE. */
+bool
+aarch64_sve_valid_pred_p (rtx x, machine_mode mode)
+{
+ machine_mode pred_mode = GET_MODE (x);
+ if (!aarch64_sve_pred_mode_p (pred_mode))
+ return false;
+
+ return known_ge (GET_MODE_NUNITS (pred_mode),
+ GET_MODE_NUNITS (mode));
+}
+
/* Return the bitmask CONST_INT to select the bits required by a zero extract
operation of width WIDTH at bit position POS. */
;; elements.
(define_mode_iterator SVE_FULL_HSF [VNx8HF VNx4SF])
+;; Partial SVE floating-point vector modes that have 16-bit or 32-bit
+;; elements.
+(define_mode_iterator SVE_PARTIAL_HSF [VNx2HF VNx4HF VNx2SF])
+
+;; SVE floating-point vector modes that have 16-bit or 32-bit elements.
+(define_mode_iterator SVE_HSF [SVE_PARTIAL_HSF SVE_FULL_HSF])
+
;; Fully-packed SVE integer vector modes that have 16-bit or 64-bit elements.
(define_mode_iterator SVE_FULL_HDI [VNx8HI VNx2DI])
(define_mode_iterator SVE_MATMULF [(VNx4SF "TARGET_SVE_F32MM")
(VNx2DF "TARGET_SVE_F64MM")])
+;; SVE floating-point vector modes that have 32-bit or 64-bit elements.
+(define_mode_iterator SVE_SDF [VNx2SF SVE_FULL_SDF])
+
;; Fully-packed SVE vector modes that have 32-bit or smaller elements.
(define_mode_iterator SVE_FULL_BHS [VNx16QI VNx8HI VNx4SI
VNx8BF VNx8HF VNx4SF])
VNx4SI VNx2SI
VNx2DI])
+;; SVE integer vector modes with 32-bit elements.
+(define_mode_iterator SVE_SI [VNx2SI VNx4SI])
+
(define_mode_iterator SVE_DIx24 [VNx4DI VNx8DI])
;; SVE modes with 2 or 4 elements.
(define_mode_iterator SVE_2 [VNx2QI VNx2HI VNx2HF VNx2BF
VNx2SI VNx2SF VNx2DI VNx2DF])
+;; SVE SI and DI modes with 2 elements.
+(define_mode_iterator SVE_2SDI [VNx2SI VNx2DI])
+
;; SVE integer modes with 2 elements, excluding the widest element.
(define_mode_iterator SVE_2BHSI [VNx2QI VNx2HI VNx2SI])
(define_mode_attr data_bytes [(VNx16BI "1") (VNx8BI "2")
(VNx4BI "4") (VNx2BI "8")])
-;; Two-nybble mask for partial vector modes: nunits, byte size.
-(define_mode_attr self_mask [(VNx8QI "0x81")
- (VNx4QI "0x41")
- (VNx2QI "0x21")
- (VNx4HI "0x42")
- (VNx2HI "0x22")
- (VNx2SI "0x24")])
-
-;; For SVE_HSDI vector modes, the mask of narrower modes, encoded as above.
-(define_mode_attr narrower_mask [(VNx8HI "0x81") (VNx4HI "0x41")
- (VNx2HI "0x21")
- (VNx4SI "0x43") (VNx2SI "0x23")
- (VNx2DI "0x27")])
+;; Two-nybble mask for vector modes: nunits, byte size.
+(define_mode_attr self_mask [(VNx2HI "0x22") (VNx2HF "0x22")
+ (VNx4HI "0x42") (VNx4HF "0x42")
+ (VNx8HI "0x82") (VNx8HF "0x82")
+ (VNx2SI "0x24") (VNx2SF "0x24")
+ (VNx4SI "0x44") (VNx4SF "0x44")
+ (VNx2DI "0x28") (VNx2DF "0x28")
+ (VNx8QI "0x81") (VNx4QI "0x41") (VNx2QI "0x21")])
+
+;; The mask of narrower vector modes, encoded as above.
+(define_mode_attr narrower_mask [(VNx8HI "0x81") (VNx8HF "0x81")
+ (VNx4HI "0x41") (VNx4HF "0x41")
+ (VNx2HI "0x21") (VNx2HF "0x21")
+ (VNx4SI "0x43") (VNx4SF "0x43")
+ (VNx2SI "0x23") (VNx2SF "0x23")
+ (VNx2DI "0x27") (VNx2DF "0x27")])
;; The constraint to use for an SVE [SU]DOT, FMUL, FMLA or FMLS lane index.
(define_mode_attr sve_lane_con [(VNx8HI "y") (VNx4SI "y") (VNx2DI "x")
return aarch64_simd_shift_imm_p (op, mode, false);
})
+(define_special_predicate "aarch64_predicate_operand"
+ (and (match_code "reg,subreg")
+ (match_test "register_operand (op, GET_MODE (op))")
+ (match_test "aarch64_sve_valid_pred_p (op, mode)")))
+
(define_predicate "aarch64_simd_imm_zero"
(and (match_code "const,const_vector")
(match_test "op == CONST0_RTX (GET_MODE (op))")))
/* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize" } */
+/* { dg-options "-O2 -ftree-vectorize --param aarch64-vect-compare-costs=0" } */
#include <stdint.h>
/* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize" } */
+/* { dg-options "-O2 -ftree-vectorize --param aarch64-vect-compare-costs=0" } */
#include <stdint.h>
/* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize" } */
+/* { dg-options "-O2 -ftree-vectorize --param aarch64-vect-compare-costs=0" } */
void __attribute__ ((noinline, noclone))
pack_float_plus_1point1 (float *d, double *s, int size)
/* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize" } */
+/* { dg-options "-O2 -ftree-vectorize --param aarch64-vect-compare-costs=0" } */
void __attribute__ ((noinline, noclone))
unpack_float_plus_7point9 (double *d, float *s, int size)
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -msve-vector-bits=2048 -fno-schedule-insns -fno-schedule-insns2" } */
+
+#include <stdint.h>
+
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+
+typedef float v32sf __attribute__((vector_size(128)));
+typedef float v64sf __attribute__((vector_size(256)));
+
+typedef double v32df __attribute__((vector_size(256)));
+
+typedef int16_t v32hi __attribute__((vector_size(64)));
+typedef int16_t v64hi __attribute__((vector_size(128)));
+typedef uint16_t v32uhi __attribute__((vector_size(64)));
+typedef uint16_t v64uhi __attribute__((vector_size(128)));
+
+typedef int32_t v32si __attribute__((vector_size(128)));
+typedef int32_t v64si __attribute__((vector_size(256)));
+typedef uint32_t v32usi __attribute__((vector_size(128)));
+typedef uint32_t v64usi __attribute__((vector_size(256)));
+
+typedef int64_t v32di __attribute__((vector_size(256)));
+typedef uint64_t v32udi __attribute__((vector_size(256)));
+
+/*
+** float_2hf2hi:
+** ...
+** ld1h (z[0-9]+)\.d, p[0-7]/z, \[x0\]
+** ptrue (p[0-7])\.d, vl32
+** scvtf (z[0-9]+)\.h, \2/m, \1\.h
+** ...
+*/
+v32hf
+float_2hf2hi (v32hi x)
+{
+ return __builtin_convertvector (x, v32hf);
+}
+
+/*
+** float_2hf2uhi:
+** ...
+** ld1h (z[0-9]+)\.d, p[0-7]/z, \[x0\]
+** ptrue (p[0-7])\.d, vl32
+** ucvtf (z[0-9]+)\.h, \2/m, \1\.h
+** ...
+*/
+v32hf
+float_2hf2uhi (v32uhi x)
+{
+ return __builtin_convertvector (x, v32hf);
+}
+
+/*
+** float_2hf2si:
+** ...
+** ld1w (z[0-9]+)\.d, p[0-7]/z, \[x0\]
+** ptrue (p[0-7])\.d, vl32
+** scvtf (z[0-9]+)\.h, \2/m, \1\.s
+** ...
+*/
+v32hf
+float_2hf2si (v32si x)
+{
+ return __builtin_convertvector (x, v32hf);
+}
+
+/*
+** float_2hf2usi:
+** ...
+** ld1w (z[0-9]+)\.d, p[0-7]/z, \[x0\]
+** ptrue (p[0-7])\.d, vl32
+** ucvtf (z[0-9]+)\.h, \2/m, \1\.s
+** ...
+*/
+v32hf
+float_2hf2usi (v32usi x)
+{
+ return __builtin_convertvector (x, v32hf);
+}
+
+/*
+** float_2hf2di:
+** ptrue (p[0-7])\.b, vl256
+** ld1d (z[0-9]+)\.d, \1/z, \[x0\]
+** scvtf (z[0-9]+)\.h, \1/m, \2\.d
+** ...
+*/
+v32hf
+float_2hf2di (v32di x)
+{
+ return __builtin_convertvector (x, v32hf);
+}
+
+/*
+** float_2hf2udi:
+** ptrue (p[0-7])\.b, vl256
+** ld1d (z[0-9]+)\.d, \1/z, \[x0\]
+** ucvtf (z[0-9]+)\.h, \1/m, \2\.d
+** ...
+*/
+v32hf
+float_2hf2udi (v32udi x)
+{
+ return __builtin_convertvector (x, v32hf);
+}
+
+/*
+** float_4hf4hi:
+** ...
+** ld1h (z[0-9]+)\.s, p[0-7]/z, \[x0\]
+** ptrue (p[0-7])\.s, vl64
+** scvtf (z[0-9]+)\.h, \2/m, \1\.h
+** ...
+*/
+v64hf
+float_4hf4hi (v64hi x)
+{
+ return __builtin_convertvector (x, v64hf);
+}
+
+/*
+** float_4hf4uhi:
+** ...
+** ld1h (z[0-9]+)\.s, p[0-7]/z, \[x0\]
+** ptrue (p[0-7])\.s, vl64
+** ucvtf (z[0-9]+)\.h, \2/m, \1\.h
+** ...
+*/
+v64hf
+float_4hf4uhi (v64uhi x)
+{
+ return __builtin_convertvector (x, v64hf);
+}
+
+/*
+** float_4hf4si:
+** ptrue (p[0-7])\.b, vl256
+** ld1w (z[0-9]+)\.s, \1/z, \[x0\]
+** scvtf (z[0-9]+)\.h, \1/m, \2\.s
+** ...
+*/
+v64hf
+float_4hf4si (v64si x)
+{
+ return __builtin_convertvector (x, v64hf);
+}
+
+/*
+** float_4hf4usi:
+** ptrue (p[0-7])\.b, vl256
+** ld1w (z[0-9]+)\.s, \1/z, \[x0\]
+** ucvtf (z[0-9]+)\.h, \1/m, \2\.s
+** ...
+*/
+v64hf
+float_4hf4usi (v64usi x)
+{
+ return __builtin_convertvector (x, v64hf);
+}
+
+/*
+** float_2sf2si:
+** ...
+** ld1w (z[0-9]+)\.d, p[0-7]/z, \[x0\]
+** ptrue (p[0-7])\.d, vl32
+** scvtf (z[0-9]+)\.s, \2/m, \1\.s
+** ...
+*/
+v32sf
+float_2sf2si (v32si x)
+{
+ return __builtin_convertvector (x, v32sf);
+}
+
+/*
+** float_2sf2usi:
+** ...
+** ld1w (z[0-9]+)\.d, p[0-7]/z, \[x0\]
+** ptrue (p[0-7])\.d, vl32
+** ucvtf (z[0-9]+)\.s, \2/m, \1\.s
+** ...
+*/
+v32sf
+float_2sf2usi (v32usi x)
+{
+ return __builtin_convertvector (x, v32sf);
+}
+
+/*
+** float_2sf2di:
+** ptrue (p[0-7])\.b, vl256
+** ld1d (z[0-9]+)\.d, \1/z, \[x0\]
+** scvtf (z[0-9]+)\.s, \1/m, \2\.d
+** ...
+*/
+v32sf
+float_2sf2di (v32di x)
+{
+ return __builtin_convertvector (x, v32sf);
+}
+
+/*
+** float_2sf2udi:
+** ptrue (p[0-7])\.b, vl256
+** ld1d (z[0-9]+)\.d, \1/z, \[x0\]
+** ucvtf (z[0-9]+)\.s, \1/m, \2\.d
+** ...
+*/
+v32sf
+float_2sf2udi (v32udi x)
+{
+ return __builtin_convertvector (x, v32sf);
+}
+
+/* { dg-final { check-function-bodies "**" "" ""} } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -msve-vector-bits=2048 -fno-trapping-math" } */
+
+#include "unpacked_cvtf_1.c"
+
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-7]\.d} } } */
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-7]\.s} } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b} 14 } } */
+
+/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.s\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.d\n} 1 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+void f64_i32 (double *restrict x, int32_t *restrict y, int n)
+{
+ for (int i = 0; i < n; i++)
+ x[i] = (double)y[i];
+}
+
+/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.[sd], p[0-7]/m, z[0-9]+\.d\n} 1 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -msve-vector-bits=2048 -fno-schedule-insns -fno-schedule-insns2" } */
+
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+
+typedef float v32sf __attribute__((vector_size(128)));
+typedef float v64sf __attribute__((vector_size(256)));
+
+typedef double v32df __attribute__((vector_size(256)));
+
+/*
+** trunc_2sf2df:
+** ptrue (p[0-7])\.b, vl256
+** ld1d (z[0-9]+)\.d, \1/z, \[x0\]
+** fcvt (z[0-9]+)\.s, \1/m, \2\.d
+** ...
+*/
+v32sf
+trunc_2sf2df (v32df x)
+{
+ return __builtin_convertvector (x, v32sf);
+}
+
+/*
+** trunc_2hf2df:
+** ptrue (p[0-7])\.b, vl256
+** ld1d (z[0-9]+)\.d, \1/z, \[x0\]
+** fcvt (z[0-9]+)\.h, \1/m, \2\.d
+** ...
+*/
+v32hf
+trunc_2hf2df (v32df x)
+{
+ return __builtin_convertvector (x, v32hf);
+}
+
+/*
+** trunc_4hf4sf:
+** ptrue (p[0-7])\.b, vl256
+** ld1w (z[0-9]+)\.s, \1/z, \[x0\]
+** fcvt (z[0-9]+)\.h, \1/m, \2\.s
+** ...
+*/
+v64hf
+trunc_4hf4sf (v64sf x)
+{
+ return __builtin_convertvector (x, v64hf);
+}
+
+/*
+** trunc_2hf2sf:
+** ...
+** ld1w (z[0-9]+)\.d, p[0-7]/z, \[x0\]
+** ptrue (p[0-7])\.d, vl32
+** fcvt (z[0-9]+)\.h, \2/m, \1\.s
+** ...
+*/
+v32hf
+trunc_2hf2sf (v32sf x)
+{
+ return __builtin_convertvector (x, v32hf);
+}
+
+/*
+** extend_2df2hf:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** fcvt (z[0-9]+)\.d, \1/m, \2\.h
+** ...
+*/
+v32df
+extend_2df2hf (v32hf x)
+{
+ return __builtin_convertvector (x, v32df);
+}
+
+/*
+** extend_2df2sf:
+** ptrue (p[0-7])\.b, vl256
+** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
+** fcvt (z[0-9]+)\.d, \1/m, \2\.s
+** ...
+*/
+v32df
+extend_2df2sf (v32sf x)
+{
+ return __builtin_convertvector (x, v32df);
+}
+
+/*
+** extend_4sf4hf:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** fcvt (z[0-9]+)\.s, \1/m, \2\.h
+** ...
+*/
+v64sf
+extend_4sf4hf (v64hf x)
+{
+ return __builtin_convertvector (x, v64sf);
+}
+
+/*
+** extend_2sf2hf:
+** ...
+** ld1h (z[0-9]+)\.d, p[0-7]/z, \[x0\]
+** ptrue (p[0-7])\.d, vl32
+** fcvt (z[0-9]+)\.s, \2/m, \1\.h
+** ...
+*/
+v32sf
+extend_2sf2hf (v32hf x)
+{
+ return __builtin_convertvector (x, v32sf);
+}
+
+/* { dg-final { check-function-bodies "**" "" ""} } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -msve-vector-bits=2048 -fno-trapping-math" } */
+
+#include "unpacked_fcvt_1.c"
+
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-7]\.d} } } */
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-7]\.s} } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b} 8 } } */
+
+/* { dg-final { scan-assembler-times {\tfcvt\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvt\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvt\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.s\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcvt\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvt\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvt\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.h\n} 2 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -msve-vector-bits=2048 -fno-schedule-insns -fno-schedule-insns2" } */
+
+#include <stdint.h>
+
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+
+typedef float v32sf __attribute__((vector_size(128)));
+typedef float v64sf __attribute__((vector_size(256)));
+
+typedef double v32df __attribute__((vector_size(256)));
+
+typedef int16_t v32hi __attribute__((vector_size(64)));
+typedef int16_t v64hi __attribute__((vector_size(128)));
+typedef uint16_t v32uhi __attribute__((vector_size(64)));
+typedef uint16_t v64uhi __attribute__((vector_size(128)));
+
+typedef int32_t v32si __attribute__((vector_size(128)));
+typedef int32_t v64si __attribute__((vector_size(256)));
+typedef uint32_t v32usi __attribute__((vector_size(128)));
+typedef uint32_t v64usi __attribute__((vector_size(256)));
+
+typedef int64_t v32di __attribute__((vector_size(256)));
+typedef uint64_t v32udi __attribute__((vector_size(256)));
+
+
+/*
+** fix_trunc_2hi2hf:
+** ...
+** ld1h (z[0-9]+)\.d, p[0-7]/z, \[x0\]
+** ptrue (p[0-7])\.d, vl32
+** fcvtzs (z[0-9]+)\.h, \2/m, \1\.h
+** ...
+*/
+v32hi
+fix_trunc_2hi2hf (v32hf x)
+{
+ return __builtin_convertvector (x, v32hi);
+}
+
+/*
+** fix_trunc_2uhi2hf:
+** ...
+** ld1h (z[0-9]+)\.d, p[0-7]/z, \[x0\]
+** ptrue (p[0-7])\.d, vl32
+** fcvtzu (z[0-9]+)\.h, \2/m, \1\.h
+** ...
+*/
+v32uhi
+fix_trunc_2uhi2hf (v32hf x)
+{
+ return __builtin_convertvector (x, v32uhi);
+}
+
+/*
+** fix_trunc_2si2hf:
+** ...
+** ld1h (z[0-9]+)\.d, p[0-7]/z, \[x0\]
+** ptrue (p[0-7])\.d, vl32
+** fcvtzs (z[0-9]+)\.s, \2/m, \1\.h
+** ...
+*/
+v32si
+fix_trunc_2si2hf (v32hf x)
+{
+ return __builtin_convertvector (x, v32si);
+}
+
+/*
+** fix_trunc_2usi2hf:
+** ...
+** ld1h (z[0-9]+)\.d, p[0-7]/z, \[x0\]
+** ptrue (p[0-7])\.d, vl32
+** fcvtzu (z[0-9]+)\.s, \2/m, \1\.h
+** ...
+*/
+v32usi
+fix_trunc_2usi2hf (v32hf x)
+{
+ return __builtin_convertvector (x, v32usi);
+}
+
+/*
+** fix_trunc_2di2hf:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** fcvtzs (z[0-9]+)\.d, \1/m, \2\.h
+** ...
+*/
+v32di
+fix_trunc_2di2hf (v32hf x)
+{
+ return __builtin_convertvector (x, v32di);
+}
+
+/*
+** fix_trunc_2udi2hf:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** fcvtzu (z[0-9]+)\.d, \1/m, \2\.h
+** ...
+*/
+v32udi
+fix_trunc_2udi2hf (v32hf x)
+{
+ return __builtin_convertvector (x, v32udi);
+}
+
+/*
+** fix_trunc_4hi4hf:
+** ...
+** ld1h (z[0-9]+)\.s, p[0-7]/z, \[x0\]
+** ptrue (p[0-7])\.s, vl64
+** fcvtzs (z[0-9]+)\.h, \2/m, \1\.h
+** ...
+*/
+v64hi
+fix_trunc_4hi4hf (v64hf x)
+{
+ return __builtin_convertvector (x, v64hi);
+}
+
+/*
+** fix_trunc_4uhi4hf:
+** ...
+** ld1h (z[0-9]+)\.s, p[0-7]/z, \[x0\]
+** ptrue (p[0-7])\.s, vl64
+** fcvtzu (z[0-9]+)\.h, \2/m, \1\.h
+** ...
+*/
+v64uhi
+fix_trunc_4uhi4hf (v64hf x)
+{
+ return __builtin_convertvector (x, v64uhi);
+}
+
+/*
+** fix_trunc_4si4hf:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** fcvtzs (z[0-9]+)\.s, \1/m, \2\.h
+** ...
+*/
+v64si
+fix_trunc_4si4hf (v64hf x)
+{
+ return __builtin_convertvector (x, v64si);
+}
+
+/*
+** fix_trunc_4usi4hf:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** fcvtzu (z[0-9]+)\.s, \1/m, \2\.h
+** ...
+*/
+v64usi
+fix_trunc_4usi4hf (v64hf x)
+{
+ return __builtin_convertvector (x, v64usi);
+}
+
+/*
+** fix_trunc_2si2sf:
+** ...
+** ld1w (z[0-9]+)\.d, p[0-7]/z, \[x0\]
+** ptrue (p[0-7])\.d, vl32
+** fcvtzs (z[0-9]+)\.s, \2/m, \1\.s
+** ...
+*/
+v32si
+fix_trunc_2si2sf (v32sf x)
+{
+ return __builtin_convertvector (x, v32si);
+}
+
+/*
+** fix_trunc_2usi2sf:
+** ...
+** ld1w (z[0-9]+)\.d, p[0-7]/z, \[x0\]
+** ptrue (p[0-7])\.d, vl32
+** fcvtzu (z[0-9]+)\.s, \2/m, \1\.s
+** ...
+*/
+v32usi
+fix_trunc_2usi2sf (v32sf x)
+{
+ return __builtin_convertvector (x, v32usi);
+}
+
+/*
+** fix_trunc_2di2sf:
+** ptrue (p[0-7])\.b, vl256
+** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
+** fcvtzs (z[0-9]+)\.d, \1/m, \2\.s
+** ...
+*/
+v32di
+fix_trunc_2di2sf (v32sf x)
+{
+ return __builtin_convertvector (x, v32di);
+}
+
+/*
+** fix_trunc_2udi2sf:
+** ptrue (p[0-7])\.b, vl256
+** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
+** fcvtzu (z[0-9]+)\.d, \1/m, \2\.s
+** ...
+*/
+v32udi
+fix_trunc_2udi2sf (v32sf x)
+{
+ return __builtin_convertvector (x, v32udi);
+}
+
+/*
+** fix_trunc_2si2df:
+** ptrue (p[0-7])\.b, vl256
+** ld1d (z[0-9]+)\.d, \1/z, \[x0\]
+** fcvtzs (z[0-9]+)\.s, \1/m, \2\.d
+** ...
+*/
+v32si
+fix_trunc_2si2df (v32df x)
+{
+ return __builtin_convertvector (x, v32si);
+}
+
+/*
+** fix_trunc_2usi2df:
+** ptrue (p[0-7])\.b, vl256
+** ld1d (z[0-9]+)\.d, \1/z, \[x0\]
+** fcvtzu (z[0-9]+)\.s, \1/m, \2\.d
+** ...
+*/
+v32usi
+fix_trunc_2usi2df (v32df x)
+{
+ return __builtin_convertvector (x, v32usi);
+}
+
+/* { dg-final { check-function-bodies "**" "" ""} } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -msve-vector-bits=2048 -fno-trapping-math" } */
+
+#include "unpacked_fcvtz_1.c"
+
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-7]\.d} } } */
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-7]\.s} } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b} 16 } } */
+
+/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.h\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.h\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.d\n} 1 } } */