}
)
+;; As above, for pairs that are used by the auto-vectorizer only.
+(define_insn_and_rewrite "*cond_<optab>_nontrunc<SVE_PARTIAL_F:mode><SVE_HSDI:mode>_relaxed"
+ [(set (match_operand:SVE_HSDI 0 "register_operand")
+ (unspec:SVE_HSDI
+ [(match_operand:<SVE_HSDI:VPRED> 1 "register_operand")
+ (unspec:SVE_HSDI
+ [(match_operand 4)
+ (const_int SVE_RELAXED_GP)
+ (match_operand:SVE_PARTIAL_F 2 "register_operand")]
+ SVE_COND_FCVTI)
+ (match_operand:SVE_HSDI 3 "aarch64_simd_reg_or_zero")]
+ UNSPEC_SEL))]
+ "TARGET_SVE
+ && (~(<SVE_HSDI:self_mask> | <SVE_HSDI:narrower_mask>) & <SVE_PARTIAL_F:self_mask>) == 0"
+ {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
+ [ &w , Upl , w , 0 ; * ] fcvtz<su>\t%0.<SVE_HSDI:Vetype>, %1/m, %2.<SVE_PARTIAL_F:Vetype>
+ [ &w , Upl , w , Dz ; yes ] movprfx\t%0.<SVE_HSDI:Vetype>, %1/z, %2.<SVE_HSDI:Vetype>\;fcvtz<su>\t%0.<SVE_HSDI:Vetype>, %1/m, %2.<SVE_PARTIAL_F:Vetype>
+ [ ?&w , Upl , w , w ; yes ] movprfx\t%0, %3\;fcvtz<su>\t%0.<SVE_HSDI:Vetype>, %1/m, %2.<SVE_PARTIAL_F:Vetype>
+ }
+ "&& !rtx_equal_p (operands[1], operands[4])"
+ {
+ operands[4] = copy_rtx (operands[1]);
+ }
+)
+
(define_insn "*cond_<optab>_nontrunc<SVE_FULL_F:mode><SVE_FULL_HSDI:mode>_strict"
[(set (match_operand:SVE_FULL_HSDI 0 "register_operand")
(unspec:SVE_FULL_HSDI
}
)
+(define_insn_and_rewrite "*cond_<optab>_trunc<VNx2DF_ONLY:mode><VNx2SI_ONLY:mode>_relaxed"
+ [(set (match_operand:VNx2SI_ONLY 0 "register_operand")
+ (unspec:VNx2SI_ONLY
+ [(match_operand:VNx2BI 1 "register_operand")
+ (unspec:VNx2SI_ONLY
+ [(match_operand 4)
+ (const_int SVE_RELAXED_GP)
+ (match_operand:VNx2DF_ONLY 2 "register_operand")]
+ SVE_COND_FCVTI)
+ (match_operand:VNx2SI_ONLY 3 "aarch64_simd_reg_or_zero")]
+ UNSPEC_SEL))]
+ "TARGET_SVE"
+ {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
+ [ &w , Upl , w , 0 ; * ] fcvtz<su>\t%0.<VNx2SI_ONLY:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype>
+ [ &w , Upl , w , Dz ; yes ] movprfx\t%0.<VNx2DF_ONLY:Vetype>, %1/z, %2.<VNx2DF_ONLY:Vetype>\;fcvtz<su>\t%0.<VNx2SI_ONLY:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype>
+ [ ?&w , Upl , w , w ; yes ] movprfx\t%0, %3\;fcvtz<su>\t%0.<VNx2SI_ONLY:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype>
+ }
+ "&& !rtx_equal_p (operands[1], operands[4])"
+ {
+ operands[4] = copy_rtx (operands[1]);
+ }
+)
+
;; -------------------------------------------------------------------------
;; ---- [INT<-FP] Packs
;; -------------------------------------------------------------------------
}
)
+;; As above, for pairs that are used by the auto-vectorizer only.
+(define_insn_and_rewrite "*cond_<optab>_nonextend<SVE_HSDI:mode><SVE_PARTIAL_F:mode>_relaxed"
+ [(set (match_operand:SVE_PARTIAL_F 0 "register_operand")
+ (unspec:SVE_PARTIAL_F
+ [(match_operand:<SVE_HSDI:VPRED> 1 "register_operand")
+ (unspec:SVE_PARTIAL_F
+ [(match_operand 4)
+ (const_int SVE_RELAXED_GP)
+ (match_operand:SVE_HSDI 2 "register_operand")]
+ SVE_COND_ICVTF)
+ (match_operand:SVE_PARTIAL_F 3 "aarch64_simd_reg_or_zero")]
+ UNSPEC_SEL))]
+ "TARGET_SVE
+ && (~(<SVE_HSDI:self_mask> | <SVE_HSDI:narrower_mask>) & <SVE_PARTIAL_F:self_mask>) == 0"
+ {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
+ [ &w , Upl , w , 0 ; * ] <su>cvtf\t%0.<SVE_PARTIAL_F:Vetype>, %1/m, %2.<SVE_HSDI:Vetype>
+ [ &w , Upl , w , Dz ; yes ] movprfx\t%0.<SVE_HSDI:Vetype>, %1/z, %2.<SVE_HSDI:Vetype>\;<su>cvtf\t%0.<SVE_PARTIAL_F:Vetype>, %1/m, %2.<SVE_HSDI:Vetype>
+ [ ?&w , Upl , w , w ; yes ] movprfx\t%0, %3\;<su>cvtf\t%0.<SVE_PARTIAL_F:Vetype>, %1/m, %2.<SVE_HSDI:Vetype>
+ }
+ "&& !rtx_equal_p (operands[1], operands[4])"
+ {
+ operands[4] = copy_rtx (operands[1]);
+ }
+)
+
(define_insn "*cond_<optab>_nonextend<SVE_FULL_HSDI:mode><SVE_FULL_F:mode>_strict"
[(set (match_operand:SVE_FULL_F 0 "register_operand")
(unspec:SVE_FULL_F
}
)
+;; As above, for pairs that are used by the auto-vectorizer only.
+(define_insn_and_rewrite "*cond_<optab>_trunc<SVE_SDF:mode><SVE_PARTIAL_HSF:mode>"
+ [(set (match_operand:SVE_PARTIAL_HSF 0 "register_operand")
+ (unspec:SVE_PARTIAL_HSF
+ [(match_operand:<SVE_SDF:VPRED> 1 "register_operand")
+ (unspec:SVE_PARTIAL_HSF
+ [(match_operand 4)
+ (const_int SVE_RELAXED_GP)
+ (match_operand:SVE_SDF 2 "register_operand")]
+ SVE_COND_FCVT)
+ (match_operand:SVE_PARTIAL_HSF 3 "aarch64_simd_reg_or_zero")]
+ UNSPEC_SEL))]
+ "TARGET_SVE && (~<SVE_SDF:narrower_mask> & <SVE_PARTIAL_HSF:self_mask>) == 0"
+ {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
+ [ w , Upl , w , 0 ; * ] fcvt\t%0.<SVE_PARTIAL_HSF:Vetype>, %1/m, %2.<SVE_SDF:Vetype>
+ [ ?&w , Upl , w , Dz ; yes ] movprfx\t%0.<SVE_SDF:Vetype>, %1/z, %2.<SVE_SDF:Vetype>\;fcvt\t%0.<SVE_PARTIAL_HSF:Vetype>, %1/m, %2.<SVE_SDF:Vetype>
+ [ ?&w , Upl , w , w ; yes ] movprfx\t%0, %3\;fcvt\t%0.<SVE_PARTIAL_HSF:Vetype>, %1/m, %2.<SVE_SDF:Vetype>
+ }
+ "&& !rtx_equal_p (operands[1], operands[4])"
+ {
+ operands[4] = copy_rtx (operands[1]);
+ }
+)
+
;; -------------------------------------------------------------------------
;; ---- [FP<-FP] Packs (bfloat16)
;; -------------------------------------------------------------------------
}
)
+;; As above, for pairs that are used by the auto-vectorizer only.
+(define_insn_and_rewrite "*cond_<optab>_nontrunc<SVE_PARTIAL_HSF:mode><SVE_SDF:mode>_relaxed"
+ [(set (match_operand:SVE_SDF 0 "register_operand")
+ (unspec:SVE_SDF
+ [(match_operand:<SVE_SDF:VPRED> 1 "register_operand")
+ (unspec:SVE_SDF
+ [(match_operand 4)
+ (const_int SVE_RELAXED_GP)
+ (match_operand:SVE_PARTIAL_HSF 2 "register_operand")]
+ SVE_COND_FCVT)
+ (match_operand:SVE_SDF 3 "aarch64_simd_reg_or_zero")]
+ UNSPEC_SEL))]
+ "TARGET_SVE && (~<SVE_SDF:narrower_mask> & <SVE_PARTIAL_HSF:self_mask>) == 0"
+ {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
+ [ w , Upl , w , 0 ; * ] fcvt\t%0.<SVE_SDF:Vetype>, %1/m, %2.<SVE_PARTIAL_HSF:Vetype>
+ [ ?&w , Upl , w , Dz ; yes ] movprfx\t%0.<SVE_SDF:Vetype>, %1/z, %2.<SVE_SDF:Vetype>\;fcvt\t%0.<SVE_SDF:Vetype>, %1/m, %2.<SVE_PARTIAL_HSF:Vetype>
+ [ ?&w , Upl , w , w ; yes ] movprfx\t%0, %3\;fcvt\t%0.<SVE_SDF:Vetype>, %1/m, %2.<SVE_PARTIAL_HSF:Vetype>
+ }
+ "&& !rtx_equal_p (operands[1], operands[4])"
+ {
+ operands[4] = copy_rtx (operands[1]);
+ }
+)
+
;; -------------------------------------------------------------------------
;; ---- [PRED<-PRED] Packs
;; -------------------------------------------------------------------------
(define_mode_iterator VNx8SI_ONLY [VNx8SI])
(define_mode_iterator VNx8SF_ONLY [VNx8SF])
(define_mode_iterator VNx8DI_ONLY [VNx8DI])
+(define_mode_iterator VNx2SI_ONLY [VNx2SI])
(define_mode_iterator VNx4SI_ONLY [VNx4SI])
(define_mode_iterator VNx4SF_ONLY [VNx4SF])
(define_mode_iterator VNx2DI_ONLY [VNx2DI])
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048 -fno-trapping-math" } */
+
+#include <stdint.h>
+
+#define COND_CVT(TYPE0, TYPE1, TYPE2, COUNT) \
+ void \
+ test_##TYPE0##_##TYPE1##_##TYPE2 (TYPE0 *__restrict out, \
+ TYPE1 *__restrict a, \
+ TYPE0 *__restrict b, \
+ TYPE2 *__restrict p) \
+ { \
+ for (unsigned int i = 0; i < COUNT; i++) \
+ out[i] = p[i] ? (TYPE0)a[i] : b[i]; \
+ }
+
+#define TEST_CVTF(PFX, T) \
+ T (_Float16, PFX##int16_t, uint64_t, 32) \
+ T (_Float16, PFX##int16_t, uint32_t, 64) \
+ T (_Float16, PFX##int32_t, uint64_t, 32) \
+ T (_Float16, PFX##int32_t, uint32_t, 64) \
+ T (_Float16, PFX##int64_t, uint64_t, 32) \
+ T (float, PFX##int32_t, uint64_t, 32) \
+ T (float, PFX##int64_t, uint64_t, 32)
+
+#define TEST_ALL(T) \
+ TEST_CVTF (, T) \
+ TEST_CVTF (u, T)
+
+TEST_ALL (COND_CVT)
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 8 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 6 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 8 } } */
+
+/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.s\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048 -fno-trapping-math" } */
+
+#include <stdint.h>
+
+#define COND_CVT(TYPE0, TYPE1, TYPE2, COUNT) \
+ void \
+ test_##TYPE0##_##TYPE1##_##TYPE2 (TYPE0 *__restrict out, \
+ TYPE1 *__restrict a, \
+ TYPE0 *__restrict b, \
+ TYPE2 *__restrict p) \
+ { \
+ for (unsigned int i = 0; i < COUNT; i++) \
+ out[i] = p[i] ? (TYPE0)a[i] : b[i]; \
+ }
+
+#define TEST_FCVT(T) \
+ T (_Float16, float, uint64_t, 32) \
+ T (_Float16, float, uint32_t, 64) \
+ T (_Float16, double, uint64_t, 32) \
+ T (float, double, uint64_t, 32) \
+ T (float, _Float16, uint64_t, 32) \
+ T (float, _Float16, uint32_t, 64) \
+ T (double, _Float16, uint64_t,32) \
+ T (double, float, uint64_t, 32)
+
+TEST_FCVT (COND_CVT)
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 4 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 4 } } */
+
+/* { dg-final { scan-assembler-times {\tfcvt\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvt\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvt\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.s\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcvt\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvt\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvt\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.h\n} 2 } } */
+
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048 -fno-trapping-math" } */
+
+#include <stdint.h>
+
+#define COND_CVT(TYPE0, TYPE1, TYPE2, COUNT) \
+ void \
+ test_##TYPE0##_##TYPE1##_##TYPE2 (TYPE0 *__restrict out, \
+ TYPE1 *__restrict a, \
+ TYPE0 *__restrict b, \
+ TYPE2 *__restrict p) \
+ { \
+ for (unsigned int i = 0; i < COUNT; i++) \
+ out[i] = p[i] ? (TYPE0)a[i] : b[i]; \
+ }
+
+#define TEST_FCVTZ(PFX, T) \
+ T (PFX##int16_t, _Float16, uint64_t, 32) \
+ T (PFX##int16_t, _Float16, uint32_t, 64) \
+ T (PFX##int32_t, _Float16, uint64_t, 32) \
+ T (PFX##int32_t, _Float16, uint32_t, 64) \
+ T (PFX##int64_t, _Float16, uint64_t, 32) \
+ T (PFX##int32_t, float, uint64_t, 32) \
+ T (PFX##int64_t, float, uint64_t, 32) \
+ T (PFX##int32_t, double, uint64_t, 32)
+
+#define TEST_ALL(T) \
+ TEST_FCVTZ (, T) \
+ TEST_FCVTZ (u, T)
+
+TEST_ALL (COND_CVT)
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 10 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 6 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 8 } } */
+
+/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.h\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.h\n} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.h\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfcvtzs\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcvtzu\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.d\n} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tsel\t} } } */