From: Spencer Abson Date: Tue, 8 Jul 2025 12:49:42 +0000 (+0000) Subject: aarch64: Relaxed SEL combiner patterns for unpacked SVE FP binary arithmetic X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=317c6a4e09614ec20a6290a8154c5359bc10bd5f;p=thirdparty%2Fgcc.git aarch64: Relaxed SEL combiner patterns for unpacked SVE FP binary arithmetic Extend the binary op/UNSPEC_SEL combiner patterns from SVE_FULL_F/ SVE_FULL_F_B16B16 to SVE_F/SVE_F_B16B16, where the strictness value is SVE_RELAXED_GP. gcc/ChangeLog: * config/aarch64/aarch64-sve.md (*cond__2_relaxed): Extend from SVE_FULL_F_B16B16 to SVE_F_B16B16. (*cond__3_relaxed): Likewise. (*cond__any_relaxed): Likwise. (*cond__any_const_relaxed): Extend from SVE_FULL_F to SVE_F. (*cond_add_2_const_relaxed): Likewise. (*cond_add_any_const_relaxed): Likewise. (*cond_sub_3_const_relaxed): Likewise. (*cond_sub_const_relaxed): Likewise. gcc/testsuite/ChangeLog: * g++.target/aarch64/sve/unpacked_cond_binary_bf16_1.C: New test. * gcc.target/aarch64/sve/unpacked_cond_builtin_fmax_1.c: Likewise. * gcc.target/aarch64/sve/unpacked_cond_builtin_fmin_1.c: Likewise. * gcc.target/aarch64/sve/unpacked_cond_fadd_1.c: Likewise. * gcc.target/aarch64/sve/unpacked_cond_fdiv_1.c: Likewise. * gcc.target/aarch64/sve/unpacked_cond_fmaxnm_1.c: Likewise. * gcc.target/aarch64/sve/unpacked_cond_fminnm_1.c: Likewise. * gcc.target/aarch64/sve/unpacked_cond_fmul_1.c: Likewise.. * gcc.target/aarch64/sve/unpacked_cond_fsubr_1.c: Likewise. --- diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index fc0f51e73a3..b252eef411c 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -5621,14 +5621,14 @@ ;; Predicated floating-point operations, merging with the first input. (define_insn_and_rewrite "*cond__2_relaxed" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 [(match_operand: 1 "register_operand") - (unspec:SVE_FULL_F_B16B16 + (unspec:SVE_F_B16B16 [(match_operand 4) (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") - (match_operand:SVE_FULL_F_B16B16 3 "register_operand")] + (match_operand:SVE_F_B16B16 2 "register_operand") + (match_operand:SVE_F_B16B16 3 "register_operand")] SVE_COND_FP_BINARY) (match_dup 2)] UNSPEC_SEL))] @@ -5664,14 +5664,14 @@ ;; Same for operations that take a 1-bit constant. (define_insn_and_rewrite "*cond__2_const_relaxed" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F [(match_operand: 1 "register_operand") - (unspec:SVE_FULL_F + (unspec:SVE_F [(match_operand 4) (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F 2 "register_operand") - (match_operand:SVE_FULL_F 3 "")] + (match_operand:SVE_F 2 "register_operand") + (match_operand:SVE_F 3 "")] SVE_COND_FP_BINARY_I1) (match_dup 2)] UNSPEC_SEL))] @@ -5707,14 +5707,14 @@ ;; Predicated floating-point operations, merging with the second input. (define_insn_and_rewrite "*cond__3_relaxed" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 [(match_operand: 1 "register_operand") - (unspec:SVE_FULL_F_B16B16 + (unspec:SVE_F_B16B16 [(match_operand 4) (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") - (match_operand:SVE_FULL_F_B16B16 3 "register_operand")] + (match_operand:SVE_F_B16B16 2 "register_operand") + (match_operand:SVE_F_B16B16 3 "register_operand")] SVE_COND_FP_BINARY) (match_dup 3)] UNSPEC_SEL))] @@ -5750,16 +5750,16 @@ ;; Predicated floating-point operations, merging with an independent value. (define_insn_and_rewrite "*cond__any_relaxed" - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") - (unspec:SVE_FULL_F_B16B16 + [(set (match_operand:SVE_F_B16B16 0 "register_operand") + (unspec:SVE_F_B16B16 [(match_operand: 1 "register_operand") - (unspec:SVE_FULL_F_B16B16 + (unspec:SVE_F_B16B16 [(match_operand 5) (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") - (match_operand:SVE_FULL_F_B16B16 3 "register_operand")] + (match_operand:SVE_F_B16B16 2 "register_operand") + (match_operand:SVE_F_B16B16 3 "register_operand")] SVE_COND_FP_BINARY) - (match_operand:SVE_FULL_F_B16B16 4 "aarch64_simd_reg_or_zero")] + (match_operand:SVE_F_B16B16 4 "aarch64_simd_reg_or_zero")] UNSPEC_SEL))] "TARGET_SVE && ( || !) @@ -5832,16 +5832,16 @@ ;; Same for operations that take a 1-bit constant. (define_insn_and_rewrite "*cond__any_const_relaxed" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F [(match_operand: 1 "register_operand") - (unspec:SVE_FULL_F + (unspec:SVE_F [(match_operand 5) (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F 2 "register_operand") - (match_operand:SVE_FULL_F 3 "")] + (match_operand:SVE_F 2 "register_operand") + (match_operand:SVE_F 3 "")] SVE_COND_FP_BINARY_I1) - (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")] + (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")] UNSPEC_SEL))] "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])" {@ [ cons: =0 , 1 , 2 , 4 ] @@ -5928,14 +5928,14 @@ ;; Predicated floating-point addition of a constant, merging with the ;; first input. (define_insn_and_rewrite "*cond_add_2_const_relaxed" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F [(match_operand: 1 "register_operand") - (unspec:SVE_FULL_F + (unspec:SVE_F [(match_operand 4) (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F 2 "register_operand") - (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_immediate")] + (match_operand:SVE_F 2 "register_operand") + (match_operand:SVE_F 3 "aarch64_sve_float_arith_with_sub_immediate")] UNSPEC_COND_FADD) (match_dup 2)] UNSPEC_SEL))] @@ -5976,16 +5976,16 @@ ;; Predicated floating-point addition of a constant, merging with an ;; independent value. (define_insn_and_rewrite "*cond_add_any_const_relaxed" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F [(match_operand: 1 "register_operand") - (unspec:SVE_FULL_F + (unspec:SVE_F [(match_operand 5) (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F 2 "register_operand") - (match_operand:SVE_FULL_F 3 "aarch64_sve_float_arith_with_sub_immediate")] + (match_operand:SVE_F 2 "register_operand") + (match_operand:SVE_F 3 "aarch64_sve_float_arith_with_sub_immediate")] UNSPEC_COND_FADD) - (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")] + (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")] UNSPEC_SEL))] "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])" {@ [ cons: =0 , 1 , 2 , 3 , 4 ] @@ -6243,14 +6243,14 @@ ;; Predicated floating-point subtraction from a constant, merging with the ;; second input. (define_insn_and_rewrite "*cond_sub_3_const_relaxed" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F [(match_operand: 1 "register_operand") - (unspec:SVE_FULL_F + (unspec:SVE_F [(match_operand 4) (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_immediate") - (match_operand:SVE_FULL_F 3 "register_operand")] + (match_operand:SVE_F 2 "aarch64_sve_float_arith_immediate") + (match_operand:SVE_F 3 "register_operand")] UNSPEC_COND_FSUB) (match_dup 3)] UNSPEC_SEL))] @@ -6287,16 +6287,16 @@ ;; Predicated floating-point subtraction from a constant, merging with an ;; independent value. (define_insn_and_rewrite "*cond_sub_const_relaxed" - [(set (match_operand:SVE_FULL_F 0 "register_operand") - (unspec:SVE_FULL_F + [(set (match_operand:SVE_F 0 "register_operand") + (unspec:SVE_F [(match_operand: 1 "register_operand") - (unspec:SVE_FULL_F + (unspec:SVE_F [(match_operand 5) (const_int SVE_RELAXED_GP) - (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_immediate") - (match_operand:SVE_FULL_F 3 "register_operand")] + (match_operand:SVE_F 2 "aarch64_sve_float_arith_immediate") + (match_operand:SVE_F 3 "register_operand")] UNSPEC_COND_FSUB) - (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")] + (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")] UNSPEC_SEL))] "TARGET_SVE && !rtx_equal_p (operands[3], operands[4])" {@ [ cons: =0 , 1 , 3 , 4 ] diff --git a/gcc/testsuite/g++.target/aarch64/sve/unpacked_cond_binary_bf16_1.C b/gcc/testsuite/g++.target/aarch64/sve/unpacked_cond_binary_bf16_1.C new file mode 100644 index 00000000000..560d874cff7 --- /dev/null +++ b/gcc/testsuite/g++.target/aarch64/sve/unpacked_cond_binary_bf16_1.C @@ -0,0 +1,46 @@ +/* { dg-do compile }*/ +/* { dg-options "-O -ffinite-math-only -fno-signed-zeros -fno-trapping-math -msve-vector-bits=2048 " } */ + +#include +#pragma GCC target "arch=armv9-a+sve-b16b16" + +#define ADD(a, b) a + b +#define SUB(a, b) a - b +#define MUL(a, b) a * b +#define MAX(a, b) (a > b) ? a : b +#define MIN(a, b) (a > b) ? b : a + +#define COND_OP(OP, TYPE, PRED_TYPE, ARG2, MERGE) \ + TYPE test_##OP##_##TYPE##_##ARG2##_##MERGE (TYPE a, TYPE b, TYPE c, PRED_TYPE p) \ + {return p ? OP (a, ARG2) : MERGE; } + +#define TEST_OP(OP, TYPE, PRED_TYPE, T) \ + T (OP, TYPE, PRED_TYPE, b, a) \ + T (OP, TYPE, PRED_TYPE, b, b) \ + T (OP, TYPE, PRED_TYPE, b, c) + +#define TEST_ALL(TYPE, PRED_TYPE, T) \ + TEST_OP (ADD, TYPE, PRED_TYPE, T) \ + TEST_OP (SUB, TYPE, PRED_TYPE, T) \ + TEST_OP (MUL, TYPE, PRED_TYPE, T) \ + TEST_OP (MAX, TYPE, PRED_TYPE, T) \ + TEST_OP (MIN, TYPE, PRED_TYPE, T) + +#define TEST(TYPE, PTYPE, SIZE) \ + typedef TYPE TYPE##SIZE __attribute__ ((vector_size (SIZE))); \ + typedef PTYPE PTYPE##SIZE __attribute__ ((vector_size (SIZE))); \ + TEST_ALL (TYPE##SIZE, PTYPE##SIZE, COND_OP) + +TEST (__bf16, uint16_t, 128) + +TEST (__bf16, uint16_t, 64) + +/* { dg-final { scan-assembler-times {\tbfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */ +/* { dg-final { scan-assembler-times {\tbfsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */ +/* { dg-final { scan-assembler-times {\tbfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */ + +/* { dg-final { scan-assembler-times {\tbfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */ +/* { dg-final { scan-assembler-times {\tbfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */ + +// There's no BFSUBR. +/* { dg-final { scan-assembler-times {\tsel\t} 2 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_builtin_fmax_1.c b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_builtin_fmax_1.c new file mode 100644 index 00000000000..d328b371342 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_builtin_fmax_1.c @@ -0,0 +1,51 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048 -fno-trapping-math" } */ + +#include + +#define a_i a[i] +#define b_i b[i] +#define c_i c[i] + +#define TEST_FN(FN, TYPE0, TYPE1, COUNT, RHS, MERGE) \ + void \ + f_##TYPE0##_##TYPE1##_##RHS##_##MERGE (TYPE0 *__restrict out, \ + TYPE0 *__restrict a, \ + TYPE0 *__restrict b, \ + TYPE0 *__restrict c, \ + TYPE1 *__restrict p) \ + { \ + for (unsigned int i = 0; i < COUNT; i++) \ + out[i] = p[i] ? FN (a[i], (TYPE0)RHS) : MERGE; \ + } + +#define TEST_ALL(FN, TYPE0, TYPE1, COUNT) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, b_i, a_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, b_i, b_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, b_i, c_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, 0, a_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, 0, b_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, 1, a_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, 1, b_i) + +TEST_ALL (__builtin_fmaxf16, _Float16, uint64_t, 32) + +TEST_ALL (__builtin_fmaxf16, _Float16, uint32_t, 64) + +TEST_ALL (__builtin_fmaxf32, float, uint64_t, 32) + +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 13 } } */ +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 13 } } */ +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 13 } } */ + +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0.0\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1.0\n} 2 } } */ + +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 6 } } */ +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */ +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0.0\n} 4 } } */ +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1.0\n} 4 } } */ + +/* { dg-final { scan-assembler-not {\tsel\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_builtin_fmin_1.c b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_builtin_fmin_1.c new file mode 100644 index 00000000000..1821f0370f0 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_builtin_fmin_1.c @@ -0,0 +1,51 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048 -fno-trapping-math" } */ + +#include + +#define a_i a[i] +#define b_i b[i] +#define c_i c[i] + +#define TEST_FN(FN, TYPE0, TYPE1, COUNT, RHS, MERGE) \ + void \ + f_##TYPE0##_##TYPE1##_##RHS##_##MERGE (TYPE0 *__restrict out, \ + TYPE0 *__restrict a, \ + TYPE0 *__restrict b, \ + TYPE0 *__restrict c, \ + TYPE1 *__restrict p) \ + { \ + for (unsigned int i = 0; i < COUNT; i++) \ + out[i] = p[i] ? FN (a[i], (TYPE0)RHS) : MERGE; \ + } + +#define TEST_ALL(FN, TYPE0, TYPE1, COUNT) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, b_i, a_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, b_i, b_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, b_i, c_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, 0, a_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, 0, b_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, 1, a_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, 1, b_i) + +TEST_ALL (__builtin_fminf16, _Float16, uint64_t, 32) + +TEST_ALL (__builtin_fminf16, _Float16, uint32_t, 64) + +TEST_ALL (__builtin_fminf32, float, uint64_t, 32) + +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 13 } } */ +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 13 } } */ +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 13 } } */ + +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0.0\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1.0\n} 2 } } */ + +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 6 } } */ +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */ +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0.0\n} 4 } } */ +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1.0\n} 4 } } */ + +/* { dg-final { scan-assembler-not {\tsel\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fadd_1.c b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fadd_1.c new file mode 100644 index 00000000000..666cf89df43 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fadd_1.c @@ -0,0 +1,62 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048 -fno-trapping-math" } */ + +#include + +#define a_i a[i] +#define b_i b[i] +#define c_i c[i] +#define imm_p5 0.5 + +#define ADD(A, B) A + B + +#define TEST_FN(FN, TYPE0, TYPE1, COUNT, NAME, RHS, MERGE) \ + void \ + f_##TYPE0##_##TYPE1##_##NAME##_##MERGE (TYPE0 *__restrict out, \ + TYPE0 *__restrict a, \ + TYPE0 *__restrict b, \ + TYPE0 *__restrict c, \ + TYPE1 *__restrict p) \ + { \ + for (unsigned int i = 0; i < COUNT; i++) \ + out[i] = p[i] ? FN (a[i], (TYPE0)RHS) : MERGE; \ + } + +#define TEST_ALL(FN, TYPE0, TYPE1, COUNT) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, b_i, b[i], a_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, b_i, b[i], b_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, b_i, b[i], c_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, one, 1, a_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, one, 1, b_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, none, -1, a_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, none, -1, b_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, p5, 0.5, a_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, p5, 0.5, b_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, np5, -0.5, a_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, np5, -0.5, b_i) + +TEST_ALL (ADD, _Float16, uint64_t, 32) + +TEST_ALL (ADD, _Float16, uint32_t, 64) + +TEST_ALL (ADD, float, uint64_t, 32) + +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 19 } } */ +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 19 } } */ +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 19 } } */ + +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 5 } } */ +/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0.5\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1.0\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0.5\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1.0\n} 2 } } */ + +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 10 } } */ +/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */ +/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0.5\n} 4 } } */ +/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1.0\n} 4 } } */ +/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0.5\n} 4 } } */ +/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1.0\n} 4 } } */ + +/* { dg-final { scan-assembler-not {\tsel\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fdiv_1.c b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fdiv_1.c new file mode 100644 index 00000000000..ec5653e7a47 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fdiv_1.c @@ -0,0 +1,47 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048 -fno-trapping-math" } */ + +#include + +#define a_i a[i] +#define b_i b[i] +#define c_i c[i] + +#define DIV(A, B) A / B + +#define TEST_FN(FN, TYPE0, TYPE1, COUNT, RHS, MERGE) \ + void \ + f_##TYPE0##_##TYPE1##_##RHS##_##MERGE (TYPE0 *__restrict out, \ + TYPE0 *__restrict a, \ + TYPE0 *__restrict b, \ + TYPE0 *__restrict c, \ + TYPE1 *__restrict p) \ + { \ + for (unsigned int i = 0; i < COUNT; i++) \ + out[i] = p[i] ? FN (a[i], (TYPE0)RHS) : MERGE; \ + } + +#define TEST_ALL(FN, TYPE0, TYPE1, COUNT) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, b_i, a_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, b_i, b_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, b_i, c_i) + +TEST_ALL (DIV, _Float16, uint64_t, 32) + +TEST_ALL (DIV, _Float16, uint32_t, 64) + +TEST_ALL (DIV, float, uint64_t, 32) + +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 7 } } */ +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 7 } } */ +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 7 } } */ + +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tfdivr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */ + +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tfdivr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */ + +/* { dg-final { scan-assembler-not {\tsel\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmaxnm_1.c b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmaxnm_1.c new file mode 100644 index 00000000000..d34872f9d89 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmaxnm_1.c @@ -0,0 +1,53 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048 -fno-signed-zeros -ffinite-math-only -fno-trapping-math" } */ + +#include + +#define a_i a[i] +#define b_i b[i] +#define c_i c[i] + +#define MAX(A, B) (A > B) ? A : B + +#define TEST_FN(FN, TYPE0, TYPE1, COUNT, RHS, MERGE) \ + void \ + f_##TYPE0##_##TYPE1##_##RHS##_##MERGE (TYPE0 *__restrict out, \ + TYPE0 *__restrict a, \ + TYPE0 *__restrict b, \ + TYPE0 *__restrict c, \ + TYPE1 *__restrict p) \ + { \ + for (unsigned int i = 0; i < COUNT; i++) \ + out[i] = p[i] ? FN (a[i], (TYPE0)RHS) : MERGE; \ + } + +#define TEST_ALL(FN, TYPE0, TYPE1, COUNT) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, b_i, a_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, b_i, b_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, b_i, c_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, 0, a_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, 0, b_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, 1, a_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, 1, b_i) + +TEST_ALL (MAX, _Float16, uint64_t, 32) + +TEST_ALL (MAX, _Float16, uint32_t, 64) + +TEST_ALL (MAX, float, uint64_t, 32) + +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 13 } } */ +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 13 } } */ +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 13 } } */ + +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0.0\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1.0\n} 2 } } */ + +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 6 } } */ +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */ +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0.0\n} 4 } } */ +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1.0\n} 4 } } */ + +/* { dg-final { scan-assembler-not {\tsel\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fminnm_1.c b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fminnm_1.c new file mode 100644 index 00000000000..d6c3c38fc1a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fminnm_1.c @@ -0,0 +1,53 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048 -fno-signed-zeros -ffinite-math-only -fno-trapping-math" } */ + +#include + +#define a_i a[i] +#define b_i b[i] +#define c_i c[i] + +#define MIN(A, B) (A < B) ? A : B + +#define TEST_FN(FN, TYPE0, TYPE1, COUNT, RHS, MERGE) \ + void \ + f_##TYPE0##_##TYPE1##_##RHS##_##MERGE (TYPE0 *__restrict out, \ + TYPE0 *__restrict a, \ + TYPE0 *__restrict b, \ + TYPE0 *__restrict c, \ + TYPE1 *__restrict p) \ + { \ + for (unsigned int i = 0; i < COUNT; i++) \ + out[i] = p[i] ? FN (a[i], (TYPE0)RHS) : MERGE; \ + } + +#define TEST_ALL(FN, TYPE0, TYPE1, COUNT) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, b_i, a_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, b_i, b_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, b_i, c_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, 0, a_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, 0, b_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, 1, a_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, 1, b_i) + +TEST_ALL (MIN, _Float16, uint64_t, 32) + +TEST_ALL (MIN, _Float16, uint32_t, 64) + +TEST_ALL (MIN, float, uint64_t, 32) + +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 13 } } */ +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 13 } } */ +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 13 } } */ + +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0.0\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1.0\n} 2 } } */ + +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 6 } } */ +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */ +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0.0\n} 4 } } */ +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1.0\n} 4 } } */ + +/* { dg-final { scan-assembler-not {\tsel\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmul_1.c b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmul_1.c new file mode 100644 index 00000000000..1ae7678d22d --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmul_1.c @@ -0,0 +1,50 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048 -fno-trapping-math" } */ + +#include + +#define a_i a[i] +#define b_i b[i] +#define c_i c[i] +#define imm_p5 0.5 + +#define MUL(A, B) A * B + +#define TEST_FN(FN, TYPE0, TYPE1, COUNT, RHS, MERGE) \ + void \ + f_##TYPE0##_##TYPE1##_##RHS##_##MERGE (TYPE0 *__restrict out, \ + TYPE0 *__restrict a, \ + TYPE0 *__restrict b, \ + TYPE0 *__restrict c, \ + TYPE1 *__restrict p) \ + { \ + for (unsigned int i = 0; i < COUNT; i++) \ + out[i] = p[i] ? FN (a[i], (TYPE0)RHS) : MERGE; \ + } + +#define TEST_ALL(FN, TYPE0, TYPE1, COUNT) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, b_i, a_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, b_i, b_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, b_i, c_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, imm_p5, a_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, imm_p5, b_i) + +TEST_ALL (MUL, _Float16, uint64_t, 32) + +TEST_ALL (MUL, _Float16, uint32_t, 64) + +TEST_ALL (MUL, float, uint64_t, 32) + +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 10 } } */ +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 10 } } */ +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 10 } } */ + +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0.5\n} 2 } } */ + +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 4 } } */ +/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */ +/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0.5\n} 4 } } */ + +/* { dg-final { scan-assembler-not {\tsel\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fsubr_1.c b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fsubr_1.c new file mode 100644 index 00000000000..eafd1690eeb --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fsubr_1.c @@ -0,0 +1,56 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048 -fno-trapping-math" } */ + +#include + +#define a_i a[i] +#define b_i b[i] +#define c_i c[i] +#define imm_p5 0.5 + +#define SUBR(A, B) B - A + +#define TEST_FN(FN, TYPE0, TYPE1, COUNT, RHS, MERGE) \ + void \ + f_##TYPE0##_##TYPE1##_##RHS##_##MERGE (TYPE0 *__restrict out, \ + TYPE0 *__restrict a, \ + TYPE0 *__restrict b, \ + TYPE0 *__restrict c, \ + TYPE1 *__restrict p) \ + { \ + for (unsigned int i = 0; i < COUNT; i++) \ + out[i] = p[i] ? FN (a[i], (TYPE0)RHS) : MERGE; \ + } + +#define TEST_ALL(FN, TYPE0, TYPE1, COUNT) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, b_i, a_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, b_i, b_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, b_i, c_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, 1, a_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, 1, b_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, imm_p5, a_i) \ + TEST_FN (FN, TYPE0, TYPE1, COUNT, imm_p5, b_i) + +TEST_ALL (SUBR, _Float16, uint64_t, 32) + +TEST_ALL (SUBR, _Float16, uint32_t, 64) + +TEST_ALL (SUBR, float, uint64_t, 32) + +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 13 } } */ +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 13 } } */ +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 13 } } */ + +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #0.5\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, #1.0\n} 2 } } */ + +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 6 } } */ +/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */ +/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */ +/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #0.5\n} 4 } } */ +/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, #1.0\n} 4 } } */ + +/* { dg-final { scan-assembler-not {\tsel\t} } } */