From 93c26deab98fc80b616a1c53c324a88f61036f53 Mon Sep 17 00:00:00 2001 From: Kyrylo Tkachov Date: Thu, 4 May 2023 09:42:37 +0100 Subject: [PATCH] aarch64: PR target/99195 annotate simple ternary ops for vec-concat with zero We're now moving onto various simple ternary instructions, including some lane forms. These include intrinsics that map down to mla, mls, fma, aba, bsl instructions. Tests are added for lane 0 and lane 1 as for some of these instructions the lane 0 variants use separate simpler patterns that need a separate annotation. Bootstrapped and tested on aarch64-none-linux-gnu. gcc/ChangeLog: PR target/99195 * config/aarch64/aarch64-simd.md (aarch64_aba): Rename to... (aarch64_aba): ... This. (aarch64_mla): Rename to... (aarch64_mla): ... This. (*aarch64_mla_elt): Rename to... (*aarch64_mla_elt): ... This. (*aarch64_mla_elt_): Rename to... (*aarch64_mla_elt_): ... This. (aarch64_mla_n): Rename to... (aarch64_mla_n): ... This. (aarch64_mls): Rename to... (aarch64_mls): ... This. (*aarch64_mls_elt): Rename to... (*aarch64_mls_elt): ... This. (*aarch64_mls_elt_): Rename to... (*aarch64_mls_elt_): ... This. (aarch64_mls_n): Rename to... (aarch64_mls_n): ... This. (fma4): Rename to... (fma4): ... This. (*aarch64_fma4_elt): Rename to... (*aarch64_fma4_elt): ... This. (*aarch64_fma4_elt_): Rename to... (*aarch64_fma4_elt_): ... This. (*aarch64_fma4_elt_from_dup): Rename to... (*aarch64_fma4_elt_from_dup): ... This. (fnma4): Rename to... (fnma4): ... This. (*aarch64_fnma4_elt): Rename to... (*aarch64_fnma4_elt): ... This. (*aarch64_fnma4_elt_): Rename to... (*aarch64_fnma4_elt_): ... This. (*aarch64_fnma4_elt_from_dup): Rename to... (*aarch64_fnma4_elt_from_dup): ... This. (aarch64_simd_bsl_internal): Rename to... (aarch64_simd_bsl_internal): ... This. (*aarch64_simd_bsl_alt): Rename to... (*aarch64_simd_bsl_alt): ... This. gcc/testsuite/ChangeLog: PR target/99195 * gcc.target/aarch64/simd/pr99195_3.c: New test. --- gcc/config/aarch64/aarch64-simd.md | 38 +++++------ .../gcc.target/aarch64/simd/pr99195_3.c | 68 +++++++++++++++++++ 2 files changed, 87 insertions(+), 19 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/pr99195_3.c diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 511d1e78809d..705c4b0b4b40 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1097,7 +1097,7 @@ } ) -(define_insn "aarch64_aba" +(define_insn "aarch64_aba" [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w") (plus:VDQ_BHSI (minus:VDQ_BHSI (USMAX:VDQ_BHSI @@ -1551,7 +1551,7 @@ ) -(define_insn "aarch64_mla" +(define_insn "aarch64_mla" [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w") (plus:VDQ_BHSI (mult:VDQ_BHSI (match_operand:VDQ_BHSI 2 "register_operand" "w") @@ -1562,7 +1562,7 @@ [(set_attr "type" "neon_mla_")] ) -(define_insn "*aarch64_mla_elt" +(define_insn "*aarch64_mla_elt" [(set (match_operand:VDQHS 0 "register_operand" "=w") (plus:VDQHS (mult:VDQHS @@ -1580,7 +1580,7 @@ [(set_attr "type" "neon_mla__scalar")] ) -(define_insn "*aarch64_mla_elt_" +(define_insn "*aarch64_mla_elt_" [(set (match_operand:VDQHS 0 "register_operand" "=w") (plus:VDQHS (mult:VDQHS @@ -1598,7 +1598,7 @@ [(set_attr "type" "neon_mla__scalar")] ) -(define_insn "aarch64_mla_n" +(define_insn "aarch64_mla_n" [(set (match_operand:VDQHS 0 "register_operand" "=w") (plus:VDQHS (mult:VDQHS @@ -1611,7 +1611,7 @@ [(set_attr "type" "neon_mla__scalar")] ) -(define_insn "aarch64_mls" +(define_insn "aarch64_mls" [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w") (minus:VDQ_BHSI (match_operand:VDQ_BHSI 1 "register_operand" "0") (mult:VDQ_BHSI (match_operand:VDQ_BHSI 2 "register_operand" "w") @@ -1621,7 +1621,7 @@ [(set_attr "type" "neon_mla_")] ) -(define_insn "*aarch64_mls_elt" +(define_insn "*aarch64_mls_elt" [(set (match_operand:VDQHS 0 "register_operand" "=w") (minus:VDQHS (match_operand:VDQHS 4 "register_operand" "0") @@ -1639,7 +1639,7 @@ [(set_attr "type" "neon_mla__scalar")] ) -(define_insn "*aarch64_mls_elt_" +(define_insn "*aarch64_mls_elt_" [(set (match_operand:VDQHS 0 "register_operand" "=w") (minus:VDQHS (match_operand:VDQHS 4 "register_operand" "0") @@ -1657,7 +1657,7 @@ [(set_attr "type" "neon_mla__scalar")] ) -(define_insn "aarch64_mls_n" +(define_insn "aarch64_mls_n" [(set (match_operand:VDQHS 0 "register_operand" "=w") (minus:VDQHS (match_operand:VDQHS 1 "register_operand" "0") @@ -3077,7 +3077,7 @@ } ) -(define_insn "fma4" +(define_insn "fma4" [(set (match_operand:VHSDF 0 "register_operand" "=w") (fma:VHSDF (match_operand:VHSDF 1 "register_operand" "w") (match_operand:VHSDF 2 "register_operand" "w") @@ -3087,7 +3087,7 @@ [(set_attr "type" "neon_fp_mla_")] ) -(define_insn "*aarch64_fma4_elt" +(define_insn "*aarch64_fma4_elt" [(set (match_operand:VDQF 0 "register_operand" "=w") (fma:VDQF (vec_duplicate:VDQF @@ -3104,7 +3104,7 @@ [(set_attr "type" "neon_fp_mla__scalar")] ) -(define_insn "*aarch64_fma4_elt_" +(define_insn "*aarch64_fma4_elt_" [(set (match_operand:VDQSF 0 "register_operand" "=w") (fma:VDQSF (vec_duplicate:VDQSF @@ -3121,7 +3121,7 @@ [(set_attr "type" "neon_fp_mla__scalar")] ) -(define_insn "*aarch64_fma4_elt_from_dup" +(define_insn "*aarch64_fma4_elt_from_dup" [(set (match_operand:VMUL 0 "register_operand" "=w") (fma:VMUL (vec_duplicate:VMUL @@ -3149,7 +3149,7 @@ [(set_attr "type" "neon_fp_mla_d_scalar_q")] ) -(define_insn "fnma4" +(define_insn "fnma4" [(set (match_operand:VHSDF 0 "register_operand" "=w") (fma:VHSDF (neg:VHSDF (match_operand:VHSDF 1 "register_operand" "w")) @@ -3160,7 +3160,7 @@ [(set_attr "type" "neon_fp_mla_")] ) -(define_insn "*aarch64_fnma4_elt" +(define_insn "*aarch64_fnma4_elt" [(set (match_operand:VDQF 0 "register_operand" "=w") (fma:VDQF (neg:VDQF @@ -3178,7 +3178,7 @@ [(set_attr "type" "neon_fp_mla__scalar")] ) -(define_insn "*aarch64_fnma4_elt_" +(define_insn "*aarch64_fnma4_elt_" [(set (match_operand:VDQSF 0 "register_operand" "=w") (fma:VDQSF (neg:VDQSF @@ -3196,7 +3196,7 @@ [(set_attr "type" "neon_fp_mla__scalar")] ) -(define_insn "*aarch64_fnma4_elt_from_dup" +(define_insn "*aarch64_fnma4_elt_from_dup" [(set (match_operand:VMUL 0 "register_operand" "=w") (fma:VMUL (neg:VMUL @@ -3808,7 +3808,7 @@ ;; Some forms of straight-line code may generate the equivalent form ;; in *aarch64_simd_bsl_alt. -(define_insn "aarch64_simd_bsl_internal" +(define_insn "aarch64_simd_bsl_internal" [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w") (xor:VDQ_I (and:VDQ_I @@ -3832,7 +3832,7 @@ ;; the first. The two are equivalent but since recog doesn't try all ;; permutations of commutative operations, we have to have a separate pattern. -(define_insn "*aarch64_simd_bsl_alt" +(define_insn "*aarch64_simd_bsl_alt" [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w") (xor:VDQ_I (and:VDQ_I diff --git a/gcc/testsuite/gcc.target/aarch64/simd/pr99195_3.c b/gcc/testsuite/gcc.target/aarch64/simd/pr99195_3.c new file mode 100644 index 000000000000..c751924e8d1b --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/pr99195_3.c @@ -0,0 +1,68 @@ +/* PR target/99195. */ +/* Check that we take advantage of 64-bit Advanced SIMD operations clearing + the top half of the vector register and no explicit zeroing instructions + are emitted. */ +/* { dg-do compile } */ +/* { dg-options "-O" } */ + +#include + +#define TERNARY(OT,IT,OP,S) \ +OT \ +foo_##OP##_##S (IT a, IT b, IT c) \ +{ \ + IT zeros = vcreate_##S (0); \ + return vcombine_##S (v##OP##_##S (a, b, c), zeros); \ +} + +#define FUNC(T,IS,OS,OP,S) TERNARY (T##x##OS##_t, T##x##IS##_t, OP, S) + +#define OPTWO(T,IS,OS,S,OP1,OP2) \ +FUNC (T, IS, OS, OP1, S) \ +FUNC (T, IS, OS, OP2, S) + +#define OPTHREE(T, IS, OS, S, OP1, OP2, OP3) \ +FUNC (T, IS, OS, OP1, S) \ +OPTWO (T, IS, OS, S, OP2, OP3) + +#define OPFOUR(T,IS,OS,S,OP1,OP2,OP3,OP4) \ +FUNC (T, IS, OS, OP1, S) \ +OPTHREE (T, IS, OS, S, OP2, OP3, OP4) + +OPTHREE (int8, 8, 16, s8, mla, mls, aba) +OPTHREE (int16, 4, 8, s16, mla, mls, aba) +OPTHREE (int32, 2, 4, s32, mla, mls, aba) + +OPFOUR (uint8, 8, 16, u8, mla, mls, aba, bsl) +OPFOUR (uint16, 4, 8, u16, mla, mls, aba, bsl) +OPFOUR (uint32, 2, 4, u32, mla, mls, aba, bsl) + +OPTHREE (float32, 2, 4, f32, mla, fma, fms) + +#undef FUNC +#define TERNARY_LANE(OT,IT,OP,S) \ +OT \ +foo_##OP##_##S (IT a, IT b, IT c) \ +{ \ + IT zeros = vcreate_##S (0); \ + return vcombine_##S (v##OP##_##S (a, b, c, 0), zeros); \ +} \ +OT \ +foo_##OP##_##S##_lane1 (IT a, IT b, IT c) \ +{ \ + IT zeros = vcreate_##S (0); \ + return vcombine_##S (v##OP##_##S (a, b, c, 1), zeros); \ +} + +#define FUNC(T,IS,OS,OP,S) TERNARY_LANE (T##x##OS##_t, T##x##IS##_t, OP, S) +OPTWO (int16, 4, 8, s16, mla_lane, mls_lane) +OPTWO (int32, 2, 4, s32, mla_lane, mls_lane) + +OPTWO (uint16, 4, 8, u16, mla_lane, mls_lane) +OPTWO (uint32, 2, 4, u32, mla_lane, mls_lane) + +OPTHREE (float32, 2, 4, f32, mla_lane, fma_lane, fms_lane) + +/* { dg-final { scan-assembler-not {\tfmov\t} } } */ +/* { dg-final { scan-assembler-not {\tmov\t} } } */ + -- 2.47.2