From: Kyrylo Tkachov Date: Thu, 25 May 2023 14:00:16 +0000 (+0100) Subject: aarch64: PR target/99195 Annotate complex FP patterns for vec-concat-zero X-Git-Tag: basepoints/gcc-15~8898 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=560bb845321f5ad039a318a081b0e88d9900f5cb;p=thirdparty%2Fgcc.git aarch64: PR target/99195 Annotate complex FP patterns for vec-concat-zero This patch annotates the complex add and mla patterns for vec-concat-zero. Testing showed an interesting bug in our MD patterns where they were defined to match: (plus:VHSDF (match_operand:VHSDF 1 "register_operand" "0") (unspec:VHSDF [(match_operand:VHSDF 2 "register_operand" "w") (match_operand:VHSDF 3 "register_operand" "w") (match_operand:SI 4 "const_int_operand" "n")] FCMLA)) but the canonicalisation rules for PLUS require the more "complex" operand to be first so during combine when the new substituted patterns were attempted to be formed combine/recog would try to match: (plus:V2SF (unspec:V2SF [ (reg:V2SF 100) (reg:V2SF 101) (const_int 0 [0]) ] UNSPEC_FCMLA270) (reg:V2SF 99)) instead. This patch fixes the operands of the PLUS RTX in these patterns. Similar patterns for the dot-product instructions already used the right order. Bootstrapped and tested on aarch64-none-linux-gnu and aarch64_be-none-elf. gcc/ChangeLog: PR target/99195 * config/aarch64/aarch64-simd.md (aarch64_fcadd): Rename to... (aarch64_fcadd): ... This. Fix canonicalization of PLUS operands. (aarch64_fcmla): Rename to... (aarch64_fcmla): ... This. Fix canonicalization of PLUS operands. (aarch64_fcmla_lane): Rename to... (aarch64_fcmla_lane): ... This. Fix canonicalization of PLUS operands. (aarch64_fcmla_laneqv4hf): Rename to... (aarch64_fcmla_laneqv4hf): ... This. Fix canonicalization of PLUS operands. (aarch64_fcmlaq_lane): Fix canonicalization of PLUS operands. gcc/testsuite/ChangeLog: PR target/99195 * gcc.target/aarch64/simd/pr99195_9.c: New test. --- diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 0df97310fd98..da9c59e65546 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -554,7 +554,7 @@ ;; to describe the permute that is also required, but even if that is done ;; the permute would have been created as a LOAD_LANES which means the values ;; in the registers are in the wrong order. -(define_insn "aarch64_fcadd" +(define_insn "aarch64_fcadd" [(set (match_operand:VHSDF 0 "register_operand" "=w") (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w") (match_operand:VHSDF 2 "register_operand" "w")] @@ -572,25 +572,25 @@ "TARGET_COMPLEX && !BYTES_BIG_ENDIAN" ) -(define_insn "aarch64_fcmla" +(define_insn "aarch64_fcmla" [(set (match_operand:VHSDF 0 "register_operand" "=w") - (plus:VHSDF (match_operand:VHSDF 1 "register_operand" "0") - (unspec:VHSDF [(match_operand:VHSDF 2 "register_operand" "w") + (plus:VHSDF (unspec:VHSDF [(match_operand:VHSDF 2 "register_operand" "w") (match_operand:VHSDF 3 "register_operand" "w")] - FCMLA)))] + FCMLA) + (match_operand:VHSDF 1 "register_operand" "0")))] "TARGET_COMPLEX" "fcmla\t%0., %2., %3., #" [(set_attr "type" "neon_fcmla")] ) -(define_insn "aarch64_fcmla_lane" +(define_insn "aarch64_fcmla_lane" [(set (match_operand:VHSDF 0 "register_operand" "=w") - (plus:VHSDF (match_operand:VHSDF 1 "register_operand" "0") - (unspec:VHSDF [(match_operand:VHSDF 2 "register_operand" "w") + (plus:VHSDF (unspec:VHSDF [(match_operand:VHSDF 2 "register_operand" "w") (match_operand:VHSDF 3 "register_operand" "w") (match_operand:SI 4 "const_int_operand" "n")] - FCMLA)))] + FCMLA) + (match_operand:VHSDF 1 "register_operand" "0")))] "TARGET_COMPLEX" { operands[4] = aarch64_endian_lane_rtx (mode, INTVAL (operands[4])); @@ -599,13 +599,13 @@ [(set_attr "type" "neon_fcmla")] ) -(define_insn "aarch64_fcmla_laneqv4hf" +(define_insn "aarch64_fcmla_laneqv4hf" [(set (match_operand:V4HF 0 "register_operand" "=w") - (plus:V4HF (match_operand:V4HF 1 "register_operand" "0") - (unspec:V4HF [(match_operand:V4HF 2 "register_operand" "w") + (plus:V4HF (unspec:V4HF [(match_operand:V4HF 2 "register_operand" "w") (match_operand:V8HF 3 "register_operand" "w") (match_operand:SI 4 "const_int_operand" "n")] - FCMLA)))] + FCMLA) + (match_operand:V4HF 1 "register_operand" "0")))] "TARGET_COMPLEX" { operands[4] = aarch64_endian_lane_rtx (V4HFmode, INTVAL (operands[4])); @@ -616,11 +616,11 @@ (define_insn "aarch64_fcmlaq_lane" [(set (match_operand:VQ_HSF 0 "register_operand" "=w") - (plus:VQ_HSF (match_operand:VQ_HSF 1 "register_operand" "0") - (unspec:VQ_HSF [(match_operand:VQ_HSF 2 "register_operand" "w") + (plus:VQ_HSF (unspec:VQ_HSF [(match_operand:VQ_HSF 2 "register_operand" "w") (match_operand: 3 "register_operand" "w") (match_operand:SI 4 "const_int_operand" "n")] - FCMLA)))] + FCMLA) + (match_operand:VQ_HSF 1 "register_operand" "0")))] "TARGET_COMPLEX" { int nunits = GET_MODE_NUNITS (mode).to_constant (); diff --git a/gcc/testsuite/gcc.target/aarch64/simd/pr99195_9.c b/gcc/testsuite/gcc.target/aarch64/simd/pr99195_9.c new file mode 100644 index 000000000000..bb86735b3407 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/pr99195_9.c @@ -0,0 +1,64 @@ +/* PR target/99195. */ +/* Check that we take advantage of 64-bit Advanced SIMD operations clearing + the top half of the vector register and no explicit zeroing instructions + are emitted. */ +/* { dg-do compile } */ +/* { dg-options "-O -march=armv8.3-a+fp16" } */ + +#include + +#define BINARY(OT,IT,OP,S) \ +OT \ +foo_##OP##_##S (IT a, IT b, IT c) \ +{ \ + IT zeros = vcreate_##S (0); \ + return vcombine_##S (v##OP##_##S (a, b), zeros); \ +} + +#define FUNC(T,IS,OS,OP,S) BINARY (T##x##OS##_t, T##x##IS##_t, OP, S) + +#define OPTWO(T,IS,OS,S,OP1,OP2) \ +FUNC (T, IS, OS, OP1, S) \ +FUNC (T, IS, OS, OP2, S) + +#define OPTHREE(T, IS, OS, S, OP1, OP2, OP3) \ +FUNC (T, IS, OS, OP1, S) \ +OPTWO (T, IS, OS, S, OP2, OP3) + +#define OPFOUR(T,IS,OS,S,OP1,OP2,OP3,OP4) \ +FUNC (T, IS, OS, OP1, S) \ +OPTHREE (T, IS, OS, S, OP2, OP3, OP4) + +OPTWO (float16, 4, 8, f16, cadd_rot90, cadd_rot270) +OPTWO (float32, 2, 4, f32, cadd_rot90, cadd_rot270) + +#define TERNARY(OT,IT,OP,S) \ +OT \ +foo_##OP##_##S (IT a, IT b, IT c) \ +{ \ + IT zeros = vcreate_##S (0); \ + return vcombine_##S (v##OP##_##S (a, b, c), zeros); \ +} + +#undef FUNC +#define FUNC(T,IS,OS,OP,S) TERNARY (T##x##OS##_t, T##x##IS##_t, OP, S) + +OPFOUR (float16, 4, 8, f16, cmla, cmla_rot90, cmla_rot180, cmla_rot270) +OPFOUR (float32, 2, 4, f32, cmla, cmla_rot90, cmla_rot180, cmla_rot270) + +#define TERNARY_IDX(OT,IT,OP,S) \ +OT \ +foo_##OP##_##S (IT a, IT b, IT c) \ +{ \ + IT zeros = vcreate_##S (0); \ + return vcombine_##S (v##OP##_##S (a, b, c, 0), zeros); \ +} + +#undef FUNC +#define FUNC(T,IS,OS,OP,S) TERNARY_IDX (T##x##OS##_t, T##x##IS##_t, OP, S) +OPFOUR (float16, 4, 8, f16, cmla_lane, cmla_rot90_lane, cmla_rot180_lane, cmla_rot270_lane) +OPFOUR (float32, 2, 4, f32, cmla_lane, cmla_rot90_lane, cmla_rot180_lane, cmla_rot270_lane) + +/* { dg-final { scan-assembler-not {\tfmov\t} } } */ +/* { dg-final { scan-assembler-not {\tmov\t} } } */ +