From: Kyrylo Tkachov <kyrylo.tkachov@arm.com>
Date: Thu, 4 May 2023 08:42:37 +0000 (+0100)
Subject: aarch64: PR target/99195 annotate simple ternary ops for vec-concat with zero
X-Git-Tag: basepoints/gcc-15~9666
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=93c26deab98fc80b616a1c53c324a88f61036f53;p=thirdparty%2Fgcc.git

aarch64: PR target/99195 annotate simple ternary ops for vec-concat with zero

We're now moving onto various simple ternary instructions, including some lane forms.
These include intrinsics that map down to mla, mls, fma, aba, bsl instructions.
Tests are added for lane 0 and lane 1 as for some of these instructions the lane 0 variants
use separate simpler patterns that need a separate annotation.

Bootstrapped and tested on aarch64-none-linux-gnu.

gcc/ChangeLog:

	PR target/99195
	* config/aarch64/aarch64-simd.md (aarch64_<su>aba<mode>): Rename to...
	(aarch64_<su>aba<mode><vczle><vczbe>): ... This.
	(aarch64_mla<mode>): Rename to...
	(aarch64_mla<mode><vczle><vczbe>): ... This.
	(*aarch64_mla_elt<mode>): Rename to...
	(*aarch64_mla_elt<mode><vczle><vczbe>): ... This.
	(*aarch64_mla_elt_<vswap_width_name><mode>): Rename to...
	(*aarch64_mla_elt_<vswap_width_name><mode><vczle><vczbe>): ... This.
	(aarch64_mla_n<mode>): Rename to...
	(aarch64_mla_n<mode><vczle><vczbe>): ... This.
	(aarch64_mls<mode>): Rename to...
	(aarch64_mls<mode><vczle><vczbe>): ... This.
	(*aarch64_mls_elt<mode>): Rename to...
	(*aarch64_mls_elt<mode><vczle><vczbe>): ... This.
	(*aarch64_mls_elt_<vswap_width_name><mode>): Rename to...
	(*aarch64_mls_elt_<vswap_width_name><mode><vczle><vczbe>): ... This.
	(aarch64_mls_n<mode>): Rename to...
	(aarch64_mls_n<mode><vczle><vczbe>): ... This.
	(fma<mode>4): Rename to...
	(fma<mode>4<vczle><vczbe>): ... This.
	(*aarch64_fma4_elt<mode>): Rename to...
	(*aarch64_fma4_elt<mode><vczle><vczbe>): ... This.
	(*aarch64_fma4_elt_<vswap_width_name><mode>): Rename to...
	(*aarch64_fma4_elt_<vswap_width_name><mode><vczle><vczbe>): ... This.
	(*aarch64_fma4_elt_from_dup<mode>): Rename to...
	(*aarch64_fma4_elt_from_dup<mode><vczle><vczbe>): ... This.
	(fnma<mode>4): Rename to...
	(fnma<mode>4<vczle><vczbe>): ... This.
	(*aarch64_fnma4_elt<mode>): Rename to...
	(*aarch64_fnma4_elt<mode><vczle><vczbe>): ... This.
	(*aarch64_fnma4_elt_<vswap_width_name><mode>): Rename to...
	(*aarch64_fnma4_elt_<vswap_width_name><mode><vczle><vczbe>): ... This.
	(*aarch64_fnma4_elt_from_dup<mode>): Rename to...
	(*aarch64_fnma4_elt_from_dup<mode><vczle><vczbe>): ... This.
	(aarch64_simd_bsl<mode>_internal): Rename to...
	(aarch64_simd_bsl<mode>_internal<vczle><vczbe>): ... This.
	(*aarch64_simd_bsl<mode>_alt): Rename to...
	(*aarch64_simd_bsl<mode>_alt<vczle><vczbe>): ... This.

gcc/testsuite/ChangeLog:

	PR target/99195
	* gcc.target/aarch64/simd/pr99195_3.c: New test.
---

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 511d1e78809d..705c4b0b4b40 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1097,7 +1097,7 @@
   }
 )
 
-(define_insn "aarch64_<su>aba<mode>"
+(define_insn "aarch64_<su>aba<mode><vczle><vczbe>"
   [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
 	(plus:VDQ_BHSI (minus:VDQ_BHSI
 			 (USMAX:VDQ_BHSI
@@ -1551,7 +1551,7 @@
 )
 
 
-(define_insn "aarch64_mla<mode>"
+(define_insn "aarch64_mla<mode><vczle><vczbe>"
  [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
        (plus:VDQ_BHSI (mult:VDQ_BHSI
 			(match_operand:VDQ_BHSI 2 "register_operand" "w")
@@ -1562,7 +1562,7 @@
   [(set_attr "type" "neon_mla_<Vetype><q>")]
 )
 
-(define_insn "*aarch64_mla_elt<mode>"
+(define_insn "*aarch64_mla_elt<mode><vczle><vczbe>"
  [(set (match_operand:VDQHS 0 "register_operand" "=w")
        (plus:VDQHS
 	 (mult:VDQHS
@@ -1580,7 +1580,7 @@
   [(set_attr "type" "neon_mla_<Vetype>_scalar<q>")]
 )
 
-(define_insn "*aarch64_mla_elt_<vswap_width_name><mode>"
+(define_insn "*aarch64_mla_elt_<vswap_width_name><mode><vczle><vczbe>"
  [(set (match_operand:VDQHS 0 "register_operand" "=w")
        (plus:VDQHS
 	 (mult:VDQHS
@@ -1598,7 +1598,7 @@
   [(set_attr "type" "neon_mla_<Vetype>_scalar<q>")]
 )
 
-(define_insn "aarch64_mla_n<mode>"
+(define_insn "aarch64_mla_n<mode><vczle><vczbe>"
  [(set (match_operand:VDQHS 0 "register_operand" "=w")
 	(plus:VDQHS
 	  (mult:VDQHS
@@ -1611,7 +1611,7 @@
   [(set_attr "type" "neon_mla_<Vetype>_scalar<q>")]
 )
 
-(define_insn "aarch64_mls<mode>"
+(define_insn "aarch64_mls<mode><vczle><vczbe>"
  [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
        (minus:VDQ_BHSI (match_operand:VDQ_BHSI 1 "register_operand" "0")
 		   (mult:VDQ_BHSI (match_operand:VDQ_BHSI 2 "register_operand" "w")
@@ -1621,7 +1621,7 @@
   [(set_attr "type" "neon_mla_<Vetype><q>")]
 )
 
-(define_insn "*aarch64_mls_elt<mode>"
+(define_insn "*aarch64_mls_elt<mode><vczle><vczbe>"
  [(set (match_operand:VDQHS 0 "register_operand" "=w")
        (minus:VDQHS
 	 (match_operand:VDQHS 4 "register_operand" "0")
@@ -1639,7 +1639,7 @@
   [(set_attr "type" "neon_mla_<Vetype>_scalar<q>")]
 )
 
-(define_insn "*aarch64_mls_elt_<vswap_width_name><mode>"
+(define_insn "*aarch64_mls_elt_<vswap_width_name><mode><vczle><vczbe>"
  [(set (match_operand:VDQHS 0 "register_operand" "=w")
        (minus:VDQHS
 	 (match_operand:VDQHS 4 "register_operand" "0")
@@ -1657,7 +1657,7 @@
   [(set_attr "type" "neon_mla_<Vetype>_scalar<q>")]
 )
 
-(define_insn "aarch64_mls_n<mode>"
+(define_insn "aarch64_mls_n<mode><vczle><vczbe>"
   [(set (match_operand:VDQHS 0 "register_operand" "=w")
 	(minus:VDQHS
 	  (match_operand:VDQHS 1 "register_operand" "0")
@@ -3077,7 +3077,7 @@
   }
 )
 
-(define_insn "fma<mode>4"
+(define_insn "fma<mode>4<vczle><vczbe>"
   [(set (match_operand:VHSDF 0 "register_operand" "=w")
        (fma:VHSDF (match_operand:VHSDF 1 "register_operand" "w")
 		  (match_operand:VHSDF 2 "register_operand" "w")
@@ -3087,7 +3087,7 @@
   [(set_attr "type" "neon_fp_mla_<stype><q>")]
 )
 
-(define_insn "*aarch64_fma4_elt<mode>"
+(define_insn "*aarch64_fma4_elt<mode><vczle><vczbe>"
   [(set (match_operand:VDQF 0 "register_operand" "=w")
     (fma:VDQF
       (vec_duplicate:VDQF
@@ -3104,7 +3104,7 @@
   [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
 )
 
-(define_insn "*aarch64_fma4_elt_<vswap_width_name><mode>"
+(define_insn "*aarch64_fma4_elt_<vswap_width_name><mode><vczle><vczbe>"
   [(set (match_operand:VDQSF 0 "register_operand" "=w")
     (fma:VDQSF
       (vec_duplicate:VDQSF
@@ -3121,7 +3121,7 @@
   [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
 )
 
-(define_insn "*aarch64_fma4_elt_from_dup<mode>"
+(define_insn "*aarch64_fma4_elt_from_dup<mode><vczle><vczbe>"
   [(set (match_operand:VMUL 0 "register_operand" "=w")
     (fma:VMUL
       (vec_duplicate:VMUL
@@ -3149,7 +3149,7 @@
   [(set_attr "type" "neon_fp_mla_d_scalar_q")]
 )
 
-(define_insn "fnma<mode>4"
+(define_insn "fnma<mode>4<vczle><vczbe>"
   [(set (match_operand:VHSDF 0 "register_operand" "=w")
 	(fma:VHSDF
 	  (neg:VHSDF (match_operand:VHSDF 1 "register_operand" "w"))
@@ -3160,7 +3160,7 @@
   [(set_attr "type" "neon_fp_mla_<stype><q>")]
 )
 
-(define_insn "*aarch64_fnma4_elt<mode>"
+(define_insn "*aarch64_fnma4_elt<mode><vczle><vczbe>"
   [(set (match_operand:VDQF 0 "register_operand" "=w")
     (fma:VDQF
       (neg:VDQF
@@ -3178,7 +3178,7 @@
   [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
 )
 
-(define_insn "*aarch64_fnma4_elt_<vswap_width_name><mode>"
+(define_insn "*aarch64_fnma4_elt_<vswap_width_name><mode><vczle><vczbe>"
   [(set (match_operand:VDQSF 0 "register_operand" "=w")
     (fma:VDQSF
       (neg:VDQSF
@@ -3196,7 +3196,7 @@
   [(set_attr "type" "neon_fp_mla_<Vetype>_scalar<q>")]
 )
 
-(define_insn "*aarch64_fnma4_elt_from_dup<mode>"
+(define_insn "*aarch64_fnma4_elt_from_dup<mode><vczle><vczbe>"
   [(set (match_operand:VMUL 0 "register_operand" "=w")
     (fma:VMUL
       (neg:VMUL
@@ -3808,7 +3808,7 @@
 ;; Some forms of straight-line code may generate the equivalent form
 ;; in *aarch64_simd_bsl<mode>_alt.
 
-(define_insn "aarch64_simd_bsl<mode>_internal"
+(define_insn "aarch64_simd_bsl<mode>_internal<vczle><vczbe>"
   [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
 	(xor:VDQ_I
 	   (and:VDQ_I
@@ -3832,7 +3832,7 @@
 ;; the first.  The two are equivalent but since recog doesn't try all
 ;; permutations of commutative operations, we have to have a separate pattern.
 
-(define_insn "*aarch64_simd_bsl<mode>_alt"
+(define_insn "*aarch64_simd_bsl<mode>_alt<vczle><vczbe>"
   [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
 	(xor:VDQ_I
 	   (and:VDQ_I
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/pr99195_3.c b/gcc/testsuite/gcc.target/aarch64/simd/pr99195_3.c
new file mode 100644
index 000000000000..c751924e8d1b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/pr99195_3.c
@@ -0,0 +1,68 @@
+/* PR target/99195.  */
+/*  Check that we take advantage of 64-bit Advanced SIMD operations clearing
+    the top half of the vector register and no explicit zeroing instructions
+    are emitted.  */
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+
+#include <arm_neon.h>
+
+#define TERNARY(OT,IT,OP,S)                         \
+OT                                              \
+foo_##OP##_##S (IT a, IT b, IT c)                     \
+{                                               \
+  IT zeros = vcreate_##S (0);                   \
+  return vcombine_##S (v##OP##_##S (a, b, c), zeros);      \
+}
+
+#define FUNC(T,IS,OS,OP,S) TERNARY (T##x##OS##_t, T##x##IS##_t, OP, S)
+
+#define OPTWO(T,IS,OS,S,OP1,OP2)        \
+FUNC (T, IS, OS, OP1, S)                \
+FUNC (T, IS, OS, OP2, S)
+
+#define OPTHREE(T, IS, OS, S, OP1, OP2, OP3)    \
+FUNC (T, IS, OS, OP1, S)        \
+OPTWO (T, IS, OS, S, OP2, OP3)
+
+#define OPFOUR(T,IS,OS,S,OP1,OP2,OP3,OP4)       \
+FUNC (T, IS, OS, OP1, S)                \
+OPTHREE (T, IS, OS, S, OP2, OP3, OP4)
+
+OPTHREE (int8, 8, 16, s8, mla, mls, aba)
+OPTHREE (int16, 4, 8, s16, mla, mls, aba)
+OPTHREE (int32, 2, 4, s32, mla, mls, aba)
+
+OPFOUR (uint8, 8, 16, u8, mla, mls, aba, bsl)
+OPFOUR (uint16, 4, 8, u16, mla, mls, aba, bsl)
+OPFOUR (uint32, 2, 4, u32, mla, mls, aba, bsl)
+
+OPTHREE (float32, 2, 4, f32, mla, fma, fms)
+
+#undef FUNC
+#define TERNARY_LANE(OT,IT,OP,S)                         \
+OT                                              \
+foo_##OP##_##S (IT a, IT b, IT c)                     \
+{                                               \
+  IT zeros = vcreate_##S (0);                   \
+  return vcombine_##S (v##OP##_##S (a, b, c, 0), zeros);      \
+}						\
+OT                                              \
+foo_##OP##_##S##_lane1 (IT a, IT b, IT c)                     \
+{                                               \
+  IT zeros = vcreate_##S (0);                   \
+  return vcombine_##S (v##OP##_##S (a, b, c, 1), zeros);      \
+}
+
+#define FUNC(T,IS,OS,OP,S) TERNARY_LANE (T##x##OS##_t, T##x##IS##_t, OP, S)
+OPTWO (int16, 4, 8, s16, mla_lane, mls_lane)
+OPTWO (int32, 2, 4, s32, mla_lane, mls_lane)
+
+OPTWO (uint16, 4, 8, u16, mla_lane, mls_lane)
+OPTWO (uint32, 2, 4, u32, mla_lane, mls_lane)
+
+OPTHREE (float32, 2, 4, f32, mla_lane, fma_lane, fms_lane)
+
+/* { dg-final { scan-assembler-not {\tfmov\t} } }  */
+/* { dg-final { scan-assembler-not {\tmov\t} } }  */
+