aarch64: PR target/99195 Add scheme to optimise away vec_concat with zeroes on 64...

author Kyrylo Tkachov <kyrylo.tkachov@arm.com>

Fri, 21 Apr 2023 17:56:21 +0000 (18:56 +0100)

committer Kyrylo Tkachov <kyrylo.tkachov@arm.com>

Fri, 21 Apr 2023 17:56:21 +0000 (18:56 +0100)
author Kyrylo Tkachov <kyrylo.tkachov@arm.com>
Fri, 21 Apr 2023 17:56:21 +0000 (18:56 +0100)
committer Kyrylo Tkachov <kyrylo.tkachov@arm.com>
Fri, 21 Apr 2023 17:56:21 +0000 (18:56 +0100)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md

index 1bed24477fb182c23a3a52291dc4dc3709b7ee12..adcad56cf553c3e9afebb7adf2731fbc48c9a8af 100644 (file)
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -18,6 +18,34 @@
  ;; along with GCC; see the file COPYING3.  If not see
  ;; <http://www.gnu.org/licenses/>.
  
+;; The following define_subst rules are used to produce patterns representing
+;; the implicit zeroing effect of 64-bit Advanced SIMD operations, in effect
+;; a vec_concat with zeroes.  The order of the vec_concat operands differs
+;; for big-endian so we have a separate define_subst rule for each endianness.
+(define_subst "add_vec_concat_subst_le"
+  [(set (match_operand:VDZ 0)
+        (match_operand:VDZ 1))]
+  "!BYTES_BIG_ENDIAN"
+  [(set (match_operand:<VDBL> 0)
+        (vec_concat:<VDBL>
+         (match_dup 1)
+         (match_operand:VDZ 2 "aarch64_simd_or_scalar_imm_zero")))])
+
+(define_subst "add_vec_concat_subst_be"
+  [(set (match_operand:VDZ 0)
+        (match_operand:VDZ 1))]
+  "BYTES_BIG_ENDIAN"
+  [(set (match_operand:<VDBL> 0)
+        (vec_concat:<VDBL>
+         (match_operand:VDZ 2 "aarch64_simd_or_scalar_imm_zero")
+         (match_dup 1)))])
+
+;; The subst_attr definitions used to annotate patterns further in the file.
+;; Patterns that need to have the above substitutions added to them should
+;; have <vczle><vczbe> added to their name.
+(define_subst_attr "vczle" "add_vec_concat_subst_le" "" "_vec_concatz_le")
+(define_subst_attr "vczbe" "add_vec_concat_subst_be" "" "_vec_concatz_be")
+
  (define_expand "mov<mode>"
    [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
         (match_operand:VALL_F16 1 "general_operand"))]
@@ -403,7 +431,7 @@
    [(set_attr "type" "neon_logic<q>")]
  )
  
-(define_insn "add<mode>3"
+(define_insn "add<mode>3<vczle><vczbe>"
    [(set (match_operand:VDQ_I 0 "register_operand" "=w")
          (plus:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
                   (match_operand:VDQ_I 2 "register_operand" "w")))]
@@ -412,7 +440,7 @@
    [(set_attr "type" "neon_add<q>")]
  )
  
-(define_insn "sub<mode>3"
+(define_insn "sub<mode>3<vczle><vczbe>"
    [(set (match_operand:VDQ_I 0 "register_operand" "=w")
          (minus:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
                    (match_operand:VDQ_I 2 "register_operand" "w")))]
@@ -421,7 +449,7 @@
    [(set_attr "type" "neon_sub<q>")]
  )
  
-(define_insn "mul<mode>3"
+(define_insn "mul<mode>3<vczle><vczbe>"
    [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
          (mult:VDQ_BHSI (match_operand:VDQ_BHSI 1 "register_operand" "w")
                    (match_operand:VDQ_BHSI 2 "register_operand" "w")))]
@@ -999,7 +1027,7 @@
  )
  
  ;; For AND (vector, register) and BIC (vector, immediate)
-(define_insn "and<mode>3"
+(define_insn "and<mode>3<vczle><vczbe>"
    [(set (match_operand:VDQ_I 0 "register_operand" "=w,w")
         (and:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w,0")
                    (match_operand:VDQ_I 2 "aarch64_reg_or_bic_imm" "w,Db")))]
@@ -1020,7 +1048,7 @@
  )
  
  ;; For ORR (vector, register) and ORR (vector, immediate)
-(define_insn "ior<mode>3"
+(define_insn "ior<mode>3<vczle><vczbe>"
    [(set (match_operand:VDQ_I 0 "register_operand" "=w,w")
         (ior:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w,0")
                    (match_operand:VDQ_I 2 "aarch64_reg_or_orr_imm" "w,Do")))]
@@ -1040,7 +1068,7 @@
    [(set_attr "type" "neon_logic<q>")]
  )
  
-(define_insn "xor<mode>3"
+(define_insn "xor<mode>3<vczle><vczbe>"
    [(set (match_operand:VDQ_I 0 "register_operand" "=w")
          (xor:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
                  (match_operand:VDQ_I 2 "register_operand" "w")))]
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md

index 6cbc97cc82c06a68259bdf4dec8a0eab230081e5..d3c43a212a158268f5c0ac67a447b78433e6cc76 100644 (file)
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -99,6 +99,9 @@
  ;; Double vector modes suitable for moving.  Includes BFmode.
  (define_mode_iterator VDMOV [V8QI V4HI V4HF V4BF V2SI V2SF])
  
+;; 64-bit modes for operations that implicitly clear the top bits of a Q reg.
+(define_mode_iterator VDZ [V8QI V4HI V4HF V4BF V2SI V2SF DI DF])
+
  ;; All modes stored in registers d0-d31.
  (define_mode_iterator DREG [V8QI V4HI V4HF V2SI V2SF DF])
  
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/pr99195_1.c b/gcc/testsuite/gcc.target/aarch64/simd/pr99195_1.c

new file mode 100644 (file)

index 0000000..3ddd5a3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/pr99195_1.c
@@ -0,0 +1,50 @@
+/* PR target/99195.  */
+/*  Check that we take advantage of 64-bit Advanced SIMD operations clearing
+    the top half of the vector register and no explicit zeroing instructions
+    are emitted.  */
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+
+#include <arm_neon.h>
+
+#define ONE(OT,IT,OP,S)                         \
+OT                                              \
+foo_##OP##_##S (IT a, IT b)                     \
+{                                               \
+  IT zeros = vcreate_##S (0);                   \
+  return vcombine_##S (v##OP##_##S (a, b), zeros);      \
+}
+
+#define FUNC(T,IS,OS,OP,S) ONE (T##x##OS##_t, T##x##IS##_t, OP, S)
+
+#define OPTWO(T,IS,OS,S,OP1,OP2)        \
+FUNC (T, IS, OS, OP1, S)                \
+FUNC (T, IS, OS, OP2, S)
+
+#define OPTHREE(T, IS, OS, S, OP1, OP2, OP3)    \
+FUNC (T, IS, OS, OP1, S)        \
+OPTWO (T, IS, OS, S, OP2, OP3)
+
+#define OPFOUR(T,IS,OS,S,OP1,OP2,OP3,OP4)       \
+FUNC (T, IS, OS, OP1, S)                \
+OPTHREE (T, IS, OS, S, OP2, OP3, OP4)
+
+#define OPFIVE(T,IS,OS,S,OP1,OP2,OP3,OP4, OP5)  \
+FUNC (T, IS, OS, OP1, S)                \
+OPFOUR (T, IS, OS, S, OP2, OP3, OP4, OP5)
+
+#define OPSIX(T,IS,OS,S,OP1,OP2,OP3,OP4,OP5,OP6)        \
+FUNC (T, IS, OS, OP1, S)                \
+OPFIVE (T, IS, OS, S, OP2, OP3, OP4, OP5, OP6)
+
+OPSIX (int8, 8, 16, s8, add, sub, mul, and, orr, eor)
+OPSIX (int16, 4, 8, s16, add, sub, mul, and, orr, eor)
+OPSIX (int32, 2, 4, s32, add, sub, mul, and, orr, eor)
+
+OPSIX (uint8, 8, 16, u8, add, sub, mul, and, orr, eor)
+OPSIX (uint16, 4, 8, u16, add, sub, mul, and, orr, eor)
+OPSIX (uint32, 2, 4, u32, add, sub, mul, and, orr, eor)
+
+/* { dg-final { scan-assembler-not {\tfmov\t} } }  */
+/* { dg-final { scan-assembler-not {\tmov\t} } }  */
+
author	Kyrylo Tkachov <kyrylo.tkachov@arm.com>
	Fri, 21 Apr 2023 17:56:21 +0000 (18:56 +0100)
committer	Kyrylo Tkachov <kyrylo.tkachov@arm.com>
	Fri, 21 Apr 2023 17:56:21 +0000 (18:56 +0100)
gcc/config/aarch64/aarch64-simd.md		patch \| blob \| blame \| history
gcc/config/aarch64/iterators.md		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/simd/pr99195_1.c	[new file with mode: 0644]	patch \| blob