From dbfc149b639342a9555c60aa9ee787fb3d009316 Mon Sep 17 00:00:00 2001
From: Jonathan Wright <jonathan.wright@arm.com>
Date: Mon, 14 Jun 2021 16:18:44 +0100
Subject: [PATCH] aarch64: Model zero-high-half semantics of ADDHN/SUBHN
 instructions

Model the zero-high-half semantics of the narrowing arithmetic Neon
instructions in the aarch64_<sur><addsub>hn<mode> RTL pattern.
Modeling these semantics allows for better RTL combinations while
also removing some register allocation issues as the compiler now
knows that the operation is totally destructive.

Add new tests to narrow_zero_high_half.c to verify the benefit of
this change.

gcc/ChangeLog:

2021-06-14  Jonathan Wright  <jonathan.wright@arm.com>

	* config/aarch64/aarch64-simd.md (aarch64_<sur><addsub>hn<mode>):
	Change to an expander that emits the correct instruction
	depending on endianness.
	(aarch64_<sur><addsub>hn<mode>_insn_le): Define.
	(aarch64_<sur><addsub>hn<mode>_insn_be): Define.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/narrow_zero_high_half.c: Add new tests.
---
 gcc/config/aarch64/aarch64-simd.md            | 49 ++++++++++++++++---
 .../aarch64/narrow_zero_high_half.c           | 40 +++++++++++++++
 2 files changed, 83 insertions(+), 6 deletions(-)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 2b75e57eb77a..540244cf0a91 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4661,16 +4661,53 @@
 
 ;; <r><addsub>hn<q>.
 
-(define_insn "aarch64_<sur><addsub>hn<mode>"
-  [(set (match_operand:<VNARROWQ> 0 "register_operand" "=w")
-        (unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand" "w")
-			    (match_operand:VQN 2 "register_operand" "w")]
-                           ADDSUBHN))]
-  "TARGET_SIMD"
+(define_insn "aarch64_<sur><addsub>hn<mode>_insn_le"
+  [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
+	(vec_concat:<VNARROWQ2>
+	  (unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand" "w")
+			      (match_operand:VQN 2 "register_operand" "w")]
+			     ADDSUBHN)
+	  (match_operand:<VNARROWQ> 3 "aarch64_simd_or_scalar_imm_zero")))]
+  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
+  "<sur><addsub>hn\\t%0.<Vntype>, %1.<Vtype>, %2.<Vtype>"
+  [(set_attr "type" "neon_<addsub>_halve_narrow_q")]
+)
+
+(define_insn "aarch64_<sur><addsub>hn<mode>_insn_be"
+  [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
+	(vec_concat:<VNARROWQ2>
+	  (match_operand:<VNARROWQ> 3 "aarch64_simd_or_scalar_imm_zero")
+	  (unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand" "w")
+			      (match_operand:VQN 2 "register_operand" "w")]
+			     ADDSUBHN)))]
+  "TARGET_SIMD && BYTES_BIG_ENDIAN"
   "<sur><addsub>hn\\t%0.<Vntype>, %1.<Vtype>, %2.<Vtype>"
   [(set_attr "type" "neon_<addsub>_halve_narrow_q")]
 )
 
+(define_expand "aarch64_<sur><addsub>hn<mode>"
+  [(set (match_operand:<VNARROWQ> 0 "register_operand")
+	(unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand")
+			    (match_operand:VQN 2 "register_operand")]
+			   ADDSUBHN))]
+  "TARGET_SIMD"
+  {
+    rtx tmp = gen_reg_rtx (<VNARROWQ2>mode);
+    if (BYTES_BIG_ENDIAN)
+      emit_insn (gen_aarch64_<sur><addsub>hn<mode>_insn_be (tmp, operands[1],
+				operands[2], CONST0_RTX (<VNARROWQ>mode)));
+    else
+      emit_insn (gen_aarch64_<sur><addsub>hn<mode>_insn_le (tmp, operands[1],
+				operands[2], CONST0_RTX (<VNARROWQ>mode)));
+
+    /* The intrinsic expects a narrow result, so emit a subreg that will get
+       optimized away as appropriate.  */
+    emit_move_insn (operands[0], lowpart_subreg (<VNARROWQ>mode, tmp,
+						 <VNARROWQ2>mode));
+    DONE;
+  }
+)
+
 (define_insn "aarch64_<sur><addsub>hn2<mode>_insn_le"
   [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
 	(vec_concat:<VNARROWQ2>
diff --git a/gcc/testsuite/gcc.target/aarch64/narrow_zero_high_half.c b/gcc/testsuite/gcc.target/aarch64/narrow_zero_high_half.c
index aa6c7ef389dd..dd5ddf83b993 100644
--- a/gcc/testsuite/gcc.target/aarch64/narrow_zero_high_half.c
+++ b/gcc/testsuite/gcc.target/aarch64/narrow_zero_high_half.c
@@ -74,6 +74,42 @@ TEST_UNARY (vqmovn, uint8x16_t, uint16x8_t, u16, u8)
 TEST_UNARY (vqmovn, uint16x8_t, uint32x4_t, u32, u16)
 TEST_UNARY (vqmovn, uint32x4_t, uint64x2_t, u64, u32)
 
+#define TEST_ARITH(name, rettype, intype, fs, rs) \
+  rettype test_ ## name ## _ ## fs ## _zero_high \
+		(intype a, intype b) \
+	{ \
+		return vcombine_ ## rs (name ## _ ## fs (a, b), \
+					vdup_n_ ## rs (0)); \
+	}
+
+TEST_ARITH (vaddhn, int8x16_t, int16x8_t, s16, s8)
+TEST_ARITH (vaddhn, int16x8_t, int32x4_t, s32, s16)
+TEST_ARITH (vaddhn, int32x4_t, int64x2_t, s64, s32)
+TEST_ARITH (vaddhn, uint8x16_t, uint16x8_t, u16, u8)
+TEST_ARITH (vaddhn, uint16x8_t, uint32x4_t, u32, u16)
+TEST_ARITH (vaddhn, uint32x4_t, uint64x2_t, u64, u32)
+
+TEST_ARITH (vraddhn, int8x16_t, int16x8_t, s16, s8)
+TEST_ARITH (vraddhn, int16x8_t, int32x4_t, s32, s16)
+TEST_ARITH (vraddhn, int32x4_t, int64x2_t, s64, s32)
+TEST_ARITH (vraddhn, uint8x16_t, uint16x8_t, u16, u8)
+TEST_ARITH (vraddhn, uint16x8_t, uint32x4_t, u32, u16)
+TEST_ARITH (vraddhn, uint32x4_t, uint64x2_t, u64, u32)
+
+TEST_ARITH (vsubhn, int8x16_t, int16x8_t, s16, s8)
+TEST_ARITH (vsubhn, int16x8_t, int32x4_t, s32, s16)
+TEST_ARITH (vsubhn, int32x4_t, int64x2_t, s64, s32)
+TEST_ARITH (vsubhn, uint8x16_t, uint16x8_t, u16, u8)
+TEST_ARITH (vsubhn, uint16x8_t, uint32x4_t, u32, u16)
+TEST_ARITH (vsubhn, uint32x4_t, uint64x2_t, u64, u32)
+
+TEST_ARITH (vrsubhn, int8x16_t, int16x8_t, s16, s8)
+TEST_ARITH (vrsubhn, int16x8_t, int32x4_t, s32, s16)
+TEST_ARITH (vrsubhn, int32x4_t, int64x2_t, s64, s32)
+TEST_ARITH (vrsubhn, uint8x16_t, uint16x8_t, u16, u8)
+TEST_ARITH (vrsubhn, uint16x8_t, uint32x4_t, u32, u16)
+TEST_ARITH (vrsubhn, uint32x4_t, uint64x2_t, u64, u32)
+
 /* { dg-final { scan-assembler-not "dup\\t" } } */
 
 /* { dg-final { scan-assembler-times "\\tshrn\\tv" 6} }  */
@@ -88,3 +124,7 @@ TEST_UNARY (vqmovn, uint32x4_t, uint64x2_t, u64, u32)
 /* { dg-final { scan-assembler-times "\\tsqxtun\\tv" 3} }  */
 /* { dg-final { scan-assembler-times "\\tuqxtn\\tv" 3} }  */
 /* { dg-final { scan-assembler-times "\\tsqxtn\\tv" 3} }  */
+/* { dg-final { scan-assembler-times "\\taddhn\\tv" 6} }  */
+/* { dg-final { scan-assembler-times "\\tsubhn\\tv" 6} }  */
+/* { dg-final { scan-assembler-times "\\trsubhn\\tv" 6} }  */
+/* { dg-final { scan-assembler-times "\\traddhn\\tv" 6} }  */
-- 
2.47.2