From: Jonathan Wright <jonathan.wright@arm.com>
Date: Mon, 14 Jun 2021 12:16:35 +0000 (+0100)
Subject: aarch64: Model zero-high-half semantics of SQXTUN instruction in RTL
X-Git-Tag: basepoints/gcc-13~6781
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=c86a3039683a8d2bb1006c1a0277678de3786ceb;p=thirdparty%2Fgcc.git

aarch64: Model zero-high-half semantics of SQXTUN instruction in RTL

Split the aarch64_sqmovun<mode> pattern into separate scalar and
vector variants. Further split the vector pattern into big/little
endian variants that model the zero-high-half semantics of the
underlying instruction. Modeling these semantics allows for better
RTL combinations while also removing some register allocation issues
as the compiler now knows that the operation is totally destructive.

Add new tests to narrow_zero_high_half.c to verify the benefit of
this change.

gcc/ChangeLog:

2021-06-14  Jonathan Wright  <jonathan.wright@arm.com>

	* config/aarch64/aarch64-simd-builtins.def: Split generator
	for aarch64_sqmovun builtins into scalar and vector variants.
	* config/aarch64/aarch64-simd.md (aarch64_sqmovun<mode>):
	Split into scalar and vector variants. Change vector variant
	to an expander that emits the correct instruction depending
	on endianness.
	(aarch64_sqmovun<mode>_insn_le): Define.
	(aarch64_sqmovun<mode>_insn_be): Define.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/narrow_zero_high_half.c: Add new tests.
---

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 18baa6720b09..2adb4b127527 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -263,7 +263,9 @@
   BUILTIN_VQ_HSI (TERNOP, smlal_hi_n, 0, NONE)
   BUILTIN_VQ_HSI (TERNOPU, umlal_hi_n, 0, NONE)
 
-  BUILTIN_VSQN_HSDI (UNOPUS, sqmovun, 0, NONE)
+  /* Implemented by aarch64_sqmovun<mode>.  */
+  BUILTIN_VQN (UNOPUS, sqmovun, 0, NONE)
+  BUILTIN_SD_HSDI (UNOPUS, sqmovun, 0, NONE)
 
   /* Implemented by aarch64_sqxtun2<mode>.  */
   BUILTIN_VQN (BINOP_UUS, sqxtun2, 0, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index b23556b551cb..59779b851fbe 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4870,17 +4870,6 @@
   [(set_attr "type" "neon_qadd<q>")]
 )
 
-;; sqmovun
-
-(define_insn "aarch64_sqmovun<mode>"
-  [(set (match_operand:<VNARROWQ> 0 "register_operand" "=w")
-	(unspec:<VNARROWQ> [(match_operand:VSQN_HSDI 1 "register_operand" "w")]
-                            UNSPEC_SQXTUN))]
-   "TARGET_SIMD"
-   "sqxtun\\t%<vn2>0<Vmntype>, %<v>1<Vmtype>"
-   [(set_attr "type" "neon_sat_shift_imm_narrow_q")]
-)
-
 ;; sqmovn and uqmovn
 
 (define_insn "aarch64_<su>qmovn<mode>"
@@ -4931,6 +4920,61 @@
   }
 )
 
+;; sqmovun
+
+(define_insn "aarch64_sqmovun<mode>"
+  [(set (match_operand:<VNARROWQ> 0 "register_operand" "=w")
+	(unspec:<VNARROWQ> [(match_operand:SD_HSDI 1 "register_operand" "w")]
+			   UNSPEC_SQXTUN))]
+   "TARGET_SIMD"
+   "sqxtun\\t%<vn2>0<Vmntype>, %<v>1<Vmtype>"
+   [(set_attr "type" "neon_sat_shift_imm_narrow_q")]
+)
+
+(define_insn "aarch64_sqmovun<mode>_insn_le"
+  [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
+	(vec_concat:<VNARROWQ2>
+	  (unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand" "w")]
+			     UNSPEC_SQXTUN)
+	  (match_operand:<VNARROWQ> 2 "aarch64_simd_or_scalar_imm_zero")))]
+  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
+  "sqxtun\\t%<vn2>0<Vmntype>, %<v>1<Vmtype>"
+  [(set_attr "type" "neon_sat_shift_imm_narrow_q")]
+)
+
+(define_insn "aarch64_sqmovun<mode>_insn_be"
+  [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
+	(vec_concat:<VNARROWQ2>
+	  (match_operand:<VNARROWQ> 2 "aarch64_simd_or_scalar_imm_zero")
+	  (unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand" "w")]
+			     UNSPEC_SQXTUN)))]
+  "TARGET_SIMD && BYTES_BIG_ENDIAN"
+  "sqxtun\\t%<vn2>0<Vmntype>, %<v>1<Vmtype>"
+  [(set_attr "type" "neon_sat_shift_imm_narrow_q")]
+)
+
+(define_expand "aarch64_sqmovun<mode>"
+  [(set (match_operand:<VNARROWQ> 0 "register_operand")
+	(unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand")]
+			   UNSPEC_SQXTUN))]
+  "TARGET_SIMD"
+  {
+    rtx tmp = gen_reg_rtx (<VNARROWQ2>mode);
+    if (BYTES_BIG_ENDIAN)
+      emit_insn (gen_aarch64_sqmovun<mode>_insn_be (tmp, operands[1],
+				CONST0_RTX (<VNARROWQ>mode)));
+    else
+      emit_insn (gen_aarch64_sqmovun<mode>_insn_le (tmp, operands[1],
+				CONST0_RTX (<VNARROWQ>mode)));
+
+    /* The intrinsic expects a narrow result, so emit a subreg that will get
+       optimized away as appropriate.  */
+    emit_move_insn (operands[0], lowpart_subreg (<VNARROWQ>mode, tmp,
+						 <VNARROWQ2>mode));
+    DONE;
+  }
+)
+
 (define_insn "aarch64_sqxtun2<mode>_le"
   [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
 	(vec_concat:<VNARROWQ2>
diff --git a/gcc/testsuite/gcc.target/aarch64/narrow_zero_high_half.c b/gcc/testsuite/gcc.target/aarch64/narrow_zero_high_half.c
index 451b0116e5ea..53e03d3594d4 100644
--- a/gcc/testsuite/gcc.target/aarch64/narrow_zero_high_half.c
+++ b/gcc/testsuite/gcc.target/aarch64/narrow_zero_high_half.c
@@ -63,6 +63,10 @@ TEST_UNARY (vmovn, uint8x16_t, uint16x8_t, u16, u8)
 TEST_UNARY (vmovn, uint16x8_t, uint32x4_t, u32, u16)
 TEST_UNARY (vmovn, uint32x4_t, uint64x2_t, u64, u32)
 
+TEST_UNARY (vqmovun, uint8x16_t, int16x8_t, s16, u8)
+TEST_UNARY (vqmovun, uint16x8_t, int32x4_t, s32, u16)
+TEST_UNARY (vqmovun, uint32x4_t, int64x2_t, s64, u32)
+
 /* { dg-final { scan-assembler-not "dup\\t" } } */
 
 /* { dg-final { scan-assembler-times "\\tshrn\\tv" 6} }  */
@@ -74,3 +78,4 @@ TEST_UNARY (vmovn, uint32x4_t, uint64x2_t, u64, u32)
 /* { dg-final { scan-assembler-times "\\tsqshrun\\tv" 3} }  */
 /* { dg-final { scan-assembler-times "\\tsqrshrun\\tv" 3} }  */
 /* { dg-final { scan-assembler-times "\\txtn\\tv" 6} }  */
+/* { dg-final { scan-assembler-times "\\tsqxtun\\tv" 3} }  */