]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
aarch64: Model zero-high-half semantics of [SU]QXTN instructions
authorJonathan Wright <jonathan.wright@arm.com>
Mon, 14 Jun 2021 14:09:18 +0000 (15:09 +0100)
committerJonathan Wright <jonathan.wright@arm.com>
Wed, 16 Jun 2021 13:22:22 +0000 (14:22 +0100)
Split the aarch64_<su>qmovn<mode> pattern into separate scalar and
vector variants. Further split the vector RTL  pattern into big/
little endian variants that model the zero-high-half semantics of the
underlying instruction. Modeling these semantics allows for better
RTL combinations while also removing some register allocation issues
as the compiler now knows that the operation is totally destructive.

Add new tests to narrow_zero_high_half.c to verify the benefit of
this change.

gcc/ChangeLog:

2021-06-14  Jonathan Wright  <jonathan.wright@arm.com>

* config/aarch64/aarch64-simd-builtins.def: Split generator
for aarch64_<su>qmovn builtins into scalar and vector
variants.
* config/aarch64/aarch64-simd.md (aarch64_<su>qmovn<mode>_insn_le):
Define.
(aarch64_<su>qmovn<mode>_insn_be): Define.
(aarch64_<su>qmovn<mode>): Split into scalar and vector
variants. Change vector variant to an expander that emits the
correct instruction depending on endianness.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/narrow_zero_high_half.c: Add new tests.

gcc/config/aarch64/aarch64-simd-builtins.def
gcc/config/aarch64/aarch64-simd.md
gcc/testsuite/gcc.target/aarch64/narrow_zero_high_half.c

index 2adb4b127527794d19b2bbd4859f089d3da47763..ac5d4fc7ff1e61d404e66193b629986382ee4ffd 100644 (file)
   BUILTIN_VQN (BINOP_UUS, sqxtun2, 0, NONE)
 
   /* Implemented by aarch64_<su>qmovn<mode>.  */
-  BUILTIN_VSQN_HSDI (UNOP, sqmovn, 0, NONE)
-  BUILTIN_VSQN_HSDI (UNOP, uqmovn, 0, NONE)
+  BUILTIN_VQN (UNOP, sqmovn, 0, NONE)
+  BUILTIN_SD_HSDI (UNOP, sqmovn, 0, NONE)
+  BUILTIN_VQN (UNOP, uqmovn, 0, NONE)
+  BUILTIN_SD_HSDI (UNOP, uqmovn, 0, NONE)
 
   /* Implemented by aarch64_<su>qxtn2<mode>.  */
   BUILTIN_VQN (BINOP, sqxtn2, 0, NONE)
index 59779b851fbeecb17cd2cddbb0ed8770a22762b5..2b75e57eb77a0dea449f2c13bd77a88f48c4cea5 100644 (file)
 (define_insn "aarch64_<su>qmovn<mode>"
   [(set (match_operand:<VNARROWQ> 0 "register_operand" "=w")
        (SAT_TRUNC:<VNARROWQ>
-    (match_operand:VSQN_HSDI 1 "register_operand" "w")))]
+         (match_operand:SD_HSDI 1 "register_operand" "w")))]
   "TARGET_SIMD"
   "<su>qxtn\\t%<vn2>0<Vmntype>, %<v>1<Vmtype>"
-   [(set_attr "type" "neon_sat_shift_imm_narrow_q")]
+  [(set_attr "type" "neon_sat_shift_imm_narrow_q")]
+)
+
+(define_insn "aarch64_<su>qmovn<mode>_insn_le"
+  [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
+       (vec_concat:<VNARROWQ2>
+         (SAT_TRUNC:<VNARROWQ>
+           (match_operand:VQN 1 "register_operand" "w"))
+         (match_operand:<VNARROWQ> 2 "aarch64_simd_or_scalar_imm_zero")))]
+  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
+  "<su>qxtn\\t%<vn2>0<Vmntype>, %<v>1<Vmtype>"
+  [(set_attr "type" "neon_sat_shift_imm_narrow_q")]
+)
+
+(define_insn "aarch64_<su>qmovn<mode>_insn_be"
+  [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
+       (vec_concat:<VNARROWQ2>
+         (match_operand:<VNARROWQ> 2 "aarch64_simd_or_scalar_imm_zero")
+         (SAT_TRUNC:<VNARROWQ>
+           (match_operand:VQN 1 "register_operand" "w"))))]
+  "TARGET_SIMD && BYTES_BIG_ENDIAN"
+  "<su>qxtn\\t%<vn2>0<Vmntype>, %<v>1<Vmtype>"
+  [(set_attr "type" "neon_sat_shift_imm_narrow_q")]
+)
+
+(define_expand "aarch64_<su>qmovn<mode>"
+  [(set (match_operand:<VNARROWQ> 0 "register_operand")
+       (SAT_TRUNC:<VNARROWQ>
+         (match_operand:VQN 1 "register_operand")))]
+  "TARGET_SIMD"
+  {
+    rtx tmp = gen_reg_rtx (<VNARROWQ2>mode);
+    if (BYTES_BIG_ENDIAN)
+      emit_insn (gen_aarch64_<su>qmovn<mode>_insn_be (tmp, operands[1],
+                               CONST0_RTX (<VNARROWQ>mode)));
+    else
+      emit_insn (gen_aarch64_<su>qmovn<mode>_insn_le (tmp, operands[1],
+                               CONST0_RTX (<VNARROWQ>mode)));
+
+    /* The intrinsic expects a narrow result, so emit a subreg that will get
+       optimized away as appropriate.  */
+    emit_move_insn (operands[0], lowpart_subreg (<VNARROWQ>mode, tmp,
+                                                <VNARROWQ2>mode));
+    DONE;
+  }
 )
 
 (define_insn "aarch64_<su>qxtn2<mode>_le"
index 53e03d3594d4a55f0e316fe56332feff28855c7d..aa6c7ef389ddaf6be09414a6f09a0dc25949b628 100644 (file)
@@ -67,6 +67,13 @@ TEST_UNARY (vqmovun, uint8x16_t, int16x8_t, s16, u8)
 TEST_UNARY (vqmovun, uint16x8_t, int32x4_t, s32, u16)
 TEST_UNARY (vqmovun, uint32x4_t, int64x2_t, s64, u32)
 
+TEST_UNARY (vqmovn, int8x16_t, int16x8_t, s16, s8)
+TEST_UNARY (vqmovn, int16x8_t, int32x4_t, s32, s16)
+TEST_UNARY (vqmovn, int32x4_t, int64x2_t, s64, s32)
+TEST_UNARY (vqmovn, uint8x16_t, uint16x8_t, u16, u8)
+TEST_UNARY (vqmovn, uint16x8_t, uint32x4_t, u32, u16)
+TEST_UNARY (vqmovn, uint32x4_t, uint64x2_t, u64, u32)
+
 /* { dg-final { scan-assembler-not "dup\\t" } } */
 
 /* { dg-final { scan-assembler-times "\\tshrn\\tv" 6} }  */
@@ -79,3 +86,5 @@ TEST_UNARY (vqmovun, uint32x4_t, int64x2_t, s64, u32)
 /* { dg-final { scan-assembler-times "\\tsqrshrun\\tv" 3} }  */
 /* { dg-final { scan-assembler-times "\\txtn\\tv" 6} }  */
 /* { dg-final { scan-assembler-times "\\tsqxtun\\tv" 3} }  */
+/* { dg-final { scan-assembler-times "\\tuqxtn\\tv" 3} }  */
+/* { dg-final { scan-assembler-times "\\tsqxtn\\tv" 3} }  */