]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
aarch64: Model zero-high-half semantics of SQXTUN instruction in RTL
authorJonathan Wright <jonathan.wright@arm.com>
Mon, 14 Jun 2021 12:16:35 +0000 (13:16 +0100)
committerJonathan Wright <jonathan.wright@arm.com>
Wed, 16 Jun 2021 13:22:08 +0000 (14:22 +0100)
Split the aarch64_sqmovun<mode> pattern into separate scalar and
vector variants. Further split the vector pattern into big/little
endian variants that model the zero-high-half semantics of the
underlying instruction. Modeling these semantics allows for better
RTL combinations while also removing some register allocation issues
as the compiler now knows that the operation is totally destructive.

Add new tests to narrow_zero_high_half.c to verify the benefit of
this change.

gcc/ChangeLog:

2021-06-14  Jonathan Wright  <jonathan.wright@arm.com>

* config/aarch64/aarch64-simd-builtins.def: Split generator
for aarch64_sqmovun builtins into scalar and vector variants.
* config/aarch64/aarch64-simd.md (aarch64_sqmovun<mode>):
Split into scalar and vector variants. Change vector variant
to an expander that emits the correct instruction depending
on endianness.
(aarch64_sqmovun<mode>_insn_le): Define.
(aarch64_sqmovun<mode>_insn_be): Define.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/narrow_zero_high_half.c: Add new tests.

gcc/config/aarch64/aarch64-simd-builtins.def
gcc/config/aarch64/aarch64-simd.md
gcc/testsuite/gcc.target/aarch64/narrow_zero_high_half.c

index 18baa6720b09b2ebda8577b809f8a8683f8b44f0..2adb4b127527794d19b2bbd4859f089d3da47763 100644 (file)
   BUILTIN_VQ_HSI (TERNOP, smlal_hi_n, 0, NONE)
   BUILTIN_VQ_HSI (TERNOPU, umlal_hi_n, 0, NONE)
 
-  BUILTIN_VSQN_HSDI (UNOPUS, sqmovun, 0, NONE)
+  /* Implemented by aarch64_sqmovun<mode>.  */
+  BUILTIN_VQN (UNOPUS, sqmovun, 0, NONE)
+  BUILTIN_SD_HSDI (UNOPUS, sqmovun, 0, NONE)
 
   /* Implemented by aarch64_sqxtun2<mode>.  */
   BUILTIN_VQN (BINOP_UUS, sqxtun2, 0, NONE)
index b23556b551cbbef420950007e9714acf190a534d..59779b851fbeecb17cd2cddbb0ed8770a22762b5 100644 (file)
   [(set_attr "type" "neon_qadd<q>")]
 )
 
-;; sqmovun
-
-(define_insn "aarch64_sqmovun<mode>"
-  [(set (match_operand:<VNARROWQ> 0 "register_operand" "=w")
-       (unspec:<VNARROWQ> [(match_operand:VSQN_HSDI 1 "register_operand" "w")]
-                            UNSPEC_SQXTUN))]
-   "TARGET_SIMD"
-   "sqxtun\\t%<vn2>0<Vmntype>, %<v>1<Vmtype>"
-   [(set_attr "type" "neon_sat_shift_imm_narrow_q")]
-)
-
 ;; sqmovn and uqmovn
 
 (define_insn "aarch64_<su>qmovn<mode>"
   }
 )
 
+;; sqmovun
+
+(define_insn "aarch64_sqmovun<mode>"
+  [(set (match_operand:<VNARROWQ> 0 "register_operand" "=w")
+       (unspec:<VNARROWQ> [(match_operand:SD_HSDI 1 "register_operand" "w")]
+                          UNSPEC_SQXTUN))]
+   "TARGET_SIMD"
+   "sqxtun\\t%<vn2>0<Vmntype>, %<v>1<Vmtype>"
+   [(set_attr "type" "neon_sat_shift_imm_narrow_q")]
+)
+
+(define_insn "aarch64_sqmovun<mode>_insn_le"
+  [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
+       (vec_concat:<VNARROWQ2>
+         (unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand" "w")]
+                            UNSPEC_SQXTUN)
+         (match_operand:<VNARROWQ> 2 "aarch64_simd_or_scalar_imm_zero")))]
+  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
+  "sqxtun\\t%<vn2>0<Vmntype>, %<v>1<Vmtype>"
+  [(set_attr "type" "neon_sat_shift_imm_narrow_q")]
+)
+
+(define_insn "aarch64_sqmovun<mode>_insn_be"
+  [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
+       (vec_concat:<VNARROWQ2>
+         (match_operand:<VNARROWQ> 2 "aarch64_simd_or_scalar_imm_zero")
+         (unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand" "w")]
+                            UNSPEC_SQXTUN)))]
+  "TARGET_SIMD && BYTES_BIG_ENDIAN"
+  "sqxtun\\t%<vn2>0<Vmntype>, %<v>1<Vmtype>"
+  [(set_attr "type" "neon_sat_shift_imm_narrow_q")]
+)
+
+(define_expand "aarch64_sqmovun<mode>"
+  [(set (match_operand:<VNARROWQ> 0 "register_operand")
+       (unspec:<VNARROWQ> [(match_operand:VQN 1 "register_operand")]
+                          UNSPEC_SQXTUN))]
+  "TARGET_SIMD"
+  {
+    rtx tmp = gen_reg_rtx (<VNARROWQ2>mode);
+    if (BYTES_BIG_ENDIAN)
+      emit_insn (gen_aarch64_sqmovun<mode>_insn_be (tmp, operands[1],
+                               CONST0_RTX (<VNARROWQ>mode)));
+    else
+      emit_insn (gen_aarch64_sqmovun<mode>_insn_le (tmp, operands[1],
+                               CONST0_RTX (<VNARROWQ>mode)));
+
+    /* The intrinsic expects a narrow result, so emit a subreg that will get
+       optimized away as appropriate.  */
+    emit_move_insn (operands[0], lowpart_subreg (<VNARROWQ>mode, tmp,
+                                                <VNARROWQ2>mode));
+    DONE;
+  }
+)
+
 (define_insn "aarch64_sqxtun2<mode>_le"
   [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w")
        (vec_concat:<VNARROWQ2>
index 451b0116e5eaffe1b50a8ccccee459c17cf4cb80..53e03d3594d4a55f0e316fe56332feff28855c7d 100644 (file)
@@ -63,6 +63,10 @@ TEST_UNARY (vmovn, uint8x16_t, uint16x8_t, u16, u8)
 TEST_UNARY (vmovn, uint16x8_t, uint32x4_t, u32, u16)
 TEST_UNARY (vmovn, uint32x4_t, uint64x2_t, u64, u32)
 
+TEST_UNARY (vqmovun, uint8x16_t, int16x8_t, s16, u8)
+TEST_UNARY (vqmovun, uint16x8_t, int32x4_t, s32, u16)
+TEST_UNARY (vqmovun, uint32x4_t, int64x2_t, s64, u32)
+
 /* { dg-final { scan-assembler-not "dup\\t" } } */
 
 /* { dg-final { scan-assembler-times "\\tshrn\\tv" 6} }  */
@@ -74,3 +78,4 @@ TEST_UNARY (vmovn, uint32x4_t, uint64x2_t, u64, u32)
 /* { dg-final { scan-assembler-times "\\tsqshrun\\tv" 3} }  */
 /* { dg-final { scan-assembler-times "\\tsqrshrun\\tv" 3} }  */
 /* { dg-final { scan-assembler-times "\\txtn\\tv" 6} }  */
+/* { dg-final { scan-assembler-times "\\tsqxtun\\tv" 3} }  */