aarch64: Rework uxtl->zip optimisation [PR113196]

author Richard Sandiford <richard.sandiford@arm.com>

Fri, 12 Jan 2024 12:38:01 +0000 (12:38 +0000)

committer Richard Sandiford <richard.sandiford@arm.com>

Fri, 12 Jan 2024 12:38:01 +0000 (12:38 +0000)
author Richard Sandiford <richard.sandiford@arm.com>
Fri, 12 Jan 2024 12:38:01 +0000 (12:38 +0000)
committer Richard Sandiford <richard.sandiford@arm.com>
Fri, 12 Jan 2024 12:38:01 +0000 (12:38 +0000)
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h

index ce9bec79cecf82c20ba2bb4b818211b905c326f0..4c70e8a49635e708a34d42fb83f141737b6099f0 100644 (file)
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -880,6 +880,7 @@ rtx aarch64_return_addr_rtx (void);
  rtx aarch64_return_addr (int, rtx);
  rtx aarch64_simd_gen_const_vector_dup (machine_mode, HOST_WIDE_INT);
  rtx aarch64_gen_shareable_zero (machine_mode);
+bool aarch64_split_simd_shift_p (rtx_insn *);
  bool aarch64_simd_mem_operand_p (rtx);
  bool aarch64_sve_ld1r_operand_p (rtx);
  bool aarch64_sve_ld1rq_operand_p (rtx);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md

index 3cd184f46fa9a07d03ec649dc7803385a38fa643..6f48b4d5f21da9f96a376cd6b34110c2a39deb33 100644 (file)
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1958,7 +1958,7 @@
    [(set_attr "type" "neon_shift_imm_long")]
  )
  
-(define_insn "aarch64_simd_vec_unpack<su>_hi_<mode>"
+(define_insn_and_split "aarch64_simd_vec_unpack<su>_hi_<mode>"
    [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
          (ANY_EXTEND:<VWIDE> (vec_select:<VHALF>
                                (match_operand:VQW 1 "register_operand" "w")
@@ -1966,63 +1966,42 @@
                             )))]
    "TARGET_SIMD"
    "<su>xtl2\t%0.<Vwtype>, %1.<Vtype>"
-  [(set_attr "type" "neon_shift_imm_long")]
-)
-
-(define_expand "vec_unpacku_hi_<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand")
-   (match_operand:VQW 1 "register_operand")]
-  "TARGET_SIMD"
+  "&& <CODE> == ZERO_EXTEND
+   && aarch64_split_simd_shift_p (insn)"
+  [(const_int 0)]
    {
-    rtx res = gen_reg_rtx (<MODE>mode);
-    rtx tmp = aarch64_gen_shareable_zero (<MODE>mode);
-    if (BYTES_BIG_ENDIAN)
-      emit_insn (gen_aarch64_zip2<mode> (res, tmp, operands[1]));
-    else
-     emit_insn (gen_aarch64_zip2<mode> (res, operands[1], tmp));
-    emit_move_insn (operands[0],
-                  simplify_gen_subreg (<VWIDE>mode, res, <MODE>mode, 0));
+    /* On many cores, it is cheaper to implement UXTL2 using a ZIP2 with zero,
+       provided that the cost of the zero can be amortized over several
+       operations.  We'll later recombine the zero and zip if there are
+       not sufficient uses of the zero to make the split worthwhile.  */
+    rtx res = simplify_gen_subreg (<MODE>mode, operands[0], <VWIDE>mode, 0);
+    rtx zero = aarch64_gen_shareable_zero (<MODE>mode);
+    emit_insn (gen_aarch64_zip2<mode> (res, operands[1], zero));
      DONE;
    }
+  [(set_attr "type" "neon_shift_imm_long")]
  )
  
-(define_expand "vec_unpacks_hi_<mode>"
+(define_expand "vec_unpack<su>_hi_<mode>"
    [(match_operand:<VWIDE> 0 "register_operand")
-   (match_operand:VQW 1 "register_operand")]
+   (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand"))]
    "TARGET_SIMD"
    {
      rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
-    emit_insn (gen_aarch64_simd_vec_unpacks_hi_<mode> (operands[0],
-                                                      operands[1], p));
+    emit_insn (gen_aarch64_simd_vec_unpack<su>_hi_<mode> (operands[0],
+                                                         operands[1], p));
      DONE;
    }
  )
  
-(define_expand "vec_unpacku_lo_<mode>"
+(define_expand "vec_unpack<su>_lo_<mode>"
    [(match_operand:<VWIDE> 0 "register_operand")
-   (match_operand:VQW 1 "register_operand")]
-  "TARGET_SIMD"
-  {
-    rtx res = gen_reg_rtx (<MODE>mode);
-    rtx tmp = aarch64_gen_shareable_zero (<MODE>mode);
-    if (BYTES_BIG_ENDIAN)
-       emit_insn (gen_aarch64_zip1<mode> (res, tmp, operands[1]));
-    else
-       emit_insn (gen_aarch64_zip1<mode> (res, operands[1], tmp));
-    emit_move_insn (operands[0],
-                  simplify_gen_subreg (<VWIDE>mode, res, <MODE>mode, 0));
-    DONE;
-  }
-)
-
-(define_expand "vec_unpacks_lo_<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand")
-   (match_operand:VQW 1 "register_operand")]
+   (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand"))]
    "TARGET_SIMD"
    {
      rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
-    emit_insn (gen_aarch64_simd_vec_unpacks_lo_<mode> (operands[0],
-                                                      operands[1], p));
+    emit_insn (gen_aarch64_simd_vec_unpack<su>_lo_<mode> (operands[0],
+                                                         operands[1], p));
      DONE;
    }
  )
@@ -4792,62 +4771,6 @@
    [(set_attr "type" "neon_sub_widen")]
  )
  
-(define_insn "aarch64_usubw<mode>_lo_zip"
-  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
-       (minus:<VWIDE>
-         (match_operand:<VWIDE> 1 "register_operand" "w")
-         (subreg:<VWIDE>
-           (unspec:<MODE> [
-               (match_operand:VQW 2 "register_operand" "w")
-               (match_operand:VQW 3 "aarch64_simd_imm_zero")
-              ] UNSPEC_ZIP1) 0)))]
-  "TARGET_SIMD"
-  "usubw\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vhalftype>"
-  [(set_attr "type" "neon_sub_widen")]
-)
-
-(define_insn "aarch64_uaddw<mode>_lo_zip"
-  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
-       (plus:<VWIDE>
-         (subreg:<VWIDE>
-           (unspec:<MODE> [
-               (match_operand:VQW 2 "register_operand" "w")
-               (match_operand:VQW 3 "aarch64_simd_imm_zero")
-              ] UNSPEC_ZIP1) 0)
-         (match_operand:<VWIDE> 1 "register_operand" "w")))]
-  "TARGET_SIMD"
-  "uaddw\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vhalftype>"
-  [(set_attr "type" "neon_add_widen")]
-)
-
-(define_insn "aarch64_usubw<mode>_hi_zip"
-  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
-       (minus:<VWIDE>
-         (match_operand:<VWIDE> 1 "register_operand" "w")
-         (subreg:<VWIDE>
-           (unspec:<MODE> [
-               (match_operand:VQW 2 "register_operand" "w")
-               (match_operand:VQW 3 "aarch64_simd_imm_zero")
-              ] UNSPEC_ZIP2) 0)))]
-  "TARGET_SIMD"
-  "usubw2\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vtype>"
-  [(set_attr "type" "neon_sub_widen")]
-)
-
-(define_insn "aarch64_uaddw<mode>_hi_zip"
-  [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
-       (plus:<VWIDE>
-         (subreg:<VWIDE>
-           (unspec:<MODE> [
-               (match_operand:VQW 2 "register_operand" "w")
-               (match_operand:VQW 3 "aarch64_simd_imm_zero")
-              ] UNSPEC_ZIP2) 0)
-         (match_operand:<VWIDE> 1 "register_operand" "w")))]
-  "TARGET_SIMD"
-  "uaddw2\\t%0.<Vwtype>, %1.<Vwtype>, %2.<Vtype>"
-  [(set_attr "type" "neon_add_widen")]
-)
-
  (define_insn "aarch64_<ANY_EXTEND:su>addw<mode>"
    [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
         (plus:<VWIDE>
@@ -9788,11 +9711,26 @@
  )
  
  ;; Sign- or zero-extend a 64-bit integer vector to a 128-bit vector.
-(define_insn "<optab><Vnarrowq><mode>2"
+(define_insn_and_split "<optab><Vnarrowq><mode>2"
    [(set (match_operand:VQN 0 "register_operand" "=w")
         (ANY_EXTEND:VQN (match_operand:<VNARROWQ> 1 "register_operand" "w")))]
    "TARGET_SIMD"
    "<su>xtl\t%0.<Vtype>, %1.<Vntype>"
+  "&& <CODE> == ZERO_EXTEND
+   && aarch64_split_simd_shift_p (insn)"
+  [(const_int 0)]
+  {
+    /* On many cores, it is cheaper to implement UXTL using a ZIP1 with zero,
+       provided that the cost of the zero can be amortized over several
+       operations.  We'll later recombine the zero and zip if there are
+       not sufficient uses of the zero to make the split worthwhile.  */
+    rtx res = simplify_gen_subreg (<VNARROWQ2>mode, operands[0],
+                                  <MODE>mode, 0);
+    rtx zero = aarch64_gen_shareable_zero (<VNARROWQ2>mode);
+    rtx op = lowpart_subreg (<VNARROWQ2>mode, operands[1], <VNARROWQ>mode);
+    emit_insn (gen_aarch64_zip1<Vnarrowq2> (res, op, zero));
+    DONE;
+  }
    [(set_attr "type" "neon_shift_imm_long")]
  )
  
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc

index 32c7317f360014d1c78efb3832cfd5b978ea4a18..7d1f8c65ce41044d6850262300cf08a23d606617 100644 (file)
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -22882,16 +22882,61 @@ aarch64_mov_operand_p (rtx x, machine_mode mode)
      == SYMBOL_TINY_ABSOLUTE;
  }
  
+/* Return a function-invariant register that contains VALUE.  *CACHED_INSN
+   caches instructions that set up such registers, so that they can be
+   reused by future calls.  */
+
+static rtx
+aarch64_get_shareable_reg (rtx_insn **cached_insn, rtx value)
+{
+  rtx_insn *insn = *cached_insn;
+  if (insn && INSN_P (insn) && !insn->deleted ())
+    {
+      rtx pat = PATTERN (insn);
+      if (GET_CODE (pat) == SET)
+       {
+         rtx dest = SET_DEST (pat);
+         if (REG_P (dest)
+             && !HARD_REGISTER_P (dest)
+             && rtx_equal_p (SET_SRC (pat), value))
+           return dest;
+       }
+    }
+  rtx reg = gen_reg_rtx (GET_MODE (value));
+  *cached_insn = emit_insn_before (gen_rtx_SET (reg, value),
+                                  function_beg_insn);
+  return reg;
+}
+
  /* Create a 0 constant that is based on V4SI to allow CSE to optimally share
     the constant creation.  */
  
  rtx
  aarch64_gen_shareable_zero (machine_mode mode)
  {
-  machine_mode zmode = V4SImode;
-  rtx tmp = gen_reg_rtx (zmode);
-  emit_move_insn (tmp, CONST0_RTX (zmode));
-  return lowpart_subreg (mode, tmp, zmode);
+  rtx reg = aarch64_get_shareable_reg (&cfun->machine->advsimd_zero_insn,
+                                      CONST0_RTX (V4SImode));
+  return lowpart_subreg (mode, reg, GET_MODE (reg));
+}
+
+/* INSN is some form of extension or shift that can be split into a
+   permutation involving a shared zero.  Return true if we should
+   perform such a split.
+
+   ??? For now, make sure that the split instruction executes more
+   frequently than the zero that feeds it.  In future it would be good
+   to split without that restriction and instead recombine shared zeros
+   if they turn out not to be worthwhile.  This would allow splits in
+   single-block functions and would also cope more naturally with
+   rematerialization.  */
+
+bool
+aarch64_split_simd_shift_p (rtx_insn *insn)
+{
+  return (can_create_pseudo_p ()
+         && optimize_bb_for_speed_p (BLOCK_FOR_INSN (insn))
+         && (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count
+             < BLOCK_FOR_INSN (insn)->count));
  }
  
  /* Return a const_int vector of VAL.  */
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h

index 0a4e152c9bd46701f16a10c6ce8594316edf4a28..157a0b9dfa5df01bd7d254ed104120f75d639787 100644 (file)
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -1056,6 +1056,12 @@ typedef struct GTY (()) machine_function
    /* A set of all decls that have been passed to a vld1 intrinsic in the
       current function.  This is used to help guide the vector cost model.  */
    hash_set<tree> *vector_load_decls;
+
+  /* An instruction that was emitted at the start of the function to
+     set an Advanced SIMD pseudo register to zero.  If the instruction
+     still exists and still fulfils its original purpose. the same register
+     can be reused by other code.  */
+  rtx_insn *advsimd_zero_insn;
  } machine_function;
  #endif
  #endif
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md

index 89767eecdf8163cd9dfb78cf70bd9d842a1b929e..942270e99d6d0c6632199c059256f3a902a1b138 100644 (file)
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -1656,6 +1656,8 @@
  ;; Narrowed quad-modes for VQN (Used for XTN2).
  (define_mode_attr VNARROWQ2 [(V8HI "V16QI") (V4SI "V8HI")
                              (V2DI "V4SI")])
+(define_mode_attr Vnarrowq2 [(V8HI "v16qi") (V4SI "v8hi")
+                            (V2DI "v4si")])
  
  ;; Narrowed modes of vector modes.
  (define_mode_attr VNARROW [(VNx8HI "VNx16QI")
diff --git a/gcc/testsuite/gcc.target/aarch64/pr113196.c b/gcc/testsuite/gcc.target/aarch64/pr113196.c

new file mode 100644 (file)

index 0000000..8982cc5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr113196.c
@@ -0,0 +1,23 @@
+/* { dg-options "-O3" } */
+
+#pragma GCC target "+nosve"
+
+int test(unsigned array[4][4]);
+
+int foo(unsigned short *a, unsigned long n)
+{
+  unsigned array[4][4];
+
+  for (unsigned i = 0; i < 4; i++, a += 4)
+    {
+      array[i][0] = a[0] << 6;
+      array[i][1] = a[1] << 6;
+      array[i][2] = a[2] << 6;
+      array[i][3] = a[3] << 6;
+    }
+
+  return test(array);
+}
+
+/* { dg-final { scan-assembler-times {\tushll\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tushll2\t} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vmovl_high_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vmovl_high_1.c

index a2d09eaee0de5a3d3409330c5c26a3b5315e84eb..9519062e6d75fafbca9ec7ce1eba6fe82e7e690d 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/simd/vmovl_high_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd/vmovl_high_1.c
@@ -3,8 +3,6 @@
  
  #include <arm_neon.h>
  
-#include <arm_neon.h>
-
  #define FUNC(IT, OT, S)         \
  OT                              \
  foo_##S (IT a)                  \
@@ -22,11 +20,11 @@ FUNC (int32x4_t, int64x2_t, s32)
  /* { dg-final { scan-assembler-times {sxtl2\tv0\.2d, v0\.4s} 1} }  */
  
  FUNC (uint8x16_t, uint16x8_t, u8)
-/* { dg-final { scan-assembler-times {zip2\tv0\.16b, v0\.16b} 1} }  */
+/* { dg-final { scan-assembler-times {uxtl2\tv0\.8h, v0\.16b} 1} }  */
  
  FUNC (uint16x8_t, uint32x4_t, u16)
-/* { dg-final { scan-assembler-times {zip2\tv0\.8h, v0\.8h} 1} }  */
+/* { dg-final { scan-assembler-times {uxtl2\tv0\.4s, v0\.8h} 1} }  */
  
  FUNC (uint32x4_t, uint64x2_t, u32)
-/* { dg-final { scan-assembler-times {zip2\tv0\.4s, v0\.4s} 1} }  */
+/* { dg-final { scan-assembler-times {uxtl2\tv0\.2d, v0\.4s} 1} }  */
  
diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_10.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_10.c

index 81e77a8bb048aece91d79d824347fc60837f660f..a741919b924f698ed11bb684b32446d775faf9d2 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_10.c
+++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_10.c
@@ -14,5 +14,5 @@ f (int16_t *x, int16_t *y, uint8_t *z, int n)
      }
  }
  
-/* { dg-final { scan-assembler-times {\tuxtl\tv[0-9]+\.8h, v[0-9]+\.8b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tzip1\tv[0-9]+\.16b, v[0-9]+\.16b, v[0-9]+\.16b\n} 1 } } */
  /* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.8h,} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_8.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_8.c

index 9531966c2945c1365a71dd36b1fb34cb1c0c021d..835eef32f503d41704f86abc8c6f6b48f5d0a8d3 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_8.c
+++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_8.c
@@ -14,5 +14,5 @@ f (int64_t *x, int64_t *y, uint32_t *z, int n)
      }
  }
  
-/* { dg-final { scan-assembler-times {\tuxtl\tv[0-9]+\.2d, v[0-9]+\.2s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tzip1\tv[0-9]+\.4s, v[0-9]+\.4s, v[0-9]+\.4s\n} 1 } } */
  /* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.2d,} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_9.c b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_9.c

index de8f69886852c0531bfb1d006648ce3c4202717a..77ff691da1c30b2e5b14dba65983f3ddbab35338 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_9.c
+++ b/gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_9.c
@@ -14,5 +14,5 @@ f (int32_t *x, int32_t *y, uint16_t *z, int n)
      }
  }
  
-/* { dg-final { scan-assembler-times {\tuxtl\tv[0-9]+\.4s, v[0-9]+\.4h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tzip1\tv[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h\n} 1 } } */
  /* { dg-final { scan-assembler-times {\tadd\tv[0-9]+\.4s,} 1 } } */
author	Richard Sandiford <richard.sandiford@arm.com>
	Fri, 12 Jan 2024 12:38:01 +0000 (12:38 +0000)
committer	Richard Sandiford <richard.sandiford@arm.com>
	Fri, 12 Jan 2024 12:38:01 +0000 (12:38 +0000)
gcc/config/aarch64/aarch64-protos.h		patch \| blob \| blame \| history
gcc/config/aarch64/aarch64-simd.md		patch \| blob \| blame \| history
gcc/config/aarch64/aarch64.cc		patch \| blob \| blame \| history
gcc/config/aarch64/aarch64.h		patch \| blob \| blame \| history
gcc/config/aarch64/iterators.md		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/pr113196.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/simd/vmovl_high_1.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_10.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_8.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/vect_mixed_sizes_9.c		patch \| blob \| blame \| history