aarch64: Restore vectorisation of vld1 inputs [PR109072]

author Richard Sandiford <richard.sandiford@arm.com>

Mon, 3 Apr 2023 08:57:08 +0000 (09:57 +0100)

committer Richard Sandiford <richard.sandiford@arm.com>

Mon, 3 Apr 2023 08:57:08 +0000 (09:57 +0100)
author Richard Sandiford <richard.sandiford@arm.com>
Mon, 3 Apr 2023 08:57:08 +0000 (09:57 +0100)
committer Richard Sandiford <richard.sandiford@arm.com>
Mon, 3 Apr 2023 08:57:08 +0000 (09:57 +0100)
diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc

index 60966fef09977eb1419a5ee75c674638fc952d76..42276e7caf7ade0172ef27c14f47ae5d3a54ba0e 100644 (file)
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -2897,6 +2897,19 @@ get_mem_type_for_load_store (unsigned int fcode)
    }
  }
  
+/* We've seen a vector load from address ADDR.  Record it in
+   vector_load_decls, if appropriate.  */
+static void
+aarch64_record_vector_load_arg (tree addr)
+{
+  tree decl = aarch64_vector_load_decl (addr);
+  if (!decl)
+    return;
+  if (!cfun->machine->vector_load_decls)
+    cfun->machine->vector_load_decls = hash_set<tree>::create_ggc (31);
+  cfun->machine->vector_load_decls->add (decl);
+}
+
  /* Try to fold STMT, given that it's a call to the built-in function with
     subcode FCODE.  Return the new statement on success and null on
     failure.  */
@@ -2932,6 +2945,11 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt,
       BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD)
       BUILTIN_VDQ_I (LOAD1_U, ld1, 0, LOAD)
       BUILTIN_VALLP_NO_DI (LOAD1_P, ld1, 0, LOAD)
+       /* Punt until after inlining, so that we stand more chance of
+          recording something meaningful in vector_load_decls.  */
+       if (!cfun->after_inlining)
+         break;
+       aarch64_record_vector_load_arg (args[0]);
         if (!BYTES_BIG_ENDIAN)
           {
             enum aarch64_simd_type mem_type
@@ -2950,6 +2968,8 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt,
                                      fold_build2 (MEM_REF,
                                                   access_type,
                                                   args[0], zero));
+           gimple_set_vuse (new_stmt, gimple_vuse (stmt));
+           gimple_set_vdef (new_stmt, gimple_vdef (stmt));
           }
         break;
  
@@ -2973,6 +2993,8 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt,
               = gimple_build_assign (fold_build2 (MEM_REF, access_type,
                                                   args[0], zero),
                                      args[1]);
+           gimple_set_vuse (new_stmt, gimple_vuse (stmt));
+           gimple_set_vdef (new_stmt, gimple_vdef (stmt));
           }
         break;
  
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h

index 82c8896c7febaa35cae520c58729386e289d5ccb..475d174dd39f4d766f203ff3f7ccb6c40d4574ed 100644 (file)
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -779,6 +779,7 @@ bool aarch64_const_vec_all_same_in_range_p (rtx, HOST_WIDE_INT,
  bool aarch64_constant_address_p (rtx);
  bool aarch64_emit_approx_div (rtx, rtx, rtx);
  bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
+tree aarch64_vector_load_decl (tree);
  void aarch64_expand_call (rtx, rtx, rtx, bool);
  bool aarch64_expand_cpymem (rtx *);
  bool aarch64_expand_setmem (rtx *);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc

index e7eab1b938c6a02a98d3e7427bc214f6c6bcf016..a49618ac0ead4d2951b457748022586cfdcc1800 100644 (file)
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -15468,6 +15468,33 @@ aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
  
  /* Vectorizer cost model target hooks.  */
  
+/* If a vld1 from address ADDR should be recorded in vector_load_decls,
+   return the decl that should be recorded.  Return null otherwise.  */
+tree
+aarch64_vector_load_decl (tree addr)
+{
+  if (TREE_CODE (addr) != ADDR_EXPR)
+    return NULL_TREE;
+  tree base = get_base_address (TREE_OPERAND (addr, 0));
+  if (TREE_CODE (base) != VAR_DECL)
+    return NULL_TREE;
+  return base;
+}
+
+/* Return true if STMT_INFO accesses a decl that is known to be the
+   argument to a vld1 in the same function.  */
+static bool
+aarch64_accesses_vector_load_decl_p (stmt_vec_info stmt_info)
+{
+  if (!cfun->machine->vector_load_decls)
+    return false;
+  auto dr = STMT_VINFO_DATA_REF (stmt_info);
+  if (!dr)
+    return false;
+  tree decl = aarch64_vector_load_decl (DR_BASE_ADDRESS (dr));
+  return decl && cfun->machine->vector_load_decls->contains (decl);
+}
+
  /* Information about how the CPU would issue the scalar, Advanced SIMD
     or SVE version of a vector loop, using the scheme defined by the
     aarch64_base_vec_issue_info hierarchy of structures.  */
@@ -15698,6 +15725,20 @@ private:
       supported by Advanced SIMD and SVE2.  */
    bool m_has_avg = false;
  
+  /* True if the vector body contains a store to a decl and if the
+     function is known to have a vld1 from the same decl.
+
+     In the Advanced SIMD ACLE, the recommended endian-agnostic way of
+     initializing a vector is:
+
+       float f[4] = { elts };
+       float32x4_t x = vld1q_f32(f);
+
+     We should strongly prefer vectorization of the initialization of f,
+     so that the store to f and the load back can be optimized away,
+     leaving a vectorization of { elts }.  */
+  bool m_stores_to_vector_load_decl = false;
+
    /* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
       - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
         SIMD code.
@@ -16714,6 +16755,18 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
             }
         }
      }
+
+  /* If the statement stores to a decl that is known to be the argument
+     to a vld1 in the same function, ignore the store for costing purposes.
+     See the comment above m_stores_to_vector_load_decl for more details.  */
+  if (stmt_info
+      && (kind == vector_store || kind == unaligned_store)
+      && aarch64_accesses_vector_load_decl_p (stmt_info))
+    {
+      stmt_cost = 0;
+      m_stores_to_vector_load_decl = true;
+    }
+
    return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
  }
  
@@ -17003,12 +17056,21 @@ aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
  
    /* Apply the heuristic described above m_stp_sequence_cost.  Prefer
       the scalar code in the event of a tie, since there is more chance
-     of scalar code being optimized with surrounding operations.  */
+     of scalar code being optimized with surrounding operations.
+
+     In addition, if the vector body is a simple store to a decl that
+     is elsewhere loaded using vld1, strongly prefer the vector form,
+     to the extent of giving the prologue a zero cost.  See the comment
+     above m_stores_to_vector_load_decl for details.  */
    if (!loop_vinfo
        && scalar_costs
-      && m_stp_sequence_cost != ~0U
-      && m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
-    m_costs[vect_body] = 2 * scalar_costs->total_cost ();
+      && m_stp_sequence_cost != ~0U)
+    {
+      if (m_stores_to_vector_load_decl)
+       m_costs[vect_prologue] = 0;
+      else if (m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
+       m_costs[vect_body] = 2 * scalar_costs->total_cost ();
+    }
  
    vector_costs::finish_cost (scalar_costs);
  }
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h

index 3bde24674dcc0f8b49219b8ce397d2f432f95f3b..6834c3e99226ffd35b401986812079e55e9dd5e5 100644 (file)
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -954,6 +954,7 @@ struct GTY (()) aarch64_frame
    bool is_scs_enabled;
  };
  
+#ifdef hash_set_h
  typedef struct GTY (()) machine_function
  {
    struct aarch64_frame frame;
@@ -962,8 +963,12 @@ typedef struct GTY (()) machine_function
    /* One entry for each general purpose register.  */
    rtx call_via[SP_REGNUM];
    bool label_is_assembled;
+  /* A set of all decls that have been passed to a vld1 intrinsic in the
+     current function.  This is used to help guide the vector cost model.  */
+  hash_set<tree> *vector_load_decls;
  } machine_function;
  #endif
+#endif
  
  /* Which ABI to use.  */
  enum aarch64_abi_type
diff --git a/gcc/testsuite/gcc.target/aarch64/pr109072_1.c b/gcc/testsuite/gcc.target/aarch64/pr109072_1.c

new file mode 100644 (file)

index 0000000..6c1d2b0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr109072_1.c
@@ -0,0 +1,281 @@
+/* { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" "" { target aarch64_little_endian } } } */
+
+#include <arm_neon.h>
+
+/*
+** s32x2_1:
+**     dup     v0\.2s, w0
+**     ret
+*/
+int32x2_t
+s32x2_1 (int32_t x)
+{
+  int32_t arr[] = { x, x };
+  return vld1_s32 (arr);
+}
+
+/*
+** s32x2_2:
+**     fmov    s0, w0
+**     ret
+*/
+int32x2_t
+s32x2_2 (int32_t x)
+{
+  int32_t arr[] = { x, 0 };
+  return vld1_s32 (arr);
+}
+
+/*
+** s32x2_3:
+**     fmov    s0, w0
+**     ins     v0\.s\[1\], w1
+**     ret
+*/
+int32x2_t
+s32x2_3 (int32_t x, int32_t y)
+{
+  int32_t arr[] = { x, y };
+  return vld1_s32 (arr);
+}
+
+/*
+** f32x2_1:
+**     dup     v0\.2s, v0.s\[0\]
+**     ret
+*/
+float32x2_t
+f32x2_1 (float32_t x)
+{
+  float32_t arr[] = { x, x };
+  return vld1_f32 (arr);
+}
+
+/*
+** f32x2_2:
+**     ins     v0\.s\[1\], v1.s\[0\]
+**     ret
+*/
+float32x2_t
+f32x2_2 (float32_t x, float32_t y)
+{
+  float32_t arr[] = { x, y };
+  return vld1_f32 (arr);
+}
+
+/*
+** s16x4_1:
+**     dup     v0\.4h, w0
+**     ret
+*/
+int16x4_t
+s16x4_1 (int16_t x)
+{
+  int16_t arr[] = { x, x, x, x };
+  return vld1_s16 (arr);
+}
+
+/*
+** s16x4_2:
+**     ...
+**     fmov    [dsh]0, [wx][0-9]+
+**     ret
+*/
+int16x4_t
+s16x4_2 (int16_t x)
+{
+  int16_t arr[] = { x, 0, 0, 0 };
+  return vld1_s16 (arr);
+}
+
+/*
+** s16x4_3:
+**     dup     v0\.4h, w1
+**     ins     v0.h\[0\], w0
+**     ret
+*/
+int16x4_t
+s16x4_3 (int16_t x, int16_t y)
+{
+  int16_t arr[] = { x, y, y, y };
+  return vld1_s16 (arr);
+}
+
+/*
+** f16x4_1:
+**     dup     v0\.4h, v0.h\[0\]
+**     ret
+*/
+float16x4_t
+f16x4_1 (float16_t x)
+{
+  float16_t arr[] = { x, x, x, x };
+  return vld1_f16 (arr);
+}
+
+/*
+** s64x2_1:
+**     dup     v0\.2d, x0
+**     ret
+*/
+int64x2_t
+s64x2_1 (int64_t x)
+{
+  int64_t arr[] = { x, x };
+  return vld1q_s64 (arr);
+}
+
+/*
+** s64x2_2: { xfail *-*-* }
+**     fmov    d0, x0
+**     ret
+*/
+int64x2_t
+s64x2_2 (int64_t x)
+{
+  int64_t arr[] = { x, 0 };
+  return vld1q_s64 (arr);
+}
+
+/*
+** s64x2_3:
+**     fmov    d0, x0
+**     ins     v0\.d\[1\], x1
+**     ret
+*/
+int64x2_t
+s64x2_3 (int64_t x, int64_t y)
+{
+  int64_t arr[] = { x, y };
+  return vld1q_s64 (arr);
+}
+
+/*
+** f64x2_1:
+**     dup     v0\.2d, v0.d\[0\]
+**     ret
+*/
+float64x2_t
+f64x2_1 (float64_t x)
+{
+  float64_t arr[] = { x, x };
+  return vld1q_f64 (arr);
+}
+
+/*
+** f64x2_2:
+**     ins     v0\.d\[1\], v1.d\[0\]
+**     ret
+*/
+float64x2_t
+f64x2_2 (float64_t x, float64_t y)
+{
+  float64_t arr[] = { x, y };
+  return vld1q_f64 (arr);
+}
+
+/*
+** s32x4_1:
+**     dup     v0\.4s, w0
+**     ret
+*/
+int32x4_t
+s32x4_1 (int32_t x)
+{
+  int32_t arr[] = { x, x, x, x };
+  return vld1q_s32 (arr);
+}
+
+/*
+** s32x4_2: { xfail *-*-* }
+**     fmov    s0, w0
+**     ret
+*/
+int32x4_t
+s32x4_2 (int32_t x)
+{
+  int32_t arr[] = { x, 0, 0, 0 };
+  return vld1q_s32 (arr);
+}
+
+/*
+** s32x4_3:
+**     dup     v0\.4s, w1
+**     ins     v0.s\[0\], w0
+**     ret
+*/
+int32x4_t
+s32x4_3 (int32_t x, int32_t y)
+{
+  int32_t arr[] = { x, y, y, y };
+  return vld1q_s32 (arr);
+}
+
+/*
+** f32x4_1:
+**     dup     v0\.4s, v0.s\[0\]
+**     ret
+*/
+float32x4_t
+f32x4_1 (float32_t x)
+{
+  float32_t arr[] = { x, x, x, x };
+  return vld1q_f32 (arr);
+}
+
+void consume (float32x4_t, float32x4_t, float32x4_t, float32x4_t);
+
+/*
+** produce_1:
+** (
+**     dup     v0\.4s, v0\.s\[0\]
+**     dup     v1\.4s, v1\.s\[0\]
+**     dup     v2\.4s, v2\.s\[0\]
+**     dup     v3\.4s, v3\.s\[0\]
+** |
+**     dup     v3\.4s, v3\.s\[0\]
+**     dup     v2\.4s, v2\.s\[0\]
+**     dup     v1\.4s, v1\.s\[0\]
+**     dup     v0\.4s, v0\.s\[0\]
+** )
+**     b       consume
+*/
+void
+produce_1 (float32_t a, float32_t b, float32_t c, float32_t d)
+{
+  float arr[4][4] = {
+    { a, a, a, a },
+    { b, b, b, b },
+    { c, c, c, c },
+    { d, d, d, d }
+  };
+  consume (vld1q_f32 (arr[0]), vld1q_f32 (arr[1]),
+          vld1q_f32 (arr[2]), vld1q_f32 (arr[3]));
+}
+
+/*
+** produce_2:
+** (
+**     dup     v0\.4s, v0\.s\[0\]
+**     dup     v1\.4s, v1\.s\[0\]
+**     dup     v2\.4s, v2\.s\[0\]
+**     dup     v3\.4s, v3\.s\[0\]
+** |
+**     dup     v3\.4s, v3\.s\[0\]
+**     dup     v2\.4s, v2\.s\[0\]
+**     dup     v1\.4s, v1\.s\[0\]
+**     dup     v0\.4s, v0\.s\[0\]
+** )
+**     b       consume
+*/
+void
+produce_2 (float32_t a, float32_t b, float32_t c, float32_t d)
+{
+  float arr0[] = { a, a, a, a };
+  float arr1[] = { b, b, b, b };
+  float arr2[] = { c, c, c, c };
+  float arr3[] = { d, d, d, d };
+  consume (vld1q_f32 (arr0), vld1q_f32 (arr1),
+          vld1q_f32 (arr2), vld1q_f32 (arr3));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/pr109072_2.c b/gcc/testsuite/gcc.target/aarch64/pr109072_2.c

new file mode 100644 (file)

index 0000000..d532f08
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr109072_2.c
@@ -0,0 +1,60 @@
+/* { dg-options "-O" } */
+
+#pragma GCC target "arch=armv8.2-a+dotprod"
+
+#include <arm_neon.h>
+
+static inline uint32_t horizontal_add_uint32x4(const uint32x4_t a) {
+  return vaddvq_u32(a);
+}
+
+static inline unsigned int sadwxh_avg_neon(const uint8_t *src_ptr,
+                                           int src_stride,
+                                           const uint8_t *ref_ptr,
+                                           int ref_stride, int w, int h,
+                                           const uint8_t *second_pred) {
+
+
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+      s0 = vld1q_u8(src_ptr + j);
+      r0 = vld1q_u8(ref_ptr + j);
+      p0 = vld1q_u8(second_pred);
+      avg0 = vrhaddq_u8(r0, p0);
+      diff0 = vabdq_u8(s0, avg0);
+      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+      s1 = vld1q_u8(src_ptr + j + 16);
+      r1 = vld1q_u8(ref_ptr + j + 16);
+      p1 = vld1q_u8(second_pred + 16);
+      avg1 = vrhaddq_u8(r1, p1);
+      diff1 = vabdq_u8(s1, avg1);
+      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+      j += 32;
+      second_pred += 32;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static inline unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            const uint8_t *second_pred) {
+  return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
+                         second_pred);
+}
+
+uint32_t vpx_sad32x16_avg_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred) { return sad32xh_avg_neon(src, src_stride, ref, ref_stride, (16), second_pred); }
author	Richard Sandiford <richard.sandiford@arm.com>
	Mon, 3 Apr 2023 08:57:08 +0000 (09:57 +0100)
committer	Richard Sandiford <richard.sandiford@arm.com>
	Mon, 3 Apr 2023 08:57:08 +0000 (09:57 +0100)
gcc/config/aarch64/aarch64-builtins.cc		patch \| blob \| blame \| history
gcc/config/aarch64/aarch64-protos.h		patch \| blob \| blame \| history
gcc/config/aarch64/aarch64.cc		patch \| blob \| blame \| history
gcc/config/aarch64/aarch64.h		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/aarch64/pr109072_1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/pr109072_2.c	[new file with mode: 0644]	patch \| blob