vect: Support gather loads with SLP

author Richard Sandiford <richard.sandiford@arm.com>

Tue, 30 Nov 2021 09:52:29 +0000 (09:52 +0000)

committer Richard Sandiford <richard.sandiford@arm.com>

Tue, 30 Nov 2021 09:52:29 +0000 (09:52 +0000)
author Richard Sandiford <richard.sandiford@arm.com>
Tue, 30 Nov 2021 09:52:29 +0000 (09:52 +0000)
committer Richard Sandiford <richard.sandiford@arm.com>
Tue, 30 Nov 2021 09:52:29 +0000 (09:52 +0000)
diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi

index 40b1e0d816789b225089c4143fb63e62a6af817a..702cd0c53e41a75e0dc47c5bedf61c270e10339b 100644 (file)
--- a/gcc/doc/sourcebuild.texi
+++ b/gcc/doc/sourcebuild.texi
@@ -1639,6 +1639,10 @@ Target supports vector masked loads.
  @item vect_masked_store
  Target supports vector masked stores.
  
+@item vect_gather_load_ifn
+Target supports vector gather loads using internal functions
+(rather than via built-in functions or emulation).
+
  @item vect_scatter_store
  Target supports vector scatter stores.
  
diff --git a/gcc/testsuite/gcc.dg/vect/vect-gather-1.c b/gcc/testsuite/gcc.dg/vect/vect-gather-1.c

new file mode 100644 (file)

index 0000000..4cee73f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-gather-1.c
@@ -0,0 +1,60 @@
+#include "tree-vect.h"
+
+#define N 16
+
+void __attribute__((noipa))
+f (int *restrict y, int *restrict x, int *restrict indices)
+{
+  for (int i = 0; i < N; ++i)
+    {
+      y[i * 2] = x[indices[i * 2]] + 1;
+      y[i * 2 + 1] = x[indices[i * 2 + 1]] + 2;
+    }
+}
+
+int y[N * 2];
+int x[N * 2] = {
+  72704, 52152, 51301, 96681,
+  57937, 60490, 34504, 60944,
+  42225, 28333, 88336, 74300,
+  29250, 20484, 38852, 91536,
+  86917, 63941, 31590, 21998,
+  22419, 26974, 28668, 13968,
+  3451, 20247, 44089, 85521,
+  22871, 87362, 50555, 85939
+};
+int indices[N * 2] = {
+  15, 16, 9, 19,
+  7, 22, 19, 1,
+  22, 13, 15, 30,
+  5, 12, 11, 11,
+  10, 25, 5, 20,
+  22, 24, 24, 28,
+  30, 19, 6, 4,
+  7, 12, 8, 21
+};
+int expected[N * 2] = {
+  91537, 86919, 28334, 22000,
+  60945, 28670, 21999, 52154,
+  28669, 20486, 91537, 50557,
+  60491, 29252, 74301, 74302,
+  88337, 20249, 60491, 22421,
+  28669, 3453, 3452, 22873,
+  50556, 22000, 34505, 57939,
+  60945, 29252, 42226, 26976
+};
+
+int
+main (void)
+{
+  check_vect ();
+
+  f (y, x, indices);
+  for (int i = 0; i < 32; ++i)
+    if (y[i] != expected[i])
+      __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" vect { target vect_gather_load_ifn } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-gather-2.c b/gcc/testsuite/gcc.dg/vect/vect-gather-2.c

new file mode 100644 (file)

index 0000000..a1f6ba4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-gather-2.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+
+#define N 16
+
+void
+f1 (int *restrict y, int *restrict x1, int *restrict x2,
+    int *restrict indices)
+{
+  for (int i = 0; i < N; ++i)
+    {
+      y[i * 2] = x1[indices[i * 2]] + 1;
+      y[i * 2 + 1] = x2[indices[i * 2 + 1]] + 2;
+    }
+}
+
+void
+f2 (int *restrict y, int *restrict x, int *restrict indices)
+{
+  for (int i = 0; i < N; ++i)
+    {
+      y[i * 2] = x[indices[i * 2]] + 1;
+      y[i * 2 + 1] = x[indices[i * 2 + 1] * 2] + 2;
+    }
+}
+
+void
+f3 (int *restrict y, int *restrict x, int *restrict indices)
+{
+  for (int i = 0; i < N; ++i)
+    {
+      y[i * 2] = x[indices[i * 2]] + 1;
+      y[i * 2 + 1] = x[(unsigned int) indices[i * 2 + 1]] + 2;
+    }
+}
+
+/* { dg-final { scan-tree-dump-not "Loop contains only SLP stmts" vect { target vect_gather_load_ifn } } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/gather_load_11.c b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_11.c

new file mode 100644 (file)

index 0000000..f6f78c1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/gather_load_11.c
@@ -0,0 +1,49 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-vect-cost-model" } */
+
+#include <stdint.h>
+
+void
+f1 (int32_t *restrict y, int32_t *restrict x, int32_t *restrict index)
+{
+  for (int i = 0; i < 100; ++i)
+    {
+      y[i * 2] = x[index[i * 2]] + 1;
+      y[i * 2 + 1] = x[index[i * 2 + 1]] + 2;
+    }
+}
+
+void
+f2 (int32_t *restrict y, int32_t *restrict x, uint32_t *restrict index)
+{
+  for (int i = 0; i < 100; ++i)
+    {
+      y[i * 2] = x[index[i * 2]] + 1;
+      y[i * 2 + 1] = x[index[i * 2 + 1]] + 2;
+    }
+}
+
+void
+f3 (int32_t *restrict y, int32_t *restrict x, uint64_t *restrict index)
+{
+  for (int i = 0; i < 100; ++i)
+    {
+      y[i * 2] = x[index[i * 2]] + 1;
+      y[i * 2 + 1] = x[index[i * 2 + 1]] + 2;
+    }
+}
+
+void
+f4 (int64_t *restrict y, int64_t *restrict x, uint64_t *restrict index)
+{
+  for (int i = 0; i < 100; ++i)
+    {
+      y[i * 2] = x[index[i * 2]] + 1;
+      y[i * 2 + 1] = x[index[i * 2 + 1]] + 2;
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+\.s, sxtw #?2\]} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+\.s, uxtw #?2\]} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+\.d, lsl #?2\]} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+\.d, lsl #?3\]} 1 } } */
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp

index 155034c9ca4880bf9b46a2ea84fd640754c805c7..caa8ab930566f02bb67c89de6d68f7199c937fa7 100644 (file)
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -7987,6 +7987,12 @@ proc check_effective_target_vect_masked_store { } {
                    || [istarget amdgcn*-*-*] }]
  }
  
+# Return 1 if the target supports vector gather loads via internal functions.
+
+proc check_effective_target_vect_gather_load_ifn { } {
+    return [expr { [check_effective_target_aarch64_sve] }]
+}
+
  # Return 1 if the target supports vector scatter stores.
  
  proc check_effective_target_vect_scatter_store { } {
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c

index 888ad72f3a90b6e13ed4ac15f376ec12de5f7c56..12a82cd694a823227ec281bb65e19e3894f3bab6 100644 (file)
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -359,6 +359,20 @@ vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
    lambda_vector dist_v;
    unsigned int loop_depth;
  
+  /* If user asserted safelen consecutive iterations can be
+     executed concurrently, assume independence.  */
+  auto apply_safelen = [&]()
+    {
+      if (loop->safelen >= 2)
+       {
+         if ((unsigned int) loop->safelen < *max_vf)
+           *max_vf = loop->safelen;
+         LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
+         return true;
+       }
+      return false;
+    };
+
    /* In loop analysis all data references should be vectorizable.  */
    if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
        || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
@@ -393,26 +407,23 @@ vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
                                  get_alias_set (DR_REF (drb))))
      return opt_result::success ();
  
+  if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
+      || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
+    {
+      if (apply_safelen ())
+       return opt_result::success ();
+
+      return opt_result::failure_at
+       (stmtinfo_a->stmt,
+        "possible alias involving gather/scatter between %T and %T\n",
+        DR_REF (dra), DR_REF (drb));
+    }
+
    /* Unknown data dependence.  */
    if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
      {
-      /* If user asserted safelen consecutive iterations can be
-        executed concurrently, assume independence.  */
-      if (loop->safelen >= 2)
-       {
-         if ((unsigned int) loop->safelen < *max_vf)
-           *max_vf = loop->safelen;
-         LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
-         return opt_result::success ();
-       }
-
-      if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
-         || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
-       return opt_result::failure_at
-         (stmtinfo_a->stmt,
-          "versioning for alias not supported for: "
-          "can't determine dependence between %T and %T\n",
-          DR_REF (dra), DR_REF (drb));
+      if (apply_safelen ())
+       return opt_result::success ();
  
        if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
@@ -427,23 +438,8 @@ vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
    /* Known data dependence.  */
    if (DDR_NUM_DIST_VECTS (ddr) == 0)
      {
-      /* If user asserted safelen consecutive iterations can be
-        executed concurrently, assume independence.  */
-      if (loop->safelen >= 2)
-       {
-         if ((unsigned int) loop->safelen < *max_vf)
-           *max_vf = loop->safelen;
-         LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
-         return opt_result::success ();
-       }
-
-      if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
-         || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
-       return opt_result::failure_at
-         (stmtinfo_a->stmt,
-          "versioning for alias not supported for: "
-          "bad dist vector for %T and %T\n",
-          DR_REF (dra), DR_REF (drb));
+      if (apply_safelen ())
+       return opt_result::success ();
  
        if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c

index 2cd0b854708f56484c4e66b7eb61ef0ec252e859..40224ad4a14a7f0e6c2b846cf396663cd4496d11 100644 (file)
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -459,6 +459,7 @@ static const int cond_expr_maps[3][5] = {
    { 4, -2, -1, 1, 2 },
    { 4, -1, -2, 2, 1 }
  };
+static const int arg1_map[] = { 1, 1 };
  static const int arg2_map[] = { 1, 2 };
  
  /* For most SLP statements, there is a one-to-one mapping between
@@ -490,6 +491,9 @@ vect_get_operand_map (const gimple *stmt, unsigned char swap = 0)
           case IFN_MASK_LOAD:
             return arg2_map;
  
+         case IFN_GATHER_LOAD:
+           return arg1_map;
+
           default:
             break;
           }
@@ -825,6 +829,20 @@ compatible_calls_p (gcall *call1, gcall *call2)
        if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
         return false;
      }
+
+  /* Check that any unvectorized arguments are equal.  */
+  if (const int *map = vect_get_operand_map (call1))
+    {
+      unsigned int nkept = *map++;
+      unsigned int mapi = 0;
+      for (unsigned int i = 0; i < nargs; ++i)
+       if (mapi < nkept && map[mapi] == int (i))
+         mapi += 1;
+       else if (!operand_equal_p (gimple_call_arg (call1, i),
+                                  gimple_call_arg (call2, i)))
+         return false;
+    }
+
    return true;
  }
  
@@ -982,7 +1000,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
           else
             rhs_code = CALL_EXPR;
  
-         if (cfn == CFN_MASK_LOAD)
+         if (cfn == CFN_MASK_LOAD || cfn == CFN_GATHER_LOAD)
             load_p = true;
           else if ((internal_fn_p (cfn)
                     && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
@@ -1126,7 +1144,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
               continue;
             }
  
-         if (!load_p && call_stmt)
+         if (call_stmt && first_stmt_code != CFN_MASK_LOAD)
             {
               if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
                                        call_stmt))
@@ -1211,7 +1229,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
          } /* Grouped access.  */
        else
         {
-         if (load_p)
+         if (load_p && rhs_code != CFN_GATHER_LOAD)
             {
               /* Not grouped load.  */
               if (dump_enabled_p ())
@@ -1692,7 +1710,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
        && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
      {
        if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
-       gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
+       gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
+                   || gimple_call_internal_p (stmt, IFN_GATHER_LOAD));
        else
         {
           *max_nunits = this_max_nunits;
@@ -4408,7 +4427,7 @@ vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
       calculated by the recursive call).  Otherwise it is the number of
       scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
       VF divided by the number of elements in a vector.  */
-  if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
+  if (!STMT_VINFO_DATA_REF (stmt_info)
        && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
      {
        for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c

index 101f61feff604f5c184cc486b4079c1a8ce9599a..06da5a9bc1349653c2bfc3b31fb565dd6a079faa 100644 (file)
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1674,6 +1674,7 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
                                       int group_size,
                                       vect_memory_access_type
                                       memory_access_type,
+                                     unsigned int ncopies,
                                       gather_scatter_info *gs_info,
                                       tree scalar_mask)
  {
@@ -1698,7 +1699,6 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
           return;
         }
-      unsigned int ncopies = vect_get_num_copies (loop_vinfo, vectype);
        vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, scalar_mask);
        return;
      }
@@ -1721,7 +1721,6 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
           return;
         }
-      unsigned int ncopies = vect_get_num_copies (loop_vinfo, vectype);
        vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, scalar_mask);
        return;
      }
@@ -2963,6 +2962,7 @@ vect_build_gather_load_calls (vec_info *vinfo, stmt_vec_info stmt_info,
  static void
  vect_get_gather_scatter_ops (loop_vec_info loop_vinfo,
                              class loop *loop, stmt_vec_info stmt_info,
+                            slp_tree slp_node, unsigned int ncopies,
                              gather_scatter_info *gs_info,
                              tree *dataref_ptr, vec<tree> *vec_offset)
  {
@@ -2975,10 +2975,12 @@ vect_get_gather_scatter_ops (loop_vec_info loop_vinfo,
        new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
        gcc_assert (!new_bb);
      }
-  unsigned ncopies = vect_get_num_copies (loop_vinfo, gs_info->offset_vectype);
-  vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, ncopies,
-                                gs_info->offset, vec_offset,
-                                gs_info->offset_vectype);
+  if (slp_node)
+    vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[0], vec_offset);
+  else
+    vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, ncopies,
+                                  gs_info->offset, vec_offset,
+                                  gs_info->offset_vectype);
  }
  
  /* Prepare to implement a grouped or strided load or store using
@@ -7484,7 +7486,7 @@ vectorizable_store (vec_info *vinfo,
           && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
         check_load_store_for_partial_vectors (loop_vinfo, vectype, vls_type,
                                               group_size, memory_access_type,
-                                             &gs_info, mask);
+                                             ncopies, &gs_info, mask);
  
        if (slp_node
           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
@@ -8147,8 +8149,8 @@ vectorizable_store (vec_info *vinfo,
           else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
             {
               vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
-                                          &gs_info, &dataref_ptr,
-                                          &vec_offsets);
+                                          slp_node, ncopies, &gs_info,
+                                          &dataref_ptr, &vec_offsets);
               vec_offset = vec_offsets[0];
             }
           else
@@ -8827,7 +8829,7 @@ vectorizable_load (vec_info *vinfo,
           && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
         check_load_store_for_partial_vectors (loop_vinfo, vectype, VLS_LOAD,
                                               group_size, memory_access_type,
-                                             &gs_info, mask);
+                                             ncopies, &gs_info, mask);
  
        if (dump_enabled_p ()
           && memory_access_type != VMAT_ELEMENTWISE
@@ -9445,8 +9447,8 @@ vectorizable_load (vec_info *vinfo,
           else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
             {
               vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
-                                          &gs_info, &dataref_ptr,
-                                          &vec_offsets);
+                                          slp_node, ncopies, &gs_info,
+                                          &dataref_ptr, &vec_offsets);
             }
           else
             dataref_ptr
author	Richard Sandiford <richard.sandiford@arm.com>
	Tue, 30 Nov 2021 09:52:29 +0000 (09:52 +0000)
committer	Richard Sandiford <richard.sandiford@arm.com>
	Tue, 30 Nov 2021 09:52:29 +0000 (09:52 +0000)
gcc/doc/sourcebuild.texi		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/vect-gather-1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.dg/vect/vect-gather-2.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/gather_load_11.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/lib/target-supports.exp		patch \| blob \| blame \| history
gcc/tree-vect-data-refs.c		patch \| blob \| blame \| history
gcc/tree-vect-slp.c		patch \| blob \| blame \| history
gcc/tree-vect-stmts.c		patch \| blob \| blame \| history