tree-optimization/96208 - SLP of non-grouped loads

author Richard Biener <rguenther@suse.de>

Thu, 22 Jun 2023 09:40:46 +0000 (11:40 +0200)

committer Richard Biener <rguenther@suse.de>

Tue, 27 Jun 2023 07:42:27 +0000 (09:42 +0200)
author Richard Biener <rguenther@suse.de>
Thu, 22 Jun 2023 09:40:46 +0000 (11:40 +0200)
committer Richard Biener <rguenther@suse.de>
Tue, 27 Jun 2023 07:42:27 +0000 (09:42 +0200)
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-pr65935.c b/gcc/testsuite/gcc.dg/vect/bb-slp-pr65935.c

index ee12136491071c6bfd7678c164df7a1c0a71818f..8cefa7f52afee1d316970e638997415a91a9a035 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/bb-slp-pr65935.c
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-pr65935.c
@@ -24,11 +24,17 @@ void rephase (void)
    struct site *s;
    for(i=0,s=lattice;i<sites_on_node;i++,s++)
      for(dir=0;dir<32;dir++)
-      for(j=0;j<3;j++)for(k=0;k<3;k++)
-       {
-         s->link[dir].e[j][k].real *= s->phase[dir];
-         s->link[dir].e[j][k].imag *= s->phase[dir];
-       }
+      {
+       for(j=0;j<3;j++)
+         for(k=0;k<3;k++)
+           {
+             s->link[dir].e[j][k].real *= s->phase[dir];
+             s->link[dir].e[j][k].imag *= s->phase[dir];
+           }
+       /* Avoid loop vectorizing the outer loop after unrolling
+          the inners.  */
+       __asm__ volatile ("" : : : "memory");
+      }
  }
  
  int main()
diff --git a/gcc/testsuite/gcc.dg/vect/slp-46.c b/gcc/testsuite/gcc.dg/vect/slp-46.c

index 18476a43d3f61c07aede8d90ca69817b0e0b5342..79ed0bb9f6b3bddd83dd664c7675a69a7ef947ae 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/slp-46.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-46.c
@@ -94,4 +94,4 @@ main ()
    return 0;
  }
  
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail vect_load_lanes } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { xfail vect_load_lanes } } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc

index fee992d01715f7d08998094594c701351e5e33e0..8cb1ac1f3194b2c7356a0e3cfb5c8ee5ec4663c0 100644 (file)
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -1286,15 +1286,19 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
         {
           if (load_p
               && rhs_code != CFN_GATHER_LOAD
-             && rhs_code != CFN_MASK_GATHER_LOAD)
+             && rhs_code != CFN_MASK_GATHER_LOAD
+             /* Not grouped loads are handled as externals for BB
+                vectorization.  For loop vectorization we can handle
+                splats the same we handle single element interleaving.  */
+             && (is_a <bb_vec_info> (vinfo)
+                 || stmt_info != first_stmt_info))
             {
               /* Not grouped load.  */
               if (dump_enabled_p ())
                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                                  "Build SLP failed: not grouped load %G", stmt);
  
-             /* FORNOW: Not grouped loads are not supported.  */
-             if (is_a <bb_vec_info> (vinfo) && i != 0)
+             if (i != 0)
                 continue;
               /* Fatal mismatch.  */
               matches[0] = false;
@@ -1302,7 +1306,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
             }
  
           /* Not memory operation.  */
-         if (!phi_p
+         if (!load_p
+             && !phi_p
               && rhs_code.is_tree_code ()
               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
@@ -1774,7 +1779,7 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
      return NULL;
  
    /* If the SLP node is a load, terminate the recursion unless masked.  */
-  if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
+  if (STMT_VINFO_DATA_REF (stmt_info)
        && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
      {
        if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
@@ -1798,8 +1803,12 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
             {
-             int load_place = vect_get_place_in_interleaving_chain
-                 (load_info, first_stmt_info);
+             int load_place;
+             if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
+               load_place = vect_get_place_in_interleaving_chain
+                               (load_info, first_stmt_info);
+             else
+               load_place = 0;
               gcc_assert (load_place != -1);
               load_permutation.safe_push (load_place);
             }
@@ -5439,6 +5448,16 @@ vect_optimize_slp_pass::remove_redundant_permutations ()
                 this_load_permuted = true;
                 break;
               }
+         /* When this isn't a grouped access we know it's single element
+            and contiguous.  */
+         if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
+           {
+             if (!this_load_permuted
+                 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
+                     || SLP_TREE_LANES (node) == 1))
+               SLP_TREE_LOAD_PERMUTATION (node).release ();
+             continue;
+           }
           stmt_vec_info first_stmt_info
             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
           if (!this_load_permuted
@@ -8129,12 +8148,16 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
    tree vectype = SLP_TREE_VECTYPE (node);
    unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
    unsigned int mask_element;
+  unsigned dr_group_size;
    machine_mode mode;
  
    if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
-    return false;
-
-  stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+    dr_group_size = 1;
+  else
+    {
+      stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+      dr_group_size = DR_GROUP_SIZE (stmt_info);
+    }
  
    mode = TYPE_MODE (vectype);
    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
@@ -8175,7 +8198,7 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
    unsigned int nelts_to_build;
    unsigned int nvectors_per_build;
    unsigned int in_nlanes;
-  bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
+  bool repeating_p = (group_size == dr_group_size
                       && multiple_p (nunits, group_size));
    if (repeating_p)
      {
@@ -8188,7 +8211,7 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
          it at least one to ensure the later computation for n_perms
          proceed.  */
        nvectors_per_build = nstmts > 0 ? nstmts : 1;
-      in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
+      in_nlanes = dr_group_size * 3;
      }
    else
      {
@@ -8200,7 +8223,7 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
        mask.new_vector (const_nunits, const_nunits, 1);
        nelts_to_build = const_vf * group_size;
        nvectors_per_build = 1;
-      in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
+      in_nlanes = const_vf * dr_group_size;
      }
    auto_sbitmap used_in_lanes (in_nlanes);
    bitmap_clear (used_in_lanes);
@@ -8214,7 +8237,7 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
      {
        unsigned int iter_num = j / group_size;
        unsigned int stmt_num = j % group_size;
-      unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info) + perm[stmt_num]);
+      unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
        bitmap_set_bit (used_in_lanes, i);
        if (repeating_p)
         {
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc

index b31971e99a4f6176a9ec290e7647372efc2d9f9f..d642d3c257f8d540a8562eedbcd40372b9550959 100644 (file)
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1150,6 +1150,8 @@ vect_model_load_cost (vec_info *vinfo,
        /* If the load is permuted then the alignment is determined by
          the first group element not by the first scalar stmt DR.  */
        stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+      if (!first_stmt_info)
+       first_stmt_info = stmt_info;
        /* Record the cost for the permutation.  */
        unsigned n_perms, n_loads;
        vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL,
@@ -2203,12 +2205,24 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
  {
    loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
    class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
-  stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+  stmt_vec_info first_stmt_info;
+  unsigned int group_size;
+  unsigned HOST_WIDE_INT gap;
+  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
+    {
+      first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+      group_size = DR_GROUP_SIZE (first_stmt_info);
+      gap = DR_GROUP_GAP (first_stmt_info);
+    }
+  else
+    {
+      first_stmt_info = stmt_info;
+      group_size = 1;
+      gap = 0;
+    }
    dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
-  unsigned int group_size = DR_GROUP_SIZE (first_stmt_info);
    bool single_element_p = (stmt_info == first_stmt_info
                            && !DR_GROUP_NEXT_ELEMENT (stmt_info));
-  unsigned HOST_WIDE_INT gap = DR_GROUP_GAP (first_stmt_info);
    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
  
    /* True if the vectorized statements would access beyond the last
@@ -2311,11 +2325,16 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
                     *memory_access_type = VMAT_ELEMENTWISE;
                 }
             }
-         else
+         else if (cmp == 0 && loop_vinfo)
             {
-             gcc_assert (!loop_vinfo || cmp > 0);
-             *memory_access_type = VMAT_CONTIGUOUS;
+             gcc_assert (vls_type == VLS_LOAD);
+             *memory_access_type = VMAT_INVARIANT;
+             /* Invariant accesses perform only component accesses, alignment
+                is irrelevant for them.  */
+             *alignment_support_scheme = dr_unaligned_supported;
             }
+         else
+           *memory_access_type = VMAT_CONTIGUOUS;
  
           /* When we have a contiguous access across loop iterations
              but the access in the loop doesn't cover the full vector
@@ -2540,7 +2559,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info stmt_info,
          is irrelevant for them.  */
        *alignment_support_scheme = dr_unaligned_supported;
      }
-  else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
+  else if (STMT_VINFO_GROUPED_ACCESS (stmt_info) || slp_node)
      {
        if (!get_group_load_store_type (vinfo, stmt_info, vectype, slp_node,
                                       masked_p,
@@ -9464,46 +9483,6 @@ vectorizable_load (vec_info *vinfo,
           return false;
         }
  
-      if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
-       {
-         slp_perm = true;
-
-         if (!loop_vinfo)
-           {
-             /* In BB vectorization we may not actually use a loaded vector
-                accessing elements in excess of DR_GROUP_SIZE.  */
-             stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
-             group_info = DR_GROUP_FIRST_ELEMENT (group_info);
-             unsigned HOST_WIDE_INT nunits;
-             unsigned j, k, maxk = 0;
-             FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), j, k)
-               if (k > maxk)
-                 maxk = k;
-             tree vectype = SLP_TREE_VECTYPE (slp_node);
-             if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
-                 || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
-               {
-                 if (dump_enabled_p ())
-                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                    "BB vectorization with gaps at the end of "
-                                    "a load is not supported\n");
-                 return false;
-               }
-           }
-
-         auto_vec<tree> tem;
-         unsigned n_perms;
-         if (!vect_transform_slp_perm_load (vinfo, slp_node, tem, NULL, vf,
-                                            true, &n_perms))
-           {
-             if (dump_enabled_p ())
-               dump_printf_loc (MSG_MISSED_OPTIMIZATION,
-                                vect_location,
-                                "unsupported load permutation\n");
-             return false;
-           }
-       }
-
        /* Invalidate assumptions made by dependence analysis when vectorization
          on the unrolled body effectively re-orders stmts.  */
        if (!PURE_SLP_STMT (stmt_info)
@@ -9521,6 +9500,46 @@ vectorizable_load (vec_info *vinfo,
    else
      group_size = 1;
  
+  if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
+    {
+      slp_perm = true;
+
+      if (!loop_vinfo)
+       {
+         /* In BB vectorization we may not actually use a loaded vector
+            accessing elements in excess of DR_GROUP_SIZE.  */
+         stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
+         group_info = DR_GROUP_FIRST_ELEMENT (group_info);
+         unsigned HOST_WIDE_INT nunits;
+         unsigned j, k, maxk = 0;
+         FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), j, k)
+             if (k > maxk)
+               maxk = k;
+         tree vectype = SLP_TREE_VECTYPE (slp_node);
+         if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
+             || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
+           {
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "BB vectorization with gaps at the end of "
+                                "a load is not supported\n");
+             return false;
+           }
+       }
+
+      auto_vec<tree> tem;
+      unsigned n_perms;
+      if (!vect_transform_slp_perm_load (vinfo, slp_node, tem, NULL, vf,
+                                        true, &n_perms))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION,
+                            vect_location,
+                            "unsupported load permutation\n");
+         return false;
+       }
+    }
+
    vect_memory_access_type memory_access_type;
    enum dr_alignment_support alignment_support_scheme;
    int misalignment;
@@ -9898,10 +9917,19 @@ vectorizable_load (vec_info *vinfo,
        || (!slp && memory_access_type == VMAT_CONTIGUOUS))
      grouped_load = false;
  
-  if (grouped_load)
+  if (grouped_load
+      || (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()))
      {
-      first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
-      group_size = DR_GROUP_SIZE (first_stmt_info);
+      if (grouped_load)
+       {
+         first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+         group_size = DR_GROUP_SIZE (first_stmt_info);
+       }
+      else
+       {
+         first_stmt_info = stmt_info;
+         group_size = 1;
+       }
        /* For SLP vectorization we directly vectorize a subchain
           without permutation.  */
        if (slp && ! SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
author	Richard Biener <rguenther@suse.de>
	Thu, 22 Jun 2023 09:40:46 +0000 (11:40 +0200)
committer	Richard Biener <rguenther@suse.de>
	Tue, 27 Jun 2023 07:42:27 +0000 (09:42 +0200)
gcc/testsuite/gcc.dg/vect/bb-slp-pr65935.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/slp-46.c		patch \| blob \| blame \| history
gcc/tree-vect-slp.cc		patch \| blob \| blame \| history
gcc/tree-vect-stmts.cc		patch \| blob \| blame \| history