tree-optimization/1157777 - STLF fails with BB vectorization of loop

author Richard Biener <rguenther@suse.de>

Mon, 17 Mar 2025 14:04:28 +0000 (15:04 +0100)

committer Richard Biener <rguenth@gcc.gnu.org>

Tue, 6 May 2025 11:36:17 +0000 (13:36 +0200)
author Richard Biener <rguenther@suse.de>
Mon, 17 Mar 2025 14:04:28 +0000 (15:04 +0100)
committer Richard Biener <rguenth@gcc.gnu.org>
Tue, 6 May 2025 11:36:17 +0000 (13:36 +0200)
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-pr115777.c b/gcc/testsuite/gcc.dg/vect/bb-slp-pr115777.c

new file mode 100644 (file)

index 0000000..bba0dc7
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-pr115777.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+
+typedef unsigned int T;
+
+#define SWAP(A, B) do { T tmp = A; A = B; B = tmp; } while (0)
+
+void
+insertion_sort(T *v, int n)
+{
+  for (int i = 1; i < n; ++i)
+    for (int k = i; k > 0 && v[k-1] > v[k]; --k)
+      SWAP(v[k-1], v[k]);
+}
+
+/* { dg-final { scan-tree-dump "using element-wise load" "slp1" { target { { x86_64-*-* i?86-*-* } && { ! ia32 } } } } } */
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc

index c9395e33fcdfc7deedd979c764daae93b15abace..231a3cab4f809159759de69c4e6631299331926f 100644 (file)
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -1203,6 +1203,97 @@ vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance)
      for (unsigned k = 0; k < SLP_TREE_SCALAR_STMTS (store).length (); ++k)
        gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
  
+  /* If this is a SLP instance with a store check if there's a dependent
+     load that cannot be forwarded from a previous iteration of a loop
+     both are in.  This is to avoid situations like that in PR115777.  */
+  if (res && store)
+    {
+      stmt_vec_info store_info
+       = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (store)[0]);
+      class loop *store_loop = gimple_bb (store_info->stmt)->loop_father;
+      if (! loop_outer (store_loop))
+       return res;
+      vec<loop_p> loop_nest;
+      loop_nest.create (1);
+      loop_nest.quick_push (store_loop);
+      data_reference *drs = nullptr;
+      for (slp_tree &load : SLP_INSTANCE_LOADS (instance))
+       {
+         if (! STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (load)[0]))
+           continue;
+         stmt_vec_info load_info
+           = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (load)[0]);
+         if (gimple_bb (load_info->stmt)->loop_father != store_loop)
+           continue;
+
+         /* For now concern ourselves with write-after-read as we also
+            only look for re-use of the store within the same SLP instance.
+            We can still get a RAW here when the instance contais a PHI
+            with a backedge though, thus this test.  */
+         if (! vect_stmt_dominates_stmt_p (STMT_VINFO_STMT (load_info),
+                                           STMT_VINFO_STMT (store_info)))
+           continue;
+
+         if (! drs)
+           {
+             drs = create_data_ref (loop_preheader_edge (store_loop),
+                                    store_loop,
+                                    DR_REF (STMT_VINFO_DATA_REF (store_info)),
+                                    store_info->stmt, false, false);
+             if (! DR_BASE_ADDRESS (drs)
+                 || TREE_CODE (DR_STEP (drs)) != INTEGER_CST)
+               break;
+           }
+         data_reference *drl
+           = create_data_ref (loop_preheader_edge (store_loop),
+                              store_loop,
+                              DR_REF (STMT_VINFO_DATA_REF (load_info)),
+                              load_info->stmt, true, false);
+
+         /* See whether the DRs have a known constant distance throughout
+            the containing loop iteration.  */
+         if (! DR_BASE_ADDRESS (drl)
+             || ! operand_equal_p (DR_STEP (drs), DR_STEP (drl))
+             || ! operand_equal_p (DR_BASE_ADDRESS (drs),
+                                   DR_BASE_ADDRESS (drl))
+             || ! operand_equal_p (DR_OFFSET (drs), DR_OFFSET (drl)))
+           {
+             free_data_ref (drl);
+             continue;
+           }
+
+         /* If the next iteration load overlaps with a non-power-of-two offset
+            we are surely failing any STLF attempt.  */
+         HOST_WIDE_INT step = TREE_INT_CST_LOW (DR_STEP (drl));
+         unsigned HOST_WIDE_INT sizes
+           = (TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drs))))
+              * DR_GROUP_SIZE (store_info));
+         unsigned HOST_WIDE_INT sizel
+           = (TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drl))))
+              * DR_GROUP_SIZE (load_info));
+         if (ranges_overlap_p (TREE_INT_CST_LOW (DR_INIT (drl)) + step, sizel,
+                               TREE_INT_CST_LOW (DR_INIT (drs)), sizes))
+           {
+             unsigned HOST_WIDE_INT dist
+               = absu_hwi (TREE_INT_CST_LOW (DR_INIT (drl)) + step
+                           - TREE_INT_CST_LOW (DR_INIT (drs)));
+             poly_uint64 loadsz = tree_to_poly_uint64
+                                    (TYPE_SIZE_UNIT (SLP_TREE_VECTYPE (load)));
+             poly_uint64 storesz = tree_to_poly_uint64
+                                   (TYPE_SIZE_UNIT (SLP_TREE_VECTYPE (store)));
+             /* When the overlap aligns with vector sizes used for the loads
+                and the vector stores are larger or equal to the loads
+                forwarding should work.  */
+             if (maybe_gt (loadsz, storesz) || ! multiple_p (dist, loadsz))
+               load->avoid_stlf_fail = true;
+           }
+         free_data_ref (drl);
+       }
+      if (drs)
+       free_data_ref (drs);
+      loop_nest.release ();
+    }
+
    return res;
  }
  
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc

index 9bf142d0faf571eb10d13e3f00049fb637b81471..562e2227c7c490c12df266625f7dd5d20d41a176 100644 (file)
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -122,6 +122,7 @@ _slp_tree::_slp_tree ()
    SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
    SLP_TREE_CODE (this) = ERROR_MARK;
    this->ldst_lanes = false;
+  this->avoid_stlf_fail = false;
    SLP_TREE_VECTYPE (this) = NULL_TREE;
    SLP_TREE_REPRESENTATIVE (this) = NULL;
    SLP_TREE_MEMORY_ACCESS_TYPE (this) = VMAT_INVARIANT;
@@ -3104,7 +3105,8 @@ vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
                                          SLP_TREE_REF_COUNT (node));
    if (SLP_TREE_VECTYPE (node))
      dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
-  dump_printf (metadata, "\n");
+  dump_printf (metadata, "%s\n",
+              node->avoid_stlf_fail ? " (avoid-stlf-fail)" : "");
    if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
      {
        if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc

index 537ae6c2f61484f42f09d614e8213730023a0802..ea0b42627815a562540e3ae7cc652c2b38d0ceb6 100644 (file)
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2134,6 +2134,14 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
                             : vect_store_lanes_supported (vectype, group_size,
                                                           masked_p))) != IFN_LAST)
             *memory_access_type = VMAT_LOAD_STORE_LANES;
+         else if (!loop_vinfo && slp_node->avoid_stlf_fail)
+           {
+             *memory_access_type = VMAT_ELEMENTWISE;
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "using element-wise load to avoid disrupting "
+                                "cross iteration store-to-load forwarding\n");
+           }
           else
             *memory_access_type = VMAT_CONTIGUOUS;
  
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h

index 63991c3d977520f974b2f673d87418f486d9fe15..3d11559fe82b83aff7fa8143d22cc3d2cd67e94a 100644 (file)
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -266,6 +266,9 @@ struct _slp_tree {
    /* Whether uses of this load or feeders of this store are suitable
       for load/store-lanes.  */
    bool ldst_lanes;
+  /* For BB vect, flag to indicate this load node should be vectorized
+     as to avoid STLF fails because of related stores.  */
+  bool avoid_stlf_fail;
  
    int vertex;
author	Richard Biener <rguenther@suse.de>
	Mon, 17 Mar 2025 14:04:28 +0000 (15:04 +0100)
committer	Richard Biener <rguenth@gcc.gnu.org>
	Tue, 6 May 2025 11:36:17 +0000 (13:36 +0200)
gcc/testsuite/gcc.dg/vect/bb-slp-pr115777.c	[new file with mode: 0644]	patch \| blob
gcc/tree-vect-data-refs.cc		patch \| blob \| blame \| history
gcc/tree-vect-slp.cc		patch \| blob \| blame \| history
gcc/tree-vect-stmts.cc		patch \| blob \| blame \| history
gcc/tree-vectorizer.h		patch \| blob \| blame \| history