Do single-lane SLP discovery for reductions

author Richard Biener <rguenther@suse.de>

Fri, 23 Feb 2024 10:45:50 +0000 (11:45 +0100)

committer Richard Biener <rguenther@suse.de>

Tue, 4 Jun 2024 08:13:30 +0000 (10:13 +0200)
author Richard Biener <rguenther@suse.de>
Fri, 23 Feb 2024 10:45:50 +0000 (11:45 +0100)
committer Richard Biener <rguenther@suse.de>
Tue, 4 Jun 2024 08:13:30 +0000 (10:13 +0200)
diff --git a/gcc/testsuite/gcc.dg/vect/slp-24-big-array.c b/gcc/testsuite/gcc.dg/vect/slp-24-big-array.c

index 5eaea9600acb2b8ffe674730bcf9514b51ae105f..63f744338a14addc5a65ee929b864d9d6cacd2b4 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/slp-24-big-array.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-24-big-array.c
@@ -92,4 +92,4 @@ int main (void)
  }
  
  /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail { vect_no_align && ilp32 } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail { vect_no_align && ilp32 } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { xfail { vect_no_align && ilp32 } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-24.c b/gcc/testsuite/gcc.dg/vect/slp-24.c

index 59178f2c0f28bdbf657ad68658d373e75d076f79..7814d7c324eaaf15d359c1221d8389246d2f41f5 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/slp-24.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-24.c
@@ -78,4 +78,4 @@ int main (void)
  }
  
  /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail { vect_no_align && ilp32 } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail { vect_no_align && ilp32 } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { xfail { vect_no_align && ilp32 } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c

index 1fd15aa3c87daecedca12106a98f8b1848f49c9c..5566705a70408108b6aa5bd81d7d7a6f00250257 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-6.c
@@ -45,6 +45,5 @@ int main (void)
  }
  
  /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail { vect_no_int_add || { ! { vect_unpack || vect_strided2 } } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */
  /* { dg-final { scan-tree-dump-times "different interleaving chains in one node" 1 "vect" { target { ! vect_no_int_add } } } } */
  
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc

index a08357acc11ecc6fbe11833fb411282568831652..06292ed8bbe45b40fe659200e1bb1e3089480e08 100644 (file)
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -6504,7 +6504,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
    /* 2.3 Create the reduction code, using one of the three schemes described
           above. In SLP we simply need to extract all the elements from the 
           vector (without reducing them), so we use scalar shifts.  */
-  else if (reduc_fn != IFN_LAST && !slp_reduc)
+  else if (reduc_fn != IFN_LAST && (!slp_reduc || group_size == 1))
      {
        tree tmp;
        tree vec_elem_type;
@@ -6674,7 +6674,7 @@ vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
        gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
        reduc_inputs[0] = new_temp;
  
-      if (reduce_with_shift && !slp_reduc)
+      if (reduce_with_shift && (!slp_reduc || group_size == 1))
         {
           int element_bitsize = tree_to_uhwi (bitsize);
           /* Enforced by vectorizable_reduction, which disallows SLP reductions
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc

index 11ec82086fca5cbb697de2b20b1c83060c86285e..ba1190c715501e1e074f43f3dbc4ed3b68cf1d93 100644 (file)
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -1911,7 +1911,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
             /* Reduction chain backedge defs are filled manually.
                ???  Need a better way to identify a SLP reduction chain PHI.
                Or a better overall way to SLP match those.  */
-           if (all_same && def_type == vect_reduction_def)
+           if (stmts.length () > 1
+               && all_same && def_type == vect_reduction_def)
               skip_args[loop_latch_edge (loop)->dest_idx] = true;
           }
         else if (def_type != vect_internal_def)
@@ -3909,9 +3910,10 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
           }
  
        /* Find SLP sequences starting from groups of reductions.  */
-      if (loop_vinfo->reductions.length () > 1)
+      if (loop_vinfo->reductions.length () > 0)
         {
-         /* Collect reduction statements.  */
+         /* Collect reduction statements we can combine into
+            a SLP reduction.  */
           vec<stmt_vec_info> scalar_stmts;
           scalar_stmts.create (loop_vinfo->reductions.length ());
           for (auto next_info : loop_vinfo->reductions)
@@ -3924,23 +3926,58 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
                      reduction path.  In that case we'd have to reverse
                      engineer that conversion stmt following the chain using
                      reduc_idx and from the PHI using reduc_def.  */
-                 && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
-                 /* Do not discover SLP reductions for lane-reducing ops, that
-                    will fail later.  */
-                 && (!(g = dyn_cast <gassign *> (STMT_VINFO_STMT (next_info)))
-                     || !lane_reducing_op_p (gimple_assign_rhs_code (g))))
-               scalar_stmts.quick_push (next_info);
+                 && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
+               {
+                 /* Do not discover SLP reductions combining lane-reducing
+                    ops, that will fail later.  */
+                 if (!(g = dyn_cast <gassign *> (STMT_VINFO_STMT (next_info)))
+                     || !lane_reducing_op_p (gimple_assign_rhs_code (g)))
+                   scalar_stmts.quick_push (next_info);
+                 else
+                   {
+                     /* Do SLP discovery for single-lane reductions.  */
+                     vec<stmt_vec_info> stmts;
+                     vec<stmt_vec_info> roots = vNULL;
+                     vec<tree> remain = vNULL;
+                     stmts.create (1);
+                     stmts.quick_push (next_info);
+                     vect_build_slp_instance (vinfo,
+                                              slp_inst_kind_reduc_group,
+                                              stmts, roots, remain,
+                                              max_tree_size, &limit,
+                                              bst_map, NULL);
+                   }
+               }
             }
-         if (scalar_stmts.length () > 1)
+         /* Save for re-processing on failure.  */
+         vec<stmt_vec_info> saved_stmts = scalar_stmts.copy ();
+         vec<stmt_vec_info> roots = vNULL;
+         vec<tree> remain = vNULL;
+         if (scalar_stmts.length () <= 1
+             || !vect_build_slp_instance (loop_vinfo,
+                                          slp_inst_kind_reduc_group,
+                                          scalar_stmts, roots, remain,
+                                          max_tree_size, &limit, bst_map,
+                                          NULL))
             {
-             vec<stmt_vec_info> roots = vNULL;
-             vec<tree> remain = vNULL;
-             vect_build_slp_instance (loop_vinfo, slp_inst_kind_reduc_group,
-                                      scalar_stmts, roots, remain,
-                                      max_tree_size, &limit, bst_map, NULL);
+             if (scalar_stmts.length () <= 1)
+               scalar_stmts.release ();
+             /* Do SLP discovery for single-lane reductions.  */
+             for (auto stmt_info : saved_stmts)
+               {
+                 vec<stmt_vec_info> stmts;
+                 vec<stmt_vec_info> roots = vNULL;
+                 vec<tree> remain = vNULL;
+                 stmts.create (1);
+                 stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
+                 vect_build_slp_instance (vinfo,
+                                          slp_inst_kind_reduc_group,
+                                          stmts, roots, remain,
+                                          max_tree_size, &limit,
+                                          bst_map, NULL);
+               }
+             saved_stmts.release ();
             }
-         else
-           scalar_stmts.release ();
         }
      }
  
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc

index 935d80f0e1bcaa81f561b39890be9d1a71d3f662..b26cc74f41727d9b2f2cd285aae7801070dfdf7b 100644 (file)
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2160,6 +2160,23 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
                 }
               overrun_p = true;
             }
+
+         /* If this is single-element interleaving with an element
+            distance that leaves unused vector loads around punt - we
+            at least create very sub-optimal code in that case (and
+            blow up memory, see PR65518).  */
+         if (loop_vinfo
+             && *memory_access_type == VMAT_CONTIGUOUS
+             && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
+             && single_element_p
+             && maybe_gt (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
+           {
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "single-element interleaving not supported "
+                                "for not adjacent vector loads\n");
+             return false;
+           }
         }
      }
    else
@@ -8202,7 +8219,9 @@ vectorizable_store (vec_info *vinfo,
    gcc_assert (ncopies >= 1);
  
    /* FORNOW.  This restriction should be relaxed.  */
-  if (loop && nested_in_vect_loop_p (loop, stmt_info) && ncopies > 1)
+  if (loop
+      && nested_in_vect_loop_p (loop, stmt_info)
+      && (ncopies > 1 || (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1)))
      {
        if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -9945,7 +9964,8 @@ vectorizable_load (vec_info *vinfo,
    gcc_assert (ncopies >= 1);
  
    /* FORNOW. This restriction should be relaxed.  */
-  if (nested_in_vect_loop && ncopies > 1)
+  if (nested_in_vect_loop
+      && (ncopies > 1 || (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1)))
      {
        if (dump_enabled_p ())
          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
author	Richard Biener <rguenther@suse.de>
	Fri, 23 Feb 2024 10:45:50 +0000 (11:45 +0100)
committer	Richard Biener <rguenther@suse.de>
	Tue, 4 Jun 2024 08:13:30 +0000 (10:13 +0200)
gcc/testsuite/gcc.dg/vect/slp-24-big-array.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/slp-24.c		patch \| blob \| blame \| history
gcc/testsuite/gcc.dg/vect/slp-reduc-6.c		patch \| blob \| blame \| history
gcc/tree-vect-loop.cc		patch \| blob \| blame \| history
gcc/tree-vect-slp.cc		patch \| blob \| blame \| history
gcc/tree-vect-stmts.cc		patch \| blob \| blame \| history