]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
tree-optimization/123190 - fix costing of permuted contiguous loads
authorRichard Biener <rguenther@suse.de>
Wed, 14 Jan 2026 11:45:19 +0000 (12:45 +0100)
committerRichard Biener <rguenth@gcc.gnu.org>
Wed, 14 Jan 2026 13:44:00 +0000 (14:44 +0100)
The following fixes a regression from the time we split load groups
along SLP boundaries.  When we face a permuted load from an access
that is contiguous across loop iterations we emit code that loads
the whole group and then emit required permutations.  The permutations
might not need all those loads, and if we split the group we would
not have emitted them.  Fortunately when analyzing a permutation
we compute both the number of required permutes and the number of
loads that will survive the followin DCE.  So make sure to use that
when costing.  This allows the previously added testcase for PR123190
to undergo epilog vectorization also at -O2 plus when using non-generic
tuning, such as tuning for Zen4 which ups the cost for XMM loads.

PR tree-optimization/123190
* tree-vectorizer.h (vect_load_store_data): Add n_loads member.
* tree-vect-stmts.cc (get_load_store_type): Record the
number of required loads for permuted loads.
(vectorizable_load): Make use of this when costing loads
for VMAT_CONTIGUOUS[_REVERSE].

* gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-1.c: Do not
require -mtune=generic.
* gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-2.c: Add
variant with -O2 instead of -O3, inner loop not unrolled.

gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-1.c
gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-2.c [new file with mode: 0644]
gcc/tree-vect-stmts.cc
gcc/tree-vectorizer.h

index 4265ac80a43df9188ef0ec4e4dfc1a1eb782ebac..098468627f05696516016a007790de3a14c95b3b 100644 (file)
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-additional-options "-O3 -mavx2 -mno-avx512f -mtune=generic" } */
+/* { dg-additional-options "-O3 -mavx2 -mno-avx512f" } */
 
 typedef struct {
    double real;
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-2.c b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-2.c
new file mode 100644 (file)
index 0000000..abc63b2
--- /dev/null
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -mavx2 -mno-avx512f" } */
+
+#include "costmodel-pr123190-1.c"
+
+/* { dg-final { scan-tree-dump "optimized: loop vectorized using 32" "vect" } } */
+/* { dg-final { scan-tree-dump "optimized: epilogue loop vectorized using 16 byte vectors and unroll factor 1" "vect" } } */
index a563238c4be036da2d5dd9f6b582216bb98e9679..83983742467c2bbbff9095e4c46859e5ac9ba14d 100644 (file)
@@ -2087,6 +2087,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info stmt_info,
   tree *ls_type = &ls->ls_type;
   bool *slp_perm = &ls->slp_perm;
   unsigned *n_perms = &ls->n_perms;
+  unsigned *n_loads = &ls->n_loads;
   tree *supported_offset_vectype = &ls->supported_offset_vectype;
   int *supported_scale = &ls->supported_scale;
   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
@@ -2103,6 +2104,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info stmt_info,
   *ls_type = NULL_TREE;
   *slp_perm = false;
   *n_perms = -1U;
+  *n_loads = -1U;
   ls->subchain_p = false;
 
   bool perm_ok = true;
@@ -2110,7 +2112,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info stmt_info,
 
   if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
     perm_ok = vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL,
-                                           vf, true, n_perms);
+                                           vf, true, n_perms, n_loads);
 
   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
     {
@@ -11880,18 +11882,20 @@ vectorizable_load (vec_info *vinfo,
         in PR101120 and friends.  */
       if (costing_p)
        {
-         gcc_assert (ls.n_perms != -1U);
+         gcc_assert (ls.n_perms != -1U && ls.n_loads != -1U);
          if (ls.n_perms != 0)
            inside_cost = record_stmt_cost (cost_vec, ls.n_perms, vec_perm,
                                            slp_node, 0, vect_body);
+         if (n_adjacent_loads > 0)
+           n_adjacent_loads = ls.n_loads;
        }
       else
        {
-         unsigned n_perms2;
+         unsigned n_perms2, n_loads2;
          bool ok = vect_transform_slp_perm_load (vinfo, slp_node, dr_chain,
                                                  gsi, vf, false, &n_perms2,
-                                                 nullptr, true);
-         gcc_assert (ok && ls.n_perms == n_perms2);
+                                                 &n_loads2, true);
+         gcc_assert (ok && ls.n_perms == n_perms2 && ls.n_loads == n_loads2);
        }
     }
 
index 7a38d4969cf2a62b8dfb1cf45c84f79bfb42bed1..2cbf752e4e769dbecb286c2047923171cc244dac 100644 (file)
@@ -307,6 +307,7 @@ struct vect_load_store_data : vect_data {
   /* True if the load requires a load permutation.  */
   bool slp_perm;    // SLP_TREE_LOAD_PERMUTATION
   unsigned n_perms; // SLP_TREE_LOAD_PERMUTATION
+  unsigned n_loads; // SLP_TREE_LOAD_PERMUTATION
   /* Whether the load permutation is consecutive and simple.  */
   bool subchain_p; // VMAT_STRIDED_SLP and VMAT_GATHER_SCATTER
 };