From: Richard Biener Date: Mon, 20 Oct 2025 12:44:27 +0000 (+0200) Subject: tree-optimization/120687 - reduction chain with UB on signed overflow X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=40d6dbf0f8dd91db5e4c13187fc341cdef40b054;p=thirdparty%2Fgcc.git tree-optimization/120687 - reduction chain with UB on signed overflow The following adds the ability to discover a reduction chain on a series of statements that invoke undefined behavior on integer overflow. This inhibits the reassoc pass from associating stmts in the way naturally leading to a reduction chain. The common mistake on the source side is to rely on the += operator to sum multiple inputs. After the refactoring of how we handle reduction chains we can easily use vect_slp_linearize_chain to do this our selves and rely on the vectorizer punning operations to unsigned given reduction vectorization always associates. PR tree-optimization/120687 * tree-vect-slp.cc (vect_analyze_slp_reduc_chain): When there's no natural reduction chain see if vect_slp_linearize_chain can recover one and built the SLP instance manually in that case. (vect_schedule_slp): Deal with NULL lanes when looking for stores to remove. * tree-vect-loop.cc (vect_transform_cycle_phi): Dump when we are successfully transforming a reduction chain. * gcc.dg/vect/vect-reduc-chain-4.c: New testcase. --- diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-4.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-4.c new file mode 100644 index 00000000000..d79676d8d82 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-chain-4.c @@ -0,0 +1,31 @@ +#include "tree-vect.h" + +int q[32]; + +int __attribute__((noipa)) +foo () +{ + int res = 0; + for (int i = 0; i < 8; ++i) + res += q[4*i] + q[4*i+1] + q[4*i+2] + q[4*i+3]; + return res; +} + +int main() +{ + check_vect (); + + int sum = 0; +#pragma GCC novector + for (int i = 0; i < 32; ++i) + { + q[i] = i; + sum += i; + } + + if (foo () != sum) + abort (); +} + +/* { dg-final { scan-tree-dump "vectorizing a reduction chain" "vect" } } */ +/* { dg-final { scan-tree-dump "optimized: loop vectorized" "vect" } } */ diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 455205069f6..be684a529db 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -8220,6 +8220,10 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo, /* Leave the scalar phi in place. */ return true; + if (reduc_info && reduc_info->is_reduc_chain && dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "vectorizing a reduction chain\n"); + vec_num = vect_get_num_copies (loop_vinfo, slp_node); /* Check whether we should use a single PHI node and accumulate diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 13a2995592b..53fe643be2c 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -4230,9 +4230,142 @@ vect_analyze_slp_reduc_chain (loop_vec_info vinfo, first = false; } while (!is_a (STMT_VINFO_STMT (next_stmt))); - if (fail || scalar_stmts.length () <= 1) + if (fail) return false; + if (scalar_stmts.length () == 1 + && code.is_tree_code () + && associative_tree_code ((tree_code)code)) + { + auto_vec chain; + auto_vec > worklist; + gimple *op_stmt = NULL, *other_op_stmt = NULL; + vect_slp_linearize_chain (vinfo, worklist, chain, (tree_code)code, + scalar_stmts[0]->stmt, op_stmt, other_op_stmt, + NULL); + if (chain.length () < 3) + { + scalar_stmts.release (); + return false; + } + + scalar_stmts.truncate (0); + stmt_vec_info tail = NULL; + for (auto el : chain) + { + if (el.dt == vect_external_def + || el.dt == vect_constant_def + || el.code != (tree_code) code) + { + scalar_stmts.release (); + return false; + } + stmt_vec_info stmt = vinfo->lookup_def (el.op); + if (STMT_VINFO_REDUC_IDX (stmt) != -1 + || STMT_VINFO_REDUC_DEF (stmt)) + { + gcc_assert (tail == NULL); + tail = stmt; + continue; + } + scalar_stmts.safe_push (stmt); + } + gcc_assert (tail); + + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "Starting SLP discovery of reduction chain for\n"); + for (unsigned i = 0; i < scalar_stmts.length (); ++i) + dump_printf_loc (MSG_NOTE, vect_location, + " %G", scalar_stmts[i]->stmt); + } + + unsigned int group_size = scalar_stmts.length (); + bool *matches = XALLOCAVEC (bool, group_size); + poly_uint64 max_nunits = 1; + unsigned tree_size = 0; + slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size, + &max_nunits, matches, limit, + &tree_size, bst_map); + if (!node) + { + scalar_stmts.release (); + return false; + } + + unsigned cycle_id = vinfo->reduc_infos.length (); + vect_reduc_info reduc_info = new vect_reduc_info_s (); + vinfo->reduc_infos.safe_push (reduc_info); + VECT_REDUC_INFO_DEF_TYPE (reduc_info) = STMT_VINFO_DEF_TYPE (next_stmt); + VECT_REDUC_INFO_TYPE (reduc_info) = STMT_VINFO_REDUC_TYPE (next_stmt); + VECT_REDUC_INFO_CODE (reduc_info) = STMT_VINFO_REDUC_CODE (next_stmt); + VECT_REDUC_INFO_FN (reduc_info) = IFN_LAST; + reduc_info->is_reduc_chain = true; + + /* Build the node for the PHI and possibly the conversion(s?). */ + slp_tree phis = vect_create_new_slp_node (2, ERROR_MARK); + SLP_TREE_REPRESENTATIVE (phis) = next_stmt; + phis->cycle_info.id = cycle_id; + SLP_TREE_LANES (phis) = group_size; + SLP_TREE_VECTYPE (phis) = SLP_TREE_VECTYPE (node); + /* ??? vect_cse_slp_nodes cannot cope with cycles without any + SLP_TREE_SCALAR_STMTS. */ + SLP_TREE_SCALAR_STMTS (phis).create (group_size); + for (unsigned i = 0; i < group_size; ++i) + SLP_TREE_SCALAR_STMTS (phis).quick_push (next_stmt); + + slp_tree reduc = vect_create_new_slp_node (2, ERROR_MARK); + SLP_TREE_REPRESENTATIVE (reduc) = scalar_stmt; + SLP_TREE_CHILDREN (reduc).quick_push (phis); + SLP_TREE_CHILDREN (reduc).quick_push (node); + reduc->cycle_info.id = cycle_id; + SLP_TREE_REDUC_IDX (reduc) = 0; + SLP_TREE_LANES (reduc) = group_size; + SLP_TREE_VECTYPE (reduc) = SLP_TREE_VECTYPE (node); + /* ??? For the reduction epilogue we need a live lane. */ + SLP_TREE_SCALAR_STMTS (reduc).create (group_size); + SLP_TREE_SCALAR_STMTS (reduc).quick_push (scalar_stmt); + for (unsigned i = 1; i < group_size; ++i) + SLP_TREE_SCALAR_STMTS (reduc).quick_push (NULL); + + edge le = loop_latch_edge (LOOP_VINFO_LOOP (vinfo)); + SLP_TREE_CHILDREN (phis).quick_push (NULL); + SLP_TREE_CHILDREN (phis).quick_push (NULL); + SLP_TREE_CHILDREN (phis)[le->dest_idx] = reduc; + SLP_TREE_REF_COUNT (reduc)++; + + /* Create a new SLP instance. */ + slp_instance new_instance = XNEW (class _slp_instance); + SLP_INSTANCE_TREE (new_instance) = reduc; + SLP_INSTANCE_LOADS (new_instance) = vNULL; + SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL; + SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL; + SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_chain; + new_instance->reduc_phis = NULL; + new_instance->cost_vec = vNULL; + new_instance->subgraph_entries = vNULL; + + vinfo->slp_instances.safe_push (new_instance); + + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "Final SLP tree for instance %p:\n", + (void *) new_instance); + vect_print_slp_graph (MSG_NOTE, vect_location, + SLP_INSTANCE_TREE (new_instance)); + } + + return true; + } + + if (scalar_stmts.length () <= 1) + { + scalar_stmts.release (); + return false; + } + scalar_stmts.reverse (); stmt_vec_info reduc_phi_info = next_stmt; @@ -12046,7 +12179,8 @@ vect_schedule_slp (vec_info *vinfo, const vec &slp_instances) /* Remove vectorized stores original scalar stmts. */ for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++) { - if (!STMT_VINFO_DATA_REF (store_info) + if (!store_info + || !STMT_VINFO_DATA_REF (store_info) || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info))) break;