From 9f8409f2e2c28f64bf6a584bc81afcae0f084785 Mon Sep 17 00:00:00 2001 From: Zhongyao Chen Date: Wed, 3 Jun 2026 20:44:59 +0800 Subject: [PATCH] vect: Avoid external fallback while operand swap retry is possible When building operand zero of a commutative BB SLP node, a failed child discovery may build operands from scalars right away. That hides the failure from the existing retry path, even when retrying with swapped operands could still fix the current node. Track the distance from operand-zero subtree discovery to the nearest upthread swap opportunity. Skip scalar fallback only when that distance is exactly one, so discovery reaches the retry path first. My local tests show no regression for vect.exp, only a few for rvv.exp, but those are reasonable, just need update test expectations. PR tree-optimization/125567 gcc/ * tree-vect-slp.cc (least_upthread_swappable_op_distance): New. (vect_build_slp_tree_2): Compute swap checks before building operand zero. Skip external fallback while swap retry is possible. Reuse the swap checks in the retry path. gcc/testsuite/ * gcc.dg/vect/pr125567.c: New test. Signed-off-by: Zhongyao Chen --- gcc/testsuite/gcc.dg/vect/pr125567.c | 40 +++++++++ gcc/tree-vect-slp.cc | 116 +++++++++++++++++---------- 2 files changed, 115 insertions(+), 41 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/pr125567.c diff --git a/gcc/testsuite/gcc.dg/vect/pr125567.c b/gcc/testsuite/gcc.dg/vect/pr125567.c new file mode 100644 index 00000000000..6bc2268a1a9 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr125567.c @@ -0,0 +1,40 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-additional-options "-O3 -ftree-vectorize -fdump-tree-slp1-details" } */ + +typedef short i16; + +void +foo (i16 *__restrict out, i16 *__restrict in, i16 x, unsigned n) +{ + unsigned k = 0; + i16 tmp = x + 3; + + while (k < n) + { + tmp = tmp ^ 0x3f; + + out[k + 0] = in[k + 0] + tmp; + out[k + 1] = in[k + 1] + tmp; + out[k + 2] = in[k + 2] + tmp; + out[k + 3] = in[k + 3] + tmp; + out[k + 4] = in[k + 4] + tmp; + out[k + 5] = in[k + 5] + tmp; + out[k + 6] = in[k + 6] + tmp; + out[k + 7] = in[k + 7] + tmp; + out[k + 8] = in[k + 8] + tmp; + out[k + 9] = in[k + 9] + tmp; + out[k + 10] = in[k + 10] + tmp; + out[k + 11] = in[k + 11] + tmp; + out[k + 12] = in[k + 12] + tmp; + out[k + 13] = in[k + 13] + tmp; + out[k + 14] = in[k + 14] + tmp; + out[k + 15] = in[k + 15] + tmp; + + k += 16; + } +} + +/* { dg-final { scan-tree-dump "Re-trying with swapped operands of stmts" "slp1" } } */ +/* { dg-final { scan-tree-dump "Using a splat of the uniform operand" "slp1" } } */ +/* { dg-final { scan-tree-dump "optimized: basic block part vectorized" "slp1" } } */ diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 075e93f04a9..90c183c8811 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -1873,6 +1873,12 @@ vect_slp_linearize_chain (vec_info *vinfo, } } +/* Distance from the node currently being discovered to the closest upthread + commutative operation whose operand-zero discovery may still be fixed by + retrying with swapped operands, or -1U if there is none. */ + +static unsigned least_upthread_swappable_op_distance = -1U; + static slp_tree vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, vec stmts, unsigned int group_size, @@ -2815,6 +2821,10 @@ out: { slp_tree child = nullptr; unsigned int j; + unsigned old_swap_distance; + bool can_swap; + bool can_swap_nonmatching; + bool *stmt_can_swap; /* We're skipping certain operands from processing, for example outer loop reduction initial defs. */ @@ -2965,10 +2975,57 @@ out: def_stmts2.release (); } - if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts, - group_size, &this_max_nunits, - matches, limit, - &this_tree_size, bst_map)) != NULL) + can_swap = (i == 0 + && (nops == 2 || nops == 3) + && oprnds_info.length () > 1 + && oprnds_info[1]->first_dt == vect_internal_def + && (is_gimple_assign (stmt_info->stmt) + || is_gimple_call (stmt_info->stmt)) + /* Swapping operands for reductions breaks assumptions + later on. */ + && STMT_VINFO_REDUC_IDX (stmt_info) == -1); + can_swap_nonmatching = can_swap; + stmt_can_swap = NULL; + if (can_swap) + { + stmt_can_swap = XALLOCAVEC (bool, group_size); + for (j = 0; j < group_size; ++j) + { + stmt_can_swap[j] = false; + if (!stmts[j]) + /* NULL lanes are gaps and have no stmt to swap. */ + stmt_can_swap[j] = true; + else if (gassign *stmt = dyn_cast (stmts[j]->stmt)) + { + tree_code code = gimple_assign_rhs_code (stmt); + stmt_can_swap[j] = (commutative_tree_code (code) + || commutative_ternary_tree_code (code)); + } + else if (gcall *call = dyn_cast (stmts[j]->stmt)) + { + internal_fn fn = (gimple_call_internal_p (call) + ? gimple_call_internal_fn (call) : IFN_LAST); + stmt_can_swap[j] = ((commutative_binary_fn_p (fn) + || commutative_ternary_fn_p (fn)) + && first_commutative_argument (fn) == 0); + } + + if (j != 0 && !stmt_can_swap[j]) + can_swap_nonmatching = false; + } + } + + old_swap_distance = least_upthread_swappable_op_distance; + if (can_swap_nonmatching) + least_upthread_swappable_op_distance = 1; + else if (least_upthread_swappable_op_distance != -1U) + least_upthread_swappable_op_distance++; + child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts, + group_size, &this_max_nunits, + matches, limit, + &this_tree_size, bst_map); + least_upthread_swappable_op_distance = old_swap_distance; + if (child != NULL) { oprnd_info->def_stmts = vNULL; children.safe_push (child); @@ -2978,17 +3035,9 @@ out: /* If the SLP build for operand zero failed and operand zero and one can be commuted try that for the scalar stmts that failed the match. */ - if (i == 0 - /* A first scalar stmt mismatch signals a fatal mismatch. */ - && matches[0] - /* ??? For COND_EXPRs we can swap the comparison operands - as well as the arms under some constraints. */ - && (nops == 2 || nops == 3) - && oprnds_info[1]->first_dt == vect_internal_def - && (is_gimple_assign (stmt_info->stmt) - || is_gimple_call (stmt_info->stmt)) - /* Swapping operands for reductions breaks assumptions later on. */ - && STMT_VINFO_REDUC_IDX (stmt_info) == -1) + if (/* A first scalar stmt mismatch signals a fatal mismatch. */ + matches[0] + && can_swap) { /* See whether we can swap the matching or the non-matching stmt operands. */ @@ -2999,34 +3048,13 @@ out: { if (matches[j] != !swap_not_matching) continue; - stmt_vec_info stmt_info = stmts[j]; /* Verify if we can swap operands of this stmt. */ - if (gassign *stmt = dyn_cast (stmt_info->stmt)) - { - tree_code code = gimple_assign_rhs_code (stmt); - if (! commutative_tree_code (code) - && ! commutative_ternary_tree_code (code)) - { - if (!swap_not_matching) - goto fail; - swap_not_matching = false; - break; - } - } - else if (gcall *call = dyn_cast (stmt_info->stmt)) + if (!stmt_can_swap[j]) { - internal_fn fn = (gimple_call_internal_p (call) - ? gimple_call_internal_fn (call) - : IFN_LAST); - if ((! commutative_binary_fn_p (fn) - && ! commutative_ternary_fn_p (fn)) - || first_commutative_argument (fn) != 0) - { - if (!swap_not_matching) - goto fail; - swap_not_matching = false; - break; - } + if (!swap_not_matching) + goto fail; + swap_not_matching = false; + break; } } } @@ -3078,6 +3106,12 @@ fail: /* ??? Rejecting patterns this way doesn't work. We'd have to do extra work to cancel the pattern so the uses see the scalar version. */ + /* Skip building vector operands from scalars while operand + discovery may still be fixed by retrying with swapped operands. */ + && (least_upthread_swappable_op_distance != 1 + /* A first scalar stmt mismatch signals a fatal mismatch + that the parent commutative retry cannot recover. */ + || !matches[0]) && !is_pattern_stmt_p (stmt_info) && !oprnd_info->any_pattern) { -- 2.47.3