From: Lili Cui Date: Tue, 12 May 2026 17:00:00 +0000 (-0700) Subject: [PATCH 1/2] tree-optimization/vect: Allow commutative operand swap for IFN in SLP... X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8b97248edd8f2716d451bfeb369540c0d5338a5d;p=thirdparty%2Fgcc.git [PATCH 1/2] tree-optimization/vect: Allow commutative operand swap for IFN in SLP reduction In vect_build_slp_tree_1, when checking whether reduction operands at different positions can be swapped, only tree_code operations (e.g. PLUS_EXPR) were recognized as commutative. Internal functions produced by if-conversion (e.g. .COND_ADD, .COND_MUL) were not handled, causing "different reduc_idx" failures when the reduction operand appeared at different commutative positions across SLP lanes. This patch extends the commutative swap recognition to internal functions using the unified first_commutative_argument(code_helper, tree) interface to identify the swappable operand pair for both tree codes and internal functions. This improves 731.astcenc_r (-Ofast) by 7.1% on EMR and 2.48% on Znver5. gcc/ChangeLog: * tree-vect-slp.cc (vect_build_slp_tree_1): Use unified first_commutative_argument interface to allow commutative operand swap for both tree codes and internal functions (e.g. .COND_ADD) in SLP reduction matching. gcc/testsuite/ChangeLog: * gcc.dg/vect/slp-reduc-15.c: New test. Co-authored-by: Hongtao Liu --- diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-15.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-15.c new file mode 100644 index 00000000000..4745f85511b --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-15.c @@ -0,0 +1,81 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_float } */ +/* { dg-additional-options "-fgimple -march=x86-64-v3" { target x86_64-*-* } } */ + +/* Test that SLP reduction vectorization handles commutative operand swap + for .COND_ADD in multi-lane SLP where the reduction operand appears + at arg1 in some lanes and arg2 in others. If-conversion can generate + COND_ADD with mixed reduction operand positions. + + Without the IFN commutative swap fix, the multi-lane SLP group + would be rejected with "different reduc_idx N instead of M". */ + +float cond_arr[1024]; +float data_a[1024]; +float data_b[1024]; +float data_c[1024]; +float data_d[1024]; + +void __GIMPLE (ssa, guessed_local(16535624), startwith("loop")) +foo (float * restrict p0, float * restrict p1, + float * restrict p2, float * restrict p3) +{ + float s0; + float s1; + float s2; + float s3; + int i; + float _a; + float _b; + float _c; + float _d; + float _5; + _Bool _cond; + unsigned int _30; + unsigned int _31; + + __BB(2, guessed_local(16535624)): + goto __BB3(precise(134217728)); + + __BB(3, loop_header(1), guessed_local(1057206200)): + s0_22 = __PHI (__BB5: s0_10, __BB2: 0.0f); + s1_24 = __PHI (__BB5: s1_11, __BB2: 0.0f); + s2_26 = __PHI (__BB5: s2_12, __BB2: 0.0f); + s3_28 = __PHI (__BB5: s3_13, __BB2: 0.0f); + i_27 = __PHI (__BB5: i_19, __BB2: 0); + _31 = __PHI (__BB5: _30, __BB2: 1024u); + _5 = cond_arr[i_27]; + _cond = _5 > 0.0f; + _a = data_a[i_27]; + _b = data_b[i_27]; + _c = data_c[i_27]; + _d = data_d[i_27]; + /* s0, s1: reduction operand at position 1 (arg1) → reduc_idx=1 */ + s0_10 = .COND_ADD (_cond, s0_22, _a, s0_22); + s1_11 = .COND_ADD (_cond, s1_24, _b, s1_24); + /* s2, s3: reduction operand at position 2 (arg2) → reduc_idx=2 */ + s2_12 = .COND_ADD (_cond, _c, s2_26, s2_26); + s3_13 = .COND_ADD (_cond, _d, s3_28, s3_28); + i_19 = i_27 + 1; + _30 = _31 - 1u; + if (_30 != 0u) + goto __BB5(guessed(132118446)); + else + goto __BB4(guessed(2099282)); + + __BB(5, guessed_local(1040670576)): + goto __BB3(precise(134217728)); + + __BB(4, guessed_local(16535624)): + *p0_40(D) = s0_10; + *p1_41(D) = s1_11; + *p2_42(D) = s2_12; + *p3_43(D) = s3_13; + return; +} + +/* With the IFN commutative swap fix, these 4 reductions should be + vectorized using SLP despite different reduc_idx values (1 vs 2). */ +/* { dg-final { scan-tree-dump "swapped operands to match def types in" "vect" { target x86_64-*-* } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { target x86_64-*-* } } } */ + diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 7be8ca03763..cbcb08f6694 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -1391,6 +1391,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, } else { + int comm_arg; if (first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info) /* For SLP reduction groups the index isn't necessarily uniform but only that of the first stmt matters. */ @@ -1399,9 +1400,10 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, && REDUC_GROUP_FIRST_ELEMENT (stmt_info)) && !(first_reduc_idx != -1 && STMT_VINFO_REDUC_IDX (stmt_info) != -1 - && rhs_code.is_tree_code () - && commutative_tree_code (tree_code (rhs_code)) - && first_reduc_idx == 1 - STMT_VINFO_REDUC_IDX (stmt_info))) + && (comm_arg = first_commutative_argument + (rhs_code, TREE_TYPE (lhs))) >= 0 + && (first_reduc_idx + == 2 * comm_arg + 1 - STMT_VINFO_REDUC_IDX (stmt_info)))) { if (dump_enabled_p ()) {