From: Lili Cui <lili.cui@intel.com>
Date: Tue, 12 May 2026 17:00:00 +0000 (-0700)
Subject: [PATCH 1/2] tree-optimization/vect: Allow commutative operand swap for IFN in SLP... 
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8b97248edd8f2716d451bfeb369540c0d5338a5d;p=thirdparty%2Fgcc.git

[PATCH 1/2] tree-optimization/vect: Allow commutative operand swap for IFN in SLP reduction

In vect_build_slp_tree_1, when checking whether reduction operands at
different positions can be swapped, only tree_code operations (e.g.
PLUS_EXPR) were recognized as commutative.  Internal functions produced
by if-conversion (e.g. .COND_ADD, .COND_MUL) were not handled, causing
"different reduc_idx" failures when the reduction operand appeared at
different commutative positions across SLP lanes.

This patch extends the commutative swap recognition to internal
functions using the unified first_commutative_argument(code_helper, tree)
interface to identify the swappable operand pair for both tree codes and
internal functions.

This improves 731.astcenc_r (-Ofast) by 7.1% on EMR and 2.48% on Znver5.

gcc/ChangeLog:

	* tree-vect-slp.cc (vect_build_slp_tree_1): Use unified
	first_commutative_argument interface to allow commutative
	operand swap for both tree codes and internal functions
	(e.g. .COND_ADD) in SLP reduction matching.

gcc/testsuite/ChangeLog:

	* gcc.dg/vect/slp-reduc-15.c: New test.

Co-authored-by: Hongtao Liu <hongtao.liu@intel.com>
---

diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-15.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-15.c
new file mode 100644
index 00000000000..4745f85511b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-15.c
@@ -0,0 +1,81 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_float } */
+/* { dg-additional-options "-fgimple -march=x86-64-v3" { target x86_64-*-* } } */
+
+/* Test that SLP reduction vectorization handles commutative operand swap
+   for .COND_ADD in multi-lane SLP where the reduction operand appears
+   at arg1 in some lanes and arg2 in others.  If-conversion can generate
+   COND_ADD with mixed reduction operand positions.
+
+   Without the IFN commutative swap fix, the multi-lane SLP group
+   would be rejected with "different reduc_idx N instead of M".  */
+
+float cond_arr[1024];
+float data_a[1024];
+float data_b[1024];
+float data_c[1024];
+float data_d[1024];
+
+void __GIMPLE (ssa, guessed_local(16535624), startwith("loop"))
+foo (float * restrict p0, float * restrict p1,
+     float * restrict p2, float * restrict p3)
+{
+  float s0;
+  float s1;
+  float s2;
+  float s3;
+  int i;
+  float _a;
+  float _b;
+  float _c;
+  float _d;
+  float _5;
+  _Bool _cond;
+  unsigned int _30;
+  unsigned int _31;
+
+  __BB(2, guessed_local(16535624)):
+  goto __BB3(precise(134217728));
+
+  __BB(3, loop_header(1), guessed_local(1057206200)):
+  s0_22 = __PHI (__BB5: s0_10, __BB2: 0.0f);
+  s1_24 = __PHI (__BB5: s1_11, __BB2: 0.0f);
+  s2_26 = __PHI (__BB5: s2_12, __BB2: 0.0f);
+  s3_28 = __PHI (__BB5: s3_13, __BB2: 0.0f);
+  i_27 = __PHI (__BB5: i_19, __BB2: 0);
+  _31 = __PHI (__BB5: _30, __BB2: 1024u);
+  _5 = cond_arr[i_27];
+  _cond = _5 > 0.0f;
+  _a = data_a[i_27];
+  _b = data_b[i_27];
+  _c = data_c[i_27];
+  _d = data_d[i_27];
+  /* s0, s1: reduction operand at position 1 (arg1) â reduc_idx=1 */
+  s0_10 = .COND_ADD (_cond, s0_22, _a, s0_22);
+  s1_11 = .COND_ADD (_cond, s1_24, _b, s1_24);
+  /* s2, s3: reduction operand at position 2 (arg2) â reduc_idx=2 */
+  s2_12 = .COND_ADD (_cond, _c, s2_26, s2_26);
+  s3_13 = .COND_ADD (_cond, _d, s3_28, s3_28);
+  i_19 = i_27 + 1;
+  _30 = _31 - 1u;
+  if (_30 != 0u)
+    goto __BB5(guessed(132118446));
+  else
+    goto __BB4(guessed(2099282));
+
+  __BB(5, guessed_local(1040670576)):
+  goto __BB3(precise(134217728));
+
+  __BB(4, guessed_local(16535624)):
+  *p0_40(D) = s0_10;
+  *p1_41(D) = s1_11;
+  *p2_42(D) = s2_12;
+  *p3_43(D) = s3_13;
+  return;
+}
+
+/* With the IFN commutative swap fix, these 4 reductions should be
+   vectorized using SLP despite different reduc_idx values (1 vs 2).  */
+/* { dg-final { scan-tree-dump "swapped operands to match def types in" "vect" { target x86_64-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { target x86_64-*-* } } } */
+
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 7be8ca03763..cbcb08f6694 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -1391,6 +1391,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
 	}
       else
 	{
+	  int comm_arg;
 	  if (first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
 	      /* For SLP reduction groups the index isn't necessarily
 		 uniform but only that of the first stmt matters.  */
@@ -1399,9 +1400,10 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
 		   && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
 	      && !(first_reduc_idx != -1
 		   && STMT_VINFO_REDUC_IDX (stmt_info) != -1
-		   && rhs_code.is_tree_code ()
-		   && commutative_tree_code (tree_code (rhs_code))
-		   && first_reduc_idx == 1 - STMT_VINFO_REDUC_IDX (stmt_info)))
+		   && (comm_arg = first_commutative_argument
+				    (rhs_code, TREE_TYPE (lhs))) >= 0
+		   && (first_reduc_idx
+		       == 2 * comm_arg + 1 - STMT_VINFO_REDUC_IDX (stmt_info))))
 	    {
 	      if (dump_enabled_p ())
 		{