Exclude fake cross-lane permutation from avx256_avoid_vec_perm.

author liuhongt <hongtao.liu@intel.com>

Wed, 20 Aug 2025 05:46:40 +0000 (22:46 -0700)

committer liuhongt <hongtao.liu@intel.com>

Tue, 9 Sep 2025 01:58:51 +0000 (18:58 -0700)
author liuhongt <hongtao.liu@intel.com>
Wed, 20 Aug 2025 05:46:40 +0000 (22:46 -0700)
committer liuhongt <hongtao.liu@intel.com>
Tue, 9 Sep 2025 01:58:51 +0000 (18:58 -0700)
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc

index 471be3e8615825b789c9169d9ae4f5afb5fe0a77..d71975a42bea36428ae0d9b6fe7db30271f0b298 100644 (file)
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -26392,8 +26392,63 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
      stmt_cost = ix86_default_vector_cost (kind, mode);
  
    if (kind == vec_perm && vectype
-      && GET_MODE_SIZE (TYPE_MODE (vectype)) == 32)
-    m_num_avx256_vec_perm[where]++;
+      && GET_MODE_SIZE (TYPE_MODE (vectype)) == 32
+      /* BIT_FIELD_REF <vect_**, 64, 0> 0 times vec_perm costs 0 in body.  */
+      && count != 0)
+    {
+      bool real_perm = true;
+      unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
+
+      if (node
+         && SLP_TREE_LOAD_PERMUTATION (node).exists ()
+         /* Loop vectorization will have 4 times vec_perm
+            with index as {0, 0, 0, 0}.
+            But it actually generates
+            vec_perm_expr <vect, vect, 0, 0, 0, 0>
+            vec_perm_expr <vect, vect, 1, 1, 1, 1>
+            vec_perm_expr <vect, vect, 2, 2, 2, 2>
+            Need to be handled separately.  */
+         && is_a <bb_vec_info> (m_vinfo))
+       {
+         unsigned half = nunits / 2;
+         unsigned i = 0;
+         bool allsame = true;
+         unsigned first = SLP_TREE_LOAD_PERMUTATION (node)[0];
+         bool cross_lane_p = false;
+         for (i = 0 ; i != SLP_TREE_LANES (node); i++)
+           {
+             unsigned tmp = SLP_TREE_LOAD_PERMUTATION (node)[i];
+             /* allsame is just a broadcast.  */
+             if (tmp != first)
+               allsame = false;
+
+             /* 4 times vec_perm with number of lanes multiple of nunits.  */
+             tmp = tmp & (nunits - 1);
+             unsigned index = i & (nunits - 1);
+             if ((index < half && tmp >= half)
+                 || (index >= half && tmp < half))
+               cross_lane_p = true;
+
+             if (!allsame && cross_lane_p)
+               break;
+           }
+
+         if (i == SLP_TREE_LANES (node))
+           real_perm = false;
+       }
+
+      if (real_perm)
+       {
+         m_num_avx256_vec_perm[where] += count;
+         if (dump_file && (dump_flags & TDF_DETAILS))
+           {
+             fprintf (dump_file, "Detected avx256 cross-lane permutation: ");
+             if (stmt_info)
+               print_gimple_expr (dump_file, stmt_info->stmt, 0, TDF_SLIM);
+             fprintf (dump_file, " \n");
+           }
+       }
+    }
  
    /* Penalize DFmode vector operations for Bonnell.  */
    if (TARGET_CPU_P (BONNELL) && kind == vector_stmt
diff --git a/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-2.c b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-2.c

new file mode 100644 (file)

index 0000000..8d4e641
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-2.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-march=sierraforest -O2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-times {(?n)Detected avx256 cross-lane permutation} 1 "slp2" } } */
+
+void
+foo (double* a, double* __restrict b, int c, int n)
+{
+  a[0] = b[100] * b[2];
+  a[1] = b[100] * b[3];
+  a[2] = b[100] * b[0];
+  a[3] = b[100] * b[1];
+}
+
+void
+foo1 (double* a, double* __restrict b, int c, int n)
+{
+  a[0] = b[100] * b[0];
+  a[1] = b[100] * b[1];
+  a[2] = b[100] * b[3];
+  a[3] = b[100] * b[2];
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-5.c b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-5.c

new file mode 100644 (file)

index 0000000..c11bea8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-5.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-march=sierraforest -Ofast" } */
+/* { dg-final { scan-assembler-not {(?n)vpermpd.*%ymm} } } */
+
+typedef struct {
+  unsigned short m1, m2, m3, m4;
+} the_struct_t;
+typedef struct {
+  double m1, m2, m3, m4, m5;
+} the_struct2_t;
+
+double bar1 (the_struct2_t*);
+
+double foo (double* k, unsigned int n, the_struct_t* the_struct) {
+  unsigned int u;
+  the_struct2_t result;
+  for (u=0; u < n; u++, k--) {
+    result.m1 += (*k)*the_struct[u].m1;
+    result.m2 += (*k)*the_struct[u].m2;
+    result.m3 += (*k)*the_struct[u].m3;
+    result.m4 += (*k)*the_struct[u].m4;
+  }
+  return bar1 (&result);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c

index d4f00b3fb52019c11dd74bbde6519fb680b18024..e0399041ad9db4285ab9ea78400ab0fed660257c 100644 (file)
--- a/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c
+++ b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c
@@ -13,7 +13,7 @@ foo (void)
        b[i*8+0] = a[i*8+0];
        b[i*8+1] = a[i*8+0];
        b[i*8+2] = a[i*8+3];
-      b[i*8+3] = a[i*8+3];
+      b[i*8+3] = a[i*8+5];
        b[i*8+4] = a[i*8+4];
        b[i*8+5] = a[i*8+6];
        b[i*8+6] = a[i*8+4];
author	liuhongt <hongtao.liu@intel.com>
	Wed, 20 Aug 2025 05:46:40 +0000 (22:46 -0700)
committer	liuhongt <hongtao.liu@intel.com>
	Tue, 9 Sep 2025 01:58:51 +0000 (18:58 -0700)
gcc/config/i386/i386.cc		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-2.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-5.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c		patch \| blob \| blame \| history