From: liuhongt Date: Wed, 20 Aug 2025 05:46:40 +0000 (-0700) Subject: Exclude fake cross-lane permutation from avx256_avoid_vec_perm. X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f4154da55586ab591c1b01936ebd6ab370bc2e80;p=thirdparty%2Fgcc.git Exclude fake cross-lane permutation from avx256_avoid_vec_perm. SLP may take a broadcast as kind of vec_perm, the patch checks the permutation index to exclude those false positive. gcc/ChangeLog: * config/i386/i386.cc (ix86_vector_costs::add_stmt_cost): Check permutation index for vec_perm, don't count it if we know it's not a cross-lane permutation. gcc/testsuite/ChangeLog: * gcc.target/i386/avx256_avoid_vec_perm.c: Adjust testcase. * gcc.target/i386/avx256_avoid_vec_perm-2.c: New test. * gcc.target/i386/avx256_avoid_vec_perm-5.c: New test. --- diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 471be3e8615..d71975a42be 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -26392,8 +26392,63 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, stmt_cost = ix86_default_vector_cost (kind, mode); if (kind == vec_perm && vectype - && GET_MODE_SIZE (TYPE_MODE (vectype)) == 32) - m_num_avx256_vec_perm[where]++; + && GET_MODE_SIZE (TYPE_MODE (vectype)) == 32 + /* BIT_FIELD_REF 0 times vec_perm costs 0 in body. */ + && count != 0) + { + bool real_perm = true; + unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype); + + if (node + && SLP_TREE_LOAD_PERMUTATION (node).exists () + /* Loop vectorization will have 4 times vec_perm + with index as {0, 0, 0, 0}. + But it actually generates + vec_perm_expr + vec_perm_expr + vec_perm_expr + Need to be handled separately. */ + && is_a (m_vinfo)) + { + unsigned half = nunits / 2; + unsigned i = 0; + bool allsame = true; + unsigned first = SLP_TREE_LOAD_PERMUTATION (node)[0]; + bool cross_lane_p = false; + for (i = 0 ; i != SLP_TREE_LANES (node); i++) + { + unsigned tmp = SLP_TREE_LOAD_PERMUTATION (node)[i]; + /* allsame is just a broadcast. */ + if (tmp != first) + allsame = false; + + /* 4 times vec_perm with number of lanes multiple of nunits. */ + tmp = tmp & (nunits - 1); + unsigned index = i & (nunits - 1); + if ((index < half && tmp >= half) + || (index >= half && tmp < half)) + cross_lane_p = true; + + if (!allsame && cross_lane_p) + break; + } + + if (i == SLP_TREE_LANES (node)) + real_perm = false; + } + + if (real_perm) + { + m_num_avx256_vec_perm[where] += count; + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Detected avx256 cross-lane permutation: "); + if (stmt_info) + print_gimple_expr (dump_file, stmt_info->stmt, 0, TDF_SLIM); + fprintf (dump_file, " \n"); + } + } + } /* Penalize DFmode vector operations for Bonnell. */ if (TARGET_CPU_P (BONNELL) && kind == vector_stmt diff --git a/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-2.c b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-2.c new file mode 100644 index 00000000000..8d4e641444d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-2.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-options "-march=sierraforest -O2 -fdump-tree-slp-details" } */ +/* { dg-final { scan-tree-dump-times {(?n)Detected avx256 cross-lane permutation} 1 "slp2" } } */ + +void +foo (double* a, double* __restrict b, int c, int n) +{ + a[0] = b[100] * b[2]; + a[1] = b[100] * b[3]; + a[2] = b[100] * b[0]; + a[3] = b[100] * b[1]; +} + +void +foo1 (double* a, double* __restrict b, int c, int n) +{ + a[0] = b[100] * b[0]; + a[1] = b[100] * b[1]; + a[2] = b[100] * b[3]; + a[3] = b[100] * b[2]; +} diff --git a/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-5.c b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-5.c new file mode 100644 index 00000000000..c11bea8c7b3 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-5.c @@ -0,0 +1,24 @@ +/* { dg-do compile } */ +/* { dg-options "-march=sierraforest -Ofast" } */ +/* { dg-final { scan-assembler-not {(?n)vpermpd.*%ymm} } } */ + +typedef struct { + unsigned short m1, m2, m3, m4; +} the_struct_t; +typedef struct { + double m1, m2, m3, m4, m5; +} the_struct2_t; + +double bar1 (the_struct2_t*); + +double foo (double* k, unsigned int n, the_struct_t* the_struct) { + unsigned int u; + the_struct2_t result; + for (u=0; u < n; u++, k--) { + result.m1 += (*k)*the_struct[u].m1; + result.m2 += (*k)*the_struct[u].m2; + result.m3 += (*k)*the_struct[u].m3; + result.m4 += (*k)*the_struct[u].m4; + } + return bar1 (&result); +} diff --git a/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c index d4f00b3fb52..e0399041ad9 100644 --- a/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c +++ b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c @@ -13,7 +13,7 @@ foo (void) b[i*8+0] = a[i*8+0]; b[i*8+1] = a[i*8+0]; b[i*8+2] = a[i*8+3]; - b[i*8+3] = a[i*8+3]; + b[i*8+3] = a[i*8+5]; b[i*8+4] = a[i*8+4]; b[i*8+5] = a[i*8+6]; b[i*8+6] = a[i*8+4];