Add a new tune avx256_avoid_vec_perm for SRF.

author liuhongt <hongtao.liu@intel.com>

Wed, 25 Sep 2024 05:11:11 +0000 (13:11 +0800)

committer liuhongt <hongtao.liu@intel.com>

Thu, 10 Oct 2024 02:21:29 +0000 (10:21 +0800)
author liuhongt <hongtao.liu@intel.com>
Wed, 25 Sep 2024 05:11:11 +0000 (13:11 +0800)
committer liuhongt <hongtao.liu@intel.com>
Thu, 10 Oct 2024 02:21:29 +0000 (10:21 +0800)
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc

index 90a564b2ffaa1f2a580c68d61b87cc7f9e7809c9..ab0ade3790f2300bb7a66cb7238cc50515c704b1 100644 (file)
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -25025,12 +25025,15 @@ private:
       where we know it's not loaded from memory.  */
    unsigned m_num_gpr_needed[3];
    unsigned m_num_sse_needed[3];
+  /* Number of 256-bit vector permutation.  */
+  unsigned m_num_avx256_vec_perm[3];
  };
  
  ix86_vector_costs::ix86_vector_costs (vec_info* vinfo, bool costing_for_scalar)
    : vector_costs (vinfo, costing_for_scalar),
      m_num_gpr_needed (),
-    m_num_sse_needed ()
+    m_num_sse_needed (),
+    m_num_avx256_vec_perm ()
  {
  }
  
@@ -25264,6 +25267,10 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
    if (stmt_cost == -1)
      stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
  
+  if (kind == vec_perm && vectype
+      && GET_MODE_SIZE (TYPE_MODE (vectype)) == 32)
+    m_num_avx256_vec_perm[where]++;
+
    /* Penalize DFmode vector operations for Bonnell.  */
    if (TARGET_CPU_P (BONNELL) && kind == vector_stmt
        && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
@@ -25333,6 +25340,11 @@ ix86_vector_costs::finish_cost (const vector_costs *scalar_costs)
  
    ix86_vect_estimate_reg_pressure ();
  
+  for (int i = 0; i != 3; i++)
+    if (m_num_avx256_vec_perm[i]
+       && TARGET_AVX256_AVOID_VEC_PERM)
+      m_costs[i] = INT_MAX;
+
    vector_costs::finish_cost (scalar_costs);
  }
  
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h

index d5d54ee660407083e437271dfcff4f2b7a8bc60d..f5204aa1ed23ec59e4d7fbe7212bb96b645955de 100644 (file)
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -439,6 +439,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
         ix86_tune_features[X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL]
  #define TARGET_AVX256_SPLIT_REGS \
         ix86_tune_features[X86_TUNE_AVX256_SPLIT_REGS]
+#define TARGET_AVX256_AVOID_VEC_PERM \
+       ix86_tune_features[X86_TUNE_AVX256_AVOID_VEC_PERM]
  #define TARGET_AVX512_SPLIT_REGS \
         ix86_tune_features[X86_TUNE_AVX512_SPLIT_REGS]
  #define TARGET_GENERAL_REGS_SSE_SPILL \
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def

index b815b6dc255bef509f5486fcd6b66e7b6df81c2c..6ebb2fd3414e8734ae7601a0d46eecb60e5c0a78 100644 (file)
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -558,7 +558,7 @@ DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_store_optimal"
  
  /* X86_TUNE_AVX256_SPLIT_REGS: if true, AVX256 ops are split into two AVX128 ops.  */
  DEF_TUNE (X86_TUNE_AVX256_SPLIT_REGS, "avx256_split_regs",m_BDVER | m_BTVER2
-         | m_ZNVER1)
+         | m_ZNVER1 | m_CORE_ATOM)
  
  /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
     the auto-vectorizer.  */
@@ -569,6 +569,11 @@ DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2
     instructions in the auto-vectorizer.  */
  DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_CORE_AVX512)
  
+/* X86_TUNE_AVX256_AVOID_VEC_PERM: Avoid using 256-bit cross-lane
+   vector permutation instructions in the auto-vectorizer.  */
+DEF_TUNE (X86_TUNE_AVX256_AVOID_VEC_PERM,
+        "avx256_avoid_vec_perm", m_CORE_ATOM)
+
  /* X86_TUNE_AVX256_SPLIT_REGS: if true, AVX512 ops are split into two AVX256 ops.  */
  DEF_TUNE (X86_TUNE_AVX512_SPLIT_REGS, "avx512_split_regs", m_ZNVER4)
  
diff --git a/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c

new file mode 100644 (file)

index 0000000..d4f00b3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=sierraforest -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors" "vect" } } */
+
+int a[256], b[256];
+
+void __attribute__((noinline))
+foo (void)
+{
+  int i;
+  for (i = 0; i < 32; ++i)
+    {
+      b[i*8+0] = a[i*8+0];
+      b[i*8+1] = a[i*8+0];
+      b[i*8+2] = a[i*8+3];
+      b[i*8+3] = a[i*8+3];
+      b[i*8+4] = a[i*8+4];
+      b[i*8+5] = a[i*8+6];
+      b[i*8+6] = a[i*8+4];
+      b[i*8+7] = a[i*8+6];
+    }
+}
author	liuhongt <hongtao.liu@intel.com>
	Wed, 25 Sep 2024 05:11:11 +0000 (13:11 +0800)
committer	liuhongt <hongtao.liu@intel.com>
	Thu, 10 Oct 2024 02:21:29 +0000 (10:21 +0800)
gcc/config/i386/i386.cc		patch \| blob \| blame \| history
gcc/config/i386/i386.h		patch \| blob \| blame \| history
gcc/config/i386/x86-tune.def		patch \| blob \| blame \| history
gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c	[new file with mode: 0644]	patch \| blob