where we know it's not loaded from memory. */
unsigned m_num_gpr_needed[3];
unsigned m_num_sse_needed[3];
+ /* Number of 256-bit vector permutation. */
+ unsigned m_num_avx256_vec_perm[3];
};
ix86_vector_costs::ix86_vector_costs (vec_info* vinfo, bool costing_for_scalar)
: vector_costs (vinfo, costing_for_scalar),
m_num_gpr_needed (),
- m_num_sse_needed ()
+ m_num_sse_needed (),
+ m_num_avx256_vec_perm ()
{
}
if (stmt_cost == -1)
stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
+ if (kind == vec_perm && vectype
+ && GET_MODE_SIZE (TYPE_MODE (vectype)) == 32)
+ m_num_avx256_vec_perm[where]++;
+
/* Penalize DFmode vector operations for Bonnell. */
if (TARGET_CPU_P (BONNELL) && kind == vector_stmt
&& vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
ix86_vect_estimate_reg_pressure ();
+ for (int i = 0; i != 3; i++)
+ if (m_num_avx256_vec_perm[i]
+ && TARGET_AVX256_AVOID_VEC_PERM)
+ m_costs[i] = INT_MAX;
+
vector_costs::finish_cost (scalar_costs);
}
ix86_tune_features[X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL]
#define TARGET_AVX256_SPLIT_REGS \
ix86_tune_features[X86_TUNE_AVX256_SPLIT_REGS]
+#define TARGET_AVX256_AVOID_VEC_PERM \
+ ix86_tune_features[X86_TUNE_AVX256_AVOID_VEC_PERM]
#define TARGET_AVX512_SPLIT_REGS \
ix86_tune_features[X86_TUNE_AVX512_SPLIT_REGS]
#define TARGET_GENERAL_REGS_SSE_SPILL \
/* X86_TUNE_AVX256_SPLIT_REGS: if true, AVX256 ops are split into two AVX128 ops. */
DEF_TUNE (X86_TUNE_AVX256_SPLIT_REGS, "avx256_split_regs",m_BDVER | m_BTVER2
- | m_ZNVER1)
+ | m_ZNVER1 | m_CORE_ATOM)
/* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
the auto-vectorizer. */
instructions in the auto-vectorizer. */
DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_CORE_AVX512)
+/* X86_TUNE_AVX256_AVOID_VEC_PERM: Avoid using 256-bit cross-lane
+ vector permutation instructions in the auto-vectorizer. */
+DEF_TUNE (X86_TUNE_AVX256_AVOID_VEC_PERM,
+ "avx256_avoid_vec_perm", m_CORE_ATOM)
+
/* X86_TUNE_AVX256_SPLIT_REGS: if true, AVX512 ops are split into two AVX256 ops. */
DEF_TUNE (X86_TUNE_AVX512_SPLIT_REGS, "avx512_split_regs", m_ZNVER4)
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=sierraforest -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors" "vect" } } */
+
+int a[256], b[256];
+
+void __attribute__((noinline))
+foo (void)
+{
+ int i;
+ for (i = 0; i < 32; ++i)
+ {
+ b[i*8+0] = a[i*8+0];
+ b[i*8+1] = a[i*8+0];
+ b[i*8+2] = a[i*8+3];
+ b[i*8+3] = a[i*8+3];
+ b[i*8+4] = a[i*8+4];
+ b[i*8+5] = a[i*8+6];
+ b[i*8+6] = a[i*8+4];
+ b[i*8+7] = a[i*8+6];
+ }
+}