if (GET_MODE_BITSIZE (mode) == 128
&& TARGET_SSE_SPLIT_REGS)
- return cost * 2;
- if (GET_MODE_BITSIZE (mode) > 128
+ return cost * GET_MODE_BITSIZE (mode) / 64;
+ else if (GET_MODE_BITSIZE (mode) > 128
&& TARGET_AVX256_SPLIT_REGS)
return cost * GET_MODE_BITSIZE (mode) / 128;
+ else if (GET_MODE_BITSIZE (mode) > 256
+ && TARGET_AVX512_SPLIT_REGS)
+ return cost * GET_MODE_BITSIZE (mode) / 256;
return cost;
}
return 1;
/* Account for targets that splits wide vectors into multiple parts. */
- if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 128)
+ if (TARGET_AVX512_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 256)
+ div = GET_MODE_BITSIZE (mode) / 256;
+ else if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 128)
div = GET_MODE_BITSIZE (mode) / 128;
else if (TARGET_SSE_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 64)
div = GET_MODE_BITSIZE (mode) / 64;
/* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2
elements. */
DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ALDERLAKE | m_CORE_ATOM | m_GENERIC))
+ ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE | m_CORE_ATOM | m_GENERIC))
/* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4
elements. */
DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ALDERLAKE | m_CORE_ATOM | m_GENERIC))
+ ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE | m_CORE_ATOM | m_GENERIC))
/* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
elements. */
/* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
smaller FMA chain. */
-DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3
+DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3 | m_ZNVER4
| m_ALDERLAKE | m_SAPPHIRERAPIDS | m_CORE_ATOM)
+/* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
+ smaller FMA chain. */
+DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_ZNVER4)
+
/* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd
for v2df vector reduction. */
DEF_TUNE (X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD,
/* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of 512-bit AVX
instructions in the auto-vectorizer. */
-DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_CORE_AVX512)
+DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_CORE_AVX512 | m_ZNVER4)
+
+/* X86_TUNE_AVX256_SPLIT_REGS: if true, AVX512 ops are split into two AVX256 ops. */
+DEF_TUNE (X86_TUNE_AVX512_SPLIT_REGS, "avx512_split_regs", m_ZNVER4)
/* X86_TUNE_AVX256_MOVE_BY_PIECES: Optimize move_by_pieces with 256-bit
AVX instructions. */
DEF_TUNE (X86_TUNE_AVX256_MOVE_BY_PIECES, "avx256_move_by_pieces",
- m_ALDERLAKE | m_CORE_AVX2)
+ m_ALDERLAKE | m_CORE_AVX2 | m_ZNVER1 | m_ZNVER2 | m_ZNVER3)
/* X86_TUNE_AVX256_STORE_BY_PIECES: Optimize store_by_pieces with 256-bit
AVX instructions. */
DEF_TUNE (X86_TUNE_AVX256_STORE_BY_PIECES, "avx256_store_by_pieces",
- m_ALDERLAKE | m_CORE_AVX2)
+ m_ALDERLAKE | m_CORE_AVX2 | m_ZNVER1 | m_ZNVER2 | m_ZNVER3)
/* X86_TUNE_AVX512_MOVE_BY_PIECES: Optimize move_by_pieces with 512-bit
AVX instructions. */
DEF_TUNE (X86_TUNE_AVX512_MOVE_BY_PIECES, "avx512_move_by_pieces",
- m_SAPPHIRERAPIDS)
+ m_SAPPHIRERAPIDS | m_ZNVER4)
/* X86_TUNE_AVX512_STORE_BY_PIECES: Optimize store_by_pieces with 512-bit
AVX instructions. */
DEF_TUNE (X86_TUNE_AVX512_STORE_BY_PIECES, "avx512_store_by_pieces",
- m_SAPPHIRERAPIDS)
+ m_SAPPHIRERAPIDS | m_ZNVER4)
/*****************************************************************************/
/*****************************************************************************/