]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/i386/x86-tune.def
Update copyright years.
[thirdparty/gcc.git] / gcc / config / i386 / x86-tune.def
CommitLineData
3ad20bd4 1/* Definitions of x86 tunable features.
8d9254fc 2 Copyright (C) 2013-2020 Free Software Foundation, Inc.
4b8bc035
XDL
3
4This file is part of GCC.
5
6GCC is free software; you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation; either version 3, or (at your option)
9any later version.
10
11GCC is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14GNU General Public License for more details.
15
4b8bc035
XDL
16You should have received a copy of the GNU General Public License and
17a copy of the GCC Runtime Library Exception along with this program;
18see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
19<http://www.gnu.org/licenses/>. */
20
d6c6ba3c
JH
21/* Tuning for a given CPU XXXX consists of:
22 - adding new CPU into:
23 - adding PROCESSOR_XXX to processor_type (in i386.h)
24 - possibly adding XXX into CPU attribute in i386.md
25 - adding XXX to processor_alias_table (in i386.c)
26 - introducing ix86_XXX_cost in i386.c
27 - Stringop generation table can be build based on test_stringop
28 - script (once rest of tuning is complete)
29 - designing a scheduler model in
30 - XXXX.md file
31 - Updating ix86_issue_rate and ix86_adjust_cost in i386.md
32 - possibly updating ia32_multipass_dfa_lookahead, ix86_sched_reorder
33 and ix86_sched_init_global if those tricks are needed.
34 - Tunning the flags bellow. Those are split into sections and each
35 section is very roughly ordered by importance. */
36
37/*****************************************************************************/
38/* Scheduling flags. */
39/*****************************************************************************/
9ac2f538 40
d6c6ba3c
JH
41/* X86_TUNE_SCHEDULE: Enable scheduling. */
42DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
2d6b2e28 43 m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT
50e461df 44 | m_INTEL | m_KNL | m_KNM | m_K6_GEODE | m_AMD_MULTIPLE | m_GOLDMONT
a548a5a1 45 | m_GOLDMONT_PLUS | m_TREMONT | m_GENERIC)
9ac2f538 46
d6c6ba3c
JH
47/* X86_TUNE_PARTIAL_REG_DEPENDENCY: Enable more register renaming
48 on modern chips. Preffer stores affecting whole integer register
49 over partial stores. For example preffer MOVZBL or MOVQ to load 8bit
50 value over movb. */
51DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency",
7264261f 52 m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2
74b2bb19 53 | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL
7264261f 54 | m_KNL | m_KNM | m_AMD_MULTIPLE | m_TREMONT
a548a5a1 55 | m_GENERIC)
9ac2f538 56
d6c6ba3c
JH
57/* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store
58 destinations to be 128bit to allow register renaming on 128bit SSE units,
59 but usually results in one extra microop on 64bit SSE units.
60 Experimental results shows that disabling this option on P4 brings over 20%
61 SPECfp regression, while enabling it on K8 brings roughly 2.4% regression
62 that can be partly masked by careful scheduling of moves. */
63DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, "sse_partial_reg_dependency",
8d99ad36 64 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
2901f42f 65 | m_BDVER | m_ZNVER | m_GENERIC)
9ac2f538 66
d6c6ba3c
JH
67/* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
68 are resolved on SSE register parts instead of whole registers, so we may
69 maintain just lower part of scalar values in proper format leaving the
70 upper part undefined. */
71DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8)
9ac2f538
JH
72
73/* X86_TUNE_PARTIAL_FLAG_REG_STALL: this flag disables use of of flags
74 set by instructions affecting just some flags (in particular shifts).
75 This is because Core2 resolves dependencies on whole flags register
76 and such sequences introduce false dependency on previous instruction
77 setting full flags.
78
79 The flags does not affect generation of INC and DEC that is controlled
4ca47ced 80 by X86_TUNE_USE_INCDEC. */
9ac2f538 81
3ad20bd4 82DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
4ca47ced 83 m_CORE2)
9ac2f538 84
d6c6ba3c
JH
85/* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
86 partial dependencies. */
87DEF_TUNE (X86_TUNE_MOVX, "movx",
4c249d97 88 m_PPRO | m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE
50e461df 89 | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_KNL | m_KNM | m_INTEL
7264261f
L
90 | m_GOLDMONT_PLUS | m_GEODE | m_AMD_MULTIPLE
91 | m_CORE_AVX2 | m_TREMONT | m_GENERIC)
9ac2f538 92
d6c6ba3c
JH
93/* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by
94 full sized loads. */
95DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall",
9a7f94d7 96 m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL
74b2bb19 97 | m_KNL | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS | m_AMD_MULTIPLE
a548a5a1 98 | m_TREMONT | m_GENERIC)
9ac2f538 99
0dc41f28 100/* X86_TUNE_FUSE_CMP_AND_BRANCH_32: Fuse compare with a subsequent
4ca47ced 101 conditional jump instruction for 32 bit TARGET. */
0dc41f28 102DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_32, "fuse_cmp_and_branch_32",
2901f42f 103 m_CORE_ALL | m_BDVER | m_ZNVER | m_GENERIC)
0dc41f28
WM
104
105/* X86_TUNE_FUSE_CMP_AND_BRANCH_64: Fuse compare with a subsequent
4ca47ced 106 conditional jump instruction for TARGET_64BIT. */
0dc41f28 107DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64, "fuse_cmp_and_branch_64",
2901f42f
VK
108 m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER
109 | m_ZNVER | m_GENERIC)
0dc41f28
WM
110
111/* X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS: Fuse compare with a
112 subsequent conditional jump instruction when the condition jump
113 check sign flag (SF) or overflow flag (OF). */
114DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags",
2901f42f
VK
115 m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER
116 | m_ZNVER | m_GENERIC)
0dc41f28
WM
117
118/* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional
119 jump instruction when the alu instruction produces the CCFLAG consumed by
120 the conditional jump instruction. */
121DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
7264261f 122 m_SANDYBRIDGE | m_CORE_AVX2 | m_GENERIC)
9ac2f538 123
9ac2f538 124
d6c6ba3c
JH
125/*****************************************************************************/
126/* Function prologue, epilogue and function calling sequences. */
127/*****************************************************************************/
9ac2f538 128
d6c6ba3c
JH
129/* X86_TUNE_ACCUMULATE_OUTGOING_ARGS: Allocate stack space for outgoing
130 arguments in prologue/epilogue instead of separately for each call
131 by push/pop instructions.
132 This increase code size by about 5% in 32bit mode, less so in 64bit mode
133 because parameters are passed in registers. It is considerable
134 win for targets without stack engine that prevents multple push operations
4ca47ced 135 to happen in parallel. */
9ac2f538 136
d3c11974 137DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args",
cace2309 138 m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
a548a5a1 139 | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ATHLON_K8)
9ac2f538 140
d6c6ba3c
JH
141/* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in prologues that are
142 considered on critical path. */
d3c11974 143DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move",
d6c6ba3c 144 m_PPRO | m_ATHLON_K8)
9ac2f538 145
d6c6ba3c
JH
146/* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in epilogues that are
147 considered on critical path. */
148DEF_TUNE (X86_TUNE_EPILOGUE_USING_MOVE, "epilogue_using_move",
d3c11974 149 m_PPRO | m_ATHLON_K8)
9ac2f538 150
d6c6ba3c 151/* X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits. */
d3c11974 152DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave",
d6c6ba3c 153 m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
9ac2f538 154
d6c6ba3c
JH
155/* X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions.
156 Some chips, like 486 and Pentium works faster with separate load
157 and push instructions. */
d3c11974
L
158DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory",
159 m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE
d6c6ba3c 160 | m_GENERIC)
9ac2f538 161
d6c6ba3c
JH
162/* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
163 over esp subtraction. */
d3c11974 164DEF_TUNE (X86_TUNE_SINGLE_PUSH, "single_push", m_386 | m_486 | m_PENT
2d6b2e28 165 | m_LAKEMONT | m_K6_GEODE)
9ac2f538 166
d6c6ba3c
JH
167/* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
168 over esp subtraction. */
2d6b2e28 169DEF_TUNE (X86_TUNE_DOUBLE_PUSH, "double_push", m_PENT | m_LAKEMONT
385e3f0c 170 | m_K6_GEODE)
9ac2f538 171
3ad20bd4
XDL
172/* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
173 over esp addition. */
385e3f0c 174DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop", m_386 | m_486 | m_PENT
2d6b2e28 175 | m_LAKEMONT | m_PPRO)
9ac2f538 176
3ad20bd4
XDL
177/* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
178 over esp addition. */
2d6b2e28 179DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT | m_LAKEMONT)
9ac2f538 180
d6c6ba3c
JH
181/*****************************************************************************/
182/* Branch predictor tuning */
183/*****************************************************************************/
9ac2f538 184
d6c6ba3c
JH
185/* X86_TUNE_PAD_SHORT_FUNCTION: Make every function to be at least 4
186 instructions long. */
d3c11974 187DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function", m_BONNELL)
d6c6ba3c
JH
188
189/* X86_TUNE_PAD_RETURNS: Place NOP before every RET that is a destination
190 of conditional jump or directly preceded by other jump instruction.
191 This is important for AND K8-AMDFAM10 because the branch prediction
192 architecture expect at most one jump per 2 byte window. Failing to
193 pad returns leads to misaligned return stack. */
194DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns",
4ca47ced 195 m_ATHLON_K8 | m_AMDFAM10)
d6c6ba3c
JH
196
197/* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
198 than 4 branch instructions in the 16 byte window. */
199DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit",
cace2309 200 m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM
a548a5a1
OM
201 | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_INTEL | m_ATHLON_K8
202 | m_AMDFAM10)
d6c6ba3c
JH
203
204/*****************************************************************************/
205/* Integer instruction selection tuning */
206/*****************************************************************************/
207
208/* X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL: Enable software prefetching
209 at -O3. For the moment, the prefetching seems badly tuned for Intel
210 chips. */
211DEF_TUNE (X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL, "software_prefetching_beneficial",
07d88205 212 m_K6_GEODE | m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
d6c6ba3c
JH
213
214/* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
215 on 16-bit immediate moves into memory on Core2 and Corei7. */
216DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_GENERIC)
217
218/* X86_TUNE_READ_MODIFY: Enable use of read-modify instructions such
219 as "add mem, reg". */
2d6b2e28 220DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_LAKEMONT | m_PPRO))
d6c6ba3c 221
6d7e169e
JH
222/* X86_TUNE_USE_INCDEC: Enable use of inc/dec instructions.
223
224 Core2 and nehalem has stall of 7 cycles for partial flag register stalls.
225 Sandy bridge and Ivy bridge generate extra uop. On Haswell this extra uop
226 is output only when the values needs to be really merged, which is not
227 done by GCC generated code. */
d6c6ba3c 228DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec",
6d7e169e 229 ~(m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE
50e461df 230 | m_BONNELL | m_SILVERMONT | m_INTEL | m_KNL | m_KNM | m_GOLDMONT
a548a5a1 231 | m_GOLDMONT_PLUS | m_TREMONT | m_GENERIC))
9ac2f538 232
3ad20bd4
XDL
233/* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
234 for DFmode copies */
235DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves",
d3c11974 236 ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
50e461df 237 | m_KNL | m_KNM | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_GOLDMONT
a548a5a1 238 | m_GOLDMONT_PLUS | m_TREMONT | m_GENERIC))
9ac2f538 239
d6c6ba3c
JH
240/* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
241 will impact LEA instruction selection. */
52747219 242DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_BONNELL | m_SILVERMONT | m_KNL
a548a5a1 243 | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_INTEL)
9a7f94d7
L
244
245/* X86_TUNE_AVOID_LEA_FOR_ADDR: Avoid lea for address computation. */
246DEF_TUNE (X86_TUNE_AVOID_LEA_FOR_ADDR, "avoid_lea_for_addr",
a548a5a1
OM
247 m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT
248 | m_KNL | m_KNM)
9ac2f538 249
d6c6ba3c 250/* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
d3c11974 251 vector path on AMD machines.
d6c6ba3c
JH
252 FIXME: Do we need to enable this for core? */
253DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM32_MEM, "slow_imul_imm32_mem",
254 m_K8 | m_AMDFAM10)
255
256/* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
d3c11974 257 machines.
d6c6ba3c
JH
258 FIXME: Do we need to enable this for core? */
259DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM8, "slow_imul_imm8",
260 m_K8 | m_AMDFAM10)
261
262/* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
263 a conditional move. */
264DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove",
74b2bb19 265 m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_KNL
a548a5a1 266 | m_KNM | m_TREMONT | m_INTEL)
d6c6ba3c
JH
267
268/* X86_TUNE_SINGLE_STRINGOP: Enable use of single string operations, such
269 as MOVS and STOS (without a REP prefix) to move/set sequences of bytes. */
270DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
271
5783ad0e
UB
272/* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
273 compact prologues and epilogues by issuing a misaligned moves. This
274 requires target to handle misaligned moves and partial memory stalls
275 reasonably well.
276 FIXME: This may actualy be a win on more targets than listed here. */
277DEF_TUNE (X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES,
278 "misaligned_move_string_pro_epilogues",
561400f0
JH
279 m_386 | m_486 | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC)
280
d6c6ba3c
JH
281/* X86_TUNE_USE_SAHF: Controls use of SAHF. */
282DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf",
9a7f94d7 283 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
cace2309 284 | m_KNL | m_KNM | m_INTEL | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER
2901f42f 285 | m_BTVER | m_ZNVER | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT
a548a5a1 286 | m_GENERIC)
d6c6ba3c
JH
287
288/* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions. */
9a7f94d7 289DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd",
cace2309 290 ~(m_PENT | m_LAKEMONT | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
a548a5a1 291 | m_K6 | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT))
d6c6ba3c
JH
292
293/* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions. */
294DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
cace2309 295 m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
74b2bb19 296 | m_LAKEMONT | m_AMD_MULTIPLE | m_GOLDMONT | m_GOLDMONT_PLUS
a548a5a1 297 | m_TREMONT | m_GENERIC)
d6c6ba3c 298
3652a4d2
JH
299/* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency
300 for bit-manipulation instructions. */
301DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi",
7264261f 302 m_SANDYBRIDGE | m_CORE_AVX2 | m_GENERIC)
3652a4d2
JH
303
304/* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based
305 on hardware capabilities. Bdver3 hardware has a loop buffer which makes
306 unrolling small loop less important. For, such architectures we adjust
307 the unroll factor so that the unrolled loop fits the loop buffer. */
308DEF_TUNE (X86_TUNE_ADJUST_UNROLL, "adjust_unroll_factor", m_BDVER3 | m_BDVER4)
309
310/* X86_TUNE_ONE_IF_CONV_INSNS: Restrict a number of cmov insns in
311 if-converted sequence to one. */
312DEF_TUNE (X86_TUNE_ONE_IF_CONV_INSN, "one_if_conv_insn",
50e461df 313 m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_CORE_ALL | m_GOLDMONT
a548a5a1 314 | m_GOLDMONT_PLUS | m_TREMONT | m_GENERIC)
3652a4d2 315
48d552e5
UB
316/* X86_TUNE_USE_XCHG_FOR_ATOMIC_STORE: Use xchg instead of mov+mfence. */
317DEF_TUNE (X86_TUNE_USE_XCHG_FOR_ATOMIC_STORE, "use_xchg_for_atomic_store",
318 m_CORE_ALL | m_BDVER | m_ZNVER | m_GENERIC)
319
da86c5af
HW
320/* X86_TUNE_EXPAND_ABS: This enables a new abs pattern by
321 generating instructions for abs (x) = (((signed) x >> (W-1) ^ x) -
322 (signed) x >> (W-1)) instead of cmove or SSE max/abs instructions. */
323DEF_TUNE (X86_TUNE_EXPAND_ABS, "expand_abs",
324 m_CORE_ALL | m_SILVERMONT | m_KNL | m_KNM | m_GOLDMONT
325 | m_GOLDMONT_PLUS | m_TREMONT )
326
d6c6ba3c
JH
327/*****************************************************************************/
328/* 387 instruction selection tuning */
329/*****************************************************************************/
330
331/* X86_TUNE_USE_HIMODE_FIOP: Enables use of x87 instructions with 16bit
332 integer operand.
333 FIXME: Why this is disabled for modern chips? */
d3c11974 334DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop",
d6c6ba3c
JH
335 m_386 | m_486 | m_K6_GEODE)
336
337/* X86_TUNE_USE_SIMODE_FIOP: Enables use of x87 instructions with 32bit
338 integer operand. */
339DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop",
2d6b2e28 340 ~(m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL
50e461df 341 | m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_AMD_MULTIPLE
a548a5a1 342 | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_GENERIC))
d6c6ba3c
JH
343
344/* X86_TUNE_USE_FFREEP: Use freep instruction instead of fstp. */
345DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE)
346
347/* X86_TUNE_EXT_80387_CONSTANTS: Use fancy 80387 constants, such as PI. */
348DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants",
9a7f94d7 349 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
50e461df 350 | m_KNL | m_KNM | m_INTEL | m_K6_GEODE | m_ATHLON_K8 | m_GOLDMONT
a548a5a1 351 | m_GOLDMONT_PLUS | m_TREMONT | m_GENERIC)
d6c6ba3c
JH
352
353/*****************************************************************************/
354/* SSE instruction selection tuning */
355/*****************************************************************************/
356
d6c6ba3c
JH
357/* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
358 regs instead of memory. */
359DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill",
360 m_CORE_ALL)
9ac2f538
JH
361
362/* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL: Use movups for misaligned loads instead
363 of a sequence loading registers by parts. */
3ad20bd4 364DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal",
7264261f
L
365 m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | m_KNM
366 | m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS
2901f42f 367 | m_TREMONT | m_AMDFAM10 | m_BDVER | m_BTVER | m_ZNVER | m_GENERIC)
9ac2f538
JH
368
369/* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores instead
370 of a sequence loading registers by parts. */
3ad20bd4 371DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
7264261f
L
372 m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | m_KNM
373 | m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS
2901f42f 374 | m_TREMONT | m_BDVER | m_ZNVER | m_GENERIC)
9ac2f538
JH
375
376/* Use packed single precision instructions where posisble. I.e. movups instead
377 of movupd. */
3ad20bd4 378DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal",
2901f42f 379 m_BDVER | m_ZNVER)
9ac2f538 380
13ef00fa 381/* X86_TUNE_SSE_TYPELESS_STORES: Always movaps/movups for 128bit stores. */
0ca6c49f 382DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores",
13ef00fa 383 m_AMD_MULTIPLE | m_CORE_ALL | m_GENERIC)
9ac2f538
JH
384
385/* X86_TUNE_SSE_LOAD0_BY_PXOR: Always use pxor to load0 as opposed to
13ef00fa 386 xorps/xorpd and other variants. */
0ca6c49f 387DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor",
2901f42f 388 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BDVER | m_BTVER | m_ZNVER
9ce29eb0 389 | m_GENERIC)
9ac2f538 390
9ac2f538
JH
391/* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from integer
392 to SSE registers. If disabled, the moves will be done by storing
41ee4e75
JH
393 the value to memory and reloading.
394 Enable this flag for generic - the only relevant architecture preferring
395 no inter-unit moves is Buldozer. While this makes small regression on SPECfp
396 scores (sub 0.3%), disabling inter-unit moves penalizes noticeably hand
397 written vectorized code which use i.e. _mm_set_epi16. */
3ad20bd4 398DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_TO_VEC, "inter_unit_moves_to_vec",
41ee4e75 399 ~(m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER))
9ac2f538
JH
400
401/* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from SSE
402 to integer registers. If disabled, the moves will be done by storing
403 the value to memory and reloading. */
3ad20bd4
XDL
404DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_FROM_VEC, "inter_unit_moves_from_vec",
405 ~m_ATHLON_K8)
9ac2f538
JH
406
407/* X86_TUNE_INTER_UNIT_CONVERSIONS: Enable float<->integer conversions
4ca47ced 408 to use both SSE and integer registers at a same time. */
3ad20bd4 409DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions",
9ac2f538
JH
410 ~(m_AMDFAM10 | m_BDVER))
411
d6c6ba3c
JH
412/* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for
413 fp converts to destination register. */
414DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts",
74b2bb19 415 m_SILVERMONT | m_KNL | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS
a548a5a1 416 | m_TREMONT | m_INTEL)
9ac2f538 417
d6c6ba3c
JH
418/* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
419 from FP to FP. This form of instructions avoids partial write to the
420 destination. */
421DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts",
422 m_AMDFAM10)
9ac2f538 423
d6c6ba3c
JH
424/* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
425 from integer to FP. */
426DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
9ac2f538 427
a4ef7f3e
ES
428/* X86_TUNE_SLOW_SHUFB: Indicates tunings with slow pshufb instruction. */
429DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb",
74b2bb19 430 m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_GOLDMONT
a548a5a1 431 | m_GOLDMONT_PLUS | m_TREMONT | m_INTEL)
a4ef7f3e 432
45392c76
IE
433/* X86_TUNE_AVOID_4BYTE_PREFIXES: Avoid instructions requiring 4+ bytes of prefixes. */
434DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes",
a548a5a1 435 m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_INTEL)
45392c76 436
f6aa5171
JH
437/* X86_TUNE_USE_GATHER: Use gather instructions. */
438DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather",
2901f42f 439 ~(m_ZNVER | m_GENERIC))
f6aa5171 440
4a0d0ed2
MJ
441/* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
442 smaller FMA chain. */
2901f42f 443DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER)
4a0d0ed2 444
ef893a2a
JH
445/* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
446 smaller FMA chain. */
447DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2)
448
d6c6ba3c
JH
449/*****************************************************************************/
450/* AVX instruction selection tuning (some of SSE flags affects AVX, too) */
451/*****************************************************************************/
9ac2f538 452
cd3c1b1c 453/* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if false, unaligned loads are
d6c6ba3c 454 split. */
d3c11974
L
455DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal",
456 ~(m_NEHALEM | m_SANDYBRIDGE | m_GENERIC))
9ac2f538 457
cd3c1b1c 458/* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if false, unaligned stores are
d6c6ba3c 459 split. */
d3c11974 460DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_store_optimal",
9ce29eb0 461 ~(m_NEHALEM | m_SANDYBRIDGE | m_BDVER | m_ZNVER1 | m_GENERIC))
9ac2f538 462
586bbef1
HL
463/* X86_TUNE_AVX256_SPLIT_REGS: if true, AVX256 ops are split into two AVX128 ops. */
464DEF_TUNE (X86_TUNE_AVX256_SPLIT_REGS, "avx256_split_regs",m_BDVER | m_BTVER2
465 | m_ZNVER1)
466
d6c6ba3c
JH
467/* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
468 the auto-vectorizer. */
9ce29eb0
VK
469DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2
470 | m_ZNVER1)
9ac2f538 471
5958557b
SS
472/* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of 512-bit AVX
473 instructions in the auto-vectorizer. */
7264261f 474DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_CORE_AVX512)
5958557b 475
d6c6ba3c
JH
476/*****************************************************************************/
477/* Historical relics: tuning flags that helps a specific old CPU designs */
478/*****************************************************************************/
479
480/* X86_TUNE_DOUBLE_WITH_ADD: Use add instead of sal to double value in
481 an integer register. */
482DEF_TUNE (X86_TUNE_DOUBLE_WITH_ADD, "double_with_add", ~m_386)
483
484/* X86_TUNE_ALWAYS_FANCY_MATH_387: controls use of fancy 387 operations,
485 such as fsqrt, fprem, fsin, fcos, fsincos etc.
486 Should be enabled for all targets that always has coprocesor. */
d3c11974 487DEF_TUNE (X86_TUNE_ALWAYS_FANCY_MATH_387, "always_fancy_math_387",
0d3a9fe7 488 ~(m_386 | m_486 | m_LAKEMONT))
d6c6ba3c
JH
489
490/* X86_TUNE_UNROLL_STRLEN: Produce (quite lame) unrolled sequence for
491 inline strlen. This affects only -minline-all-stringops mode. By
492 default we always dispatch to a library since our internal strlen
493 is bad. */
494DEF_TUNE (X86_TUNE_UNROLL_STRLEN, "unroll_strlen", ~m_386)
495
496/* X86_TUNE_SHIFT1: Enables use of short encoding of "sal reg" instead of
497 longer "sal $1, reg". */
498DEF_TUNE (X86_TUNE_SHIFT1, "shift1", ~m_486)
499
500/* X86_TUNE_ZERO_EXTEND_WITH_AND: Use AND instruction instead
501 of mozbl/movwl. */
385e3f0c 502DEF_TUNE (X86_TUNE_ZERO_EXTEND_WITH_AND, "zero_extend_with_and",
d878e79b 503 m_486 | m_PENT)
9ac2f538 504
3ad20bd4
XDL
505/* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
506 and SImode multiply, but 386 and 486 do HImode multiply faster. */
507DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul",
508 ~(m_386 | m_486))
9ac2f538 509
d6c6ba3c
JH
510/* X86_TUNE_FAST_PREFIX: Enable demoting some 32bit or 64bit arithmetic
511 into 16bit/8bit when resulting sequence is shorter. For example
512 for "and $-65536, reg" to 16bit store of 0. */
385e3f0c 513DEF_TUNE (X86_TUNE_FAST_PREFIX, "fast_prefix",
2d6b2e28 514 ~(m_386 | m_486 | m_PENT | m_LAKEMONT))
9ac2f538 515
d6c6ba3c
JH
516/* X86_TUNE_READ_MODIFY_WRITE: Enable use of read modify write instructions
517 such as "add $1, mem". */
385e3f0c 518DEF_TUNE (X86_TUNE_READ_MODIFY_WRITE, "read_modify_write",
2d6b2e28 519 ~(m_PENT | m_LAKEMONT))
9ac2f538 520
3ad20bd4
XDL
521/* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
522 than a MOV. */
2d6b2e28 523DEF_TUNE (X86_TUNE_MOVE_M1_VIA_OR, "move_m1_via_or", m_PENT | m_LAKEMONT)
9ac2f538 524
3ad20bd4
XDL
525/* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
526 but one byte longer. */
2d6b2e28 527DEF_TUNE (X86_TUNE_NOT_UNPAIRABLE, "not_unpairable", m_PENT | m_LAKEMONT)
9ac2f538 528
d6c6ba3c
JH
529/* X86_TUNE_PARTIAL_REG_STALL: Pentium pro, unlike later chips, handled
530 use of partial registers by renaming. This improved performance of 16bit
531 code where upper halves of registers are not used. It also leads to
532 an penalty whenever a 16bit store is followed by 32bit use. This flag
533 disables production of such sequences in common cases.
534 See also X86_TUNE_HIMODE_MATH.
9ac2f538 535
d6c6ba3c
JH
536 In current implementation the partial register stalls are not eliminated
537 very well - they can be introduced via subregs synthesized by combine
538 and can happen in caller/callee saving sequences. */
539DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO)
9ac2f538 540
d6c6ba3c
JH
541/* X86_TUNE_PROMOTE_QIMODE: When it is cheap, turn 8bit arithmetic to
542 corresponding 32bit arithmetic. */
543DEF_TUNE (X86_TUNE_PROMOTE_QIMODE, "promote_qimode",
544 ~m_PPRO)
9ac2f538 545
d6c6ba3c
JH
546/* X86_TUNE_PROMOTE_HI_REGS: Same, but for 16bit artihmetic. Again we avoid
547 partial register stalls on PentiumPro targets. */
548DEF_TUNE (X86_TUNE_PROMOTE_HI_REGS, "promote_hi_regs", m_PPRO)
9ac2f538 549
d6c6ba3c
JH
550/* X86_TUNE_HIMODE_MATH: Enable use of 16bit arithmetic.
551 On PPro this flag is meant to avoid partial register stalls. */
552DEF_TUNE (X86_TUNE_HIMODE_MATH, "himode_math", ~m_PPRO)
9ac2f538 553
d6c6ba3c
JH
554/* X86_TUNE_SPLIT_LONG_MOVES: Avoid instructions moving immediates
555 directly to memory. */
556DEF_TUNE (X86_TUNE_SPLIT_LONG_MOVES, "split_long_moves", m_PPRO)
9ac2f538 557
d6c6ba3c
JH
558/* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
559DEF_TUNE (X86_TUNE_USE_XCHGB, "use_xchgb", m_PENT4)
9ac2f538 560
d6c6ba3c
JH
561/* X86_TUNE_USE_MOV0: Use "mov $0, reg" instead of "xor reg, reg" to clear
562 integer register. */
563DEF_TUNE (X86_TUNE_USE_MOV0, "use_mov0", m_K6)
9ac2f538 564
d6c6ba3c
JH
565/* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
566 operand that cannot be represented using a modRM byte. The XOR
567 replacement is long decoded, so this split helps here as well. */
568DEF_TUNE (X86_TUNE_NOT_VECTORMODE, "not_vectormode", m_K6)
9ac2f538 569
d6c6ba3c
JH
570/* X86_TUNE_AVOID_VECTOR_DECODE: Enable splitters that avoid vector decoded
571 forms of instructions on K8 targets. */
572DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode",
573 m_K8)
9ac2f538 574
d6c6ba3c
JH
575/*****************************************************************************/
576/* This never worked well before. */
577/*****************************************************************************/
41ee845b 578
d6c6ba3c
JH
579/* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
580 on simulation result. But after P4 was made, no performance benefit
581 was observed with branch hints. It also increases the code size.
582 As a result, icc never generates branch hints. */
7100c1f2 583DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS, "branch_prediction_hints", 0U)
41ee845b 584
d6c6ba3c 585/* X86_TUNE_QIMODE_MATH: Enable use of 8bit arithmetic. */
7100c1f2 586DEF_TUNE (X86_TUNE_QIMODE_MATH, "qimode_math", ~0U)
41ee845b 587
d6c6ba3c
JH
588/* X86_TUNE_PROMOTE_QI_REGS: This enables generic code that promotes all 8bit
589 arithmetic to 32bit via PROMOTE_MODE macro. This code generation scheme
590 is usually used for RISC targets. */
7100c1f2 591DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", 0U)
348188bf
L
592
593/* X86_TUNE_EMIT_VZEROUPPER: This enables vzeroupper instruction insertion
594 before a transfer of control flow out of the function. */
595DEF_TUNE (X86_TUNE_EMIT_VZEROUPPER, "emit_vzeroupper", ~m_KNL)