]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/i386/x86-tune.def
rs6000.c (rs6000_gimple_fold_builtin): Add support for folding of vector compares.
[thirdparty/gcc.git] / gcc / config / i386 / x86-tune.def
CommitLineData
3ad20bd4 1/* Definitions of x86 tunable features.
cbe34bb5 2 Copyright (C) 2013-2017 Free Software Foundation, Inc.
4b8bc035
XDL
3
4This file is part of GCC.
5
6GCC is free software; you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation; either version 3, or (at your option)
9any later version.
10
11GCC is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14GNU General Public License for more details.
15
4b8bc035
XDL
16You should have received a copy of the GNU General Public License and
17a copy of the GCC Runtime Library Exception along with this program;
18see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
19<http://www.gnu.org/licenses/>. */
20
d6c6ba3c
JH
21/* Tuning for a given CPU XXXX consists of:
22 - adding new CPU into:
23 - adding PROCESSOR_XXX to processor_type (in i386.h)
24 - possibly adding XXX into CPU attribute in i386.md
25 - adding XXX to processor_alias_table (in i386.c)
26 - introducing ix86_XXX_cost in i386.c
27 - Stringop generation table can be build based on test_stringop
28 - script (once rest of tuning is complete)
29 - designing a scheduler model in
30 - XXXX.md file
31 - Updating ix86_issue_rate and ix86_adjust_cost in i386.md
32 - possibly updating ia32_multipass_dfa_lookahead, ix86_sched_reorder
33 and ix86_sched_init_global if those tricks are needed.
34 - Tunning the flags bellow. Those are split into sections and each
35 section is very roughly ordered by importance. */
36
37/*****************************************************************************/
38/* Scheduling flags. */
39/*****************************************************************************/
9ac2f538 40
d6c6ba3c
JH
41/* X86_TUNE_SCHEDULE: Enable scheduling. */
42DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
2d6b2e28 43 m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT
cace2309 44 | m_INTEL | m_KNL | m_KNM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
9ac2f538 45
d6c6ba3c
JH
46/* X86_TUNE_PARTIAL_REG_DEPENDENCY: Enable more register renaming
47 on modern chips. Preffer stores affecting whole integer register
48 over partial stores. For example preffer MOVZBL or MOVQ to load 8bit
49 value over movb. */
50DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency",
4c249d97
JH
51 m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE
52 | m_BONNELL | m_SILVERMONT | m_INTEL
cace2309 53 | m_KNL | m_KNM | m_AMD_MULTIPLE | m_GENERIC)
9ac2f538 54
d6c6ba3c
JH
55/* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store
56 destinations to be 128bit to allow register renaming on 128bit SSE units,
57 but usually results in one extra microop on 64bit SSE units.
58 Experimental results shows that disabling this option on P4 brings over 20%
59 SPECfp regression, while enabling it on K8 brings roughly 2.4% regression
60 that can be partly masked by careful scheduling of moves. */
61DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, "sse_partial_reg_dependency",
8d99ad36 62 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
9ce29eb0 63 | m_BDVER | m_ZNVER1 | m_GENERIC)
9ac2f538 64
d6c6ba3c
JH
65/* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
66 are resolved on SSE register parts instead of whole registers, so we may
67 maintain just lower part of scalar values in proper format leaving the
68 upper part undefined. */
69DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8)
9ac2f538
JH
70
71/* X86_TUNE_PARTIAL_FLAG_REG_STALL: this flag disables use of of flags
72 set by instructions affecting just some flags (in particular shifts).
73 This is because Core2 resolves dependencies on whole flags register
74 and such sequences introduce false dependency on previous instruction
75 setting full flags.
76
77 The flags does not affect generation of INC and DEC that is controlled
78 by X86_TUNE_USE_INCDEC.
79
80 This flag may be dropped from generic once core2-corei5 machines are
81 rare enough. */
3ad20bd4 82DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
0ca6c49f 83 m_CORE2 | m_GENERIC)
9ac2f538 84
d6c6ba3c
JH
85/* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
86 partial dependencies. */
87DEF_TUNE (X86_TUNE_MOVX, "movx",
4c249d97
JH
88 m_PPRO | m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE
89 | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
90 | m_GEODE | m_AMD_MULTIPLE | m_GENERIC)
9ac2f538 91
d6c6ba3c
JH
92/* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by
93 full sized loads. */
94DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall",
9a7f94d7 95 m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL
cace2309 96 | m_KNL | m_KNM | m_AMD_MULTIPLE | m_GENERIC)
9ac2f538 97
0dc41f28
WM
98/* X86_TUNE_FUSE_CMP_AND_BRANCH_32: Fuse compare with a subsequent
99 conditional jump instruction for 32 bit TARGET.
d6c6ba3c 100 FIXME: revisit for generic. */
0dc41f28 101DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_32, "fuse_cmp_and_branch_32",
9ce29eb0 102 m_CORE_ALL | m_BDVER | m_ZNVER1)
0dc41f28
WM
103
104/* X86_TUNE_FUSE_CMP_AND_BRANCH_64: Fuse compare with a subsequent
105 conditional jump instruction for TARGET_64BIT.
106 FIXME: revisit for generic. */
107DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64, "fuse_cmp_and_branch_64",
9ce29eb0 108 m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER | m_ZNVER1)
0dc41f28
WM
109
110/* X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS: Fuse compare with a
111 subsequent conditional jump instruction when the condition jump
112 check sign flag (SF) or overflow flag (OF). */
113DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags",
9ce29eb0 114 m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER | m_ZNVER1)
0dc41f28
WM
115
116/* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional
117 jump instruction when the alu instruction produces the CCFLAG consumed by
118 the conditional jump instruction. */
119DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
d3c11974 120 m_SANDYBRIDGE | m_HASWELL)
9ac2f538 121
9ac2f538 122
d6c6ba3c
JH
123/*****************************************************************************/
124/* Function prologue, epilogue and function calling sequences. */
125/*****************************************************************************/
9ac2f538 126
d6c6ba3c
JH
127/* X86_TUNE_ACCUMULATE_OUTGOING_ARGS: Allocate stack space for outgoing
128 arguments in prologue/epilogue instead of separately for each call
129 by push/pop instructions.
130 This increase code size by about 5% in 32bit mode, less so in 64bit mode
131 because parameters are passed in registers. It is considerable
132 win for targets without stack engine that prevents multple push operations
133 to happen in parallel.
9ac2f538 134
d6c6ba3c
JH
135 FIXME: the flags is incorrectly enabled for amdfam10, Bulldozer,
136 Bobcat and Generic. This is because disabling it causes large
137 regression on mgrid due to IRA limitation leading to unecessary
138 use of the frame pointer in 32bit mode. */
d3c11974 139DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args",
cace2309 140 m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
33b64438 141 | m_ATHLON_K8)
9ac2f538 142
d6c6ba3c
JH
143/* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in prologues that are
144 considered on critical path. */
d3c11974 145DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move",
d6c6ba3c 146 m_PPRO | m_ATHLON_K8)
9ac2f538 147
d6c6ba3c
JH
148/* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in epilogues that are
149 considered on critical path. */
150DEF_TUNE (X86_TUNE_EPILOGUE_USING_MOVE, "epilogue_using_move",
d3c11974 151 m_PPRO | m_ATHLON_K8)
9ac2f538 152
d6c6ba3c 153/* X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits. */
d3c11974 154DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave",
d6c6ba3c 155 m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC)
9ac2f538 156
d6c6ba3c
JH
157/* X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions.
158 Some chips, like 486 and Pentium works faster with separate load
159 and push instructions. */
d3c11974
L
160DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory",
161 m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE
d6c6ba3c 162 | m_GENERIC)
9ac2f538 163
d6c6ba3c
JH
164/* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
165 over esp subtraction. */
d3c11974 166DEF_TUNE (X86_TUNE_SINGLE_PUSH, "single_push", m_386 | m_486 | m_PENT
2d6b2e28 167 | m_LAKEMONT | m_K6_GEODE)
9ac2f538 168
d6c6ba3c
JH
169/* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
170 over esp subtraction. */
2d6b2e28 171DEF_TUNE (X86_TUNE_DOUBLE_PUSH, "double_push", m_PENT | m_LAKEMONT
385e3f0c 172 | m_K6_GEODE)
9ac2f538 173
3ad20bd4
XDL
174/* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
175 over esp addition. */
385e3f0c 176DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop", m_386 | m_486 | m_PENT
2d6b2e28 177 | m_LAKEMONT | m_PPRO)
9ac2f538 178
3ad20bd4
XDL
179/* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
180 over esp addition. */
2d6b2e28 181DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT | m_LAKEMONT)
9ac2f538 182
d6c6ba3c
JH
183/*****************************************************************************/
184/* Branch predictor tuning */
185/*****************************************************************************/
9ac2f538 186
d6c6ba3c
JH
187/* X86_TUNE_PAD_SHORT_FUNCTION: Make every function to be at least 4
188 instructions long. */
d3c11974 189DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function", m_BONNELL)
d6c6ba3c
JH
190
191/* X86_TUNE_PAD_RETURNS: Place NOP before every RET that is a destination
192 of conditional jump or directly preceded by other jump instruction.
193 This is important for AND K8-AMDFAM10 because the branch prediction
194 architecture expect at most one jump per 2 byte window. Failing to
195 pad returns leads to misaligned return stack. */
196DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns",
197 m_ATHLON_K8 | m_AMDFAM10 | m_GENERIC)
198
199/* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
200 than 4 branch instructions in the 16 byte window. */
201DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit",
cace2309
SP
202 m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM
203 |m_INTEL | m_ATHLON_K8 | m_AMDFAM10)
d6c6ba3c
JH
204
205/*****************************************************************************/
206/* Integer instruction selection tuning */
207/*****************************************************************************/
208
209/* X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL: Enable software prefetching
210 at -O3. For the moment, the prefetching seems badly tuned for Intel
211 chips. */
212DEF_TUNE (X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL, "software_prefetching_beneficial",
07d88205 213 m_K6_GEODE | m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
d6c6ba3c
JH
214
215/* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
216 on 16-bit immediate moves into memory on Core2 and Corei7. */
217DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_GENERIC)
218
219/* X86_TUNE_READ_MODIFY: Enable use of read-modify instructions such
220 as "add mem, reg". */
2d6b2e28 221DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_LAKEMONT | m_PPRO))
d6c6ba3c 222
6d7e169e
JH
223/* X86_TUNE_USE_INCDEC: Enable use of inc/dec instructions.
224
225 Core2 and nehalem has stall of 7 cycles for partial flag register stalls.
226 Sandy bridge and Ivy bridge generate extra uop. On Haswell this extra uop
227 is output only when the values needs to be really merged, which is not
228 done by GCC generated code. */
d6c6ba3c 229DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec",
6d7e169e
JH
230 ~(m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE
231 | m_BONNELL | m_SILVERMONT | m_INTEL | m_KNL | m_KNM | m_GENERIC))
9ac2f538 232
3ad20bd4
XDL
233/* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
234 for DFmode copies */
235DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves",
d3c11974 236 ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
cace2309 237 | m_KNL | m_KNM | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_GENERIC))
9ac2f538 238
d6c6ba3c
JH
239/* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
240 will impact LEA instruction selection. */
52747219 241DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_BONNELL | m_SILVERMONT | m_KNL
cace2309 242 | m_KNM | m_INTEL)
9a7f94d7
L
243
244/* X86_TUNE_AVOID_LEA_FOR_ADDR: Avoid lea for address computation. */
245DEF_TUNE (X86_TUNE_AVOID_LEA_FOR_ADDR, "avoid_lea_for_addr",
cace2309 246 m_BONNELL | m_SILVERMONT | m_KNL | m_KNM)
9ac2f538 247
d6c6ba3c 248/* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
d3c11974 249 vector path on AMD machines.
d6c6ba3c
JH
250 FIXME: Do we need to enable this for core? */
251DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM32_MEM, "slow_imul_imm32_mem",
252 m_K8 | m_AMDFAM10)
253
254/* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
d3c11974 255 machines.
d6c6ba3c
JH
256 FIXME: Do we need to enable this for core? */
257DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM8, "slow_imul_imm8",
258 m_K8 | m_AMDFAM10)
259
260/* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
261 a conditional move. */
262DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove",
cace2309 263 m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL)
d6c6ba3c
JH
264
265/* X86_TUNE_SINGLE_STRINGOP: Enable use of single string operations, such
266 as MOVS and STOS (without a REP prefix) to move/set sequences of bytes. */
267DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
268
5783ad0e
UB
269/* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
270 compact prologues and epilogues by issuing a misaligned moves. This
271 requires target to handle misaligned moves and partial memory stalls
272 reasonably well.
273 FIXME: This may actualy be a win on more targets than listed here. */
274DEF_TUNE (X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES,
275 "misaligned_move_string_pro_epilogues",
561400f0
JH
276 m_386 | m_486 | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC)
277
d6c6ba3c
JH
278/* X86_TUNE_USE_SAHF: Controls use of SAHF. */
279DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf",
9a7f94d7 280 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
cace2309 281 | m_KNL | m_KNM | m_INTEL | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER
9ce29eb0 282 | m_BTVER | m_ZNVER1 | m_GENERIC)
d6c6ba3c
JH
283
284/* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions. */
9a7f94d7 285DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd",
cace2309 286 ~(m_PENT | m_LAKEMONT | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
385e3f0c 287 | m_K6))
d6c6ba3c
JH
288
289/* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions. */
290DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
cace2309 291 m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
d878e79b 292 | m_LAKEMONT | m_AMD_MULTIPLE | m_GENERIC)
d6c6ba3c 293
3652a4d2
JH
294/* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency
295 for bit-manipulation instructions. */
296DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi",
297 m_SANDYBRIDGE | m_HASWELL | m_GENERIC)
298
299/* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based
300 on hardware capabilities. Bdver3 hardware has a loop buffer which makes
301 unrolling small loop less important. For, such architectures we adjust
302 the unroll factor so that the unrolled loop fits the loop buffer. */
303DEF_TUNE (X86_TUNE_ADJUST_UNROLL, "adjust_unroll_factor", m_BDVER3 | m_BDVER4)
304
305/* X86_TUNE_ONE_IF_CONV_INSNS: Restrict a number of cmov insns in
306 if-converted sequence to one. */
307DEF_TUNE (X86_TUNE_ONE_IF_CONV_INSN, "one_if_conv_insn",
308 m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_CORE_ALL | m_GENERIC)
309
d6c6ba3c
JH
310/*****************************************************************************/
311/* 387 instruction selection tuning */
312/*****************************************************************************/
313
314/* X86_TUNE_USE_HIMODE_FIOP: Enables use of x87 instructions with 16bit
315 integer operand.
316 FIXME: Why this is disabled for modern chips? */
d3c11974 317DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop",
d6c6ba3c
JH
318 m_386 | m_486 | m_K6_GEODE)
319
320/* X86_TUNE_USE_SIMODE_FIOP: Enables use of x87 instructions with 32bit
321 integer operand. */
322DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop",
2d6b2e28 323 ~(m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL
cace2309 324 | m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_AMD_MULTIPLE | m_GENERIC))
d6c6ba3c
JH
325
326/* X86_TUNE_USE_FFREEP: Use freep instruction instead of fstp. */
327DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE)
328
329/* X86_TUNE_EXT_80387_CONSTANTS: Use fancy 80387 constants, such as PI. */
330DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants",
9a7f94d7 331 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
cace2309 332 | m_KNL | m_KNM | m_INTEL | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC)
d6c6ba3c
JH
333
334/*****************************************************************************/
335/* SSE instruction selection tuning */
336/*****************************************************************************/
337
d6c6ba3c
JH
338/* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
339 regs instead of memory. */
340DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill",
341 m_CORE_ALL)
9ac2f538
JH
342
343/* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL: Use movups for misaligned loads instead
344 of a sequence loading registers by parts. */
3ad20bd4 345DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal",
cace2309 346 m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_SILVERMONT | m_KNL | m_KNM
9ce29eb0 347 | m_INTEL | m_AMDFAM10 | m_BDVER | m_BTVER | m_ZNVER1 | m_GENERIC)
9ac2f538
JH
348
349/* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores instead
350 of a sequence loading registers by parts. */
3ad20bd4 351DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
cace2309 352 m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_SILVERMONT | m_KNL | m_KNM
9ce29eb0 353 | m_INTEL | m_BDVER | m_ZNVER1 | m_GENERIC)
9ac2f538
JH
354
355/* Use packed single precision instructions where posisble. I.e. movups instead
356 of movupd. */
3ad20bd4 357DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal",
9ce29eb0 358 m_BDVER | m_ZNVER1)
9ac2f538 359
13ef00fa 360/* X86_TUNE_SSE_TYPELESS_STORES: Always movaps/movups for 128bit stores. */
0ca6c49f 361DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores",
13ef00fa 362 m_AMD_MULTIPLE | m_CORE_ALL | m_GENERIC)
9ac2f538
JH
363
364/* X86_TUNE_SSE_LOAD0_BY_PXOR: Always use pxor to load0 as opposed to
13ef00fa 365 xorps/xorpd and other variants. */
0ca6c49f 366DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor",
9ce29eb0
VK
367 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BDVER | m_BTVER | m_ZNVER1
368 | m_GENERIC)
9ac2f538 369
9ac2f538
JH
370/* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from integer
371 to SSE registers. If disabled, the moves will be done by storing
372 the value to memory and reloading. */
3ad20bd4 373DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_TO_VEC, "inter_unit_moves_to_vec",
68d1c4bf 374 ~(m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC))
9ac2f538
JH
375
376/* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from SSE
377 to integer registers. If disabled, the moves will be done by storing
378 the value to memory and reloading. */
3ad20bd4
XDL
379DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_FROM_VEC, "inter_unit_moves_from_vec",
380 ~m_ATHLON_K8)
9ac2f538
JH
381
382/* X86_TUNE_INTER_UNIT_CONVERSIONS: Enable float<->integer conversions
383 to use both SSE and integer registers at a same time.
384 FIXME: revisit importance of this for generic. */
3ad20bd4 385DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions",
9ac2f538
JH
386 ~(m_AMDFAM10 | m_BDVER))
387
d6c6ba3c
JH
388/* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for
389 fp converts to destination register. */
390DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts",
cace2309 391 m_SILVERMONT | m_KNL | m_KNM | m_INTEL)
9ac2f538 392
d6c6ba3c
JH
393/* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
394 from FP to FP. This form of instructions avoids partial write to the
395 destination. */
396DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts",
397 m_AMDFAM10)
9ac2f538 398
d6c6ba3c
JH
399/* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
400 from integer to FP. */
401DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
9ac2f538 402
a4ef7f3e
ES
403/* X86_TUNE_SLOW_SHUFB: Indicates tunings with slow pshufb instruction. */
404DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb",
cace2309 405 m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL)
a4ef7f3e 406
45392c76
IE
407/* X86_TUNE_AVOID_4BYTE_PREFIXES: Avoid instructions requiring 4+ bytes of prefixes. */
408DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes",
409 m_SILVERMONT | m_INTEL)
410
d6c6ba3c
JH
411/*****************************************************************************/
412/* AVX instruction selection tuning (some of SSE flags affects AVX, too) */
413/*****************************************************************************/
9ac2f538 414
cd3c1b1c 415/* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if false, unaligned loads are
d6c6ba3c 416 split. */
d3c11974
L
417DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal",
418 ~(m_NEHALEM | m_SANDYBRIDGE | m_GENERIC))
9ac2f538 419
cd3c1b1c 420/* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if false, unaligned stores are
d6c6ba3c 421 split. */
d3c11974 422DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_store_optimal",
9ce29eb0 423 ~(m_NEHALEM | m_SANDYBRIDGE | m_BDVER | m_ZNVER1 | m_GENERIC))
9ac2f538 424
d6c6ba3c
JH
425/* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
426 the auto-vectorizer. */
9ce29eb0
VK
427DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2
428 | m_ZNVER1)
9ac2f538 429
d6c6ba3c
JH
430/*****************************************************************************/
431/* Historical relics: tuning flags that helps a specific old CPU designs */
432/*****************************************************************************/
433
434/* X86_TUNE_DOUBLE_WITH_ADD: Use add instead of sal to double value in
435 an integer register. */
436DEF_TUNE (X86_TUNE_DOUBLE_WITH_ADD, "double_with_add", ~m_386)
437
438/* X86_TUNE_ALWAYS_FANCY_MATH_387: controls use of fancy 387 operations,
439 such as fsqrt, fprem, fsin, fcos, fsincos etc.
440 Should be enabled for all targets that always has coprocesor. */
d3c11974 441DEF_TUNE (X86_TUNE_ALWAYS_FANCY_MATH_387, "always_fancy_math_387",
0d3a9fe7 442 ~(m_386 | m_486 | m_LAKEMONT))
d6c6ba3c
JH
443
444/* X86_TUNE_UNROLL_STRLEN: Produce (quite lame) unrolled sequence for
445 inline strlen. This affects only -minline-all-stringops mode. By
446 default we always dispatch to a library since our internal strlen
447 is bad. */
448DEF_TUNE (X86_TUNE_UNROLL_STRLEN, "unroll_strlen", ~m_386)
449
450/* X86_TUNE_SHIFT1: Enables use of short encoding of "sal reg" instead of
451 longer "sal $1, reg". */
452DEF_TUNE (X86_TUNE_SHIFT1, "shift1", ~m_486)
453
454/* X86_TUNE_ZERO_EXTEND_WITH_AND: Use AND instruction instead
455 of mozbl/movwl. */
385e3f0c 456DEF_TUNE (X86_TUNE_ZERO_EXTEND_WITH_AND, "zero_extend_with_and",
d878e79b 457 m_486 | m_PENT)
9ac2f538 458
3ad20bd4
XDL
459/* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
460 and SImode multiply, but 386 and 486 do HImode multiply faster. */
461DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul",
462 ~(m_386 | m_486))
9ac2f538 463
d6c6ba3c
JH
464/* X86_TUNE_FAST_PREFIX: Enable demoting some 32bit or 64bit arithmetic
465 into 16bit/8bit when resulting sequence is shorter. For example
466 for "and $-65536, reg" to 16bit store of 0. */
385e3f0c 467DEF_TUNE (X86_TUNE_FAST_PREFIX, "fast_prefix",
2d6b2e28 468 ~(m_386 | m_486 | m_PENT | m_LAKEMONT))
9ac2f538 469
d6c6ba3c
JH
470/* X86_TUNE_READ_MODIFY_WRITE: Enable use of read modify write instructions
471 such as "add $1, mem". */
385e3f0c 472DEF_TUNE (X86_TUNE_READ_MODIFY_WRITE, "read_modify_write",
2d6b2e28 473 ~(m_PENT | m_LAKEMONT))
9ac2f538 474
3ad20bd4
XDL
475/* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
476 than a MOV. */
2d6b2e28 477DEF_TUNE (X86_TUNE_MOVE_M1_VIA_OR, "move_m1_via_or", m_PENT | m_LAKEMONT)
9ac2f538 478
3ad20bd4
XDL
479/* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
480 but one byte longer. */
2d6b2e28 481DEF_TUNE (X86_TUNE_NOT_UNPAIRABLE, "not_unpairable", m_PENT | m_LAKEMONT)
9ac2f538 482
d6c6ba3c
JH
483/* X86_TUNE_PARTIAL_REG_STALL: Pentium pro, unlike later chips, handled
484 use of partial registers by renaming. This improved performance of 16bit
485 code where upper halves of registers are not used. It also leads to
486 an penalty whenever a 16bit store is followed by 32bit use. This flag
487 disables production of such sequences in common cases.
488 See also X86_TUNE_HIMODE_MATH.
9ac2f538 489
d6c6ba3c
JH
490 In current implementation the partial register stalls are not eliminated
491 very well - they can be introduced via subregs synthesized by combine
492 and can happen in caller/callee saving sequences. */
493DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO)
9ac2f538 494
d6c6ba3c
JH
495/* X86_TUNE_PROMOTE_QIMODE: When it is cheap, turn 8bit arithmetic to
496 corresponding 32bit arithmetic. */
497DEF_TUNE (X86_TUNE_PROMOTE_QIMODE, "promote_qimode",
498 ~m_PPRO)
9ac2f538 499
d6c6ba3c
JH
500/* X86_TUNE_PROMOTE_HI_REGS: Same, but for 16bit artihmetic. Again we avoid
501 partial register stalls on PentiumPro targets. */
502DEF_TUNE (X86_TUNE_PROMOTE_HI_REGS, "promote_hi_regs", m_PPRO)
9ac2f538 503
d6c6ba3c
JH
504/* X86_TUNE_HIMODE_MATH: Enable use of 16bit arithmetic.
505 On PPro this flag is meant to avoid partial register stalls. */
506DEF_TUNE (X86_TUNE_HIMODE_MATH, "himode_math", ~m_PPRO)
9ac2f538 507
d6c6ba3c
JH
508/* X86_TUNE_SPLIT_LONG_MOVES: Avoid instructions moving immediates
509 directly to memory. */
510DEF_TUNE (X86_TUNE_SPLIT_LONG_MOVES, "split_long_moves", m_PPRO)
9ac2f538 511
d6c6ba3c
JH
512/* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
513DEF_TUNE (X86_TUNE_USE_XCHGB, "use_xchgb", m_PENT4)
9ac2f538 514
d6c6ba3c
JH
515/* X86_TUNE_USE_MOV0: Use "mov $0, reg" instead of "xor reg, reg" to clear
516 integer register. */
517DEF_TUNE (X86_TUNE_USE_MOV0, "use_mov0", m_K6)
9ac2f538 518
d6c6ba3c
JH
519/* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
520 operand that cannot be represented using a modRM byte. The XOR
521 replacement is long decoded, so this split helps here as well. */
522DEF_TUNE (X86_TUNE_NOT_VECTORMODE, "not_vectormode", m_K6)
9ac2f538 523
d6c6ba3c
JH
524/* X86_TUNE_AVOID_VECTOR_DECODE: Enable splitters that avoid vector decoded
525 forms of instructions on K8 targets. */
526DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode",
527 m_K8)
9ac2f538 528
d6c6ba3c
JH
529/*****************************************************************************/
530/* This never worked well before. */
531/*****************************************************************************/
41ee845b 532
d6c6ba3c
JH
533/* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
534 on simulation result. But after P4 was made, no performance benefit
535 was observed with branch hints. It also increases the code size.
536 As a result, icc never generates branch hints. */
7100c1f2 537DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS, "branch_prediction_hints", 0U)
41ee845b 538
d6c6ba3c 539/* X86_TUNE_QIMODE_MATH: Enable use of 8bit arithmetic. */
7100c1f2 540DEF_TUNE (X86_TUNE_QIMODE_MATH, "qimode_math", ~0U)
41ee845b 541
d6c6ba3c
JH
542/* X86_TUNE_PROMOTE_QI_REGS: This enables generic code that promotes all 8bit
543 arithmetic to 32bit via PROMOTE_MODE macro. This code generation scheme
544 is usually used for RISC targets. */
7100c1f2 545DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", 0U)