]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/i386/x86-tune.def
tree-optimization/115579 - fix wrong code with store-motion
[thirdparty/gcc.git] / gcc / config / i386 / x86-tune.def
CommitLineData
3ad20bd4 1/* Definitions of x86 tunable features.
a945c346 2 Copyright (C) 2013-2024 Free Software Foundation, Inc.
4b8bc035
XDL
3
4This file is part of GCC.
5
6GCC is free software; you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation; either version 3, or (at your option)
9any later version.
10
11GCC is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14GNU General Public License for more details.
15
4b8bc035
XDL
16You should have received a copy of the GNU General Public License and
17a copy of the GCC Runtime Library Exception along with this program;
18see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
19<http://www.gnu.org/licenses/>. */
20
d6c6ba3c
JH
21/* Tuning for a given CPU XXXX consists of:
22 - adding new CPU into:
23 - adding PROCESSOR_XXX to processor_type (in i386.h)
24 - possibly adding XXX into CPU attribute in i386.md
e53b6e56
ML
25 - adding XXX to processor_alias_table (in i386.cc)
26 - introducing ix86_XXX_cost in i386.cc
d6c6ba3c
JH
27 - Stringop generation table can be build based on test_stringop
28 - script (once rest of tuning is complete)
29 - designing a scheduler model in
30 - XXXX.md file
31 - Updating ix86_issue_rate and ix86_adjust_cost in i386.md
32 - possibly updating ia32_multipass_dfa_lookahead, ix86_sched_reorder
33 and ix86_sched_init_global if those tricks are needed.
34 - Tunning the flags bellow. Those are split into sections and each
35 section is very roughly ordered by importance. */
36
37/*****************************************************************************/
38/* Scheduling flags. */
39/*****************************************************************************/
9ac2f538 40
d6c6ba3c
JH
41/* X86_TUNE_SCHEDULE: Enable scheduling. */
42DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
2d6b2e28 43 m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT
e1a7e2c5
HJ
44 | m_INTEL | m_K6_GEODE | m_AMD_MULTIPLE | m_ZHAOXIN | m_GOLDMONT
45 | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM
46 | m_GENERIC)
9ac2f538 47
d6c6ba3c 48/* X86_TUNE_PARTIAL_REG_DEPENDENCY: Enable more register renaming
c060e5c4
JJ
49 on modern chips. Prefer stores affecting whole integer register
50 over partial stores. For example prefer MOVZBL or MOVQ to load 8bit
d6c6ba3c
JH
51 value over movb. */
52DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency",
7264261f 53 m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2
74b2bb19 54 | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL
e1a7e2c5
HJ
55 | m_AMD_MULTIPLE | m_ZHAOXIN | m_TREMONT | m_CORE_HYBRID
56 | m_CORE_ATOM | m_GENERIC)
9ac2f538 57
d6c6ba3c
JH
58/* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store
59 destinations to be 128bit to allow register renaming on 128bit SSE units,
60 but usually results in one extra microop on 64bit SSE units.
61 Experimental results shows that disabling this option on P4 brings over 20%
62 SPECfp regression, while enabling it on K8 brings roughly 2.4% regression
63 that can be partly masked by careful scheduling of moves. */
64DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, "sse_partial_reg_dependency",
8d99ad36 65 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
94c0b26f 66 | m_BDVER | m_ZNVER | m_ZHAOXIN | m_TREMONT | m_CORE_HYBRID
2aa97c0d 67 | m_CORE_ATOM | m_GENERIC)
9ac2f538 68
48b3caff
L
69/* X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY: This knob avoids
70 partial write to the destination in scalar SSE conversion from FP
71 to FP. */
72DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY,
73 "sse_partial_reg_fp_converts_dependency",
74 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
94c0b26f 75 | m_BDVER | m_ZNVER | m_ZHAOXIN | m_CORE_HYBRID | m_CORE_ATOM
2aa97c0d 76 | m_GENERIC)
48b3caff
L
77
78/* X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY: This knob avoids partial
79 write to the destination in scalar SSE conversion from integer to FP. */
80DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY,
81 "sse_partial_reg_converts_dependency",
82 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
94c0b26f 83 | m_BDVER | m_ZNVER | m_ZHAOXIN | m_CORE_HYBRID | m_CORE_ATOM
2aa97c0d 84 | m_GENERIC)
48b3caff 85
1c257558 86/* X86_TUNE_DEST_FALSE_DEP_FOR_GLC: This knob inserts zero-idiom before
87 several insns to break false dependency on the dest register for GLC
88 micro-architecture. */
89DEF_TUNE (X86_TUNE_DEST_FALSE_DEP_FOR_GLC,
2aa97c0d
HJ
90 "dest_false_dep_for_glc", m_SAPPHIRERAPIDS | m_CORE_HYBRID
91 | m_CORE_ATOM)
1c257558 92
d6c6ba3c
JH
93/* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
94 are resolved on SSE register parts instead of whole registers, so we may
95 maintain just lower part of scalar values in proper format leaving the
96 upper part undefined. */
97DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8)
9ac2f538 98
700d4cb0 99/* X86_TUNE_PARTIAL_FLAG_REG_STALL: this flag disables use of flags
9ac2f538
JH
100 set by instructions affecting just some flags (in particular shifts).
101 This is because Core2 resolves dependencies on whole flags register
102 and such sequences introduce false dependency on previous instruction
103 setting full flags.
104
105 The flags does not affect generation of INC and DEC that is controlled
4ca47ced 106 by X86_TUNE_USE_INCDEC. */
9ac2f538 107
3ad20bd4 108DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
4ca47ced 109 m_CORE2)
9ac2f538 110
d6c6ba3c
JH
111/* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
112 partial dependencies. */
113DEF_TUNE (X86_TUNE_MOVX, "movx",
4c249d97 114 m_PPRO | m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE
e1a7e2c5 115 | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_INTEL
94c0b26f 116 | m_GOLDMONT_PLUS | m_GEODE | m_AMD_MULTIPLE | m_ZHAOXIN
2aa97c0d 117 | m_CORE_AVX2 | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC)
9ac2f538 118
d6c6ba3c
JH
119/* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by
120 full sized loads. */
121DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall",
9a7f94d7 122 m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL
e1a7e2c5
HJ
123 | m_GOLDMONT | m_GOLDMONT_PLUS | m_AMD_MULTIPLE | m_ZHAOXIN
124 | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC)
9ac2f538 125
0dc41f28 126/* X86_TUNE_FUSE_CMP_AND_BRANCH_32: Fuse compare with a subsequent
4ca47ced 127 conditional jump instruction for 32 bit TARGET. */
0dc41f28 128DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_32, "fuse_cmp_and_branch_32",
94c0b26f 129 m_CORE_ALL | m_BDVER | m_ZNVER | m_ZHAOXIN | m_GENERIC)
0dc41f28
WM
130
131/* X86_TUNE_FUSE_CMP_AND_BRANCH_64: Fuse compare with a subsequent
4ca47ced 132 conditional jump instruction for TARGET_64BIT. */
0dc41f28 133DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64, "fuse_cmp_and_branch_64",
2901f42f 134 m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER
94c0b26f 135 | m_ZNVER | m_ZHAOXIN | m_GENERIC)
0dc41f28
WM
136
137/* X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS: Fuse compare with a
138 subsequent conditional jump instruction when the condition jump
139 check sign flag (SF) or overflow flag (OF). */
140DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags",
2901f42f 141 m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER
94c0b26f 142 | m_ZNVER | m_ZHAOXIN | m_GENERIC)
0dc41f28
WM
143
144/* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional
145 jump instruction when the alu instruction produces the CCFLAG consumed by
146 the conditional jump instruction. */
147DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
94c0b26f 148 m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC)
9ac2f538 149
9ac2f538 150
d6c6ba3c
JH
151/*****************************************************************************/
152/* Function prologue, epilogue and function calling sequences. */
153/*****************************************************************************/
9ac2f538 154
d6c6ba3c
JH
155/* X86_TUNE_ACCUMULATE_OUTGOING_ARGS: Allocate stack space for outgoing
156 arguments in prologue/epilogue instead of separately for each call
157 by push/pop instructions.
158 This increase code size by about 5% in 32bit mode, less so in 64bit mode
159 because parameters are passed in registers. It is considerable
94b91b27 160 win for targets without stack engine that prevents multiple push operations
4ca47ced 161 to happen in parallel. */
9ac2f538 162
d3c11974 163DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args",
e1a7e2c5 164 m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_INTEL
94c0b26f 165 | m_GOLDMONT | m_GOLDMONT_PLUS | m_ATHLON_K8 | m_ZHAOXIN)
9ac2f538 166
d6c6ba3c
JH
167/* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in prologues that are
168 considered on critical path. */
d3c11974 169DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move",
d6c6ba3c 170 m_PPRO | m_ATHLON_K8)
9ac2f538 171
d6c6ba3c
JH
172/* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in epilogues that are
173 considered on critical path. */
174DEF_TUNE (X86_TUNE_EPILOGUE_USING_MOVE, "epilogue_using_move",
d3c11974 175 m_PPRO | m_ATHLON_K8)
9ac2f538 176
d6c6ba3c 177/* X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits. */
d3c11974 178DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave",
94c0b26f 179 m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_ZHAOXIN
2aa97c0d 180 | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC)
9ac2f538 181
d6c6ba3c
JH
182/* X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions.
183 Some chips, like 486 and Pentium works faster with separate load
184 and push instructions. */
d3c11974
L
185DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory",
186 m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE
94c0b26f 187 | m_ZHAOXIN | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC)
9ac2f538 188
d6c6ba3c
JH
189/* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
190 over esp subtraction. */
d3c11974 191DEF_TUNE (X86_TUNE_SINGLE_PUSH, "single_push", m_386 | m_486 | m_PENT
2d6b2e28 192 | m_LAKEMONT | m_K6_GEODE)
9ac2f538 193
d6c6ba3c
JH
194/* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
195 over esp subtraction. */
2d6b2e28 196DEF_TUNE (X86_TUNE_DOUBLE_PUSH, "double_push", m_PENT | m_LAKEMONT
385e3f0c 197 | m_K6_GEODE)
9ac2f538 198
3ad20bd4
XDL
199/* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
200 over esp addition. */
385e3f0c 201DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop", m_386 | m_486 | m_PENT
2d6b2e28 202 | m_LAKEMONT | m_PPRO)
9ac2f538 203
3ad20bd4
XDL
204/* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
205 over esp addition. */
2d6b2e28 206DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT | m_LAKEMONT)
9ac2f538 207
d6c6ba3c
JH
208/*****************************************************************************/
209/* Branch predictor tuning */
210/*****************************************************************************/
9ac2f538 211
d6c6ba3c
JH
212/* X86_TUNE_PAD_SHORT_FUNCTION: Make every function to be at least 4
213 instructions long. */
d3c11974 214DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function", m_BONNELL)
d6c6ba3c
JH
215
216/* X86_TUNE_PAD_RETURNS: Place NOP before every RET that is a destination
217 of conditional jump or directly preceded by other jump instruction.
218 This is important for AND K8-AMDFAM10 because the branch prediction
219 architecture expect at most one jump per 2 byte window. Failing to
220 pad returns leads to misaligned return stack. */
221DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns",
4ca47ced 222 m_ATHLON_K8 | m_AMDFAM10)
d6c6ba3c
JH
223
224/* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
225 than 4 branch instructions in the 16 byte window. */
226DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit",
e1a7e2c5
HJ
227 m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_GOLDMONT
228 | m_GOLDMONT_PLUS | m_INTEL | m_ATHLON_K8 | m_AMDFAM10)
d6c6ba3c
JH
229
230/*****************************************************************************/
231/* Integer instruction selection tuning */
232/*****************************************************************************/
233
234/* X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL: Enable software prefetching
235 at -O3. For the moment, the prefetching seems badly tuned for Intel
236 chips. */
237DEF_TUNE (X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL, "software_prefetching_beneficial",
07d88205 238 m_K6_GEODE | m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
d6c6ba3c
JH
239
240/* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
241 on 16-bit immediate moves into memory on Core2 and Corei7. */
94c0b26f 242DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_ZHAOXIN | m_GENERIC)
d6c6ba3c
JH
243
244/* X86_TUNE_READ_MODIFY: Enable use of read-modify instructions such
245 as "add mem, reg". */
2d6b2e28 246DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_LAKEMONT | m_PPRO))
d6c6ba3c 247
6d7e169e
JH
248/* X86_TUNE_USE_INCDEC: Enable use of inc/dec instructions.
249
250 Core2 and nehalem has stall of 7 cycles for partial flag register stalls.
251 Sandy bridge and Ivy bridge generate extra uop. On Haswell this extra uop
252 is output only when the values needs to be really merged, which is not
253 done by GCC generated code. */
d6c6ba3c 254DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec",
6d7e169e 255 ~(m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE
e1a7e2c5 256 | m_BONNELL | m_SILVERMONT | m_INTEL | m_GOLDMONT
2aa97c0d 257 | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM
94c0b26f 258 | m_ZHAOXIN | m_GENERIC))
9ac2f538 259
3ad20bd4
XDL
260/* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
261 for DFmode copies */
262DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves",
d3c11974 263 ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
e1a7e2c5
HJ
264 | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_ZHAOXIN | m_GOLDMONT
265 | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM
266 | m_GENERIC))
9ac2f538 267
d6c6ba3c
JH
268/* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
269 will impact LEA instruction selection. */
e1a7e2c5
HJ
270DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_BONNELL | m_SILVERMONT | m_GOLDMONT
271 | m_GOLDMONT_PLUS | m_INTEL | m_ZHAOXIN)
9a7f94d7
L
272
273/* X86_TUNE_AVOID_LEA_FOR_ADDR: Avoid lea for address computation. */
274DEF_TUNE (X86_TUNE_AVOID_LEA_FOR_ADDR, "avoid_lea_for_addr",
e1a7e2c5 275 m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS)
9ac2f538 276
d6c6ba3c 277/* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
d3c11974 278 vector path on AMD machines.
d6c6ba3c
JH
279 FIXME: Do we need to enable this for core? */
280DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM32_MEM, "slow_imul_imm32_mem",
281 m_K8 | m_AMDFAM10)
282
283/* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
d3c11974 284 machines.
d6c6ba3c
JH
285 FIXME: Do we need to enable this for core? */
286DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM8, "slow_imul_imm8",
287 m_K8 | m_AMDFAM10)
288
289/* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
290 a conditional move. */
291DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove",
e1a7e2c5 292 m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL)
d6c6ba3c
JH
293
294/* X86_TUNE_SINGLE_STRINGOP: Enable use of single string operations, such
295 as MOVS and STOS (without a REP prefix) to move/set sequences of bytes. */
296DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
297
bf24f4ec
L
298/* X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB: Enable use of REP MOVSB/STOSB to
299 move/set sequences of bytes with known size. */
300DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
301 "prefer_known_rep_movsb_stosb",
2aa97c0d 302 m_SKYLAKE | m_CORE_HYBRID | m_CORE_ATOM | m_TREMONT | m_CORE_AVX512
94c0b26f 303 | m_ZHAOXIN)
bf24f4ec 304
5783ad0e
UB
305/* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
306 compact prologues and epilogues by issuing a misaligned moves. This
307 requires target to handle misaligned moves and partial memory stalls
308 reasonably well.
309 FIXME: This may actualy be a win on more targets than listed here. */
310DEF_TUNE (X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES,
311 "misaligned_move_string_pro_epilogues",
94c0b26f 312 m_386 | m_486 | m_CORE_ALL | m_AMD_MULTIPLE | m_ZHAOXIN | m_TREMONT
2aa97c0d 313 | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC)
561400f0 314
d6c6ba3c
JH
315/* X86_TUNE_USE_SAHF: Controls use of SAHF. */
316DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf",
e1a7e2c5
HJ
317 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
318 | m_INTEL | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER
319 | m_ZNVER | m_ZHAOXIN | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT
320 | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC)
d6c6ba3c
JH
321
322/* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions. */
9a7f94d7 323DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd",
e1a7e2c5 324 ~(m_PENT | m_LAKEMONT | m_BONNELL | m_SILVERMONT | m_INTEL
61b03ade 325 | m_K6 | m_GOLDMONT | m_GOLDMONT_PLUS))
d6c6ba3c
JH
326
327/* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions. */
328DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
e1a7e2c5
HJ
329 m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL | m_LAKEMONT
330 | m_AMD_MULTIPLE | m_ZHAOXIN | m_GOLDMONT | m_GOLDMONT_PLUS
331 | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC)
d6c6ba3c 332
3652a4d2
JH
333/* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency
334 for bit-manipulation instructions. */
335DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi",
cc6eb8b5 336 m_SANDYBRIDGE | m_HASWELL | m_SKYLAKE | m_SKYLAKE_AVX512
337 | m_CANNONLAKE | m_CASCADELAKE | m_COOPERLAKE
94c0b26f 338 | m_ZHAOXIN | m_GENERIC)
3652a4d2
JH
339
340/* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based
341 on hardware capabilities. Bdver3 hardware has a loop buffer which makes
342 unrolling small loop less important. For, such architectures we adjust
343 the unroll factor so that the unrolled loop fits the loop buffer. */
344DEF_TUNE (X86_TUNE_ADJUST_UNROLL, "adjust_unroll_factor", m_BDVER3 | m_BDVER4)
345
346/* X86_TUNE_ONE_IF_CONV_INSNS: Restrict a number of cmov insns in
347 if-converted sequence to one. */
348DEF_TUNE (X86_TUNE_ONE_IF_CONV_INSN, "one_if_conv_insn",
8b69efd9 349 m_SILVERMONT | m_HASWELL | m_SKYLAKE | m_GOLDMONT | m_GOLDMONT_PLUS
350 | m_TREMONT | m_ZHAOXIN)
3652a4d2 351
3c5e83d5
UB
352/* X86_TUNE_AVOID_MFENCE: Use lock prefixed instructions instead of mfence. */
353DEF_TUNE (X86_TUNE_AVOID_MFENCE, "avoid_mfence",
94c0b26f 354 m_CORE_ALL | m_BDVER | m_ZNVER | m_ZHAOXIN | m_TREMONT | m_CORE_HYBRID
2aa97c0d 355 | m_CORE_ATOM | m_GENERIC)
48d552e5 356
da86c5af
HW
357/* X86_TUNE_EXPAND_ABS: This enables a new abs pattern by
358 generating instructions for abs (x) = (((signed) x >> (W-1) ^ x) -
359 (signed) x >> (W-1)) instead of cmove or SSE max/abs instructions. */
360DEF_TUNE (X86_TUNE_EXPAND_ABS, "expand_abs",
e1a7e2c5 361 m_CORE_ALL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_ZHAOXIN)
da86c5af 362
d6c6ba3c
JH
363/*****************************************************************************/
364/* 387 instruction selection tuning */
365/*****************************************************************************/
366
367/* X86_TUNE_USE_HIMODE_FIOP: Enables use of x87 instructions with 16bit
368 integer operand.
369 FIXME: Why this is disabled for modern chips? */
d3c11974 370DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop",
d6c6ba3c
JH
371 m_386 | m_486 | m_K6_GEODE)
372
373/* X86_TUNE_USE_SIMODE_FIOP: Enables use of x87 instructions with 32bit
374 integer operand. */
375DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop",
2d6b2e28 376 ~(m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL
e1a7e2c5
HJ
377 | m_SILVERMONT | m_INTEL | m_AMD_MULTIPLE | m_ZHAOXIN | m_GOLDMONT
378 | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM
379 | m_GENERIC))
d6c6ba3c
JH
380
381/* X86_TUNE_USE_FFREEP: Use freep instruction instead of fstp. */
94c0b26f 382DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE | m_ZHAOXIN)
d6c6ba3c
JH
383
384/* X86_TUNE_EXT_80387_CONSTANTS: Use fancy 80387 constants, such as PI. */
385DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants",
9a7f94d7 386 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
e1a7e2c5
HJ
387 | m_INTEL | m_K6_GEODE | m_ATHLON_K8 | m_ZHAOXIN | m_GOLDMONT
388 | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM
389 | m_GENERIC)
d6c6ba3c
JH
390
391/*****************************************************************************/
392/* SSE instruction selection tuning */
393/*****************************************************************************/
394
d6c6ba3c
JH
395/* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
396 regs instead of memory. */
397DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill",
398 m_CORE_ALL)
9ac2f538
JH
399
400/* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL: Use movups for misaligned loads instead
401 of a sequence loading registers by parts. */
3ad20bd4 402DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal",
e1a7e2c5
HJ
403 m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_INTEL
404 | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID
94c0b26f 405 | m_CORE_ATOM | m_AMDFAM10 | m_BDVER | m_BTVER | m_ZNVER | m_ZHAOXIN
2aa97c0d 406 | m_GENERIC)
9ac2f538 407
828573a5
UB
408/* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores
409 instead of a sequence loading registers by parts. */
3ad20bd4 410DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
e1a7e2c5 411 m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT
2aa97c0d 412 | m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID
94c0b26f 413 | m_CORE_ATOM | m_BDVER | m_ZNVER | m_ZHAOXIN | m_GENERIC)
9ac2f538 414
828573a5
UB
415/* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL: Use packed single
416 precision 128bit instructions instead of double where possible. */
3ad20bd4 417DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal",
2901f42f 418 m_BDVER | m_ZNVER)
9ac2f538 419
13ef00fa 420/* X86_TUNE_SSE_TYPELESS_STORES: Always movaps/movups for 128bit stores. */
0ca6c49f 421DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores",
94c0b26f 422 m_AMD_MULTIPLE | m_ZHAOXIN | m_CORE_ALL | m_TREMONT | m_CORE_HYBRID
2aa97c0d 423 | m_CORE_ATOM | m_GENERIC)
9ac2f538
JH
424
425/* X86_TUNE_SSE_LOAD0_BY_PXOR: Always use pxor to load0 as opposed to
13ef00fa 426 xorps/xorpd and other variants. */
0ca6c49f 427DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor",
2901f42f 428 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BDVER | m_BTVER | m_ZNVER
94c0b26f 429 | m_ZHAOXIN | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC)
9ac2f538 430
9ac2f538
JH
431/* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from integer
432 to SSE registers. If disabled, the moves will be done by storing
41ee4e75
JH
433 the value to memory and reloading.
434 Enable this flag for generic - the only relevant architecture preferring
435 no inter-unit moves is Buldozer. While this makes small regression on SPECfp
436 scores (sub 0.3%), disabling inter-unit moves penalizes noticeably hand
437 written vectorized code which use i.e. _mm_set_epi16. */
3ad20bd4 438DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_TO_VEC, "inter_unit_moves_to_vec",
41ee4e75 439 ~(m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER))
9ac2f538
JH
440
441/* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from SSE
442 to integer registers. If disabled, the moves will be done by storing
443 the value to memory and reloading. */
3ad20bd4
XDL
444DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_FROM_VEC, "inter_unit_moves_from_vec",
445 ~m_ATHLON_K8)
9ac2f538
JH
446
447/* X86_TUNE_INTER_UNIT_CONVERSIONS: Enable float<->integer conversions
4ca47ced 448 to use both SSE and integer registers at a same time. */
3ad20bd4 449DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions",
9ac2f538
JH
450 ~(m_AMDFAM10 | m_BDVER))
451
d6c6ba3c
JH
452/* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for
453 fp converts to destination register. */
454DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts",
e1a7e2c5 455 m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL)
9ac2f538 456
d6c6ba3c
JH
457/* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
458 from FP to FP. This form of instructions avoids partial write to the
459 destination. */
460DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts",
461 m_AMDFAM10)
9ac2f538 462
d6c6ba3c
JH
463/* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
464 from integer to FP. */
465DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
9ac2f538 466
a4ef7f3e
ES
467/* X86_TUNE_SLOW_SHUFB: Indicates tunings with slow pshufb instruction. */
468DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb",
e1a7e2c5 469 m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL)
a4ef7f3e 470
45392c76
IE
471/* X86_TUNE_AVOID_4BYTE_PREFIXES: Avoid instructions requiring 4+ bytes of prefixes. */
472DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes",
2aa97c0d
HJ
473 m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID
474 | m_CORE_ATOM | m_INTEL)
45392c76 475
87126675
JH
476/* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2
477 elements. */
478DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
2aa97c0d 479 ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID
6f6ea27d 480 | m_YONGFENG | m_SHIJIDADAO | m_CORE_ATOM | m_GENERIC | m_GDS))
87126675 481
96759248
JH
482/* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2
483 elements. */
484DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts",
b83acefb 485 ~(m_ZNVER4))
96759248 486
87126675
JH
487/* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4
488 elements. */
489DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
2aa97c0d 490 ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID
6f6ea27d 491 | m_YONGFENG | m_SHIJIDADAO | m_CORE_ATOM | m_GENERIC | m_GDS))
87126675 492
96759248
JH
493/* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4
494 elements. */
495DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
b83acefb 496 ~(m_ZNVER4))
96759248 497
87126675
JH
498/* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
499 elements. */
b2a927fb 500DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts",
2aa97c0d 501 ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_CORE_HYBRID | m_CORE_ATOM
6f6ea27d 502 | m_YONGFENG | m_SHIJIDADAO | m_GENERIC | m_GDS))
96759248
JH
503
504/* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more
505 elements. */
b2a927fb 506DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
b83acefb 507 ~(m_ZNVER4))
f6aa5171 508
4a0d0ed2
MJ
509/* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
510 smaller FMA chain. */
467cc398 511DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4
6f6ea27d 512 | m_YONGFENG | m_SHIJIDADAO | m_GENERIC)
4a0d0ed2 513
ef893a2a
JH
514/* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
515 smaller FMA chain. */
467cc398
JH
516DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3 | m_ZNVER4
517 | m_CORE_HYBRID | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC)
ef893a2a 518
eef81eef
JH
519/* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
520 smaller FMA chain. */
96759248 521DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_NONE)
eef81eef 522
97d51c17 523/* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd
524 for v2df vector reduction. */
525DEF_TUNE (X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD,
526 "v2df_reduction_prefer_haddpd", m_NONE)
527
d6c6ba3c
JH
528/*****************************************************************************/
529/* AVX instruction selection tuning (some of SSE flags affects AVX, too) */
530/*****************************************************************************/
9ac2f538 531
cd3c1b1c 532/* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if false, unaligned loads are
d6c6ba3c 533 split. */
d3c11974 534DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal",
b80fefd6 535 ~(m_NEHALEM | m_SANDYBRIDGE))
9ac2f538 536
cd3c1b1c 537/* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if false, unaligned stores are
d6c6ba3c 538 split. */
d3c11974 539DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_store_optimal",
b80fefd6 540 ~(m_NEHALEM | m_SANDYBRIDGE | m_BDVER | m_ZNVER1))
9ac2f538 541
586bbef1
HL
542/* X86_TUNE_AVX256_SPLIT_REGS: if true, AVX256 ops are split into two AVX128 ops. */
543DEF_TUNE (X86_TUNE_AVX256_SPLIT_REGS, "avx256_split_regs",m_BDVER | m_BTVER2
544 | m_ZNVER1)
545
d6c6ba3c
JH
546/* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
547 the auto-vectorizer. */
9ce29eb0
VK
548DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2
549 | m_ZNVER1)
9ac2f538 550
5958557b
SS
551/* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of 512-bit AVX
552 instructions in the auto-vectorizer. */
a7502c4a 553DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_CORE_AVX512)
eef81eef
JH
554
555/* X86_TUNE_AVX256_SPLIT_REGS: if true, AVX512 ops are split into two AVX256 ops. */
556DEF_TUNE (X86_TUNE_AVX512_SPLIT_REGS, "avx512_split_regs", m_ZNVER4)
5958557b 557
5b01bfeb
L
558/* X86_TUNE_AVX256_MOVE_BY_PIECES: Optimize move_by_pieces with 256-bit
559 AVX instructions. */
560DEF_TUNE (X86_TUNE_AVX256_MOVE_BY_PIECES, "avx256_move_by_pieces",
2aa97c0d 561 m_CORE_HYBRID | m_CORE_AVX2 | m_ZNVER1 | m_ZNVER2 | m_ZNVER3)
5b01bfeb
L
562
563/* X86_TUNE_AVX256_STORE_BY_PIECES: Optimize store_by_pieces with 256-bit
564 AVX instructions. */
565DEF_TUNE (X86_TUNE_AVX256_STORE_BY_PIECES, "avx256_store_by_pieces",
2aa97c0d 566 m_CORE_HYBRID | m_CORE_AVX2 | m_ZNVER1 | m_ZNVER2 | m_ZNVER3)
5b01bfeb 567
654cd743
L
568/* X86_TUNE_AVX512_MOVE_BY_PIECES: Optimize move_by_pieces with 512-bit
569 AVX instructions. */
570DEF_TUNE (X86_TUNE_AVX512_MOVE_BY_PIECES, "avx512_move_by_pieces",
d0aa0af9 571 m_SAPPHIRERAPIDS | m_ZNVER4 | m_ZNVER5)
654cd743
L
572
573/* X86_TUNE_AVX512_STORE_BY_PIECES: Optimize store_by_pieces with 512-bit
574 AVX instructions. */
575DEF_TUNE (X86_TUNE_AVX512_STORE_BY_PIECES, "avx512_store_by_pieces",
d0aa0af9 576 m_SAPPHIRERAPIDS | m_ZNVER4 | m_ZNVER5)
654cd743 577
5b01bfeb 578/*****************************************************************************/
d6c6ba3c
JH
579/*****************************************************************************/
580/* Historical relics: tuning flags that helps a specific old CPU designs */
581/*****************************************************************************/
582
583/* X86_TUNE_DOUBLE_WITH_ADD: Use add instead of sal to double value in
584 an integer register. */
585DEF_TUNE (X86_TUNE_DOUBLE_WITH_ADD, "double_with_add", ~m_386)
586
587/* X86_TUNE_ALWAYS_FANCY_MATH_387: controls use of fancy 387 operations,
588 such as fsqrt, fprem, fsin, fcos, fsincos etc.
589 Should be enabled for all targets that always has coprocesor. */
d3c11974 590DEF_TUNE (X86_TUNE_ALWAYS_FANCY_MATH_387, "always_fancy_math_387",
0d3a9fe7 591 ~(m_386 | m_486 | m_LAKEMONT))
d6c6ba3c
JH
592
593/* X86_TUNE_UNROLL_STRLEN: Produce (quite lame) unrolled sequence for
594 inline strlen. This affects only -minline-all-stringops mode. By
595 default we always dispatch to a library since our internal strlen
596 is bad. */
597DEF_TUNE (X86_TUNE_UNROLL_STRLEN, "unroll_strlen", ~m_386)
598
599/* X86_TUNE_SHIFT1: Enables use of short encoding of "sal reg" instead of
600 longer "sal $1, reg". */
601DEF_TUNE (X86_TUNE_SHIFT1, "shift1", ~m_486)
602
603/* X86_TUNE_ZERO_EXTEND_WITH_AND: Use AND instruction instead
604 of mozbl/movwl. */
385e3f0c 605DEF_TUNE (X86_TUNE_ZERO_EXTEND_WITH_AND, "zero_extend_with_and",
d878e79b 606 m_486 | m_PENT)
9ac2f538 607
3ad20bd4
XDL
608/* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
609 and SImode multiply, but 386 and 486 do HImode multiply faster. */
610DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul",
611 ~(m_386 | m_486))
9ac2f538 612
d6c6ba3c
JH
613/* X86_TUNE_FAST_PREFIX: Enable demoting some 32bit or 64bit arithmetic
614 into 16bit/8bit when resulting sequence is shorter. For example
615 for "and $-65536, reg" to 16bit store of 0. */
385e3f0c 616DEF_TUNE (X86_TUNE_FAST_PREFIX, "fast_prefix",
2d6b2e28 617 ~(m_386 | m_486 | m_PENT | m_LAKEMONT))
9ac2f538 618
d6c6ba3c
JH
619/* X86_TUNE_READ_MODIFY_WRITE: Enable use of read modify write instructions
620 such as "add $1, mem". */
385e3f0c 621DEF_TUNE (X86_TUNE_READ_MODIFY_WRITE, "read_modify_write",
2d6b2e28 622 ~(m_PENT | m_LAKEMONT))
9ac2f538 623
3ad20bd4
XDL
624/* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
625 than a MOV. */
2d6b2e28 626DEF_TUNE (X86_TUNE_MOVE_M1_VIA_OR, "move_m1_via_or", m_PENT | m_LAKEMONT)
9ac2f538 627
3ad20bd4
XDL
628/* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
629 but one byte longer. */
2d6b2e28 630DEF_TUNE (X86_TUNE_NOT_UNPAIRABLE, "not_unpairable", m_PENT | m_LAKEMONT)
9ac2f538 631
d6c6ba3c
JH
632/* X86_TUNE_PARTIAL_REG_STALL: Pentium pro, unlike later chips, handled
633 use of partial registers by renaming. This improved performance of 16bit
634 code where upper halves of registers are not used. It also leads to
635 an penalty whenever a 16bit store is followed by 32bit use. This flag
636 disables production of such sequences in common cases.
637 See also X86_TUNE_HIMODE_MATH.
9ac2f538 638
d6c6ba3c
JH
639 In current implementation the partial register stalls are not eliminated
640 very well - they can be introduced via subregs synthesized by combine
641 and can happen in caller/callee saving sequences. */
642DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO)
9ac2f538 643
678e6c32
UB
644/* X86_TUNE_PARTIAL_MEMORY_READ_STALL: Reading (possible unaligned) part of
645 memory location after a large write to the same address causes
646 store-to-load forwarding stall. */
0beb1611 647DEF_TUNE (X86_TUNE_PARTIAL_MEMORY_READ_STALL, "partial_memory_read_stall",
678e6c32
UB
648 m_386 | m_486 | m_PENT | m_LAKEMONT | m_PPRO | m_P4_NOCONA | m_CORE2
649 | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT
650 | m_K6_GEODE | m_ATHLON_K8 | m_AMDFAM10)
651
d6c6ba3c
JH
652/* X86_TUNE_PROMOTE_QIMODE: When it is cheap, turn 8bit arithmetic to
653 corresponding 32bit arithmetic. */
654DEF_TUNE (X86_TUNE_PROMOTE_QIMODE, "promote_qimode",
655 ~m_PPRO)
9ac2f538 656
d6c6ba3c
JH
657/* X86_TUNE_PROMOTE_HI_REGS: Same, but for 16bit artihmetic. Again we avoid
658 partial register stalls on PentiumPro targets. */
659DEF_TUNE (X86_TUNE_PROMOTE_HI_REGS, "promote_hi_regs", m_PPRO)
9ac2f538 660
d6c6ba3c
JH
661/* X86_TUNE_HIMODE_MATH: Enable use of 16bit arithmetic.
662 On PPro this flag is meant to avoid partial register stalls. */
663DEF_TUNE (X86_TUNE_HIMODE_MATH, "himode_math", ~m_PPRO)
9ac2f538 664
d6c6ba3c
JH
665/* X86_TUNE_SPLIT_LONG_MOVES: Avoid instructions moving immediates
666 directly to memory. */
667DEF_TUNE (X86_TUNE_SPLIT_LONG_MOVES, "split_long_moves", m_PPRO)
9ac2f538 668
d6c6ba3c
JH
669/* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
670DEF_TUNE (X86_TUNE_USE_XCHGB, "use_xchgb", m_PENT4)
9ac2f538 671
d6c6ba3c
JH
672/* X86_TUNE_USE_MOV0: Use "mov $0, reg" instead of "xor reg, reg" to clear
673 integer register. */
674DEF_TUNE (X86_TUNE_USE_MOV0, "use_mov0", m_K6)
9ac2f538 675
d6c6ba3c
JH
676/* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
677 operand that cannot be represented using a modRM byte. The XOR
678 replacement is long decoded, so this split helps here as well. */
679DEF_TUNE (X86_TUNE_NOT_VECTORMODE, "not_vectormode", m_K6)
9ac2f538 680
d6c6ba3c
JH
681/* X86_TUNE_AVOID_VECTOR_DECODE: Enable splitters that avoid vector decoded
682 forms of instructions on K8 targets. */
683DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode",
684 m_K8)
9ac2f538 685
d6c6ba3c
JH
686/*****************************************************************************/
687/* This never worked well before. */
688/*****************************************************************************/
41ee845b 689
d6c6ba3c
JH
690/* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
691 on simulation result. But after P4 was made, no performance benefit
692 was observed with branch hints. It also increases the code size.
693 As a result, icc never generates branch hints. */
37876976 694DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS, "branch_prediction_hints", m_NONE)
41ee845b 695
d6c6ba3c 696/* X86_TUNE_QIMODE_MATH: Enable use of 8bit arithmetic. */
37876976 697DEF_TUNE (X86_TUNE_QIMODE_MATH, "qimode_math", m_ALL)
41ee845b 698
d6c6ba3c
JH
699/* X86_TUNE_PROMOTE_QI_REGS: This enables generic code that promotes all 8bit
700 arithmetic to 32bit via PROMOTE_MODE macro. This code generation scheme
701 is usually used for RISC targets. */
37876976 702DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", m_NONE)
348188bf 703
eba3565c
RS
704/* X86_TUNE_SLOW_STC: This disables use of stc, clc and cmc carry flag
705 modifications on architectures where theses operations are slow. */
706DEF_TUNE (X86_TUNE_SLOW_STC, "slow_stc", m_PENT4)
34d4168e
RS
707
708/* X86_TUNE_USE_RCR: Controls use of rcr 1 instruction instead of shrd. */
709DEF_TUNE (X86_TUNE_USE_RCR, "use_rcr", m_AMD_MULTIPLE)