gcc/config/i386/x86-tune.def

   1 /* Definitions of x86 tunable features.
   2    Copyright (C) 2013-2023 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 3, or (at your option)
   9 any later version.
  10
  11 GCC is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License and
  17 a copy of the GCC Runtime Library Exception along with this program;
  18 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 /* Tuning for a given CPU XXXX consists of:
  22     - adding new CPU into:
  23         - adding PROCESSOR_XXX to processor_type (in i386.h)
  24         - possibly adding XXX into CPU attribute in i386.md
  25         - adding XXX to processor_alias_table (in i386.cc)
  26     - introducing ix86_XXX_cost in i386.cc
  27         - Stringop generation table can be build based on test_stringop
  28         - script (once rest of tuning is complete)
  29     - designing a scheduler model in
  30         - XXXX.md file
  31         - Updating ix86_issue_rate and ix86_adjust_cost in i386.md
  32         - possibly updating ia32_multipass_dfa_lookahead, ix86_sched_reorder
  33           and ix86_sched_init_global if those tricks are needed.
  34     - Tunning the flags bellow. Those are split into sections and each
  35       section is very roughly ordered by importance.  */
  36
  37 /*****************************************************************************/
  38 /* Scheduling flags.                                                         */
  39 /*****************************************************************************/
  40
  41 /* X86_TUNE_SCHEDULE: Enable scheduling.  */
  42 DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
  43           m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT
  44           | m_INTEL | m_KNL | m_KNM | m_K6_GEODE | m_AMD_MULTIPLE | m_LUJIAZUI
  45           | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_ARROWLAKE
  46           | m_CORE_ATOM | m_GENERIC)
  47
  48 /* X86_TUNE_PARTIAL_REG_DEPENDENCY: Enable more register renaming
  49    on modern chips.  Prefer stores affecting whole integer register
  50    over partial stores.  For example prefer MOVZBL or MOVQ to load 8bit
  51    value over movb.  */
  52 DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency",
  53           m_P4_NOCONA | m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE | m_CORE_AVX2
  54           | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL
  55           | m_KNL | m_KNM | m_AMD_MULTIPLE | m_LUJIAZUI | m_TREMONT
  56           | m_ALDERLAKE | m_ARROWLAKE | m_CORE_ATOM | m_GENERIC)
  57
  58 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store
  59    destinations to be 128bit to allow register renaming on 128bit SSE units,
  60    but usually results in one extra microop on 64bit SSE units.
  61    Experimental results shows that disabling this option on P4 brings over 20%
  62    SPECfp regression, while enabling it on K8 brings roughly 2.4% regression
  63    that can be partly masked by careful scheduling of moves.  */
  64 DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, "sse_partial_reg_dependency",
  65           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
  66           | m_BDVER | m_ZNVER | m_LUJIAZUI | m_TREMONT | m_ALDERLAKE
  67           | m_ARROWLAKE | m_CORE_ATOM | m_GENERIC)
  68
  69 /* X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY: This knob avoids
  70    partial write to the destination in scalar SSE conversion from FP
  71    to FP.  */
  72 DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY,
  73           "sse_partial_reg_fp_converts_dependency",
  74           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
  75           | m_BDVER | m_ZNVER | m_LUJIAZUI | m_ALDERLAKE | m_ARROWLAKE
  76           | m_CORE_ATOM | m_GENERIC)
  77
  78 /* X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY: This knob avoids partial
  79    write to the destination in scalar SSE conversion from integer to FP.  */
  80 DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY,
  81           "sse_partial_reg_converts_dependency",
  82           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
  83           | m_BDVER | m_ZNVER | m_LUJIAZUI | m_ALDERLAKE | m_ARROWLAKE
  84           | m_CORE_ATOM | m_GENERIC)
  85
  86 /* X86_TUNE_DEST_FALSE_DEP_FOR_GLC: This knob inserts zero-idiom before
  87    several insns to break false dependency on the dest register for GLC
  88    micro-architecture.  */
  89 DEF_TUNE (X86_TUNE_DEST_FALSE_DEP_FOR_GLC,
  90           "dest_false_dep_for_glc", m_SAPPHIRERAPIDS | m_ALDERLAKE | m_ARROWLAKE
  91           | m_CORE_ATOM)
  92
  93 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
  94    are resolved on SSE register parts instead of whole registers, so we may
  95    maintain just lower part of scalar values in proper format leaving the
  96    upper part undefined.  */
  97 DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8)
  98
  99 /* X86_TUNE_PARTIAL_FLAG_REG_STALL: this flag disables use of flags
 100    set by instructions affecting just some flags (in particular shifts).
 101    This is because Core2 resolves dependencies on whole flags register
 102    and such sequences introduce false dependency on previous instruction
 103    setting full flags.
 104
 105    The flags does not affect generation of INC and DEC that is controlled
 106    by X86_TUNE_USE_INCDEC.  */
 107
 108 DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
 109           m_CORE2)
 110
 111 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
 112    partial dependencies.  */
 113 DEF_TUNE (X86_TUNE_MOVX, "movx",
 114           m_PPRO | m_P4_NOCONA | m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE
 115           | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_KNL | m_KNM | m_INTEL
 116           | m_GOLDMONT_PLUS | m_GEODE | m_AMD_MULTIPLE | m_LUJIAZUI
 117           | m_CORE_AVX2 | m_TREMONT | m_ALDERLAKE | m_ARROWLAKE
 118           | m_CORE_ATOM | m_GENERIC)
 119
 120 /* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by
 121    full sized loads.  */
 122 DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall",
 123           m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL
 124           | m_KNL | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS | m_AMD_MULTIPLE
 125           | m_LUJIAZUI | m_TREMONT | m_ALDERLAKE | m_ARROWLAKE
 126           | m_CORE_ATOM | m_GENERIC)
 127
 128 /* X86_TUNE_FUSE_CMP_AND_BRANCH_32: Fuse compare with a subsequent
 129    conditional jump instruction for 32 bit TARGET.  */
 130 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_32, "fuse_cmp_and_branch_32",
 131           m_CORE_ALL | m_BDVER | m_ZNVER | m_LUJIAZUI | m_GENERIC)
 132
 133 /* X86_TUNE_FUSE_CMP_AND_BRANCH_64: Fuse compare with a subsequent
 134    conditional jump instruction for TARGET_64BIT.  */
 135 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64, "fuse_cmp_and_branch_64",
 136           m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER
 137           | m_ZNVER | m_LUJIAZUI | m_GENERIC)
 138
 139 /* X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS: Fuse compare with a
 140    subsequent conditional jump instruction when the condition jump
 141    check sign flag (SF) or overflow flag (OF).  */
 142 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags",
 143           m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER
 144           | m_ZNVER | m_LUJIAZUI | m_GENERIC)
 145
 146 /* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional
 147    jump instruction when the alu instruction produces the CCFLAG consumed by
 148    the conditional jump instruction. */
 149 DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
 150                   m_SANDYBRIDGE | m_CORE_AVX2 | m_LUJIAZUI | m_GENERIC)
 151
 152
 153 /*****************************************************************************/
 154 /* Function prologue, epilogue and function calling sequences.               */
 155 /*****************************************************************************/
 156
 157 /* X86_TUNE_ACCUMULATE_OUTGOING_ARGS: Allocate stack space for outgoing
 158    arguments in prologue/epilogue instead of separately for each call
 159    by push/pop instructions.
 160    This increase code size by about 5% in 32bit mode, less so in 64bit mode
 161    because parameters are passed in registers.  It is considerable
 162    win for targets without stack engine that prevents multple push operations
 163    to happen in parallel.  */
 164
 165 DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args",
 166           m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
 167           | m_GOLDMONT | m_GOLDMONT_PLUS | m_ATHLON_K8 | m_LUJIAZUI)
 168
 169 /* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in prologues that are
 170    considered on critical path.  */
 171 DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move",
 172           m_PPRO | m_ATHLON_K8)
 173
 174 /* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in epilogues that are
 175    considered on critical path.  */
 176 DEF_TUNE (X86_TUNE_EPILOGUE_USING_MOVE, "epilogue_using_move",
 177           m_PPRO | m_ATHLON_K8)
 178
 179 /* X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits.  */
 180 DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave",
 181           m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_LUJIAZUI
 182           | m_TREMONT | m_ALDERLAKE | m_ARROWLAKE | m_CORE_ATOM | m_GENERIC)
 183
 184 /* X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions.
 185    Some chips, like 486 and Pentium works faster with separate load
 186    and push instructions.  */
 187 DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory",
 188           m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE
 189           | m_LUJIAZUI | m_TREMONT | m_ALDERLAKE | m_ARROWLAKE
 190           | m_CORE_ATOM | m_GENERIC)
 191
 192 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
 193    over esp subtraction.  */
 194 DEF_TUNE (X86_TUNE_SINGLE_PUSH, "single_push", m_386 | m_486 | m_PENT
 195           | m_LAKEMONT | m_K6_GEODE)
 196
 197 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
 198    over esp subtraction.  */
 199 DEF_TUNE (X86_TUNE_DOUBLE_PUSH, "double_push", m_PENT | m_LAKEMONT
 200           | m_K6_GEODE)
 201
 202 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
 203    over esp addition.  */
 204 DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop", m_386 | m_486 | m_PENT
 205           | m_LAKEMONT | m_PPRO)
 206
 207 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
 208    over esp addition.  */
 209 DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT | m_LAKEMONT)
 210
 211 /*****************************************************************************/
 212 /* Branch predictor tuning                                                   */
 213 /*****************************************************************************/
 214
 215 /* X86_TUNE_PAD_SHORT_FUNCTION: Make every function to be at least 4
 216    instructions long.  */
 217 DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function", m_BONNELL)
 218
 219 /* X86_TUNE_PAD_RETURNS: Place NOP before every RET that is a destination
 220    of conditional jump or directly preceded by other jump instruction.
 221    This is important for AND K8-AMDFAM10 because the branch prediction
 222    architecture expect at most one jump per 2 byte window.  Failing to
 223    pad returns leads to misaligned return stack.  */
 224 DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns",
 225           m_ATHLON_K8 | m_AMDFAM10)
 226
 227 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
 228    than 4 branch instructions in the 16 byte window.  */
 229 DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit",
 230           m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM
 231           | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL | m_ATHLON_K8 | m_AMDFAM10)
 232
 233 /*****************************************************************************/
 234 /* Integer instruction selection tuning                                      */
 235 /*****************************************************************************/
 236
 237 /* X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL: Enable software prefetching
 238    at -O3.  For the moment, the prefetching seems badly tuned for Intel
 239    chips.  */
 240 DEF_TUNE (X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL, "software_prefetching_beneficial",
 241           m_K6_GEODE | m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
 242
 243 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
 244    on 16-bit immediate moves into memory on Core2 and Corei7.  */
 245 DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_LUJIAZUI | m_GENERIC)
 246
 247 /* X86_TUNE_READ_MODIFY: Enable use of read-modify instructions such
 248    as "add mem, reg".  */
 249 DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_LAKEMONT | m_PPRO))
 250
 251 /* X86_TUNE_USE_INCDEC: Enable use of inc/dec instructions.
 252
 253    Core2 and nehalem has stall of 7 cycles for partial flag register stalls.
 254    Sandy bridge and Ivy bridge generate extra uop.  On Haswell this extra uop
 255    is output only when the values needs to be really merged, which is not
 256    done by GCC generated code.  */
 257 DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec",
 258           ~(m_P4_NOCONA | m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE
 259             | m_BONNELL | m_SILVERMONT | m_INTEL |  m_KNL | m_KNM | m_GOLDMONT
 260             | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_ARROWLAKE
 261             | m_CORE_ATOM | m_LUJIAZUI | m_GENERIC))
 262
 263 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
 264    for DFmode copies */
 265 DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves",
 266           ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
 267             | m_KNL | m_KNM | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_LUJIAZUI
 268             | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE
 269             | m_ARROWLAKE | m_CORE_ATOM | m_GENERIC))
 270
 271 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
 272    will impact LEA instruction selection. */
 273 DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_BONNELL | m_SILVERMONT | m_KNL
 274          | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL | m_LUJIAZUI)
 275
 276 /* X86_TUNE_AVOID_LEA_FOR_ADDR: Avoid lea for address computation.  */
 277 DEF_TUNE (X86_TUNE_AVOID_LEA_FOR_ADDR, "avoid_lea_for_addr",
 278           m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS
 279           | m_KNL | m_KNM)
 280
 281 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
 282    vector path on AMD machines.
 283    FIXME: Do we need to enable this for core? */
 284 DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM32_MEM, "slow_imul_imm32_mem",
 285           m_K8 | m_AMDFAM10)
 286
 287 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
 288    machines.
 289    FIXME: Do we need to enable this for core? */
 290 DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM8, "slow_imul_imm8",
 291           m_K8 | m_AMDFAM10)
 292
 293 /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
 294    a conditional move.  */
 295 DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove",
 296           m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_KNL
 297           | m_KNM | m_INTEL)
 298
 299 /* X86_TUNE_SINGLE_STRINGOP: Enable use of single string operations, such
 300    as MOVS and STOS (without a REP prefix) to move/set sequences of bytes.  */
 301 DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
 302
 303 /* X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB: Enable use of REP MOVSB/STOSB to
 304    move/set sequences of bytes with known size.  */
 305 DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
 306           "prefer_known_rep_movsb_stosb",
 307           m_SKYLAKE | m_ALDERLAKE | m_ARROWLAKE | m_CORE_ATOM
 308           | m_TREMONT | m_CORE_AVX512 | m_LUJIAZUI)
 309
 310 /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
 311    compact prologues and epilogues by issuing a misaligned moves.  This
 312    requires target to handle misaligned moves and partial memory stalls
 313    reasonably well.
 314    FIXME: This may actualy be a win on more targets than listed here.  */
 315 DEF_TUNE (X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES,
 316           "misaligned_move_string_pro_epilogues",
 317           m_386 | m_486 | m_CORE_ALL | m_AMD_MULTIPLE | m_LUJIAZUI | m_TREMONT
 318           | m_ALDERLAKE | m_ARROWLAKE | m_CORE_ATOM | m_GENERIC)
 319
 320 /* X86_TUNE_USE_SAHF: Controls use of SAHF.  */
 321 DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf",
 322           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
 323           | m_KNL | m_KNM | m_INTEL | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER
 324           | m_BTVER | m_ZNVER | m_LUJIAZUI | m_GOLDMONT | m_GOLDMONT_PLUS
 325           | m_TREMONT | m_ALDERLAKE | m_ARROWLAKE | m_CORE_ATOM
 326           | m_GENERIC)
 327
 328 /* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions.  */
 329 DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd",
 330           ~(m_PENT | m_LAKEMONT | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
 331             | m_K6 | m_GOLDMONT | m_GOLDMONT_PLUS))
 332
 333 /* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions.  */
 334 DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
 335           m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
 336           | m_LAKEMONT | m_AMD_MULTIPLE | m_LUJIAZUI | m_GOLDMONT
 337           | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_ARROWLAKE
 338           | m_CORE_ATOM | m_GENERIC)
 339
 340 /* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency
 341    for bit-manipulation instructions.  */
 342 DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi",
 343           m_SANDYBRIDGE | m_HASWELL | m_SKYLAKE | m_SKYLAKE_AVX512
 344           | m_CANNONLAKE | m_CASCADELAKE | m_COOPERLAKE
 345           | m_LUJIAZUI | m_GENERIC)
 346
 347 /* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based
 348    on hardware capabilities. Bdver3 hardware has a loop buffer which makes
 349    unrolling small loop less important. For, such architectures we adjust
 350    the unroll factor so that the unrolled loop fits the loop buffer.  */
 351 DEF_TUNE (X86_TUNE_ADJUST_UNROLL, "adjust_unroll_factor", m_BDVER3 | m_BDVER4)
 352
 353 /* X86_TUNE_ONE_IF_CONV_INSNS: Restrict a number of cmov insns in
 354    if-converted sequence to one.  */
 355 DEF_TUNE (X86_TUNE_ONE_IF_CONV_INSN, "one_if_conv_insn",
 356           m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_CORE_ALL | m_GOLDMONT
 357           | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_ARROWLAKE
 358           | m_CORE_ATOM | m_LUJIAZUI | m_GENERIC)
 359
 360 /* X86_TUNE_AVOID_MFENCE: Use lock prefixed instructions instead of mfence.  */
 361 DEF_TUNE (X86_TUNE_AVOID_MFENCE, "avoid_mfence",
 362          m_CORE_ALL | m_BDVER | m_ZNVER | m_LUJIAZUI | m_TREMONT | m_ALDERLAKE
 363          | m_ARROWLAKE | m_CORE_ATOM | m_GENERIC)
 364
 365 /* X86_TUNE_EXPAND_ABS: This enables a new abs pattern by
 366    generating instructions for abs (x) = (((signed) x >> (W-1) ^ x) -
 367    (signed) x >> (W-1)) instead of cmove or SSE max/abs instructions.  */
 368 DEF_TUNE (X86_TUNE_EXPAND_ABS, "expand_abs",
 369           m_CORE_ALL | m_SILVERMONT | m_KNL | m_KNM | m_GOLDMONT
 370           | m_GOLDMONT_PLUS | m_LUJIAZUI)
 371
 372 /*****************************************************************************/
 373 /* 387 instruction selection tuning                                          */
 374 /*****************************************************************************/
 375
 376 /* X86_TUNE_USE_HIMODE_FIOP: Enables use of x87 instructions with 16bit
 377    integer operand.
 378    FIXME: Why this is disabled for modern chips?  */
 379 DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop",
 380           m_386 | m_486 | m_K6_GEODE)
 381
 382 /* X86_TUNE_USE_SIMODE_FIOP: Enables use of x87 instructions with 32bit
 383    integer operand.  */
 384 DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop",
 385           ~(m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL
 386             | m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_AMD_MULTIPLE
 387             | m_LUJIAZUI | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT
 388             | m_ALDERLAKE | m_ARROWLAKE | m_CORE_ATOM
 389             | m_GENERIC))
 390
 391 /* X86_TUNE_USE_FFREEP: Use freep instruction instead of fstp.  */
 392 DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE | m_LUJIAZUI)
 393
 394 /* X86_TUNE_EXT_80387_CONSTANTS: Use fancy 80387 constants, such as PI.  */
 395 DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants",
 396           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
 397           | m_KNL | m_KNM | m_INTEL | m_K6_GEODE | m_ATHLON_K8 | m_LUJIAZUI
 398           | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_ARROWLAKE
 399           | m_CORE_ATOM | m_GENERIC)
 400
 401 /*****************************************************************************/
 402 /* SSE instruction selection tuning                                          */
 403 /*****************************************************************************/
 404
 405 /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
 406    regs instead of memory.  */
 407 DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill",
 408           m_CORE_ALL)
 409
 410 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL: Use movups for misaligned loads instead
 411    of a sequence loading registers by parts.  */
 412 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal",
 413           m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | m_KNM
 414           | m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE
 415           | m_ARROWLAKE | m_CORE_ATOM | m_AMDFAM10 | m_BDVER
 416           | m_BTVER | m_ZNVER | m_LUJIAZUI | m_GENERIC)
 417
 418 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores
 419    instead of a sequence loading registers by parts.  */
 420 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
 421           m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | m_KNM
 422           | m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE
 423           | m_ARROWLAKE | m_CORE_ATOM | m_BDVER | m_ZNVER
 424           | m_LUJIAZUI | m_GENERIC)
 425
 426 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL: Use packed single
 427    precision 128bit instructions instead of double where possible.   */
 428 DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal",
 429           m_BDVER | m_ZNVER)
 430
 431 /* X86_TUNE_SSE_TYPELESS_STORES: Always movaps/movups for 128bit stores.   */
 432 DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores",
 433           m_AMD_MULTIPLE | m_LUJIAZUI | m_CORE_ALL | m_TREMONT | m_ALDERLAKE
 434           | m_ARROWLAKE | m_CORE_ATOM | m_GENERIC)
 435
 436 /* X86_TUNE_SSE_LOAD0_BY_PXOR: Always use pxor to load0 as opposed to
 437    xorps/xorpd and other variants.  */
 438 DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor",
 439           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BDVER | m_BTVER | m_ZNVER
 440           | m_LUJIAZUI | m_TREMONT | m_ALDERLAKE | m_ARROWLAKE
 441           | m_CORE_ATOM | m_GENERIC)
 442
 443 /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from integer
 444    to SSE registers.  If disabled, the moves will be done by storing
 445    the value to memory and reloading.
 446    Enable this flag for generic - the only relevant architecture preferring
 447    no inter-unit moves is Buldozer. While this makes small regression on SPECfp
 448    scores (sub 0.3%), disabling inter-unit moves penalizes noticeably hand
 449    written vectorized code which use i.e. _mm_set_epi16.  */
 450 DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_TO_VEC, "inter_unit_moves_to_vec",
 451           ~(m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER))
 452
 453 /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from SSE
 454    to integer registers.  If disabled, the moves will be done by storing
 455    the value to memory and reloading.  */
 456 DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_FROM_VEC, "inter_unit_moves_from_vec",
 457           ~m_ATHLON_K8)
 458
 459 /* X86_TUNE_INTER_UNIT_CONVERSIONS: Enable float<->integer conversions
 460    to use both SSE and integer registers at a same time.  */
 461 DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions",
 462           ~(m_AMDFAM10 | m_BDVER))
 463
 464 /* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for
 465    fp converts to destination register.  */
 466 DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts",
 467           m_SILVERMONT | m_KNL | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS
 468           | m_INTEL)
 469
 470 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
 471    from FP to FP.  This form of instructions avoids partial write to the
 472    destination.  */
 473 DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts",
 474           m_AMDFAM10)
 475
 476 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
 477    from integer to FP. */
 478 DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
 479
 480 /* X86_TUNE_SLOW_SHUFB: Indicates tunings with slow pshufb instruction.  */
 481 DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb",
 482           m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_GOLDMONT
 483           | m_GOLDMONT_PLUS | m_INTEL)
 484
 485 /* X86_TUNE_AVOID_4BYTE_PREFIXES: Avoid instructions requiring 4+ bytes of prefixes.  */
 486 DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes",
 487           m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE
 488           | m_ARROWLAKE | m_CORE_ATOM | m_INTEL)
 489
 490 /* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2
 491    elements.  */
 492 DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
 493           ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE
 494             | m_ARROWLAKE | m_CORE_ATOM | m_GENERIC | m_GDS))
 495
 496 /* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2
 497    elements.  */
 498 DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts",
 499           ~(m_ZNVER4))
 500
 501 /* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4
 502    elements.  */
 503 DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
 504           ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ALDERLAKE
 505             | m_ARROWLAKE | m_CORE_ATOM | m_GENERIC | m_GDS))
 506
 507 /* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4
 508    elements.  */
 509 DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
 510           ~(m_ZNVER4))
 511
 512 /* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
 513    elements.  */
 514 DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts",
 515           ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_ALDERLAKE | m_ARROWLAKE
 516             | m_CORE_ATOM | m_GENERIC | m_GDS))
 517
 518 /* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more
 519    elements.  */
 520 DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
 521           ~(m_ZNVER4))
 522
 523 /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
 524    smaller FMA chain.  */
 525 DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2 | m_ZNVER3)
 526
 527 /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
 528    smaller FMA chain.  */
 529 DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3
 530           | m_ALDERLAKE | m_ARROWLAKE | m_SAPPHIRERAPIDS
 531           | m_CORE_ATOM)
 532
 533 /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
 534    smaller FMA chain.  */
 535 DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_NONE)
 536
 537 /* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd
 538    for v2df vector reduction.  */
 539 DEF_TUNE (X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD,
 540           "v2df_reduction_prefer_haddpd", m_NONE)
 541
 542 /*****************************************************************************/
 543 /* AVX instruction selection tuning (some of SSE flags affects AVX, too)     */
 544 /*****************************************************************************/
 545
 546 /* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if false, unaligned loads are
 547    split.  */
 548 DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal",
 549           ~(m_NEHALEM | m_SANDYBRIDGE))
 550
 551 /* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if false, unaligned stores are
 552    split.  */
 553 DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_store_optimal",
 554           ~(m_NEHALEM | m_SANDYBRIDGE | m_BDVER | m_ZNVER1))
 555
 556 /* X86_TUNE_AVX256_SPLIT_REGS: if true, AVX256 ops are split into two AVX128 ops.  */
 557 DEF_TUNE (X86_TUNE_AVX256_SPLIT_REGS, "avx256_split_regs",m_BDVER | m_BTVER2
 558           | m_ZNVER1)
 559
 560 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
 561    the auto-vectorizer.  */
 562 DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2
 563           | m_ZNVER1)
 564
 565 /* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of 512-bit AVX
 566    instructions in the auto-vectorizer.  */
 567 DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_CORE_AVX512)
 568
 569 /* X86_TUNE_AVX256_SPLIT_REGS: if true, AVX512 ops are split into two AVX256 ops.  */
 570 DEF_TUNE (X86_TUNE_AVX512_SPLIT_REGS, "avx512_split_regs", m_ZNVER4)
 571
 572 /* X86_TUNE_AVX256_MOVE_BY_PIECES: Optimize move_by_pieces with 256-bit
 573    AVX instructions.  */
 574 DEF_TUNE (X86_TUNE_AVX256_MOVE_BY_PIECES, "avx256_move_by_pieces",
 575           m_ALDERLAKE | m_ARROWLAKE | m_CORE_AVX2 | m_ZNVER1
 576           | m_ZNVER2 | m_ZNVER3)
 577
 578 /* X86_TUNE_AVX256_STORE_BY_PIECES: Optimize store_by_pieces with 256-bit
 579    AVX instructions.  */
 580 DEF_TUNE (X86_TUNE_AVX256_STORE_BY_PIECES, "avx256_store_by_pieces",
 581           m_ALDERLAKE | m_ARROWLAKE | m_CORE_AVX2 | m_ZNVER1
 582           | m_ZNVER2 | m_ZNVER3)
 583
 584 /* X86_TUNE_AVX512_MOVE_BY_PIECES: Optimize move_by_pieces with 512-bit
 585    AVX instructions.  */
 586 DEF_TUNE (X86_TUNE_AVX512_MOVE_BY_PIECES, "avx512_move_by_pieces",
 587           m_SAPPHIRERAPIDS | m_ZNVER4)
 588
 589 /* X86_TUNE_AVX512_STORE_BY_PIECES: Optimize store_by_pieces with 512-bit
 590    AVX instructions.  */
 591 DEF_TUNE (X86_TUNE_AVX512_STORE_BY_PIECES, "avx512_store_by_pieces",
 592           m_SAPPHIRERAPIDS | m_ZNVER4)
 593
 594 /*****************************************************************************/
 595 /*****************************************************************************/
 596 /* Historical relics: tuning flags that helps a specific old CPU designs     */
 597 /*****************************************************************************/
 598
 599 /* X86_TUNE_DOUBLE_WITH_ADD: Use add instead of sal to double value in
 600    an integer register.  */
 601 DEF_TUNE (X86_TUNE_DOUBLE_WITH_ADD, "double_with_add", ~m_386)
 602
 603 /* X86_TUNE_ALWAYS_FANCY_MATH_387: controls use of fancy 387 operations,
 604    such as fsqrt, fprem, fsin, fcos, fsincos etc.
 605    Should be enabled for all targets that always has coprocesor.  */
 606 DEF_TUNE (X86_TUNE_ALWAYS_FANCY_MATH_387, "always_fancy_math_387",
 607           ~(m_386 | m_486 | m_LAKEMONT))
 608
 609 /* X86_TUNE_UNROLL_STRLEN: Produce (quite lame) unrolled sequence for
 610    inline strlen.  This affects only -minline-all-stringops mode. By
 611    default we always dispatch to a library since our internal strlen
 612    is bad.  */
 613 DEF_TUNE (X86_TUNE_UNROLL_STRLEN, "unroll_strlen", ~m_386)
 614
 615 /* X86_TUNE_SHIFT1: Enables use of short encoding of "sal reg" instead of
 616    longer "sal $1, reg".  */
 617 DEF_TUNE (X86_TUNE_SHIFT1, "shift1", ~m_486)
 618
 619 /* X86_TUNE_ZERO_EXTEND_WITH_AND: Use AND instruction instead
 620    of mozbl/movwl.  */
 621 DEF_TUNE (X86_TUNE_ZERO_EXTEND_WITH_AND, "zero_extend_with_and",
 622           m_486 | m_PENT)
 623
 624 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
 625    and SImode multiply, but 386 and 486 do HImode multiply faster.  */
 626 DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul",
 627           ~(m_386 | m_486))
 628
 629 /* X86_TUNE_FAST_PREFIX: Enable demoting some 32bit or 64bit arithmetic
 630    into 16bit/8bit when resulting sequence is shorter.  For example
 631    for "and $-65536, reg" to 16bit store of 0.  */
 632 DEF_TUNE (X86_TUNE_FAST_PREFIX, "fast_prefix",
 633           ~(m_386 | m_486 | m_PENT | m_LAKEMONT))
 634
 635 /* X86_TUNE_READ_MODIFY_WRITE: Enable use of read modify write instructions
 636    such as "add $1, mem".  */
 637 DEF_TUNE (X86_TUNE_READ_MODIFY_WRITE, "read_modify_write",
 638           ~(m_PENT | m_LAKEMONT))
 639
 640 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
 641    than a MOV.  */
 642 DEF_TUNE (X86_TUNE_MOVE_M1_VIA_OR, "move_m1_via_or", m_PENT | m_LAKEMONT)
 643
 644 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
 645    but one byte longer.  */
 646 DEF_TUNE (X86_TUNE_NOT_UNPAIRABLE, "not_unpairable", m_PENT | m_LAKEMONT)
 647
 648 /* X86_TUNE_PARTIAL_REG_STALL: Pentium pro, unlike later chips, handled
 649    use of partial registers by renaming.  This improved performance of 16bit
 650    code where upper halves of registers are not used.  It also leads to
 651    an penalty whenever a 16bit store is followed by 32bit use.  This flag
 652    disables production of such sequences in common cases.
 653    See also X86_TUNE_HIMODE_MATH.
 654
 655    In current implementation the partial register stalls are not eliminated
 656    very well - they can be introduced via subregs synthesized by combine
 657    and can happen in caller/callee saving sequences.  */
 658 DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO)
 659
 660 /* X86_TUNE_PROMOTE_QIMODE: When it is cheap, turn 8bit arithmetic to
 661    corresponding 32bit arithmetic.  */
 662 DEF_TUNE (X86_TUNE_PROMOTE_QIMODE, "promote_qimode",
 663           ~m_PPRO)
 664
 665 /* X86_TUNE_PROMOTE_HI_REGS: Same, but for 16bit artihmetic.  Again we avoid
 666    partial register stalls on PentiumPro targets. */
 667 DEF_TUNE (X86_TUNE_PROMOTE_HI_REGS, "promote_hi_regs", m_PPRO)
 668
 669 /* X86_TUNE_HIMODE_MATH: Enable use of 16bit arithmetic.
 670    On PPro this flag is meant to avoid partial register stalls.  */
 671 DEF_TUNE (X86_TUNE_HIMODE_MATH, "himode_math", ~m_PPRO)
 672
 673 /* X86_TUNE_SPLIT_LONG_MOVES: Avoid instructions moving immediates
 674    directly to memory.  */
 675 DEF_TUNE (X86_TUNE_SPLIT_LONG_MOVES, "split_long_moves", m_PPRO)
 676
 677 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx.  */
 678 DEF_TUNE (X86_TUNE_USE_XCHGB, "use_xchgb", m_PENT4)
 679
 680 /* X86_TUNE_USE_MOV0: Use "mov $0, reg" instead of "xor reg, reg" to clear
 681    integer register.  */
 682 DEF_TUNE (X86_TUNE_USE_MOV0, "use_mov0", m_K6)
 683
 684 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
 685    operand that cannot be represented using a modRM byte.  The XOR
 686    replacement is long decoded, so this split helps here as well.  */
 687 DEF_TUNE (X86_TUNE_NOT_VECTORMODE, "not_vectormode", m_K6)
 688
 689 /* X86_TUNE_AVOID_VECTOR_DECODE: Enable splitters that avoid vector decoded
 690    forms of instructions on K8 targets.  */
 691 DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode",
 692           m_K8)
 693
 694 /*****************************************************************************/
 695 /* This never worked well before.                                            */
 696 /*****************************************************************************/
 697
 698 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
 699    on simulation result. But after P4 was made, no performance benefit
 700    was observed with branch hints.  It also increases the code size.
 701    As a result, icc never generates branch hints.  */
 702 DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS, "branch_prediction_hints", m_NONE)
 703
 704 /* X86_TUNE_QIMODE_MATH: Enable use of 8bit arithmetic.  */
 705 DEF_TUNE (X86_TUNE_QIMODE_MATH, "qimode_math", m_ALL)
 706
 707 /* X86_TUNE_PROMOTE_QI_REGS: This enables generic code that promotes all 8bit
 708    arithmetic to 32bit via PROMOTE_MODE macro.  This code generation scheme
 709    is usually used for RISC targets.  */
 710 DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", m_NONE)
 711
 712 /* X86_TUNE_EMIT_VZEROUPPER: This enables vzeroupper instruction insertion
 713    before a transfer of control flow out of the function.  */
 714 DEF_TUNE (X86_TUNE_EMIT_VZEROUPPER, "emit_vzeroupper", ~m_KNL)
 715
 716 /* X86_TUNE_SLOW_STC: This disables use of stc, clc and cmc carry flag
 717   modifications on architectures where theses operations are slow.  */
 718 DEF_TUNE (X86_TUNE_SLOW_STC, "slow_stc", m_PENT4)