gcc/config/i386/x86-tune.def

   1 /* Definitions of x86 tunable features.
   2    Copyright (C) 2013-2022 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 3, or (at your option)
   9 any later version.
  10
  11 GCC is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License and
  17 a copy of the GCC Runtime Library Exception along with this program;
  18 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 /* Tuning for a given CPU XXXX consists of:
  22     - adding new CPU into:
  23         - adding PROCESSOR_XXX to processor_type (in i386.h)
  24         - possibly adding XXX into CPU attribute in i386.md
  25         - adding XXX to processor_alias_table (in i386.cc)
  26     - introducing ix86_XXX_cost in i386.cc
  27         - Stringop generation table can be build based on test_stringop
  28         - script (once rest of tuning is complete)
  29     - designing a scheduler model in
  30         - XXXX.md file
  31         - Updating ix86_issue_rate and ix86_adjust_cost in i386.md
  32         - possibly updating ia32_multipass_dfa_lookahead, ix86_sched_reorder
  33           and ix86_sched_init_global if those tricks are needed.
  34     - Tunning the flags bellow. Those are split into sections and each
  35       section is very roughly ordered by importance.  */
  36
  37 /*****************************************************************************/
  38 /* Scheduling flags.                                                         */
  39 /*****************************************************************************/
  40
  41 /* X86_TUNE_SCHEDULE: Enable scheduling.  */
  42 DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
  43           m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT
  44           | m_INTEL | m_KNL | m_KNM | m_K6_GEODE | m_AMD_MULTIPLE | m_LUJIAZUI
  45      | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_GENERIC)
  46
  47 /* X86_TUNE_PARTIAL_REG_DEPENDENCY: Enable more register renaming
  48    on modern chips.  Prefer stores affecting whole integer register
  49    over partial stores.  For example prefer MOVZBL or MOVQ to load 8bit
  50    value over movb.  */
  51 DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency",
  52           m_P4_NOCONA | m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE | m_CORE_AVX2
  53           | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL
  54           | m_KNL | m_KNM | m_AMD_MULTIPLE | m_LUJIAZUI | m_TREMONT
  55           | m_ALDERLAKE | m_GENERIC)
  56
  57 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store
  58    destinations to be 128bit to allow register renaming on 128bit SSE units,
  59    but usually results in one extra microop on 64bit SSE units.
  60    Experimental results shows that disabling this option on P4 brings over 20%
  61    SPECfp regression, while enabling it on K8 brings roughly 2.4% regression
  62    that can be partly masked by careful scheduling of moves.  */
  63 DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, "sse_partial_reg_dependency",
  64           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
  65           | m_BDVER | m_ZNVER | m_LUJIAZUI | m_TREMONT | m_ALDERLAKE
  66           | m_GENERIC)
  67
  68 /* X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY: This knob avoids
  69    partial write to the destination in scalar SSE conversion from FP
  70    to FP.  */
  71 DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY,
  72           "sse_partial_reg_fp_converts_dependency",
  73           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
  74           | m_BDVER | m_ZNVER | m_LUJIAZUI | m_ALDERLAKE | m_GENERIC)
  75
  76 /* X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY: This knob avoids partial
  77    write to the destination in scalar SSE conversion from integer to FP.  */
  78 DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY,
  79           "sse_partial_reg_converts_dependency",
  80           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
  81           | m_BDVER | m_ZNVER | m_LUJIAZUI | m_ALDERLAKE | m_GENERIC)
  82
  83 /* X86_TUNE_DEST_FALSE_DEP_FOR_GLC: This knob inserts zero-idiom before
  84    several insns to break false dependency on the dest register for GLC
  85    micro-architecture.  */
  86 DEF_TUNE (X86_TUNE_DEST_FALSE_DEP_FOR_GLC,
  87           "dest_false_dep_for_glc", m_SAPPHIRERAPIDS | m_ALDERLAKE)
  88
  89 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
  90    are resolved on SSE register parts instead of whole registers, so we may
  91    maintain just lower part of scalar values in proper format leaving the
  92    upper part undefined.  */
  93 DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8)
  94
  95 /* X86_TUNE_PARTIAL_FLAG_REG_STALL: this flag disables use of flags
  96    set by instructions affecting just some flags (in particular shifts).
  97    This is because Core2 resolves dependencies on whole flags register
  98    and such sequences introduce false dependency on previous instruction
  99    setting full flags.
 100
 101    The flags does not affect generation of INC and DEC that is controlled
 102    by X86_TUNE_USE_INCDEC.  */
 103
 104 DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
 105           m_CORE2)
 106
 107 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
 108    partial dependencies.  */
 109 DEF_TUNE (X86_TUNE_MOVX, "movx",
 110           m_PPRO | m_P4_NOCONA | m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE
 111           | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_KNL | m_KNM | m_INTEL
 112           | m_GOLDMONT_PLUS | m_GEODE | m_AMD_MULTIPLE | m_LUJIAZUI
 113           | m_CORE_AVX2 | m_TREMONT | m_ALDERLAKE | m_GENERIC)
 114
 115 /* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by
 116    full sized loads.  */
 117 DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall",
 118           m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL
 119           | m_KNL | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS | m_AMD_MULTIPLE
 120           | m_LUJIAZUI | m_TREMONT | m_ALDERLAKE | m_GENERIC)
 121
 122 /* X86_TUNE_FUSE_CMP_AND_BRANCH_32: Fuse compare with a subsequent
 123    conditional jump instruction for 32 bit TARGET.  */
 124 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_32, "fuse_cmp_and_branch_32",
 125           m_CORE_ALL | m_BDVER | m_ZNVER | m_LUJIAZUI | m_GENERIC)
 126
 127 /* X86_TUNE_FUSE_CMP_AND_BRANCH_64: Fuse compare with a subsequent
 128    conditional jump instruction for TARGET_64BIT.  */
 129 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64, "fuse_cmp_and_branch_64",
 130           m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER
 131           | m_ZNVER | m_LUJIAZUI | m_GENERIC)
 132
 133 /* X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS: Fuse compare with a
 134    subsequent conditional jump instruction when the condition jump
 135    check sign flag (SF) or overflow flag (OF).  */
 136 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags",
 137           m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER
 138           | m_ZNVER | m_LUJIAZUI | m_GENERIC)
 139
 140 /* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional
 141    jump instruction when the alu instruction produces the CCFLAG consumed by
 142    the conditional jump instruction. */
 143 DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
 144                   m_SANDYBRIDGE | m_CORE_AVX2 | m_LUJIAZUI | m_GENERIC)
 145
 146
 147 /*****************************************************************************/
 148 /* Function prologue, epilogue and function calling sequences.               */
 149 /*****************************************************************************/
 150
 151 /* X86_TUNE_ACCUMULATE_OUTGOING_ARGS: Allocate stack space for outgoing
 152    arguments in prologue/epilogue instead of separately for each call
 153    by push/pop instructions.
 154    This increase code size by about 5% in 32bit mode, less so in 64bit mode
 155    because parameters are passed in registers.  It is considerable
 156    win for targets without stack engine that prevents multple push operations
 157    to happen in parallel.  */
 158
 159 DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args",
 160           m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
 161           | m_GOLDMONT | m_GOLDMONT_PLUS | m_ATHLON_K8 | m_LUJIAZUI)
 162
 163 /* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in prologues that are
 164    considered on critical path.  */
 165 DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move",
 166           m_PPRO | m_ATHLON_K8)
 167
 168 /* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in epilogues that are
 169    considered on critical path.  */
 170 DEF_TUNE (X86_TUNE_EPILOGUE_USING_MOVE, "epilogue_using_move",
 171           m_PPRO | m_ATHLON_K8)
 172
 173 /* X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits.  */
 174 DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave",
 175           m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_LUJIAZUI
 176           | m_TREMONT | m_ALDERLAKE | m_GENERIC)
 177
 178 /* X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions.
 179    Some chips, like 486 and Pentium works faster with separate load
 180    and push instructions.  */
 181 DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory",
 182           m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE
 183           | m_LUJIAZUI | m_TREMONT | m_ALDERLAKE | m_GENERIC)
 184
 185 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
 186    over esp subtraction.  */
 187 DEF_TUNE (X86_TUNE_SINGLE_PUSH, "single_push", m_386 | m_486 | m_PENT
 188           | m_LAKEMONT | m_K6_GEODE)
 189
 190 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
 191    over esp subtraction.  */
 192 DEF_TUNE (X86_TUNE_DOUBLE_PUSH, "double_push", m_PENT | m_LAKEMONT
 193           | m_K6_GEODE)
 194
 195 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
 196    over esp addition.  */
 197 DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop", m_386 | m_486 | m_PENT
 198           | m_LAKEMONT | m_PPRO)
 199
 200 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
 201    over esp addition.  */
 202 DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT | m_LAKEMONT)
 203
 204 /*****************************************************************************/
 205 /* Branch predictor tuning                                                   */
 206 /*****************************************************************************/
 207
 208 /* X86_TUNE_PAD_SHORT_FUNCTION: Make every function to be at least 4
 209    instructions long.  */
 210 DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function", m_BONNELL)
 211
 212 /* X86_TUNE_PAD_RETURNS: Place NOP before every RET that is a destination
 213    of conditional jump or directly preceded by other jump instruction.
 214    This is important for AND K8-AMDFAM10 because the branch prediction
 215    architecture expect at most one jump per 2 byte window.  Failing to
 216    pad returns leads to misaligned return stack.  */
 217 DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns",
 218           m_ATHLON_K8 | m_AMDFAM10)
 219
 220 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
 221    than 4 branch instructions in the 16 byte window.  */
 222 DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit",
 223           m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM
 224           | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL | m_ATHLON_K8 | m_AMDFAM10)
 225
 226 /*****************************************************************************/
 227 /* Integer instruction selection tuning                                      */
 228 /*****************************************************************************/
 229
 230 /* X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL: Enable software prefetching
 231    at -O3.  For the moment, the prefetching seems badly tuned for Intel
 232    chips.  */
 233 DEF_TUNE (X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL, "software_prefetching_beneficial",
 234           m_K6_GEODE | m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
 235
 236 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
 237    on 16-bit immediate moves into memory on Core2 and Corei7.  */
 238 DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_LUJIAZUI | m_GENERIC)
 239
 240 /* X86_TUNE_READ_MODIFY: Enable use of read-modify instructions such
 241    as "add mem, reg".  */
 242 DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_LAKEMONT | m_PPRO))
 243
 244 /* X86_TUNE_USE_INCDEC: Enable use of inc/dec instructions.
 245
 246    Core2 and nehalem has stall of 7 cycles for partial flag register stalls.
 247    Sandy bridge and Ivy bridge generate extra uop.  On Haswell this extra uop
 248    is output only when the values needs to be really merged, which is not
 249    done by GCC generated code.  */
 250 DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec",
 251           ~(m_P4_NOCONA | m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE
 252             | m_BONNELL | m_SILVERMONT | m_INTEL |  m_KNL | m_KNM | m_GOLDMONT
 253             | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_LUJIAZUI
 254             | m_GENERIC))
 255
 256 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
 257    for DFmode copies */
 258 DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves",
 259           ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
 260             | m_KNL | m_KNM | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_LUJIAZUI
 261             | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_GENERIC))
 262
 263 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
 264    will impact LEA instruction selection. */
 265 DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_BONNELL | m_SILVERMONT | m_KNL
 266          | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL | m_LUJIAZUI)
 267
 268 /* X86_TUNE_AVOID_LEA_FOR_ADDR: Avoid lea for address computation.  */
 269 DEF_TUNE (X86_TUNE_AVOID_LEA_FOR_ADDR, "avoid_lea_for_addr",
 270           m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS
 271           | m_KNL | m_KNM)
 272
 273 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
 274    vector path on AMD machines.
 275    FIXME: Do we need to enable this for core? */
 276 DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM32_MEM, "slow_imul_imm32_mem",
 277           m_K8 | m_AMDFAM10)
 278
 279 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
 280    machines.
 281    FIXME: Do we need to enable this for core? */
 282 DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM8, "slow_imul_imm8",
 283           m_K8 | m_AMDFAM10)
 284
 285 /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
 286    a conditional move.  */
 287 DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove",
 288           m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_KNL
 289           | m_KNM | m_INTEL)
 290
 291 /* X86_TUNE_SINGLE_STRINGOP: Enable use of single string operations, such
 292    as MOVS and STOS (without a REP prefix) to move/set sequences of bytes.  */
 293 DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
 294
 295 /* X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB: Enable use of REP MOVSB/STOSB to
 296    move/set sequences of bytes with known size.  */
 297 DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
 298           "prefer_known_rep_movsb_stosb",
 299           m_SKYLAKE | m_ALDERLAKE | m_TREMONT | m_CORE_AVX512 | m_LUJIAZUI)
 300
 301 /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
 302    compact prologues and epilogues by issuing a misaligned moves.  This
 303    requires target to handle misaligned moves and partial memory stalls
 304    reasonably well.
 305    FIXME: This may actualy be a win on more targets than listed here.  */
 306 DEF_TUNE (X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES,
 307           "misaligned_move_string_pro_epilogues",
 308           m_386 | m_486 | m_CORE_ALL | m_AMD_MULTIPLE | m_LUJIAZUI | m_TREMONT
 309           | m_ALDERLAKE | m_GENERIC)
 310
 311 /* X86_TUNE_USE_SAHF: Controls use of SAHF.  */
 312 DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf",
 313           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
 314           | m_KNL | m_KNM | m_INTEL | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER
 315           | m_BTVER | m_ZNVER | m_LUJIAZUI | m_GOLDMONT | m_GOLDMONT_PLUS
 316           | m_TREMONT | m_ALDERLAKE | m_GENERIC)
 317
 318 /* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions.  */
 319 DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd",
 320           ~(m_PENT | m_LAKEMONT | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
 321             | m_K6 | m_GOLDMONT | m_GOLDMONT_PLUS))
 322
 323 /* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions.  */
 324 DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
 325           m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
 326           | m_LAKEMONT | m_AMD_MULTIPLE | m_LUJIAZUI | m_GOLDMONT
 327           | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_GENERIC)
 328
 329 /* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency
 330    for bit-manipulation instructions.  */
 331 DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi",
 332           m_SANDYBRIDGE | m_CORE_AVX2 | m_TREMONT | m_ALDERLAKE | m_LUJIAZUI
 333      | m_GENERIC)
 334
 335 /* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based
 336    on hardware capabilities. Bdver3 hardware has a loop buffer which makes
 337    unrolling small loop less important. For, such architectures we adjust
 338    the unroll factor so that the unrolled loop fits the loop buffer.  */
 339 DEF_TUNE (X86_TUNE_ADJUST_UNROLL, "adjust_unroll_factor", m_BDVER3 | m_BDVER4)
 340
 341 /* X86_TUNE_ONE_IF_CONV_INSNS: Restrict a number of cmov insns in
 342    if-converted sequence to one.  */
 343 DEF_TUNE (X86_TUNE_ONE_IF_CONV_INSN, "one_if_conv_insn",
 344           m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_CORE_ALL | m_GOLDMONT
 345           | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_LUJIAZUI | m_GENERIC)
 346
 347 /* X86_TUNE_AVOID_MFENCE: Use lock prefixed instructions instead of mfence.  */
 348 DEF_TUNE (X86_TUNE_AVOID_MFENCE, "avoid_mfence",
 349          m_CORE_ALL | m_BDVER | m_ZNVER | m_LUJIAZUI | m_TREMONT | m_ALDERLAKE
 350     | m_GENERIC)
 351
 352 /* X86_TUNE_EXPAND_ABS: This enables a new abs pattern by
 353    generating instructions for abs (x) = (((signed) x >> (W-1) ^ x) -
 354    (signed) x >> (W-1)) instead of cmove or SSE max/abs instructions.  */
 355 DEF_TUNE (X86_TUNE_EXPAND_ABS, "expand_abs",
 356           m_CORE_ALL | m_SILVERMONT | m_KNL | m_KNM | m_GOLDMONT
 357           | m_GOLDMONT_PLUS | m_LUJIAZUI)
 358
 359 /*****************************************************************************/
 360 /* 387 instruction selection tuning                                          */
 361 /*****************************************************************************/
 362
 363 /* X86_TUNE_USE_HIMODE_FIOP: Enables use of x87 instructions with 16bit
 364    integer operand.
 365    FIXME: Why this is disabled for modern chips?  */
 366 DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop",
 367           m_386 | m_486 | m_K6_GEODE)
 368
 369 /* X86_TUNE_USE_SIMODE_FIOP: Enables use of x87 instructions with 32bit
 370    integer operand.  */
 371 DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop",
 372           ~(m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL
 373             | m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_AMD_MULTIPLE
 374             | m_LUJIAZUI | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT
 375             | m_ALDERLAKE | m_GENERIC))
 376
 377 /* X86_TUNE_USE_FFREEP: Use freep instruction instead of fstp.  */
 378 DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE | m_LUJIAZUI)
 379
 380 /* X86_TUNE_EXT_80387_CONSTANTS: Use fancy 80387 constants, such as PI.  */
 381 DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants",
 382           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
 383           | m_KNL | m_KNM | m_INTEL | m_K6_GEODE | m_ATHLON_K8 | m_LUJIAZUI
 384           | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_GENERIC)
 385
 386 /*****************************************************************************/
 387 /* SSE instruction selection tuning                                          */
 388 /*****************************************************************************/
 389
 390 /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
 391    regs instead of memory.  */
 392 DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill",
 393           m_CORE_ALL)
 394
 395 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL: Use movups for misaligned loads instead
 396    of a sequence loading registers by parts.  */
 397 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal",
 398           m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | m_KNM
 399           | m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE
 400           | m_AMDFAM10 | m_BDVER | m_BTVER | m_ZNVER | m_LUJIAZUI | m_GENERIC)
 401
 402 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores
 403    instead of a sequence loading registers by parts.  */
 404 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
 405           m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | m_KNM
 406           | m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE
 407           | m_BDVER | m_ZNVER | m_LUJIAZUI | m_GENERIC)
 408
 409 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL: Use packed single
 410    precision 128bit instructions instead of double where possible.   */
 411 DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal",
 412           m_BDVER | m_ZNVER)
 413
 414 /* X86_TUNE_SSE_TYPELESS_STORES: Always movaps/movups for 128bit stores.   */
 415 DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores",
 416           m_AMD_MULTIPLE | m_LUJIAZUI | m_CORE_ALL | m_TREMONT | m_ALDERLAKE
 417           | m_GENERIC)
 418
 419 /* X86_TUNE_SSE_LOAD0_BY_PXOR: Always use pxor to load0 as opposed to
 420    xorps/xorpd and other variants.  */
 421 DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor",
 422           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BDVER | m_BTVER | m_ZNVER
 423           | m_LUJIAZUI | m_TREMONT | m_ALDERLAKE | m_GENERIC)
 424
 425 /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from integer
 426    to SSE registers.  If disabled, the moves will be done by storing
 427    the value to memory and reloading.
 428    Enable this flag for generic - the only relevant architecture preferring
 429    no inter-unit moves is Buldozer. While this makes small regression on SPECfp
 430    scores (sub 0.3%), disabling inter-unit moves penalizes noticeably hand
 431    written vectorized code which use i.e. _mm_set_epi16.  */
 432 DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_TO_VEC, "inter_unit_moves_to_vec",
 433           ~(m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER))
 434
 435 /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from SSE
 436    to integer registers.  If disabled, the moves will be done by storing
 437    the value to memory and reloading.  */
 438 DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_FROM_VEC, "inter_unit_moves_from_vec",
 439           ~m_ATHLON_K8)
 440
 441 /* X86_TUNE_INTER_UNIT_CONVERSIONS: Enable float<->integer conversions
 442    to use both SSE and integer registers at a same time.  */
 443 DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions",
 444           ~(m_AMDFAM10 | m_BDVER))
 445
 446 /* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for
 447    fp converts to destination register.  */
 448 DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts",
 449           m_SILVERMONT | m_KNL | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS
 450           | m_INTEL)
 451
 452 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
 453    from FP to FP.  This form of instructions avoids partial write to the
 454    destination.  */
 455 DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts",
 456           m_AMDFAM10)
 457
 458 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
 459    from integer to FP. */
 460 DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
 461
 462 /* X86_TUNE_SLOW_SHUFB: Indicates tunings with slow pshufb instruction.  */
 463 DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb",
 464           m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_GOLDMONT
 465           | m_GOLDMONT_PLUS | m_INTEL)
 466
 467 /* X86_TUNE_AVOID_4BYTE_PREFIXES: Avoid instructions requiring 4+ bytes of prefixes.  */
 468 DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes",
 469           m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE
 470           | m_INTEL)
 471
 472 /* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2
 473    elements.  */
 474 DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
 475           ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ALDERLAKE | m_GENERIC))
 476
 477 /* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4
 478    elements.  */
 479 DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
 480           ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ALDERLAKE | m_GENERIC))
 481
 482 /* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
 483    elements.  */
 484 DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather",
 485           ~(m_ZNVER1 | m_ZNVER2 | m_ALDERLAKE | m_GENERIC))
 486
 487 /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
 488    smaller FMA chain.  */
 489 DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER)
 490
 491 /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
 492    smaller FMA chain.  */
 493 DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3)
 494
 495 /* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd
 496    for v2df vector reduction.  */
 497 DEF_TUNE (X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD,
 498           "v2df_reduction_prefer_haddpd", m_NONE)
 499
 500 /*****************************************************************************/
 501 /* AVX instruction selection tuning (some of SSE flags affects AVX, too)     */
 502 /*****************************************************************************/
 503
 504 /* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if false, unaligned loads are
 505    split.  */
 506 DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal",
 507           ~(m_NEHALEM | m_SANDYBRIDGE))
 508
 509 /* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if false, unaligned stores are
 510    split.  */
 511 DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_store_optimal",
 512           ~(m_NEHALEM | m_SANDYBRIDGE | m_BDVER | m_ZNVER1))
 513
 514 /* X86_TUNE_AVX256_SPLIT_REGS: if true, AVX256 ops are split into two AVX128 ops.  */
 515 DEF_TUNE (X86_TUNE_AVX256_SPLIT_REGS, "avx256_split_regs",m_BDVER | m_BTVER2
 516           | m_ZNVER1)
 517
 518 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
 519    the auto-vectorizer.  */
 520 DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2
 521           | m_ZNVER1)
 522
 523 /* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of 512-bit AVX
 524    instructions in the auto-vectorizer.  */
 525 DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_CORE_AVX512)
 526
 527 /* X86_TUNE_AVX256_MOVE_BY_PIECES: Optimize move_by_pieces with 256-bit
 528    AVX instructions.  */
 529 DEF_TUNE (X86_TUNE_AVX256_MOVE_BY_PIECES, "avx256_move_by_pieces",
 530           m_CORE_AVX512)
 531
 532 /* X86_TUNE_AVX256_STORE_BY_PIECES: Optimize store_by_pieces with 256-bit
 533    AVX instructions.  */
 534 DEF_TUNE (X86_TUNE_AVX256_STORE_BY_PIECES, "avx256_store_by_pieces",
 535           m_CORE_AVX512)
 536
 537 /* X86_TUNE_AVX512_MOVE_BY_PIECES: Optimize move_by_pieces with 512-bit
 538    AVX instructions.  */
 539 DEF_TUNE (X86_TUNE_AVX512_MOVE_BY_PIECES, "avx512_move_by_pieces",
 540           m_SAPPHIRERAPIDS)
 541
 542 /* X86_TUNE_AVX512_STORE_BY_PIECES: Optimize store_by_pieces with 512-bit
 543    AVX instructions.  */
 544 DEF_TUNE (X86_TUNE_AVX512_STORE_BY_PIECES, "avx512_store_by_pieces",
 545           m_SAPPHIRERAPIDS)
 546
 547 /*****************************************************************************/
 548 /*****************************************************************************/
 549 /* Historical relics: tuning flags that helps a specific old CPU designs     */
 550 /*****************************************************************************/
 551
 552 /* X86_TUNE_DOUBLE_WITH_ADD: Use add instead of sal to double value in
 553    an integer register.  */
 554 DEF_TUNE (X86_TUNE_DOUBLE_WITH_ADD, "double_with_add", ~m_386)
 555
 556 /* X86_TUNE_ALWAYS_FANCY_MATH_387: controls use of fancy 387 operations,
 557    such as fsqrt, fprem, fsin, fcos, fsincos etc.
 558    Should be enabled for all targets that always has coprocesor.  */
 559 DEF_TUNE (X86_TUNE_ALWAYS_FANCY_MATH_387, "always_fancy_math_387",
 560           ~(m_386 | m_486 | m_LAKEMONT))
 561
 562 /* X86_TUNE_UNROLL_STRLEN: Produce (quite lame) unrolled sequence for
 563    inline strlen.  This affects only -minline-all-stringops mode. By
 564    default we always dispatch to a library since our internal strlen
 565    is bad.  */
 566 DEF_TUNE (X86_TUNE_UNROLL_STRLEN, "unroll_strlen", ~m_386)
 567
 568 /* X86_TUNE_SHIFT1: Enables use of short encoding of "sal reg" instead of
 569    longer "sal $1, reg".  */
 570 DEF_TUNE (X86_TUNE_SHIFT1, "shift1", ~m_486)
 571
 572 /* X86_TUNE_ZERO_EXTEND_WITH_AND: Use AND instruction instead
 573    of mozbl/movwl.  */
 574 DEF_TUNE (X86_TUNE_ZERO_EXTEND_WITH_AND, "zero_extend_with_and",
 575           m_486 | m_PENT)
 576
 577 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
 578    and SImode multiply, but 386 and 486 do HImode multiply faster.  */
 579 DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul",
 580           ~(m_386 | m_486))
 581
 582 /* X86_TUNE_FAST_PREFIX: Enable demoting some 32bit or 64bit arithmetic
 583    into 16bit/8bit when resulting sequence is shorter.  For example
 584    for "and $-65536, reg" to 16bit store of 0.  */
 585 DEF_TUNE (X86_TUNE_FAST_PREFIX, "fast_prefix",
 586           ~(m_386 | m_486 | m_PENT | m_LAKEMONT))
 587
 588 /* X86_TUNE_READ_MODIFY_WRITE: Enable use of read modify write instructions
 589    such as "add $1, mem".  */
 590 DEF_TUNE (X86_TUNE_READ_MODIFY_WRITE, "read_modify_write",
 591           ~(m_PENT | m_LAKEMONT))
 592
 593 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
 594    than a MOV.  */
 595 DEF_TUNE (X86_TUNE_MOVE_M1_VIA_OR, "move_m1_via_or", m_PENT | m_LAKEMONT)
 596
 597 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
 598    but one byte longer.  */
 599 DEF_TUNE (X86_TUNE_NOT_UNPAIRABLE, "not_unpairable", m_PENT | m_LAKEMONT)
 600
 601 /* X86_TUNE_PARTIAL_REG_STALL: Pentium pro, unlike later chips, handled
 602    use of partial registers by renaming.  This improved performance of 16bit
 603    code where upper halves of registers are not used.  It also leads to
 604    an penalty whenever a 16bit store is followed by 32bit use.  This flag
 605    disables production of such sequences in common cases.
 606    See also X86_TUNE_HIMODE_MATH.
 607
 608    In current implementation the partial register stalls are not eliminated
 609    very well - they can be introduced via subregs synthesized by combine
 610    and can happen in caller/callee saving sequences.  */
 611 DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO)
 612
 613 /* X86_TUNE_PROMOTE_QIMODE: When it is cheap, turn 8bit arithmetic to
 614    corresponding 32bit arithmetic.  */
 615 DEF_TUNE (X86_TUNE_PROMOTE_QIMODE, "promote_qimode",
 616           ~m_PPRO)
 617
 618 /* X86_TUNE_PROMOTE_HI_REGS: Same, but for 16bit artihmetic.  Again we avoid
 619    partial register stalls on PentiumPro targets. */
 620 DEF_TUNE (X86_TUNE_PROMOTE_HI_REGS, "promote_hi_regs", m_PPRO)
 621
 622 /* X86_TUNE_HIMODE_MATH: Enable use of 16bit arithmetic.
 623    On PPro this flag is meant to avoid partial register stalls.  */
 624 DEF_TUNE (X86_TUNE_HIMODE_MATH, "himode_math", ~m_PPRO)
 625
 626 /* X86_TUNE_SPLIT_LONG_MOVES: Avoid instructions moving immediates
 627    directly to memory.  */
 628 DEF_TUNE (X86_TUNE_SPLIT_LONG_MOVES, "split_long_moves", m_PPRO)
 629
 630 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx.  */
 631 DEF_TUNE (X86_TUNE_USE_XCHGB, "use_xchgb", m_PENT4)
 632
 633 /* X86_TUNE_USE_MOV0: Use "mov $0, reg" instead of "xor reg, reg" to clear
 634    integer register.  */
 635 DEF_TUNE (X86_TUNE_USE_MOV0, "use_mov0", m_K6)
 636
 637 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
 638    operand that cannot be represented using a modRM byte.  The XOR
 639    replacement is long decoded, so this split helps here as well.  */
 640 DEF_TUNE (X86_TUNE_NOT_VECTORMODE, "not_vectormode", m_K6)
 641
 642 /* X86_TUNE_AVOID_VECTOR_DECODE: Enable splitters that avoid vector decoded
 643    forms of instructions on K8 targets.  */
 644 DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode",
 645           m_K8)
 646
 647 /*****************************************************************************/
 648 /* This never worked well before.                                            */
 649 /*****************************************************************************/
 650
 651 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
 652    on simulation result. But after P4 was made, no performance benefit
 653    was observed with branch hints.  It also increases the code size.
 654    As a result, icc never generates branch hints.  */
 655 DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS, "branch_prediction_hints", m_NONE)
 656
 657 /* X86_TUNE_QIMODE_MATH: Enable use of 8bit arithmetic.  */
 658 DEF_TUNE (X86_TUNE_QIMODE_MATH, "qimode_math", m_ALL)
 659
 660 /* X86_TUNE_PROMOTE_QI_REGS: This enables generic code that promotes all 8bit
 661    arithmetic to 32bit via PROMOTE_MODE macro.  This code generation scheme
 662    is usually used for RISC targets.  */
 663 DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", m_NONE)
 664
 665 /* X86_TUNE_EMIT_VZEROUPPER: This enables vzeroupper instruction insertion
 666    before a transfer of control flow out of the function.  */
 667 DEF_TUNE (X86_TUNE_EMIT_VZEROUPPER, "emit_vzeroupper", ~m_KNL)