gcc/config/i386/x86-tune.def

   1 /* Definitions of x86 tunable features.
   2    Copyright (C) 2013-2022 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 3, or (at your option)
   9 any later version.
  10
  11 GCC is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License and
  17 a copy of the GCC Runtime Library Exception along with this program;
  18 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 /* Tuning for a given CPU XXXX consists of:
  22     - adding new CPU into:
  23         - adding PROCESSOR_XXX to processor_type (in i386.h)
  24         - possibly adding XXX into CPU attribute in i386.md
  25         - adding XXX to processor_alias_table (in i386.cc)
  26     - introducing ix86_XXX_cost in i386.cc
  27         - Stringop generation table can be build based on test_stringop
  28         - script (once rest of tuning is complete)
  29     - designing a scheduler model in
  30         - XXXX.md file
  31         - Updating ix86_issue_rate and ix86_adjust_cost in i386.md
  32         - possibly updating ia32_multipass_dfa_lookahead, ix86_sched_reorder
  33           and ix86_sched_init_global if those tricks are needed.
  34     - Tunning the flags bellow. Those are split into sections and each
  35       section is very roughly ordered by importance.  */
  36
  37 /*****************************************************************************/
  38 /* Scheduling flags.                                                         */
  39 /*****************************************************************************/
  40
  41 /* X86_TUNE_SCHEDULE: Enable scheduling.  */
  42 DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
  43           m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT
  44           | m_INTEL | m_KNL | m_KNM | m_K6_GEODE | m_AMD_MULTIPLE | m_GOLDMONT
  45           | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_GENERIC)
  46
  47 /* X86_TUNE_PARTIAL_REG_DEPENDENCY: Enable more register renaming
  48    on modern chips.  Prefer stores affecting whole integer register
  49    over partial stores.  For example prefer MOVZBL or MOVQ to load 8bit
  50    value over movb.  */
  51 DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency",
  52           m_P4_NOCONA | m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE | m_CORE_AVX2
  53           | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL
  54           | m_KNL | m_KNM | m_AMD_MULTIPLE | m_TREMONT | m_ALDERLAKE
  55           | m_GENERIC)
  56
  57 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store
  58    destinations to be 128bit to allow register renaming on 128bit SSE units,
  59    but usually results in one extra microop on 64bit SSE units.
  60    Experimental results shows that disabling this option on P4 brings over 20%
  61    SPECfp regression, while enabling it on K8 brings roughly 2.4% regression
  62    that can be partly masked by careful scheduling of moves.  */
  63 DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, "sse_partial_reg_dependency",
  64           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
  65           | m_BDVER | m_ZNVER | m_TREMONT | m_ALDERLAKE | m_GENERIC)
  66
  67 /* X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY: This knob avoids
  68    partial write to the destination in scalar SSE conversion from FP
  69    to FP.  */
  70 DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY,
  71           "sse_partial_reg_fp_converts_dependency",
  72           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
  73           | m_BDVER | m_ZNVER | m_ALDERLAKE | m_GENERIC)
  74
  75 /* X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY: This knob avoids partial
  76    write to the destination in scalar SSE conversion from integer to FP.  */
  77 DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY,
  78           "sse_partial_reg_converts_dependency",
  79           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
  80           | m_BDVER | m_ZNVER | m_ALDERLAKE | m_GENERIC)
  81
  82 /* X86_TUNE_DEST_FALSE_DEP_FOR_GLC: This knob inserts zero-idiom before
  83    several insns to break false dependency on the dest register for GLC
  84    micro-architecture.  */
  85 DEF_TUNE (X86_TUNE_DEST_FALSE_DEP_FOR_GLC,
  86           "dest_false_dep_for_glc", m_SAPPHIRERAPIDS | m_ALDERLAKE)
  87
  88 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
  89    are resolved on SSE register parts instead of whole registers, so we may
  90    maintain just lower part of scalar values in proper format leaving the
  91    upper part undefined.  */
  92 DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8)
  93
  94 /* X86_TUNE_PARTIAL_FLAG_REG_STALL: this flag disables use of flags
  95    set by instructions affecting just some flags (in particular shifts).
  96    This is because Core2 resolves dependencies on whole flags register
  97    and such sequences introduce false dependency on previous instruction
  98    setting full flags.
  99
 100    The flags does not affect generation of INC and DEC that is controlled
 101    by X86_TUNE_USE_INCDEC.  */
 102
 103 DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
 104           m_CORE2)
 105
 106 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
 107    partial dependencies.  */
 108 DEF_TUNE (X86_TUNE_MOVX, "movx",
 109           m_PPRO | m_P4_NOCONA | m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE
 110           | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_KNL | m_KNM | m_INTEL
 111           | m_GOLDMONT_PLUS | m_GEODE | m_AMD_MULTIPLE
 112           | m_CORE_AVX2 | m_TREMONT | m_ALDERLAKE | m_GENERIC)
 113
 114 /* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by
 115    full sized loads.  */
 116 DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall",
 117           m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL
 118           | m_KNL | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS | m_AMD_MULTIPLE
 119           | m_TREMONT | m_ALDERLAKE | m_GENERIC)
 120
 121 /* X86_TUNE_FUSE_CMP_AND_BRANCH_32: Fuse compare with a subsequent
 122    conditional jump instruction for 32 bit TARGET.  */
 123 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_32, "fuse_cmp_and_branch_32",
 124           m_CORE_ALL | m_BDVER | m_ZNVER | m_GENERIC)
 125
 126 /* X86_TUNE_FUSE_CMP_AND_BRANCH_64: Fuse compare with a subsequent
 127    conditional jump instruction for TARGET_64BIT.  */
 128 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64, "fuse_cmp_and_branch_64",
 129           m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER
 130           | m_ZNVER | m_GENERIC)
 131
 132 /* X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS: Fuse compare with a
 133    subsequent conditional jump instruction when the condition jump
 134    check sign flag (SF) or overflow flag (OF).  */
 135 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags",
 136           m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER
 137           | m_ZNVER | m_GENERIC)
 138
 139 /* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional
 140    jump instruction when the alu instruction produces the CCFLAG consumed by
 141    the conditional jump instruction. */
 142 DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
 143           m_SANDYBRIDGE | m_CORE_AVX2 | m_GENERIC)
 144
 145
 146 /*****************************************************************************/
 147 /* Function prologue, epilogue and function calling sequences.               */
 148 /*****************************************************************************/
 149
 150 /* X86_TUNE_ACCUMULATE_OUTGOING_ARGS: Allocate stack space for outgoing
 151    arguments in prologue/epilogue instead of separately for each call
 152    by push/pop instructions.
 153    This increase code size by about 5% in 32bit mode, less so in 64bit mode
 154    because parameters are passed in registers.  It is considerable
 155    win for targets without stack engine that prevents multple push operations
 156    to happen in parallel.  */
 157
 158 DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args",
 159           m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
 160           | m_GOLDMONT | m_GOLDMONT_PLUS | m_ATHLON_K8)
 161
 162 /* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in prologues that are
 163    considered on critical path.  */
 164 DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move",
 165           m_PPRO | m_ATHLON_K8)
 166
 167 /* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in epilogues that are
 168    considered on critical path.  */
 169 DEF_TUNE (X86_TUNE_EPILOGUE_USING_MOVE, "epilogue_using_move",
 170           m_PPRO | m_ATHLON_K8)
 171
 172 /* X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits.  */
 173 DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave",
 174           m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_TREMONT
 175           | m_ALDERLAKE | m_GENERIC)
 176
 177 /* X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions.
 178    Some chips, like 486 and Pentium works faster with separate load
 179    and push instructions.  */
 180 DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory",
 181           m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE
 182           | m_TREMONT | m_ALDERLAKE | m_GENERIC)
 183
 184 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
 185    over esp subtraction.  */
 186 DEF_TUNE (X86_TUNE_SINGLE_PUSH, "single_push", m_386 | m_486 | m_PENT
 187           | m_LAKEMONT | m_K6_GEODE)
 188
 189 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
 190    over esp subtraction.  */
 191 DEF_TUNE (X86_TUNE_DOUBLE_PUSH, "double_push", m_PENT | m_LAKEMONT
 192           | m_K6_GEODE)
 193
 194 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
 195    over esp addition.  */
 196 DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop", m_386 | m_486 | m_PENT
 197           | m_LAKEMONT | m_PPRO)
 198
 199 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
 200    over esp addition.  */
 201 DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT | m_LAKEMONT)
 202
 203 /*****************************************************************************/
 204 /* Branch predictor tuning                                                   */
 205 /*****************************************************************************/
 206
 207 /* X86_TUNE_PAD_SHORT_FUNCTION: Make every function to be at least 4
 208    instructions long.  */
 209 DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function", m_BONNELL)
 210
 211 /* X86_TUNE_PAD_RETURNS: Place NOP before every RET that is a destination
 212    of conditional jump or directly preceded by other jump instruction.
 213    This is important for AND K8-AMDFAM10 because the branch prediction
 214    architecture expect at most one jump per 2 byte window.  Failing to
 215    pad returns leads to misaligned return stack.  */
 216 DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns",
 217           m_ATHLON_K8 | m_AMDFAM10)
 218
 219 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
 220    than 4 branch instructions in the 16 byte window.  */
 221 DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit",
 222           m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM
 223           | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL | m_ATHLON_K8 | m_AMDFAM10)
 224
 225 /*****************************************************************************/
 226 /* Integer instruction selection tuning                                      */
 227 /*****************************************************************************/
 228
 229 /* X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL: Enable software prefetching
 230    at -O3.  For the moment, the prefetching seems badly tuned for Intel
 231    chips.  */
 232 DEF_TUNE (X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL, "software_prefetching_beneficial",
 233           m_K6_GEODE | m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
 234
 235 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
 236    on 16-bit immediate moves into memory on Core2 and Corei7.  */
 237 DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_GENERIC)
 238
 239 /* X86_TUNE_READ_MODIFY: Enable use of read-modify instructions such
 240    as "add mem, reg".  */
 241 DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_LAKEMONT | m_PPRO))
 242
 243 /* X86_TUNE_USE_INCDEC: Enable use of inc/dec instructions.
 244
 245    Core2 and nehalem has stall of 7 cycles for partial flag register stalls.
 246    Sandy bridge and Ivy bridge generate extra uop.  On Haswell this extra uop
 247    is output only when the values needs to be really merged, which is not
 248    done by GCC generated code.  */
 249 DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec",
 250           ~(m_P4_NOCONA | m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE
 251             | m_BONNELL | m_SILVERMONT | m_INTEL |  m_KNL | m_KNM | m_GOLDMONT
 252             | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_GENERIC))
 253
 254 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
 255    for DFmode copies */
 256 DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves",
 257           ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
 258             | m_KNL | m_KNM | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_GOLDMONT
 259             | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_GENERIC))
 260
 261 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
 262    will impact LEA instruction selection. */
 263 DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_BONNELL | m_SILVERMONT | m_KNL
 264          | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL)
 265
 266 /* X86_TUNE_AVOID_LEA_FOR_ADDR: Avoid lea for address computation.  */
 267 DEF_TUNE (X86_TUNE_AVOID_LEA_FOR_ADDR, "avoid_lea_for_addr",
 268           m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS
 269           | m_KNL | m_KNM)
 270
 271 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
 272    vector path on AMD machines.
 273    FIXME: Do we need to enable this for core? */
 274 DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM32_MEM, "slow_imul_imm32_mem",
 275           m_K8 | m_AMDFAM10)
 276
 277 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
 278    machines.
 279    FIXME: Do we need to enable this for core? */
 280 DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM8, "slow_imul_imm8",
 281           m_K8 | m_AMDFAM10)
 282
 283 /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
 284    a conditional move.  */
 285 DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove",
 286           m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_KNL
 287           | m_KNM | m_INTEL)
 288
 289 /* X86_TUNE_SINGLE_STRINGOP: Enable use of single string operations, such
 290    as MOVS and STOS (without a REP prefix) to move/set sequences of bytes.  */
 291 DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
 292
 293 /* X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB: Enable use of REP MOVSB/STOSB to
 294    move/set sequences of bytes with known size.  */
 295 DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
 296           "prefer_known_rep_movsb_stosb",
 297           m_SKYLAKE | m_ALDERLAKE | m_TREMONT | m_CORE_AVX512)
 298
 299 /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
 300    compact prologues and epilogues by issuing a misaligned moves.  This
 301    requires target to handle misaligned moves and partial memory stalls
 302    reasonably well.
 303    FIXME: This may actualy be a win on more targets than listed here.  */
 304 DEF_TUNE (X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES,
 305           "misaligned_move_string_pro_epilogues",
 306           m_386 | m_486 | m_CORE_ALL | m_AMD_MULTIPLE | m_TREMONT
 307           | m_ALDERLAKE | m_GENERIC)
 308
 309 /* X86_TUNE_USE_SAHF: Controls use of SAHF.  */
 310 DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf",
 311           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
 312           | m_KNL | m_KNM | m_INTEL | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER
 313           | m_BTVER | m_ZNVER | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT
 314           | m_ALDERLAKE | m_GENERIC)
 315
 316 /* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions.  */
 317 DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd",
 318           ~(m_PENT | m_LAKEMONT | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
 319             | m_K6 | m_GOLDMONT | m_GOLDMONT_PLUS))
 320
 321 /* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions.  */
 322 DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
 323           m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_INTEL
 324           | m_LAKEMONT | m_AMD_MULTIPLE | m_GOLDMONT | m_GOLDMONT_PLUS
 325           | m_TREMONT | m_ALDERLAKE | m_GENERIC)
 326
 327 /* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency
 328    for bit-manipulation instructions.  */
 329 DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi",
 330           m_SANDYBRIDGE | m_CORE_AVX2 | m_TREMONT | m_ALDERLAKE | m_GENERIC)
 331
 332 /* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based
 333    on hardware capabilities. Bdver3 hardware has a loop buffer which makes
 334    unrolling small loop less important. For, such architectures we adjust
 335    the unroll factor so that the unrolled loop fits the loop buffer.  */
 336 DEF_TUNE (X86_TUNE_ADJUST_UNROLL, "adjust_unroll_factor", m_BDVER3 | m_BDVER4)
 337
 338 /* X86_TUNE_ONE_IF_CONV_INSNS: Restrict a number of cmov insns in
 339    if-converted sequence to one.  */
 340 DEF_TUNE (X86_TUNE_ONE_IF_CONV_INSN, "one_if_conv_insn",
 341           m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_CORE_ALL | m_GOLDMONT
 342           | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_GENERIC)
 343
 344 /* X86_TUNE_AVOID_MFENCE: Use lock prefixed instructions instead of mfence.  */
 345 DEF_TUNE (X86_TUNE_AVOID_MFENCE, "avoid_mfence",
 346          m_CORE_ALL | m_BDVER | m_ZNVER | m_TREMONT | m_ALDERLAKE | m_GENERIC)
 347
 348 /* X86_TUNE_EXPAND_ABS: This enables a new abs pattern by
 349    generating instructions for abs (x) = (((signed) x >> (W-1) ^ x) -
 350    (signed) x >> (W-1)) instead of cmove or SSE max/abs instructions.  */
 351 DEF_TUNE (X86_TUNE_EXPAND_ABS, "expand_abs",
 352           m_CORE_ALL | m_SILVERMONT | m_KNL | m_KNM | m_GOLDMONT
 353           | m_GOLDMONT_PLUS)
 354
 355 /*****************************************************************************/
 356 /* 387 instruction selection tuning                                          */
 357 /*****************************************************************************/
 358
 359 /* X86_TUNE_USE_HIMODE_FIOP: Enables use of x87 instructions with 16bit
 360    integer operand.
 361    FIXME: Why this is disabled for modern chips?  */
 362 DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop",
 363           m_386 | m_486 | m_K6_GEODE)
 364
 365 /* X86_TUNE_USE_SIMODE_FIOP: Enables use of x87 instructions with 32bit
 366    integer operand.  */
 367 DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop",
 368           ~(m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL
 369             | m_SILVERMONT | m_KNL | m_KNM | m_INTEL | m_AMD_MULTIPLE
 370             | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE
 371             | m_GENERIC))
 372
 373 /* X86_TUNE_USE_FFREEP: Use freep instruction instead of fstp.  */
 374 DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE)
 375
 376 /* X86_TUNE_EXT_80387_CONSTANTS: Use fancy 80387 constants, such as PI.  */
 377 DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants",
 378           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
 379           | m_KNL | m_KNM | m_INTEL | m_K6_GEODE | m_ATHLON_K8 | m_GOLDMONT
 380           | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE | m_GENERIC)
 381
 382 /*****************************************************************************/
 383 /* SSE instruction selection tuning                                          */
 384 /*****************************************************************************/
 385
 386 /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
 387    regs instead of memory.  */
 388 DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill",
 389           m_CORE_ALL)
 390
 391 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL: Use movups for misaligned loads instead
 392    of a sequence loading registers by parts.  */
 393 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal",
 394           m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | m_KNM
 395           | m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE
 396           | m_AMDFAM10 | m_BDVER | m_BTVER | m_ZNVER | m_GENERIC)
 397
 398 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores
 399    instead of a sequence loading registers by parts.  */
 400 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
 401           m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_KNL | m_KNM
 402           | m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS
 403           | m_TREMONT | m_ALDERLAKE | m_BDVER | m_ZNVER | m_GENERIC)
 404
 405 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL: Use packed single
 406    precision 128bit instructions instead of double where possible.   */
 407 DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal",
 408           m_BDVER | m_ZNVER)
 409
 410 /* X86_TUNE_SSE_TYPELESS_STORES: Always movaps/movups for 128bit stores.   */
 411 DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores",
 412           m_AMD_MULTIPLE | m_CORE_ALL | m_TREMONT | m_ALDERLAKE | m_GENERIC)
 413
 414 /* X86_TUNE_SSE_LOAD0_BY_PXOR: Always use pxor to load0 as opposed to
 415    xorps/xorpd and other variants.  */
 416 DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor",
 417           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BDVER | m_BTVER | m_ZNVER
 418           | m_TREMONT | m_ALDERLAKE | m_GENERIC)
 419
 420 /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from integer
 421    to SSE registers.  If disabled, the moves will be done by storing
 422    the value to memory and reloading.
 423    Enable this flag for generic - the only relevant architecture preferring
 424    no inter-unit moves is Buldozer. While this makes small regression on SPECfp
 425    scores (sub 0.3%), disabling inter-unit moves penalizes noticeably hand
 426    written vectorized code which use i.e. _mm_set_epi16.  */
 427 DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_TO_VEC, "inter_unit_moves_to_vec",
 428           ~(m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER))
 429
 430 /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from SSE
 431    to integer registers.  If disabled, the moves will be done by storing
 432    the value to memory and reloading.  */
 433 DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_FROM_VEC, "inter_unit_moves_from_vec",
 434           ~m_ATHLON_K8)
 435
 436 /* X86_TUNE_INTER_UNIT_CONVERSIONS: Enable float<->integer conversions
 437    to use both SSE and integer registers at a same time.  */
 438 DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions",
 439           ~(m_AMDFAM10 | m_BDVER))
 440
 441 /* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for
 442    fp converts to destination register.  */
 443 DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts",
 444           m_SILVERMONT | m_KNL | m_KNM | m_GOLDMONT | m_GOLDMONT_PLUS
 445           | m_INTEL)
 446
 447 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
 448    from FP to FP.  This form of instructions avoids partial write to the
 449    destination.  */
 450 DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts",
 451           m_AMDFAM10)
 452
 453 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
 454    from integer to FP. */
 455 DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
 456
 457 /* X86_TUNE_SLOW_SHUFB: Indicates tunings with slow pshufb instruction.  */
 458 DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb",
 459           m_BONNELL | m_SILVERMONT | m_KNL | m_KNM | m_GOLDMONT
 460           | m_GOLDMONT_PLUS | m_INTEL)
 461
 462 /* X86_TUNE_AVOID_4BYTE_PREFIXES: Avoid instructions requiring 4+ bytes of prefixes.  */
 463 DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes",
 464           m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_ALDERLAKE
 465           | m_INTEL)
 466
 467 /* X86_TUNE_USE_GATHER: Use gather instructions.  */
 468 DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather",
 469           ~(m_ZNVER1 | m_ZNVER2 | m_ALDERLAKE | m_GENERIC))
 470
 471 /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
 472    smaller FMA chain.  */
 473 DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER)
 474
 475 /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
 476    smaller FMA chain.  */
 477 DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3)
 478
 479 /* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd
 480    for v2df vector reduction.  */
 481 DEF_TUNE (X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD,
 482           "v2df_reduction_prefer_haddpd", m_NONE)
 483
 484 /*****************************************************************************/
 485 /* AVX instruction selection tuning (some of SSE flags affects AVX, too)     */
 486 /*****************************************************************************/
 487
 488 /* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if false, unaligned loads are
 489    split.  */
 490 DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal",
 491           ~(m_NEHALEM | m_SANDYBRIDGE))
 492
 493 /* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if false, unaligned stores are
 494    split.  */
 495 DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_store_optimal",
 496           ~(m_NEHALEM | m_SANDYBRIDGE | m_BDVER | m_ZNVER1))
 497
 498 /* X86_TUNE_AVX256_SPLIT_REGS: if true, AVX256 ops are split into two AVX128 ops.  */
 499 DEF_TUNE (X86_TUNE_AVX256_SPLIT_REGS, "avx256_split_regs",m_BDVER | m_BTVER2
 500           | m_ZNVER1)
 501
 502 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
 503    the auto-vectorizer.  */
 504 DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2
 505           | m_ZNVER1)
 506
 507 /* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of 512-bit AVX
 508    instructions in the auto-vectorizer.  */
 509 DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_CORE_AVX512)
 510
 511 /* X86_TUNE_AVX256_MOVE_BY_PIECES: Optimize move_by_pieces with 256-bit
 512    AVX instructions.  */
 513 DEF_TUNE (X86_TUNE_AVX256_MOVE_BY_PIECES, "avx256_move_by_pieces",
 514           m_CORE_AVX512)
 515
 516 /* X86_TUNE_AVX256_STORE_BY_PIECES: Optimize store_by_pieces with 256-bit
 517    AVX instructions.  */
 518 DEF_TUNE (X86_TUNE_AVX256_STORE_BY_PIECES, "avx256_store_by_pieces",
 519           m_CORE_AVX512)
 520
 521 /* X86_TUNE_AVX512_MOVE_BY_PIECES: Optimize move_by_pieces with 512-bit
 522    AVX instructions.  */
 523 DEF_TUNE (X86_TUNE_AVX512_MOVE_BY_PIECES, "avx512_move_by_pieces",
 524           m_SAPPHIRERAPIDS)
 525
 526 /* X86_TUNE_AVX512_STORE_BY_PIECES: Optimize store_by_pieces with 512-bit
 527    AVX instructions.  */
 528 DEF_TUNE (X86_TUNE_AVX512_STORE_BY_PIECES, "avx512_store_by_pieces",
 529           m_SAPPHIRERAPIDS)
 530
 531 /*****************************************************************************/
 532 /*****************************************************************************/
 533 /* Historical relics: tuning flags that helps a specific old CPU designs     */
 534 /*****************************************************************************/
 535
 536 /* X86_TUNE_DOUBLE_WITH_ADD: Use add instead of sal to double value in
 537    an integer register.  */
 538 DEF_TUNE (X86_TUNE_DOUBLE_WITH_ADD, "double_with_add", ~m_386)
 539
 540 /* X86_TUNE_ALWAYS_FANCY_MATH_387: controls use of fancy 387 operations,
 541    such as fsqrt, fprem, fsin, fcos, fsincos etc.
 542    Should be enabled for all targets that always has coprocesor.  */
 543 DEF_TUNE (X86_TUNE_ALWAYS_FANCY_MATH_387, "always_fancy_math_387",
 544           ~(m_386 | m_486 | m_LAKEMONT))
 545
 546 /* X86_TUNE_UNROLL_STRLEN: Produce (quite lame) unrolled sequence for
 547    inline strlen.  This affects only -minline-all-stringops mode. By
 548    default we always dispatch to a library since our internal strlen
 549    is bad.  */
 550 DEF_TUNE (X86_TUNE_UNROLL_STRLEN, "unroll_strlen", ~m_386)
 551
 552 /* X86_TUNE_SHIFT1: Enables use of short encoding of "sal reg" instead of
 553    longer "sal $1, reg".  */
 554 DEF_TUNE (X86_TUNE_SHIFT1, "shift1", ~m_486)
 555
 556 /* X86_TUNE_ZERO_EXTEND_WITH_AND: Use AND instruction instead
 557    of mozbl/movwl.  */
 558 DEF_TUNE (X86_TUNE_ZERO_EXTEND_WITH_AND, "zero_extend_with_and",
 559           m_486 | m_PENT)
 560
 561 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
 562    and SImode multiply, but 386 and 486 do HImode multiply faster.  */
 563 DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul",
 564           ~(m_386 | m_486))
 565
 566 /* X86_TUNE_FAST_PREFIX: Enable demoting some 32bit or 64bit arithmetic
 567    into 16bit/8bit when resulting sequence is shorter.  For example
 568    for "and $-65536, reg" to 16bit store of 0.  */
 569 DEF_TUNE (X86_TUNE_FAST_PREFIX, "fast_prefix",
 570           ~(m_386 | m_486 | m_PENT | m_LAKEMONT))
 571
 572 /* X86_TUNE_READ_MODIFY_WRITE: Enable use of read modify write instructions
 573    such as "add $1, mem".  */
 574 DEF_TUNE (X86_TUNE_READ_MODIFY_WRITE, "read_modify_write",
 575           ~(m_PENT | m_LAKEMONT))
 576
 577 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
 578    than a MOV.  */
 579 DEF_TUNE (X86_TUNE_MOVE_M1_VIA_OR, "move_m1_via_or", m_PENT | m_LAKEMONT)
 580
 581 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
 582    but one byte longer.  */
 583 DEF_TUNE (X86_TUNE_NOT_UNPAIRABLE, "not_unpairable", m_PENT | m_LAKEMONT)
 584
 585 /* X86_TUNE_PARTIAL_REG_STALL: Pentium pro, unlike later chips, handled
 586    use of partial registers by renaming.  This improved performance of 16bit
 587    code where upper halves of registers are not used.  It also leads to
 588    an penalty whenever a 16bit store is followed by 32bit use.  This flag
 589    disables production of such sequences in common cases.
 590    See also X86_TUNE_HIMODE_MATH.
 591
 592    In current implementation the partial register stalls are not eliminated
 593    very well - they can be introduced via subregs synthesized by combine
 594    and can happen in caller/callee saving sequences.  */
 595 DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO)
 596
 597 /* X86_TUNE_PROMOTE_QIMODE: When it is cheap, turn 8bit arithmetic to
 598    corresponding 32bit arithmetic.  */
 599 DEF_TUNE (X86_TUNE_PROMOTE_QIMODE, "promote_qimode",
 600           ~m_PPRO)
 601
 602 /* X86_TUNE_PROMOTE_HI_REGS: Same, but for 16bit artihmetic.  Again we avoid
 603    partial register stalls on PentiumPro targets. */
 604 DEF_TUNE (X86_TUNE_PROMOTE_HI_REGS, "promote_hi_regs", m_PPRO)
 605
 606 /* X86_TUNE_HIMODE_MATH: Enable use of 16bit arithmetic.
 607    On PPro this flag is meant to avoid partial register stalls.  */
 608 DEF_TUNE (X86_TUNE_HIMODE_MATH, "himode_math", ~m_PPRO)
 609
 610 /* X86_TUNE_SPLIT_LONG_MOVES: Avoid instructions moving immediates
 611    directly to memory.  */
 612 DEF_TUNE (X86_TUNE_SPLIT_LONG_MOVES, "split_long_moves", m_PPRO)
 613
 614 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx.  */
 615 DEF_TUNE (X86_TUNE_USE_XCHGB, "use_xchgb", m_PENT4)
 616
 617 /* X86_TUNE_USE_MOV0: Use "mov $0, reg" instead of "xor reg, reg" to clear
 618    integer register.  */
 619 DEF_TUNE (X86_TUNE_USE_MOV0, "use_mov0", m_K6)
 620
 621 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
 622    operand that cannot be represented using a modRM byte.  The XOR
 623    replacement is long decoded, so this split helps here as well.  */
 624 DEF_TUNE (X86_TUNE_NOT_VECTORMODE, "not_vectormode", m_K6)
 625
 626 /* X86_TUNE_AVOID_VECTOR_DECODE: Enable splitters that avoid vector decoded
 627    forms of instructions on K8 targets.  */
 628 DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode",
 629           m_K8)
 630
 631 /*****************************************************************************/
 632 /* This never worked well before.                                            */
 633 /*****************************************************************************/
 634
 635 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
 636    on simulation result. But after P4 was made, no performance benefit
 637    was observed with branch hints.  It also increases the code size.
 638    As a result, icc never generates branch hints.  */
 639 DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS, "branch_prediction_hints", m_NONE)
 640
 641 /* X86_TUNE_QIMODE_MATH: Enable use of 8bit arithmetic.  */
 642 DEF_TUNE (X86_TUNE_QIMODE_MATH, "qimode_math", m_ALL)
 643
 644 /* X86_TUNE_PROMOTE_QI_REGS: This enables generic code that promotes all 8bit
 645    arithmetic to 32bit via PROMOTE_MODE macro.  This code generation scheme
 646    is usually used for RISC targets.  */
 647 DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", m_NONE)
 648
 649 /* X86_TUNE_EMIT_VZEROUPPER: This enables vzeroupper instruction insertion
 650    before a transfer of control flow out of the function.  */
 651 DEF_TUNE (X86_TUNE_EMIT_VZEROUPPER, "emit_vzeroupper", ~m_KNL)