gcc/config/i386/x86-tune-costs.h

   1 /* Costs of operations of individual x86 CPUs.
   2    Copyright (C) 1988-2019 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 3, or (at your option)
   9 any later version.
  10
  11 GCC is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 Under Section 7 of GPL version 3, you are granted additional
  17 permissions described in the GCC Runtime Library Exception, version
  18 3.1, as published by the Free Software Foundation.
  19
  20 You should have received a copy of the GNU General Public License and
  21 a copy of the GCC Runtime Library Exception along with this program;
  22 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  23 <http://www.gnu.org/licenses/>.  */
  24 /* Processor costs (relative to an add) */
  25 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes.  */
  26 #define COSTS_N_BYTES(N) ((N) * 2)
  27
  28 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
  29
  30 static stringop_algs ix86_size_memcpy[2] = {
  31   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
  32   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
  33 static stringop_algs ix86_size_memset[2] = {
  34   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
  35   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
  36
  37 const
  38 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
  39   {
  40   /* Start of register allocator costs.  integer->integer move cost is 2. */
  41   2,                                 /* cost for loading QImode using movzbl */
  42   {2, 2, 2},                            /* cost of loading integer registers
  43                                            in QImode, HImode and SImode.
  44                                            Relative to reg-reg move (2).  */
  45   {2, 2, 2},                            /* cost of storing integer registers */
  46   2,                                    /* cost of reg,reg fld/fst */
  47   {2, 2, 2},                            /* cost of loading fp registers
  48                                            in SFmode, DFmode and XFmode */
  49   {2, 2, 2},                            /* cost of storing fp registers
  50                                            in SFmode, DFmode and XFmode */
  51   3,                                    /* cost of moving MMX register */
  52   {3, 3},                               /* cost of loading MMX registers
  53                                            in SImode and DImode */
  54   {3, 3},                               /* cost of storing MMX registers
  55                                            in SImode and DImode */
  56   3, 3, 3,                              /* cost of moving XMM,YMM,ZMM register */
  57   {3, 3, 3, 3, 3},                      /* cost of loading SSE registers
  58                                            in 32,64,128,256 and 512-bit */
  59   {3, 3, 3, 3, 3},                      /* cost of storing SSE registers
  60                                            in 32,64,128,256 and 512-bit */
  61   3, 3,                                 /* SSE->integer and integer->SSE moves */
  62   /* End of register allocator costs.  */
  63   },
  64
  65   COSTS_N_BYTES (2),                    /* cost of an add instruction */
  66   COSTS_N_BYTES (3),                    /* cost of a lea instruction */
  67   COSTS_N_BYTES (2),                    /* variable shift costs */
  68   COSTS_N_BYTES (3),                    /* constant shift costs */
  69   {COSTS_N_BYTES (3),                   /* cost of starting multiply for QI */
  70    COSTS_N_BYTES (3),                   /*                               HI */
  71    COSTS_N_BYTES (3),                   /*                               SI */
  72    COSTS_N_BYTES (3),                   /*                               DI */
  73    COSTS_N_BYTES (5)},                  /*                            other */
  74   0,                                    /* cost of multiply per each bit set */
  75   {COSTS_N_BYTES (3),                   /* cost of a divide/mod for QI */
  76    COSTS_N_BYTES (3),                   /*                          HI */
  77    COSTS_N_BYTES (3),                   /*                          SI */
  78    COSTS_N_BYTES (3),                   /*                          DI */
  79    COSTS_N_BYTES (5)},                  /*                          other */
  80   COSTS_N_BYTES (3),                    /* cost of movsx */
  81   COSTS_N_BYTES (3),                    /* cost of movzx */
  82   0,                                    /* "large" insn */
  83   2,                                    /* MOVE_RATIO */
  84   2,                                    /* CLEAR_RATIO */
  85   {2, 2, 2},                            /* cost of loading integer registers
  86                                            in QImode, HImode and SImode.
  87                                            Relative to reg-reg move (2).  */
  88   {2, 2, 2},                            /* cost of storing integer registers */
  89   {3, 3, 3, 3, 3},                      /* cost of loading SSE register
  90                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
  91   {3, 3, 3, 3, 3},                      /* cost of storing SSE register
  92                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
  93   {3, 3, 3, 3, 3},                      /* cost of unaligned SSE load
  94                                            in 128bit, 256bit and 512bit */
  95   {3, 3, 3, 3, 3},                      /* cost of unaligned SSE store
  96                                            in 128bit, 256bit and 512bit */
  97   3, 3, 3,                              /* cost of moving XMM,YMM,ZMM register */
  98   3,                                    /* cost of moving SSE register to integer.  */
  99   5, 0,                                 /* Gather load static, per_elt.  */
 100   5, 0,                                 /* Gather store static, per_elt.  */
 101   0,                                    /* size of l1 cache  */
 102   0,                                    /* size of l2 cache  */
 103   0,                                    /* size of prefetch block */
 104   0,                                    /* number of parallel prefetches */
 105   2,                                    /* Branch cost */
 106   COSTS_N_BYTES (2),                    /* cost of FADD and FSUB insns.  */
 107   COSTS_N_BYTES (2),                    /* cost of FMUL instruction.  */
 108   COSTS_N_BYTES (2),                    /* cost of FDIV instruction.  */
 109   COSTS_N_BYTES (2),                    /* cost of FABS instruction.  */
 110   COSTS_N_BYTES (2),                    /* cost of FCHS instruction.  */
 111   COSTS_N_BYTES (2),                    /* cost of FSQRT instruction.  */
 112
 113   COSTS_N_BYTES (2),                    /* cost of cheap SSE instruction.  */
 114   COSTS_N_BYTES (2),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 115   COSTS_N_BYTES (2),                    /* cost of MULSS instruction.  */
 116   COSTS_N_BYTES (2),                    /* cost of MULSD instruction.  */
 117   COSTS_N_BYTES (2),                    /* cost of FMA SS instruction.  */
 118   COSTS_N_BYTES (2),                    /* cost of FMA SD instruction.  */
 119   COSTS_N_BYTES (2),                    /* cost of DIVSS instruction.  */
 120   COSTS_N_BYTES (2),                    /* cost of DIVSD instruction.  */
 121   COSTS_N_BYTES (2),                    /* cost of SQRTSS instruction.  */
 122   COSTS_N_BYTES (2),                    /* cost of SQRTSD instruction.  */
 123   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 124   ix86_size_memcpy,
 125   ix86_size_memset,
 126   COSTS_N_BYTES (1),                    /* cond_taken_branch_cost.  */
 127   COSTS_N_BYTES (1),                    /* cond_not_taken_branch_cost.  */
 128   NULL,                                 /* Loop alignment.  */
 129   NULL,                                 /* Jump alignment.  */
 130   NULL,                                 /* Label alignment.  */
 131   NULL,                                 /* Func alignment.  */
 132 };
 133
 134 /* Processor costs (relative to an add) */
 135 static stringop_algs i386_memcpy[2] = {
 136   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
 137   DUMMY_STRINGOP_ALGS};
 138 static stringop_algs i386_memset[2] = {
 139   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
 140   DUMMY_STRINGOP_ALGS};
 141
 142 static const
 143 struct processor_costs i386_cost = {    /* 386 specific costs */
 144   {
 145   /* Start of register allocator costs.  integer->integer move cost is 2. */
 146   4,                                 /* cost for loading QImode using movzbl */
 147   {2, 4, 2},                            /* cost of loading integer registers
 148                                            in QImode, HImode and SImode.
 149                                            Relative to reg-reg move (2).  */
 150   {2, 4, 2},                            /* cost of storing integer registers */
 151   2,                                    /* cost of reg,reg fld/fst */
 152   {8, 8, 8},                            /* cost of loading fp registers
 153                                            in SFmode, DFmode and XFmode */
 154   {8, 8, 8},                            /* cost of storing fp registers
 155                                            in SFmode, DFmode and XFmode */
 156   2,                                    /* cost of moving MMX register */
 157   {4, 8},                               /* cost of loading MMX registers
 158                                            in SImode and DImode */
 159   {4, 8},                               /* cost of storing MMX registers
 160                                            in SImode and DImode */
 161   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 162   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 163                                            in 32,64,128,256 and 512-bit */
 164   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 165                                            in 32,64,128,256 and 512-bit */
 166   3, 3,                                 /* SSE->integer and integer->SSE moves */
 167   /* End of register allocator costs.  */
 168   },
 169
 170   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 171   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 172   COSTS_N_INSNS (3),                    /* variable shift costs */
 173   COSTS_N_INSNS (2),                    /* constant shift costs */
 174   {COSTS_N_INSNS (6),                   /* cost of starting multiply for QI */
 175    COSTS_N_INSNS (6),                   /*                               HI */
 176    COSTS_N_INSNS (6),                   /*                               SI */
 177    COSTS_N_INSNS (6),                   /*                               DI */
 178    COSTS_N_INSNS (6)},                  /*                            other */
 179   COSTS_N_INSNS (1),                    /* cost of multiply per each bit set */
 180   {COSTS_N_INSNS (23),                  /* cost of a divide/mod for QI */
 181    COSTS_N_INSNS (23),                  /*                          HI */
 182    COSTS_N_INSNS (23),                  /*                          SI */
 183    COSTS_N_INSNS (23),                  /*                          DI */
 184    COSTS_N_INSNS (23)},                 /*                          other */
 185   COSTS_N_INSNS (3),                    /* cost of movsx */
 186   COSTS_N_INSNS (2),                    /* cost of movzx */
 187   15,                                   /* "large" insn */
 188   3,                                    /* MOVE_RATIO */
 189   3,                                    /* CLEAR_RATIO */
 190   {2, 4, 2},                            /* cost of loading integer registers
 191                                            in QImode, HImode and SImode.
 192                                            Relative to reg-reg move (2).  */
 193   {2, 4, 2},                            /* cost of storing integer registers */
 194   {4, 8, 16, 32, 64},                   /* cost of loading SSE register
 195                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 196   {4, 8, 16, 32, 64},                   /* cost of storing SSE register
 197                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 198   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 199   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 200   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 201   3,                                    /* cost of moving SSE register to integer.  */
 202   4, 4,                                 /* Gather load static, per_elt.  */
 203   4, 4,                                 /* Gather store static, per_elt.  */
 204   0,                                    /* size of l1 cache  */
 205   0,                                    /* size of l2 cache  */
 206   0,                                    /* size of prefetch block */
 207   0,                                    /* number of parallel prefetches */
 208   1,                                    /* Branch cost */
 209   COSTS_N_INSNS (23),                   /* cost of FADD and FSUB insns.  */
 210   COSTS_N_INSNS (27),                   /* cost of FMUL instruction.  */
 211   COSTS_N_INSNS (88),                   /* cost of FDIV instruction.  */
 212   COSTS_N_INSNS (22),                   /* cost of FABS instruction.  */
 213   COSTS_N_INSNS (24),                   /* cost of FCHS instruction.  */
 214   COSTS_N_INSNS (122),                  /* cost of FSQRT instruction.  */
 215
 216   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 217   COSTS_N_INSNS (23),                   /* cost of ADDSS/SD SUBSS/SD insns.  */
 218   COSTS_N_INSNS (27),                   /* cost of MULSS instruction.  */
 219   COSTS_N_INSNS (27),                   /* cost of MULSD instruction.  */
 220   COSTS_N_INSNS (27),                   /* cost of FMA SS instruction.  */
 221   COSTS_N_INSNS (27),                   /* cost of FMA SD instruction.  */
 222   COSTS_N_INSNS (88),                   /* cost of DIVSS instruction.  */
 223   COSTS_N_INSNS (88),                   /* cost of DIVSD instruction.  */
 224   COSTS_N_INSNS (122),                  /* cost of SQRTSS instruction.  */
 225   COSTS_N_INSNS (122),                  /* cost of SQRTSD instruction.  */
 226   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 227   i386_memcpy,
 228   i386_memset,
 229   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 230   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 231   "4",                                  /* Loop alignment.  */
 232   "4",                                  /* Jump alignment.  */
 233   NULL,                                 /* Label alignment.  */
 234   "4",                                  /* Func alignment.  */
 235 };
 236
 237 static stringop_algs i486_memcpy[2] = {
 238   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
 239   DUMMY_STRINGOP_ALGS};
 240 static stringop_algs i486_memset[2] = {
 241   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
 242   DUMMY_STRINGOP_ALGS};
 243
 244 static const
 245 struct processor_costs i486_cost = {    /* 486 specific costs */
 246   {
 247   /* Start of register allocator costs.  integer->integer move cost is 2. */
 248   4,                                 /* cost for loading QImode using movzbl */
 249   {2, 4, 2},                            /* cost of loading integer registers
 250                                            in QImode, HImode and SImode.
 251                                            Relative to reg-reg move (2).  */
 252   {2, 4, 2},                            /* cost of storing integer registers */
 253   2,                                    /* cost of reg,reg fld/fst */
 254   {8, 8, 8},                            /* cost of loading fp registers
 255                                            in SFmode, DFmode and XFmode */
 256   {8, 8, 8},                            /* cost of storing fp registers
 257                                            in SFmode, DFmode and XFmode */
 258   2,                                    /* cost of moving MMX register */
 259   {4, 8},                               /* cost of loading MMX registers
 260                                            in SImode and DImode */
 261   {4, 8},                               /* cost of storing MMX registers
 262                                            in SImode and DImode */
 263   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 264   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 265                                            in 32,64,128,256 and 512-bit */
 266   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 267                                            in 32,64,128,256 and 512-bit */
 268   3, 3,                                 /* SSE->integer and integer->SSE moves */
 269   /* End of register allocator costs.  */
 270   },
 271
 272   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 273   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 274   COSTS_N_INSNS (3),                    /* variable shift costs */
 275   COSTS_N_INSNS (2),                    /* constant shift costs */
 276   {COSTS_N_INSNS (12),                  /* cost of starting multiply for QI */
 277    COSTS_N_INSNS (12),                  /*                               HI */
 278    COSTS_N_INSNS (12),                  /*                               SI */
 279    COSTS_N_INSNS (12),                  /*                               DI */
 280    COSTS_N_INSNS (12)},                 /*                            other */
 281   1,                                    /* cost of multiply per each bit set */
 282   {COSTS_N_INSNS (40),                  /* cost of a divide/mod for QI */
 283    COSTS_N_INSNS (40),                  /*                          HI */
 284    COSTS_N_INSNS (40),                  /*                          SI */
 285    COSTS_N_INSNS (40),                  /*                          DI */
 286    COSTS_N_INSNS (40)},                 /*                          other */
 287   COSTS_N_INSNS (3),                    /* cost of movsx */
 288   COSTS_N_INSNS (2),                    /* cost of movzx */
 289   15,                                   /* "large" insn */
 290   3,                                    /* MOVE_RATIO */
 291   3,                                    /* CLEAR_RATIO */
 292   {2, 4, 2},                            /* cost of loading integer registers
 293                                            in QImode, HImode and SImode.
 294                                            Relative to reg-reg move (2).  */
 295   {2, 4, 2},                            /* cost of storing integer registers */
 296   {4, 8, 16, 32, 64},                   /* cost of loading SSE register
 297                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 298   {4, 8, 16, 32, 64},                   /* cost of storing SSE register
 299                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 300   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 301   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 302   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 303   3,                                    /* cost of moving SSE register to integer.  */
 304   4, 4,                                 /* Gather load static, per_elt.  */
 305   4, 4,                                 /* Gather store static, per_elt.  */
 306   4,                                    /* size of l1 cache.  486 has 8kB cache
 307                                            shared for code and data, so 4kB is
 308                                            not really precise.  */
 309   4,                                    /* size of l2 cache  */
 310   0,                                    /* size of prefetch block */
 311   0,                                    /* number of parallel prefetches */
 312   1,                                    /* Branch cost */
 313   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
 314   COSTS_N_INSNS (16),                   /* cost of FMUL instruction.  */
 315   COSTS_N_INSNS (73),                   /* cost of FDIV instruction.  */
 316   COSTS_N_INSNS (3),                    /* cost of FABS instruction.  */
 317   COSTS_N_INSNS (3),                    /* cost of FCHS instruction.  */
 318   COSTS_N_INSNS (83),                   /* cost of FSQRT instruction.  */
 319
 320   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 321   COSTS_N_INSNS (8),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 322   COSTS_N_INSNS (16),                   /* cost of MULSS instruction.  */
 323   COSTS_N_INSNS (16),                   /* cost of MULSD instruction.  */
 324   COSTS_N_INSNS (16),                   /* cost of FMA SS instruction.  */
 325   COSTS_N_INSNS (16),                   /* cost of FMA SD instruction.  */
 326   COSTS_N_INSNS (73),                   /* cost of DIVSS instruction.  */
 327   COSTS_N_INSNS (74),                   /* cost of DIVSD instruction.  */
 328   COSTS_N_INSNS (83),                   /* cost of SQRTSS instruction.  */
 329   COSTS_N_INSNS (83),                   /* cost of SQRTSD instruction.  */
 330   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 331   i486_memcpy,
 332   i486_memset,
 333   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 334   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 335   "16",                                 /* Loop alignment.  */
 336   "16",                                 /* Jump alignment.  */
 337   "0:0:8",                              /* Label alignment.  */
 338   "16",                                 /* Func alignment.  */
 339 };
 340
 341 static stringop_algs pentium_memcpy[2] = {
 342   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 343   DUMMY_STRINGOP_ALGS};
 344 static stringop_algs pentium_memset[2] = {
 345   {libcall, {{-1, rep_prefix_4_byte, false}}},
 346   DUMMY_STRINGOP_ALGS};
 347
 348 static const
 349 struct processor_costs pentium_cost = {
 350   {
 351   /* Start of register allocator costs.  integer->integer move cost is 2. */
 352   6,                                 /* cost for loading QImode using movzbl */
 353   {2, 4, 2},                            /* cost of loading integer registers
 354                                            in QImode, HImode and SImode.
 355                                            Relative to reg-reg move (2).  */
 356   {2, 4, 2},                            /* cost of storing integer registers */
 357   2,                                    /* cost of reg,reg fld/fst */
 358   {2, 2, 6},                            /* cost of loading fp registers
 359                                            in SFmode, DFmode and XFmode */
 360   {4, 4, 6},                            /* cost of storing fp registers
 361                                            in SFmode, DFmode and XFmode */
 362   8,                                    /* cost of moving MMX register */
 363   {8, 8},                               /* cost of loading MMX registers
 364                                            in SImode and DImode */
 365   {8, 8},                               /* cost of storing MMX registers
 366                                            in SImode and DImode */
 367   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 368   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 369                                            in 32,64,128,256 and 512-bit */
 370   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 371                                            in 32,64,128,256 and 512-bit */
 372   3, 3,                                 /* SSE->integer and integer->SSE moves */
 373   /* End of register allocator costs.  */
 374   },
 375
 376   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 377   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 378   COSTS_N_INSNS (4),                    /* variable shift costs */
 379   COSTS_N_INSNS (1),                    /* constant shift costs */
 380   {COSTS_N_INSNS (11),                  /* cost of starting multiply for QI */
 381    COSTS_N_INSNS (11),                  /*                               HI */
 382    COSTS_N_INSNS (11),                  /*                               SI */
 383    COSTS_N_INSNS (11),                  /*                               DI */
 384    COSTS_N_INSNS (11)},                 /*                            other */
 385   0,                                    /* cost of multiply per each bit set */
 386   {COSTS_N_INSNS (25),                  /* cost of a divide/mod for QI */
 387    COSTS_N_INSNS (25),                  /*                          HI */
 388    COSTS_N_INSNS (25),                  /*                          SI */
 389    COSTS_N_INSNS (25),                  /*                          DI */
 390    COSTS_N_INSNS (25)},                 /*                          other */
 391   COSTS_N_INSNS (3),                    /* cost of movsx */
 392   COSTS_N_INSNS (2),                    /* cost of movzx */
 393   8,                                    /* "large" insn */
 394   6,                                    /* MOVE_RATIO */
 395   6,                                    /* CLEAR_RATIO */
 396   {2, 4, 2},                            /* cost of loading integer registers
 397                                            in QImode, HImode and SImode.
 398                                            Relative to reg-reg move (2).  */
 399   {2, 4, 2},                            /* cost of storing integer registers */
 400   {4, 8, 16, 32, 64},                   /* cost of loading SSE register
 401                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 402   {4, 8, 16, 32, 64},                   /* cost of storing SSE register
 403                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 404   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 405   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 406   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 407   3,                                    /* cost of moving SSE register to integer.  */
 408   4, 4,                                 /* Gather load static, per_elt.  */
 409   4, 4,                                 /* Gather store static, per_elt.  */
 410   8,                                    /* size of l1 cache.  */
 411   8,                                    /* size of l2 cache  */
 412   0,                                    /* size of prefetch block */
 413   0,                                    /* number of parallel prefetches */
 414   2,                                    /* Branch cost */
 415   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 416   COSTS_N_INSNS (3),                    /* cost of FMUL instruction.  */
 417   COSTS_N_INSNS (39),                   /* cost of FDIV instruction.  */
 418   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 419   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 420   COSTS_N_INSNS (70),                   /* cost of FSQRT instruction.  */
 421
 422   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 423   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 424   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
 425   COSTS_N_INSNS (3),                    /* cost of MULSD instruction.  */
 426   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
 427   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
 428   COSTS_N_INSNS (39),                   /* cost of DIVSS instruction.  */
 429   COSTS_N_INSNS (39),                   /* cost of DIVSD instruction.  */
 430   COSTS_N_INSNS (70),                   /* cost of SQRTSS instruction.  */
 431   COSTS_N_INSNS (70),                   /* cost of SQRTSD instruction.  */
 432   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 433   pentium_memcpy,
 434   pentium_memset,
 435   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 436   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 437   "16:8:8",                             /* Loop alignment.  */
 438   "16:8:8",                             /* Jump alignment.  */
 439   "0:0:8",                              /* Label alignment.  */
 440   "16",                                 /* Func alignment.  */
 441 };
 442
 443 static const
 444 struct processor_costs lakemont_cost = {
 445   {
 446   /* Start of register allocator costs.  integer->integer move cost is 2. */
 447   6,                                 /* cost for loading QImode using movzbl */
 448   {2, 4, 2},                            /* cost of loading integer registers
 449                                            in QImode, HImode and SImode.
 450                                            Relative to reg-reg move (2).  */
 451   {2, 4, 2},                            /* cost of storing integer registers */
 452   2,                                    /* cost of reg,reg fld/fst */
 453   {2, 2, 6},                            /* cost of loading fp registers
 454                                            in SFmode, DFmode and XFmode */
 455   {4, 4, 6},                            /* cost of storing fp registers
 456                                            in SFmode, DFmode and XFmode */
 457   8,                                    /* cost of moving MMX register */
 458   {8, 8},                               /* cost of loading MMX registers
 459                                            in SImode and DImode */
 460   {8, 8},                               /* cost of storing MMX registers
 461                                            in SImode and DImode */
 462   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 463   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 464                                            in 32,64,128,256 and 512-bit */
 465   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 466                                            in 32,64,128,256 and 512-bit */
 467   3, 3,                                 /* SSE->integer and integer->SSE moves */
 468   /* End of register allocator costs.  */
 469   },
 470
 471   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 472   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
 473   COSTS_N_INSNS (1),                    /* variable shift costs */
 474   COSTS_N_INSNS (1),                    /* constant shift costs */
 475   {COSTS_N_INSNS (11),                  /* cost of starting multiply for QI */
 476    COSTS_N_INSNS (11),                  /*                               HI */
 477    COSTS_N_INSNS (11),                  /*                               SI */
 478    COSTS_N_INSNS (11),                  /*                               DI */
 479    COSTS_N_INSNS (11)},                 /*                            other */
 480   0,                                    /* cost of multiply per each bit set */
 481   {COSTS_N_INSNS (25),                  /* cost of a divide/mod for QI */
 482    COSTS_N_INSNS (25),                  /*                          HI */
 483    COSTS_N_INSNS (25),                  /*                          SI */
 484    COSTS_N_INSNS (25),                  /*                          DI */
 485    COSTS_N_INSNS (25)},                 /*                          other */
 486   COSTS_N_INSNS (3),                    /* cost of movsx */
 487   COSTS_N_INSNS (2),                    /* cost of movzx */
 488   8,                                    /* "large" insn */
 489   17,                                   /* MOVE_RATIO */
 490   6,                                    /* CLEAR_RATIO */
 491   {2, 4, 2},                            /* cost of loading integer registers
 492                                            in QImode, HImode and SImode.
 493                                            Relative to reg-reg move (2).  */
 494   {2, 4, 2},                            /* cost of storing integer registers */
 495   {4, 8, 16, 32, 64},                   /* cost of loading SSE register
 496                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 497   {4, 8, 16, 32, 64},                   /* cost of storing SSE register
 498                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 499   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 500   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 501   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 502   3,                                    /* cost of moving SSE register to integer.  */
 503   4, 4,                                 /* Gather load static, per_elt.  */
 504   4, 4,                                 /* Gather store static, per_elt.  */
 505   8,                                    /* size of l1 cache.  */
 506   8,                                    /* size of l2 cache  */
 507   0,                                    /* size of prefetch block */
 508   0,                                    /* number of parallel prefetches */
 509   2,                                    /* Branch cost */
 510   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 511   COSTS_N_INSNS (3),                    /* cost of FMUL instruction.  */
 512   COSTS_N_INSNS (39),                   /* cost of FDIV instruction.  */
 513   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 514   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 515   COSTS_N_INSNS (70),                   /* cost of FSQRT instruction.  */
 516
 517   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 518   COSTS_N_INSNS (5),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 519   COSTS_N_INSNS (5),                    /* cost of MULSS instruction.  */
 520   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
 521   COSTS_N_INSNS (10),                   /* cost of FMA SS instruction.  */
 522   COSTS_N_INSNS (10),                   /* cost of FMA SD instruction.  */
 523   COSTS_N_INSNS (31),                   /* cost of DIVSS instruction.  */
 524   COSTS_N_INSNS (60),                   /* cost of DIVSD instruction.  */
 525   COSTS_N_INSNS (31),                   /* cost of SQRTSS instruction.  */
 526   COSTS_N_INSNS (63),                   /* cost of SQRTSD instruction.  */
 527   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 528   pentium_memcpy,
 529   pentium_memset,
 530   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 531   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 532   "16:8:8",                             /* Loop alignment.  */
 533   "16:8:8",                             /* Jump alignment.  */
 534   "0:0:8",                              /* Label alignment.  */
 535   "16",                                 /* Func alignment.  */
 536 };
 537
 538 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
 539    (we ensure the alignment).  For small blocks inline loop is still a
 540    noticeable win, for bigger blocks either rep movsl or rep movsb is
 541    way to go.  Rep movsb has apparently more expensive startup time in CPU,
 542    but after 4K the difference is down in the noise.  */
 543 static stringop_algs pentiumpro_memcpy[2] = {
 544   {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
 545                        {8192, rep_prefix_4_byte, false},
 546                        {-1, rep_prefix_1_byte, false}}},
 547   DUMMY_STRINGOP_ALGS};
 548 static stringop_algs pentiumpro_memset[2] = {
 549   {rep_prefix_4_byte, {{1024, unrolled_loop, false},
 550                        {8192, rep_prefix_4_byte, false},
 551                        {-1, libcall, false}}},
 552   DUMMY_STRINGOP_ALGS};
 553 static const
 554 struct processor_costs pentiumpro_cost = {
 555   {
 556   /* Start of register allocator costs.  integer->integer move cost is 2. */
 557   2,                                 /* cost for loading QImode using movzbl */
 558   {4, 4, 4},                            /* cost of loading integer registers
 559                                            in QImode, HImode and SImode.
 560                                            Relative to reg-reg move (2).  */
 561   {2, 2, 2},                            /* cost of storing integer registers */
 562   2,                                    /* cost of reg,reg fld/fst */
 563   {2, 2, 6},                            /* cost of loading fp registers
 564                                            in SFmode, DFmode and XFmode */
 565   {4, 4, 6},                            /* cost of storing fp registers
 566                                            in SFmode, DFmode and XFmode */
 567   2,                                    /* cost of moving MMX register */
 568   {2, 2},                               /* cost of loading MMX registers
 569                                            in SImode and DImode */
 570   {2, 2},                               /* cost of storing MMX registers
 571                                            in SImode and DImode */
 572   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 573   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 574                                            in 32,64,128,256 and 512-bit */
 575   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 576                                            in 32,64,128,256 and 512-bit */
 577   3, 3,                                 /* SSE->integer and integer->SSE moves */
 578   /* End of register allocator costs.  */
 579   },
 580
 581   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 582   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 583   COSTS_N_INSNS (1),                    /* variable shift costs */
 584   COSTS_N_INSNS (1),                    /* constant shift costs */
 585   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
 586    COSTS_N_INSNS (4),                   /*                               HI */
 587    COSTS_N_INSNS (4),                   /*                               SI */
 588    COSTS_N_INSNS (4),                   /*                               DI */
 589    COSTS_N_INSNS (4)},                  /*                            other */
 590   0,                                    /* cost of multiply per each bit set */
 591   {COSTS_N_INSNS (17),                  /* cost of a divide/mod for QI */
 592    COSTS_N_INSNS (17),                  /*                          HI */
 593    COSTS_N_INSNS (17),                  /*                          SI */
 594    COSTS_N_INSNS (17),                  /*                          DI */
 595    COSTS_N_INSNS (17)},                 /*                          other */
 596   COSTS_N_INSNS (1),                    /* cost of movsx */
 597   COSTS_N_INSNS (1),                    /* cost of movzx */
 598   8,                                    /* "large" insn */
 599   6,                                    /* MOVE_RATIO */
 600   6,                                    /* CLEAR_RATIO */
 601   {4, 4, 4},                            /* cost of loading integer registers
 602                                            in QImode, HImode and SImode.
 603                                            Relative to reg-reg move (2).  */
 604   {2, 2, 2},                            /* cost of storing integer registers */
 605   {4, 8, 16, 32, 64},                   /* cost of loading SSE register
 606                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 607   {4, 8, 16, 32, 64},                   /* cost of storing SSE register
 608                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 609   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 610   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 611   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 612   3,                                    /* cost of moving SSE register to integer.  */
 613   4, 4,                                 /* Gather load static, per_elt.  */
 614   4, 4,                                 /* Gather store static, per_elt.  */
 615   8,                                    /* size of l1 cache.  */
 616   256,                                  /* size of l2 cache  */
 617   32,                                   /* size of prefetch block */
 618   6,                                    /* number of parallel prefetches */
 619   2,                                    /* Branch cost */
 620   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 621   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
 622   COSTS_N_INSNS (56),                   /* cost of FDIV instruction.  */
 623   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 624   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 625   COSTS_N_INSNS (56),                   /* cost of FSQRT instruction.  */
 626
 627   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 628   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 629   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
 630   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
 631   COSTS_N_INSNS (7),                    /* cost of FMA SS instruction.  */
 632   COSTS_N_INSNS (7),                    /* cost of FMA SD instruction.  */
 633   COSTS_N_INSNS (18),                   /* cost of DIVSS instruction.  */
 634   COSTS_N_INSNS (18),                   /* cost of DIVSD instruction.  */
 635   COSTS_N_INSNS (31),                   /* cost of SQRTSS instruction.  */
 636   COSTS_N_INSNS (31),                   /* cost of SQRTSD instruction.  */
 637   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 638   pentiumpro_memcpy,
 639   pentiumpro_memset,
 640   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 641   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 642   "16",                                 /* Loop alignment.  */
 643   "16:11:8",                            /* Jump alignment.  */
 644   "0:0:8",                              /* Label alignment.  */
 645   "16",                                 /* Func alignment.  */
 646 };
 647
 648 static stringop_algs geode_memcpy[2] = {
 649   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 650   DUMMY_STRINGOP_ALGS};
 651 static stringop_algs geode_memset[2] = {
 652   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 653   DUMMY_STRINGOP_ALGS};
 654 static const
 655 struct processor_costs geode_cost = {
 656   {
 657   /* Start of register allocator costs.  integer->integer move cost is 2. */
 658   2,                                 /* cost for loading QImode using movzbl */
 659   {2, 2, 2},                            /* cost of loading integer registers
 660                                            in QImode, HImode and SImode.
 661                                            Relative to reg-reg move (2).  */
 662   {2, 2, 2},                            /* cost of storing integer registers */
 663   2,                                    /* cost of reg,reg fld/fst */
 664   {2, 2, 2},                            /* cost of loading fp registers
 665                                            in SFmode, DFmode and XFmode */
 666   {4, 6, 6},                            /* cost of storing fp registers
 667                                            in SFmode, DFmode and XFmode */
 668   2,                                    /* cost of moving MMX register */
 669   {2, 2},                               /* cost of loading MMX registers
 670                                            in SImode and DImode */
 671   {2, 2},                               /* cost of storing MMX registers
 672                                            in SImode and DImode */
 673   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 674   {2, 2, 8, 16, 32},                    /* cost of loading SSE registers
 675                                            in 32,64,128,256 and 512-bit */
 676   {2, 2, 8, 16, 32},                    /* cost of storing SSE registers
 677                                            in 32,64,128,256 and 512-bit */
 678   6, 6,                                 /* SSE->integer and integer->SSE moves */
 679   /* End of register allocator costs.  */
 680   },
 681
 682   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 683   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 684   COSTS_N_INSNS (2),                    /* variable shift costs */
 685   COSTS_N_INSNS (1),                    /* constant shift costs */
 686   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 687    COSTS_N_INSNS (4),                   /*                               HI */
 688    COSTS_N_INSNS (7),                   /*                               SI */
 689    COSTS_N_INSNS (7),                   /*                               DI */
 690    COSTS_N_INSNS (7)},                  /*                            other */
 691   0,                                    /* cost of multiply per each bit set */
 692   {COSTS_N_INSNS (15),                  /* cost of a divide/mod for QI */
 693    COSTS_N_INSNS (23),                  /*                          HI */
 694    COSTS_N_INSNS (39),                  /*                          SI */
 695    COSTS_N_INSNS (39),                  /*                          DI */
 696    COSTS_N_INSNS (39)},                 /*                          other */
 697   COSTS_N_INSNS (1),                    /* cost of movsx */
 698   COSTS_N_INSNS (1),                    /* cost of movzx */
 699   8,                                    /* "large" insn */
 700   4,                                    /* MOVE_RATIO */
 701   4,                                    /* CLEAR_RATIO */
 702   {2, 2, 2},                            /* cost of loading integer registers
 703                                            in QImode, HImode and SImode.
 704                                            Relative to reg-reg move (2).  */
 705   {2, 2, 2},                            /* cost of storing integer registers */
 706   {2, 2, 8, 16, 32},                    /* cost of loading SSE register
 707                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 708   {2, 2, 8, 16, 32},                    /* cost of storing SSE register
 709                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 710   {2, 2, 8, 16, 32},                    /* cost of unaligned loads.  */
 711   {2, 2, 8, 16, 32},                    /* cost of unaligned stores.  */
 712   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 713   6,                                    /* cost of moving SSE register to integer.  */
 714   2, 2,                                 /* Gather load static, per_elt.  */
 715   2, 2,                                 /* Gather store static, per_elt.  */
 716   64,                                   /* size of l1 cache.  */
 717   128,                                  /* size of l2 cache.  */
 718   32,                                   /* size of prefetch block */
 719   1,                                    /* number of parallel prefetches */
 720   1,                                    /* Branch cost */
 721   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
 722   COSTS_N_INSNS (11),                   /* cost of FMUL instruction.  */
 723   COSTS_N_INSNS (47),                   /* cost of FDIV instruction.  */
 724   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 725   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 726   COSTS_N_INSNS (54),                   /* cost of FSQRT instruction.  */
 727
 728   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 729   COSTS_N_INSNS (6),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 730   COSTS_N_INSNS (11),                   /* cost of MULSS instruction.  */
 731   COSTS_N_INSNS (11),                   /* cost of MULSD instruction.  */
 732   COSTS_N_INSNS (17),                   /* cost of FMA SS instruction.  */
 733   COSTS_N_INSNS (17),                   /* cost of FMA SD instruction.  */
 734   COSTS_N_INSNS (47),                   /* cost of DIVSS instruction.  */
 735   COSTS_N_INSNS (47),                   /* cost of DIVSD instruction.  */
 736   COSTS_N_INSNS (54),                   /* cost of SQRTSS instruction.  */
 737   COSTS_N_INSNS (54),                   /* cost of SQRTSD instruction.  */
 738   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 739   geode_memcpy,
 740   geode_memset,
 741   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 742   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 743   NULL,                                 /* Loop alignment.  */
 744   NULL,                                 /* Jump alignment.  */
 745   NULL,                                 /* Label alignment.  */
 746   NULL,                                 /* Func alignment.  */
 747 };
 748
 749 static stringop_algs k6_memcpy[2] = {
 750   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 751   DUMMY_STRINGOP_ALGS};
 752 static stringop_algs k6_memset[2] = {
 753   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 754   DUMMY_STRINGOP_ALGS};
 755 static const
 756 struct processor_costs k6_cost = {
 757   {
 758   /* Start of register allocator costs.  integer->integer move cost is 2. */
 759   3,                                 /* cost for loading QImode using movzbl */
 760   {4, 5, 4},                            /* cost of loading integer registers
 761                                            in QImode, HImode and SImode.
 762                                            Relative to reg-reg move (2).  */
 763   {2, 3, 2},                            /* cost of storing integer registers */
 764   4,                                    /* cost of reg,reg fld/fst */
 765   {6, 6, 6},                            /* cost of loading fp registers
 766                                            in SFmode, DFmode and XFmode */
 767   {4, 4, 4},                            /* cost of storing fp registers
 768                                            in SFmode, DFmode and XFmode */
 769   2,                                    /* cost of moving MMX register */
 770   {2, 2},                               /* cost of loading MMX registers
 771                                            in SImode and DImode */
 772   {2, 2},                               /* cost of storing MMX registers
 773                                            in SImode and DImode */
 774   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 775   {2, 2, 8, 16, 32},                    /* cost of loading SSE registers
 776                                            in 32,64,128,256 and 512-bit */
 777   {2, 2, 8, 16, 32},                    /* cost of storing SSE registers
 778                                            in 32,64,128,256 and 512-bit */
 779   6, 6,                                 /* SSE->integer and integer->SSE moves */
 780   /* End of register allocator costs.  */
 781   },
 782
 783   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 784   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 785   COSTS_N_INSNS (1),                    /* variable shift costs */
 786   COSTS_N_INSNS (1),                    /* constant shift costs */
 787   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 788    COSTS_N_INSNS (3),                   /*                               HI */
 789    COSTS_N_INSNS (3),                   /*                               SI */
 790    COSTS_N_INSNS (3),                   /*                               DI */
 791    COSTS_N_INSNS (3)},                  /*                            other */
 792   0,                                    /* cost of multiply per each bit set */
 793   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
 794    COSTS_N_INSNS (18),                  /*                          HI */
 795    COSTS_N_INSNS (18),                  /*                          SI */
 796    COSTS_N_INSNS (18),                  /*                          DI */
 797    COSTS_N_INSNS (18)},                 /*                          other */
 798   COSTS_N_INSNS (2),                    /* cost of movsx */
 799   COSTS_N_INSNS (2),                    /* cost of movzx */
 800   8,                                    /* "large" insn */
 801   4,                                    /* MOVE_RATIO */
 802   4,                                    /* CLEAR_RATIO */
 803   {4, 5, 4},                            /* cost of loading integer registers
 804                                            in QImode, HImode and SImode.
 805                                            Relative to reg-reg move (2).  */
 806   {2, 3, 2},                            /* cost of storing integer registers */
 807   {2, 2, 8, 16, 32},                    /* cost of loading SSE register
 808                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 809   {2, 2, 8, 16, 32},                    /* cost of storing SSE register
 810                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 811   {2, 2, 8, 16, 32},                    /* cost of unaligned loads.  */
 812   {2, 2, 8, 16, 32},                    /* cost of unaligned stores.  */
 813   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 814   6,                                    /* cost of moving SSE register to integer.  */
 815   2, 2,                                 /* Gather load static, per_elt.  */
 816   2, 2,                                 /* Gather store static, per_elt.  */
 817   32,                                   /* size of l1 cache.  */
 818   32,                                   /* size of l2 cache.  Some models
 819                                            have integrated l2 cache, but
 820                                            optimizing for k6 is not important
 821                                            enough to worry about that.  */
 822   32,                                   /* size of prefetch block */
 823   1,                                    /* number of parallel prefetches */
 824   1,                                    /* Branch cost */
 825   COSTS_N_INSNS (2),                    /* cost of FADD and FSUB insns.  */
 826   COSTS_N_INSNS (2),                    /* cost of FMUL instruction.  */
 827   COSTS_N_INSNS (56),                   /* cost of FDIV instruction.  */
 828   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 829   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 830   COSTS_N_INSNS (56),                   /* cost of FSQRT instruction.  */
 831
 832   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 833   COSTS_N_INSNS (2),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 834   COSTS_N_INSNS (2),                    /* cost of MULSS instruction.  */
 835   COSTS_N_INSNS (2),                    /* cost of MULSD instruction.  */
 836   COSTS_N_INSNS (4),                    /* cost of FMA SS instruction.  */
 837   COSTS_N_INSNS (4),                    /* cost of FMA SD instruction.  */
 838   COSTS_N_INSNS (56),                   /* cost of DIVSS instruction.  */
 839   COSTS_N_INSNS (56),                   /* cost of DIVSD instruction.  */
 840   COSTS_N_INSNS (56),                   /* cost of SQRTSS instruction.  */
 841   COSTS_N_INSNS (56),                   /* cost of SQRTSD instruction.  */
 842   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 843   k6_memcpy,
 844   k6_memset,
 845   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 846   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 847   "32:8:8",                             /* Loop alignment.  */
 848   "32:8:8",                             /* Jump alignment.  */
 849   "0:0:8",                              /* Label alignment.  */
 850   "32",                                 /* Func alignment.  */
 851 };
 852
 853 /* For some reason, Athlon deals better with REP prefix (relative to loops)
 854    compared to K8. Alignment becomes important after 8 bytes for memcpy and
 855    128 bytes for memset.  */
 856 static stringop_algs athlon_memcpy[2] = {
 857   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 858   DUMMY_STRINGOP_ALGS};
 859 static stringop_algs athlon_memset[2] = {
 860   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 861   DUMMY_STRINGOP_ALGS};
 862 static const
 863 struct processor_costs athlon_cost = {
 864   {
 865   /* Start of register allocator costs.  integer->integer move cost is 2. */
 866   4,                                 /* cost for loading QImode using movzbl */
 867   {3, 4, 3},                            /* cost of loading integer registers
 868                                            in QImode, HImode and SImode.
 869                                            Relative to reg-reg move (2).  */
 870   {3, 4, 3},                            /* cost of storing integer registers */
 871   4,                                    /* cost of reg,reg fld/fst */
 872   {4, 4, 12},                           /* cost of loading fp registers
 873                                            in SFmode, DFmode and XFmode */
 874   {6, 6, 8},                            /* cost of storing fp registers
 875                                            in SFmode, DFmode and XFmode */
 876   2,                                    /* cost of moving MMX register */
 877   {4, 4},                               /* cost of loading MMX registers
 878                                            in SImode and DImode */
 879   {4, 4},                               /* cost of storing MMX registers
 880                                            in SImode and DImode */
 881   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 882   {4, 4, 12, 12, 24},                   /* cost of loading SSE registers
 883                                            in 32,64,128,256 and 512-bit */
 884   {4, 4, 10, 10, 20},                   /* cost of storing SSE registers
 885                                            in 32,64,128,256 and 512-bit */
 886   5, 5,                                 /* SSE->integer and integer->SSE moves */
 887   /* End of register allocator costs.  */
 888   },
 889
 890   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 891   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 892   COSTS_N_INSNS (1),                    /* variable shift costs */
 893   COSTS_N_INSNS (1),                    /* constant shift costs */
 894   {COSTS_N_INSNS (5),                   /* cost of starting multiply for QI */
 895    COSTS_N_INSNS (5),                   /*                               HI */
 896    COSTS_N_INSNS (5),                   /*                               SI */
 897    COSTS_N_INSNS (5),                   /*                               DI */
 898    COSTS_N_INSNS (5)},                  /*                            other */
 899   0,                                    /* cost of multiply per each bit set */
 900   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
 901    COSTS_N_INSNS (26),                  /*                          HI */
 902    COSTS_N_INSNS (42),                  /*                          SI */
 903    COSTS_N_INSNS (74),                  /*                          DI */
 904    COSTS_N_INSNS (74)},                 /*                          other */
 905   COSTS_N_INSNS (1),                    /* cost of movsx */
 906   COSTS_N_INSNS (1),                    /* cost of movzx */
 907   8,                                    /* "large" insn */
 908   9,                                    /* MOVE_RATIO */
 909   6,                                    /* CLEAR_RATIO */
 910   {3, 4, 3},                            /* cost of loading integer registers
 911                                            in QImode, HImode and SImode.
 912                                            Relative to reg-reg move (2).  */
 913   {3, 4, 3},                            /* cost of storing integer registers */
 914   {4, 4, 12, 12, 24},                   /* cost of loading SSE register
 915                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 916   {4, 4, 10, 10, 20},                   /* cost of storing SSE register
 917                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 918   {4, 4, 12, 12, 24},                   /* cost of unaligned loads.  */
 919   {4, 4, 10, 10, 20},                   /* cost of unaligned stores.  */
 920   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 921   5,                                    /* cost of moving SSE register to integer.  */
 922   4, 4,                                 /* Gather load static, per_elt.  */
 923   4, 4,                                 /* Gather store static, per_elt.  */
 924   64,                                   /* size of l1 cache.  */
 925   256,                                  /* size of l2 cache.  */
 926   64,                                   /* size of prefetch block */
 927   6,                                    /* number of parallel prefetches */
 928   5,                                    /* Branch cost */
 929   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
 930   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
 931   COSTS_N_INSNS (24),                   /* cost of FDIV instruction.  */
 932   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 933   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 934   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
 935
 936   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
 937   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 938   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
 939   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
 940   COSTS_N_INSNS (8),                    /* cost of FMA SS instruction.  */
 941   COSTS_N_INSNS (8),                    /* cost of FMA SD instruction.  */
 942   /* 11-16  */
 943   COSTS_N_INSNS (16),                   /* cost of DIVSS instruction.  */
 944   COSTS_N_INSNS (24),                   /* cost of DIVSD instruction.  */
 945   COSTS_N_INSNS (19),                   /* cost of SQRTSS instruction.  */
 946   COSTS_N_INSNS (19),                   /* cost of SQRTSD instruction.  */
 947   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 948   athlon_memcpy,
 949   athlon_memset,
 950   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 951   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 952   "16:8:8",                             /* Loop alignment.  */
 953   "16:8:8",                             /* Jump alignment.  */
 954   "0:0:8",                              /* Label alignment.  */
 955   "16",                                 /* Func alignment.  */
 956 };
 957
 958 /* K8 has optimized REP instruction for medium sized blocks, but for very
 959    small blocks it is better to use loop. For large blocks, libcall can
 960    do nontemporary accesses and beat inline considerably.  */
 961 static stringop_algs k8_memcpy[2] = {
 962   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
 963              {-1, rep_prefix_4_byte, false}}},
 964   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
 965              {-1, libcall, false}}}};
 966 static stringop_algs k8_memset[2] = {
 967   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
 968              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 969   {libcall, {{48, unrolled_loop, false},
 970              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
 971 static const
 972 struct processor_costs k8_cost = {
 973   {
 974   /* Start of register allocator costs.  integer->integer move cost is 2. */
 975   4,                                 /* cost for loading QImode using movzbl */
 976   {3, 4, 3},                            /* cost of loading integer registers
 977                                            in QImode, HImode and SImode.
 978                                            Relative to reg-reg move (2).  */
 979   {3, 4, 3},                            /* cost of storing integer registers */
 980   4,                                    /* cost of reg,reg fld/fst */
 981   {4, 4, 12},                           /* cost of loading fp registers
 982                                            in SFmode, DFmode and XFmode */
 983   {6, 6, 8},                            /* cost of storing fp registers
 984                                            in SFmode, DFmode and XFmode */
 985   2,                                    /* cost of moving MMX register */
 986   {3, 3},                               /* cost of loading MMX registers
 987                                            in SImode and DImode */
 988   {4, 4},                               /* cost of storing MMX registers
 989                                            in SImode and DImode */
 990   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 991   {4, 3, 12, 12, 24},                   /* cost of loading SSE registers
 992                                            in 32,64,128,256 and 512-bit */
 993   {4, 4, 10, 10, 20},                   /* cost of storing SSE registers
 994                                            in 32,64,128,256 and 512-bit */
 995   5, 5,                                 /* SSE->integer and integer->SSE moves */
 996   /* End of register allocator costs.  */
 997   },
 998
 999   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1000   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
1001   COSTS_N_INSNS (1),                    /* variable shift costs */
1002   COSTS_N_INSNS (1),                    /* constant shift costs */
1003   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1004    COSTS_N_INSNS (4),                   /*                               HI */
1005    COSTS_N_INSNS (3),                   /*                               SI */
1006    COSTS_N_INSNS (4),                   /*                               DI */
1007    COSTS_N_INSNS (5)},                  /*                            other */
1008   0,                                    /* cost of multiply per each bit set */
1009   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
1010    COSTS_N_INSNS (26),                  /*                          HI */
1011    COSTS_N_INSNS (42),                  /*                          SI */
1012    COSTS_N_INSNS (74),                  /*                          DI */
1013    COSTS_N_INSNS (74)},                 /*                          other */
1014   COSTS_N_INSNS (1),                    /* cost of movsx */
1015   COSTS_N_INSNS (1),                    /* cost of movzx */
1016   8,                                    /* "large" insn */
1017   9,                                    /* MOVE_RATIO */
1018   6,                                    /* CLEAR_RATIO */
1019   {3, 4, 3},                            /* cost of loading integer registers
1020                                            in QImode, HImode and SImode.
1021                                            Relative to reg-reg move (2).  */
1022   {3, 4, 3},                            /* cost of storing integer registers */
1023   {4, 3, 12, 12, 24},                   /* cost of loading SSE register
1024                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1025   {4, 4, 10, 10, 20},                   /* cost of storing SSE register
1026                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1027   {4, 3, 12, 12, 24},                   /* cost of unaligned loads.  */
1028   {4, 4, 10, 10, 20},                   /* cost of unaligned stores.  */
1029   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1030   5,                                    /* cost of moving SSE register to integer.  */
1031   4, 4,                                 /* Gather load static, per_elt.  */
1032   4, 4,                                 /* Gather store static, per_elt.  */
1033   64,                                   /* size of l1 cache.  */
1034   512,                                  /* size of l2 cache.  */
1035   64,                                   /* size of prefetch block */
1036   /* New AMD processors never drop prefetches; if they cannot be performed
1037      immediately, they are queued.  We set number of simultaneous prefetches
1038      to a large constant to reflect this (it probably is not a good idea not
1039      to limit number of prefetches at all, as their execution also takes some
1040      time).  */
1041   100,                                  /* number of parallel prefetches */
1042   3,                                    /* Branch cost */
1043   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
1044   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1045   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
1046   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1047   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1048   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
1049
1050   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1051   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1052   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
1053   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1054   COSTS_N_INSNS (8),                    /* cost of FMA SS instruction.  */
1055   COSTS_N_INSNS (8),                    /* cost of FMA SD instruction.  */
1056   /* 11-16  */
1057   COSTS_N_INSNS (16),                   /* cost of DIVSS instruction.  */
1058   COSTS_N_INSNS (20),                   /* cost of DIVSD instruction.  */
1059   COSTS_N_INSNS (19),                   /* cost of SQRTSS instruction.  */
1060   COSTS_N_INSNS (27),                   /* cost of SQRTSD instruction.  */
1061   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1062   k8_memcpy,
1063   k8_memset,
1064   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
1065   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1066   "16:8:8",                             /* Loop alignment.  */
1067   "16:8:8",                             /* Jump alignment.  */
1068   "0:0:8",                              /* Label alignment.  */
1069   "16",                                 /* Func alignment.  */
1070 };
1071
1072 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1073    very small blocks it is better to use loop. For large blocks, libcall can
1074    do nontemporary accesses and beat inline considerably.  */
1075 static stringop_algs amdfam10_memcpy[2] = {
1076   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1077              {-1, rep_prefix_4_byte, false}}},
1078   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1079              {-1, libcall, false}}}};
1080 static stringop_algs amdfam10_memset[2] = {
1081   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1082              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1083   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1084              {-1, libcall, false}}}};
1085 struct processor_costs amdfam10_cost = {
1086   {
1087   /* Start of register allocator costs.  integer->integer move cost is 2. */
1088   4,                                 /* cost for loading QImode using movzbl */
1089   {3, 4, 3},                            /* cost of loading integer registers
1090                                            in QImode, HImode and SImode.
1091                                            Relative to reg-reg move (2).  */
1092   {3, 4, 3},                            /* cost of storing integer registers */
1093   4,                                    /* cost of reg,reg fld/fst */
1094   {4, 4, 12},                           /* cost of loading fp registers
1095                                            in SFmode, DFmode and XFmode */
1096   {6, 6, 8},                            /* cost of storing fp registers
1097                                            in SFmode, DFmode and XFmode */
1098   2,                                    /* cost of moving MMX register */
1099   {3, 3},                               /* cost of loading MMX registers
1100                                            in SImode and DImode */
1101   {4, 4},                               /* cost of storing MMX registers
1102                                            in SImode and DImode */
1103   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1104   {4, 4, 3, 6, 12},                     /* cost of loading SSE registers
1105                                            in 32,64,128,256 and 512-bit */
1106   {4, 4, 5, 10, 20},                    /* cost of storing SSE registers
1107                                            in 32,64,128,256 and 512-bit */
1108   3, 3,                                 /* SSE->integer and integer->SSE moves */
1109
1110                                         /* On K8:
1111                                             MOVD reg64, xmmreg Double FSTORE 4
1112                                             MOVD reg32, xmmreg Double FSTORE 4
1113                                            On AMDFAM10:
1114                                             MOVD reg64, xmmreg Double FADD 3
1115                                                                1/1  1/1
1116                                             MOVD reg32, xmmreg Double FADD 3
1117                                                                1/1  1/1 */
1118   /* End of register allocator costs.  */
1119   },
1120
1121   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1122   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
1123   COSTS_N_INSNS (1),                    /* variable shift costs */
1124   COSTS_N_INSNS (1),                    /* constant shift costs */
1125   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1126    COSTS_N_INSNS (4),                   /*                               HI */
1127    COSTS_N_INSNS (3),                   /*                               SI */
1128    COSTS_N_INSNS (4),                   /*                               DI */
1129    COSTS_N_INSNS (5)},                  /*                            other */
1130   0,                                    /* cost of multiply per each bit set */
1131   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1132    COSTS_N_INSNS (35),                  /*                          HI */
1133    COSTS_N_INSNS (51),                  /*                          SI */
1134    COSTS_N_INSNS (83),                  /*                          DI */
1135    COSTS_N_INSNS (83)},                 /*                          other */
1136   COSTS_N_INSNS (1),                    /* cost of movsx */
1137   COSTS_N_INSNS (1),                    /* cost of movzx */
1138   8,                                    /* "large" insn */
1139   9,                                    /* MOVE_RATIO */
1140   6,                                    /* CLEAR_RATIO */
1141   {3, 4, 3},                            /* cost of loading integer registers
1142                                            in QImode, HImode and SImode.
1143                                            Relative to reg-reg move (2).  */
1144   {3, 4, 3},                            /* cost of storing integer registers */
1145   {4, 4, 3, 6, 12},                     /* cost of loading SSE register
1146                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1147   {4, 4, 5, 10, 20},                    /* cost of storing SSE register
1148                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1149   {4, 4, 3, 7, 12},                     /* cost of unaligned loads.  */
1150   {4, 4, 5, 10, 20},                    /* cost of unaligned stores.  */
1151   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1152   3,                                    /* cost of moving SSE register to integer.  */
1153   4, 4,                                 /* Gather load static, per_elt.  */
1154   4, 4,                                 /* Gather store static, per_elt.  */
1155   64,                                   /* size of l1 cache.  */
1156   512,                                  /* size of l2 cache.  */
1157   64,                                   /* size of prefetch block */
1158   /* New AMD processors never drop prefetches; if they cannot be performed
1159      immediately, they are queued.  We set number of simultaneous prefetches
1160      to a large constant to reflect this (it probably is not a good idea not
1161      to limit number of prefetches at all, as their execution also takes some
1162      time).  */
1163   100,                                  /* number of parallel prefetches */
1164   2,                                    /* Branch cost */
1165   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
1166   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1167   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
1168   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1169   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1170   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
1171
1172   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1173   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1174   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
1175   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1176   COSTS_N_INSNS (8),                    /* cost of FMA SS instruction.  */
1177   COSTS_N_INSNS (8),                    /* cost of FMA SD instruction.  */
1178   /* 11-16  */
1179   COSTS_N_INSNS (16),                   /* cost of DIVSS instruction.  */
1180   COSTS_N_INSNS (20),                   /* cost of DIVSD instruction.  */
1181   COSTS_N_INSNS (19),                   /* cost of SQRTSS instruction.  */
1182   COSTS_N_INSNS (27),                   /* cost of SQRTSD instruction.  */
1183   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1184   amdfam10_memcpy,
1185   amdfam10_memset,
1186   COSTS_N_INSNS (2),                    /* cond_taken_branch_cost.  */
1187   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1188   "32:25:8",                            /* Loop alignment.  */
1189   "32:8:8",                             /* Jump alignment.  */
1190   "0:0:8",                              /* Label alignment.  */
1191   "32",                                 /* Func alignment.  */
1192 };
1193
1194 /*  BDVER has optimized REP instruction for medium sized blocks, but for
1195     very small blocks it is better to use loop. For large blocks, libcall
1196     can do nontemporary accesses and beat inline considerably.  */
1197 static stringop_algs bdver_memcpy[2] = {
1198   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1199              {-1, rep_prefix_4_byte, false}}},
1200   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1201              {-1, libcall, false}}}};
1202 static stringop_algs bdver_memset[2] = {
1203   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1204              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1205   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1206              {-1, libcall, false}}}};
1207
1208 const struct processor_costs bdver_cost = {
1209   {
1210   /* Start of register allocator costs.  integer->integer move cost is 2. */
1211   8,                                 /* cost for loading QImode using movzbl */
1212   {8, 8, 8},                            /* cost of loading integer registers
1213                                            in QImode, HImode and SImode.
1214                                            Relative to reg-reg move (2).  */
1215   {8, 8, 8},                            /* cost of storing integer registers */
1216   4,                                    /* cost of reg,reg fld/fst */
1217   {12, 12, 28},                         /* cost of loading fp registers
1218                                            in SFmode, DFmode and XFmode */
1219   {10, 10, 18},                         /* cost of storing fp registers
1220                                            in SFmode, DFmode and XFmode */
1221   4,                                    /* cost of moving MMX register */
1222   {12, 12},                             /* cost of loading MMX registers
1223                                            in SImode and DImode */
1224   {10, 10},                             /* cost of storing MMX registers
1225                                            in SImode and DImode */
1226   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1227   {12, 12, 10, 40, 60},                 /* cost of loading SSE registers
1228                                            in 32,64,128,256 and 512-bit */
1229   {10, 10, 10, 40, 60},                 /* cost of storing SSE registers
1230                                            in 32,64,128,256 and 512-bit */
1231   16, 20,                               /* SSE->integer and integer->SSE moves */
1232   /* End of register allocator costs.  */
1233   },
1234
1235   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1236   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
1237   COSTS_N_INSNS (1),                    /* variable shift costs */
1238   COSTS_N_INSNS (1),                    /* constant shift costs */
1239   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
1240    COSTS_N_INSNS (4),                   /*                               HI */
1241    COSTS_N_INSNS (4),                   /*                               SI */
1242    COSTS_N_INSNS (6),                   /*                               DI */
1243    COSTS_N_INSNS (6)},                  /*                            other */
1244   0,                                    /* cost of multiply per each bit set */
1245   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1246    COSTS_N_INSNS (35),                  /*                          HI */
1247    COSTS_N_INSNS (51),                  /*                          SI */
1248    COSTS_N_INSNS (83),                  /*                          DI */
1249    COSTS_N_INSNS (83)},                 /*                          other */
1250   COSTS_N_INSNS (1),                    /* cost of movsx */
1251   COSTS_N_INSNS (1),                    /* cost of movzx */
1252   8,                                    /* "large" insn */
1253   9,                                    /* MOVE_RATIO */
1254   6,                                    /* CLEAR_RATIO */
1255   {8, 8, 8},                            /* cost of loading integer registers
1256                                            in QImode, HImode and SImode.
1257                                            Relative to reg-reg move (2).  */
1258   {8, 8, 8},                            /* cost of storing integer registers */
1259   {12, 12, 10, 40, 60},                 /* cost of loading SSE register
1260                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1261   {10, 10, 10, 40, 60},                 /* cost of storing SSE register
1262                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1263   {12, 12, 10, 40, 60},                 /* cost of unaligned loads.  */
1264   {10, 10, 10, 40, 60},                 /* cost of unaligned stores.  */
1265   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1266   16,                                   /* cost of moving SSE register to integer.  */
1267   12, 12,                               /* Gather load static, per_elt.  */
1268   10, 10,                               /* Gather store static, per_elt.  */
1269   16,                                   /* size of l1 cache.  */
1270   2048,                                 /* size of l2 cache.  */
1271   64,                                   /* size of prefetch block */
1272   /* New AMD processors never drop prefetches; if they cannot be performed
1273      immediately, they are queued.  We set number of simultaneous prefetches
1274      to a large constant to reflect this (it probably is not a good idea not
1275      to limit number of prefetches at all, as their execution also takes some
1276      time).  */
1277   100,                                  /* number of parallel prefetches */
1278   2,                                    /* Branch cost */
1279   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
1280   COSTS_N_INSNS (6),                    /* cost of FMUL instruction.  */
1281   COSTS_N_INSNS (42),                   /* cost of FDIV instruction.  */
1282   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1283   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1284   COSTS_N_INSNS (52),                   /* cost of FSQRT instruction.  */
1285
1286   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1287   COSTS_N_INSNS (6),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1288   COSTS_N_INSNS (6),                    /* cost of MULSS instruction.  */
1289   COSTS_N_INSNS (6),                    /* cost of MULSD instruction.  */
1290   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
1291   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
1292   /* 9-24  */
1293   COSTS_N_INSNS (24),                   /* cost of DIVSS instruction.  */
1294   /* 9-27  */
1295   COSTS_N_INSNS (27),                   /* cost of DIVSD instruction.  */
1296   COSTS_N_INSNS (15),                   /* cost of SQRTSS instruction.  */
1297   COSTS_N_INSNS (26),                   /* cost of SQRTSD instruction.  */
1298   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1299   bdver_memcpy,
1300   bdver_memset,
1301   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1302   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1303   "16:11:8",                            /* Loop alignment.  */
1304   "16:8:8",                             /* Jump alignment.  */
1305   "0:0:8",                              /* Label alignment.  */
1306   "11",                                 /* Func alignment.  */
1307 };
1308
1309
1310 /*  ZNVER1 has optimized REP instruction for medium sized blocks, but for
1311     very small blocks it is better to use loop.  For large blocks, libcall
1312     can do nontemporary accesses and beat inline considerably.  */
1313 static stringop_algs znver1_memcpy[2] = {
1314   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1315              {-1, rep_prefix_4_byte, false}}},
1316   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1317              {-1, libcall, false}}}};
1318 static stringop_algs znver1_memset[2] = {
1319   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1320              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1321   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1322              {-1, libcall, false}}}};
1323 struct processor_costs znver1_cost = {
1324   {
1325   /* Start of register allocator costs.  integer->integer move cost is 2. */
1326
1327   /* reg-reg moves are done by renaming and thus they are even cheaper than
1328      1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1329      to doubles of latencies, we do not model this correctly.  It does not
1330      seem to make practical difference to bump prices up even more.  */
1331   6,                                    /* cost for loading QImode using
1332                                            movzbl.  */
1333   {6, 6, 6},                            /* cost of loading integer registers
1334                                            in QImode, HImode and SImode.
1335                                            Relative to reg-reg move (2).  */
1336   {8, 8, 8},                            /* cost of storing integer
1337                                            registers.  */
1338   2,                                    /* cost of reg,reg fld/fst.  */
1339   {6, 6, 16},                           /* cost of loading fp registers
1340                                            in SFmode, DFmode and XFmode.  */
1341   {8, 8, 16},                           /* cost of storing fp registers
1342                                            in SFmode, DFmode and XFmode.  */
1343   2,                                    /* cost of moving MMX register.  */
1344   {6, 6},                               /* cost of loading MMX registers
1345                                            in SImode and DImode.  */
1346   {8, 8},                               /* cost of storing MMX registers
1347                                            in SImode and DImode.  */
1348   2, 3, 6,                              /* cost of moving XMM,YMM,ZMM register.  */
1349   {6, 6, 6, 12, 24},                    /* cost of loading SSE registers
1350                                            in 32,64,128,256 and 512-bit.  */
1351   {8, 8, 8, 16, 32},                    /* cost of storing SSE registers
1352                                            in 32,64,128,256 and 512-bit.  */
1353   6, 6,                                 /* SSE->integer and integer->SSE moves.  */
1354   /* End of register allocator costs.  */
1355   },
1356
1357   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
1358   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
1359   COSTS_N_INSNS (1),                    /* variable shift costs.  */
1360   COSTS_N_INSNS (1),                    /* constant shift costs.  */
1361   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI.  */
1362    COSTS_N_INSNS (3),                   /*                               HI.  */
1363    COSTS_N_INSNS (3),                   /*                               SI.  */
1364    COSTS_N_INSNS (3),                   /*                               DI.  */
1365    COSTS_N_INSNS (3)},                  /*                            other.  */
1366   0,                                    /* cost of multiply per each bit
1367                                             set.  */
1368    /* Depending on parameters, idiv can get faster on ryzen.  This is upper
1369       bound.  */
1370   {COSTS_N_INSNS (16),                  /* cost of a divide/mod for QI.  */
1371    COSTS_N_INSNS (22),                  /*                          HI.  */
1372    COSTS_N_INSNS (30),                  /*                          SI.  */
1373    COSTS_N_INSNS (45),                  /*                          DI.  */
1374    COSTS_N_INSNS (45)},                 /*                          other.  */
1375   COSTS_N_INSNS (1),                    /* cost of movsx.  */
1376   COSTS_N_INSNS (1),                    /* cost of movzx.  */
1377   8,                                    /* "large" insn.  */
1378   9,                                    /* MOVE_RATIO.  */
1379   6,                                    /* CLEAR_RATIO */
1380   {6, 6, 6},                            /* cost of loading integer registers
1381                                            in QImode, HImode and SImode.
1382                                            Relative to reg-reg move (2).  */
1383   {8, 8, 8},                            /* cost of storing integer
1384                                            registers.  */
1385   {6, 6, 6, 12, 24},                    /* cost of loading SSE register
1386                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1387   {8, 8, 8, 16, 32},                    /* cost of storing SSE register
1388                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1389   {6, 6, 6, 12, 24},                    /* cost of unaligned loads.  */
1390   {8, 8, 8, 16, 32},                    /* cost of unaligned stores.  */
1391   2, 3, 6,                              /* cost of moving XMM,YMM,ZMM register.  */
1392   6,                                    /* cost of moving SSE register to integer.  */
1393   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1394      throughput 12.  Approx 9 uops do not depend on vector size and every load
1395      is 7 uops.  */
1396   18, 8,                                /* Gather load static, per_elt.  */
1397   18, 10,                               /* Gather store static, per_elt.  */
1398   32,                                   /* size of l1 cache.  */
1399   512,                                  /* size of l2 cache.  */
1400   64,                                   /* size of prefetch block.  */
1401   /* New AMD processors never drop prefetches; if they cannot be performed
1402      immediately, they are queued.  We set number of simultaneous prefetches
1403      to a large constant to reflect this (it probably is not a good idea not
1404      to limit number of prefetches at all, as their execution also takes some
1405      time).  */
1406   100,                                  /* number of parallel prefetches.  */
1407   3,                                    /* Branch cost.  */
1408   COSTS_N_INSNS (5),                    /* cost of FADD and FSUB insns.  */
1409   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
1410   /* Latency of fdiv is 8-15.  */
1411   COSTS_N_INSNS (15),                   /* cost of FDIV instruction.  */
1412   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
1413   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
1414   /* Latency of fsqrt is 4-10.  */
1415   COSTS_N_INSNS (10),                   /* cost of FSQRT instruction.  */
1416
1417   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1418   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1419   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
1420   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1421   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
1422   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
1423   COSTS_N_INSNS (10),                   /* cost of DIVSS instruction.  */
1424   /* 9-13  */
1425   COSTS_N_INSNS (13),                   /* cost of DIVSD instruction.  */
1426   COSTS_N_INSNS (10),                   /* cost of SQRTSS instruction.  */
1427   COSTS_N_INSNS (15),                   /* cost of SQRTSD instruction.  */
1428   /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1429      and it can execute 2 integer additions and 2 multiplications thus
1430      reassociation may make sense up to with of 6.  SPEC2k6 bencharks suggests
1431      that 4 works better than 6 probably due to register pressure.
1432
1433      Integer vector operations are taken by FP unit and execute 3 vector
1434      plus/minus operations per cycle but only one multiply.  This is adjusted
1435      in ix86_reassociation_width.  */
1436   4, 4, 3, 6,                           /* reassoc int, fp, vec_int, vec_fp.  */
1437   znver1_memcpy,
1438   znver1_memset,
1439   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1440   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1441   "16",                                 /* Loop alignment.  */
1442   "16",                                 /* Jump alignment.  */
1443   "0:0:8",                              /* Label alignment.  */
1444   "16",                                 /* Func alignment.  */
1445 };
1446
1447 /*  ZNVER2 has optimized REP instruction for medium sized blocks, but for
1448     very small blocks it is better to use loop.  For large blocks, libcall
1449     can do nontemporary accesses and beat inline considerably.  */
1450 static stringop_algs znver2_memcpy[2] = {
1451   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1452              {-1, rep_prefix_4_byte, false}}},
1453   {libcall, {{16, loop, false}, {64, rep_prefix_4_byte, false},
1454              {-1, libcall, false}}}};
1455 static stringop_algs znver2_memset[2] = {
1456   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1457              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1458   {libcall, {{24, rep_prefix_4_byte, false}, {128, rep_prefix_8_byte, false},
1459              {-1, libcall, false}}}};
1460
1461 struct processor_costs znver2_cost = {
1462   {
1463   /* Start of register allocator costs.  integer->integer move cost is 2. */
1464
1465   /* reg-reg moves are done by renaming and thus they are even cheaper than
1466      1 cycle.  Because reg-reg move cost is 2 and following tables correspond
1467      to doubles of latencies, we do not model this correctly.  It does not
1468      seem to make practical difference to bump prices up even more.  */
1469   6,                                    /* cost for loading QImode using
1470                                            movzbl.  */
1471   {6, 6, 6},                            /* cost of loading integer registers
1472                                            in QImode, HImode and SImode.
1473                                            Relative to reg-reg move (2).  */
1474   {8, 8, 8},                            /* cost of storing integer
1475                                            registers.  */
1476   2,                                    /* cost of reg,reg fld/fst.  */
1477   {6, 6, 16},                           /* cost of loading fp registers
1478                                            in SFmode, DFmode and XFmode.  */
1479   {8, 8, 16},                           /* cost of storing fp registers
1480                                            in SFmode, DFmode and XFmode.  */
1481   2,                                    /* cost of moving MMX register.  */
1482   {6, 6},                               /* cost of loading MMX registers
1483                                            in SImode and DImode.  */
1484   {8, 8},                               /* cost of storing MMX registers
1485                                            in SImode and DImode.  */
1486   2, 2, 3,                              /* cost of moving XMM,YMM,ZMM
1487                                            register.  */
1488   {6, 6, 6, 6, 12},                     /* cost of loading SSE registers
1489                                            in 32,64,128,256 and 512-bit.  */
1490   {8, 8, 8, 8, 16},                     /* cost of storing SSE registers
1491                                            in 32,64,128,256 and 512-bit.  */
1492   6, 6,                                 /* SSE->integer and integer->SSE
1493                                            moves.  */
1494   /* End of register allocator costs.  */
1495   },
1496
1497   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
1498   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
1499   COSTS_N_INSNS (1),                    /* variable shift costs.  */
1500   COSTS_N_INSNS (1),                    /* constant shift costs.  */
1501   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI.  */
1502    COSTS_N_INSNS (3),                   /*                               HI.  */
1503    COSTS_N_INSNS (3),                   /*                               SI.  */
1504    COSTS_N_INSNS (3),                   /*                               DI.  */
1505    COSTS_N_INSNS (3)},                  /*                      other.  */
1506   0,                                    /* cost of multiply per each bit
1507                                            set.  */
1508    /* Depending on parameters, idiv can get faster on ryzen.  This is upper
1509       bound.  */
1510   {COSTS_N_INSNS (16),                  /* cost of a divide/mod for QI.  */
1511    COSTS_N_INSNS (22),                  /*                          HI.  */
1512    COSTS_N_INSNS (30),                  /*                          SI.  */
1513    COSTS_N_INSNS (45),                  /*                          DI.  */
1514    COSTS_N_INSNS (45)},                 /*                          other.  */
1515   COSTS_N_INSNS (1),                    /* cost of movsx.  */
1516   COSTS_N_INSNS (1),                    /* cost of movzx.  */
1517   8,                                    /* "large" insn.  */
1518   9,                                    /* MOVE_RATIO.  */
1519   6,                                    /* CLEAR_RATIO */
1520   {6, 6, 6},                            /* cost of loading integer registers
1521                                            in QImode, HImode and SImode.
1522                                            Relative to reg-reg move (2).  */
1523   {8, 8, 8},                            /* cost of storing integer
1524                                            registers.  */
1525   {6, 6, 6, 6, 12},                     /* cost of loading SSE registers
1526                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1527   {8, 8, 8, 8, 16},                     /* cost of storing SSE register
1528                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1529   {6, 6, 6, 6, 12},                     /* cost of unaligned loads.  */
1530   {8, 8, 8, 8, 16},                     /* cost of unaligned stores.  */
1531   2, 2, 3,                              /* cost of moving XMM,YMM,ZMM
1532                                            register.  */
1533   6,                                    /* cost of moving SSE register to integer.  */
1534   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1535      throughput 12.  Approx 9 uops do not depend on vector size and every load
1536      is 7 uops.  */
1537   18, 8,                                /* Gather load static, per_elt.  */
1538   18, 10,                               /* Gather store static, per_elt.  */
1539   32,                                   /* size of l1 cache.  */
1540   512,                                  /* size of l2 cache.  */
1541   64,                                   /* size of prefetch block.  */
1542   /* New AMD processors never drop prefetches; if they cannot be performed
1543      immediately, they are queued.  We set number of simultaneous prefetches
1544      to a large constant to reflect this (it probably is not a good idea not
1545      to limit number of prefetches at all, as their execution also takes some
1546      time).  */
1547   100,                                  /* number of parallel prefetches.  */
1548   3,                                    /* Branch cost.  */
1549   COSTS_N_INSNS (5),                    /* cost of FADD and FSUB insns.  */
1550   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
1551   /* Latency of fdiv is 8-15.  */
1552   COSTS_N_INSNS (15),                   /* cost of FDIV instruction.  */
1553   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
1554   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
1555   /* Latency of fsqrt is 4-10.  */
1556   COSTS_N_INSNS (10),                   /* cost of FSQRT instruction.  */
1557
1558   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1559   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1560   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
1561   COSTS_N_INSNS (3),                    /* cost of MULSD instruction.  */
1562   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
1563   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
1564   COSTS_N_INSNS (10),                   /* cost of DIVSS instruction.  */
1565   /* 9-13.  */
1566   COSTS_N_INSNS (13),                   /* cost of DIVSD instruction.  */
1567   COSTS_N_INSNS (10),                   /* cost of SQRTSS instruction.  */
1568   COSTS_N_INSNS (15),                   /* cost of SQRTSD instruction.  */
1569   /* Zen can execute 4 integer operations per cycle.  FP operations
1570      take 3 cycles and it can execute 2 integer additions and 2
1571      multiplications thus reassociation may make sense up to with of 6.
1572      SPEC2k6 bencharks suggests
1573      that 4 works better than 6 probably due to register pressure.
1574
1575      Integer vector operations are taken by FP unit and execute 3 vector
1576      plus/minus operations per cycle but only one multiply.  This is adjusted
1577      in ix86_reassociation_width.  */
1578   4, 4, 3, 6,                           /* reassoc int, fp, vec_int, vec_fp.  */
1579   znver2_memcpy,
1580   znver2_memset,
1581   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1582   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1583   "16",                                 /* Loop alignment.  */
1584   "16",                                 /* Jump alignment.  */
1585   "0:0:8",                              /* Label alignment.  */
1586   "16",                                 /* Func alignment.  */
1587 };
1588
1589 /* skylake_cost should produce code tuned for Skylake familly of CPUs.  */
1590 static stringop_algs skylake_memcpy[2] =   {
1591   {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1592   {libcall, {{16, loop, false}, {512, unrolled_loop, false},
1593              {-1, libcall, false}}}};
1594
1595 static stringop_algs skylake_memset[2] = {
1596   {libcall, {{6, loop_1_byte, true},
1597              {24, loop, true},
1598              {8192, rep_prefix_4_byte, true},
1599              {-1, libcall, false}}},
1600   {libcall, {{24, loop, true}, {512, unrolled_loop, false},
1601              {-1, libcall, false}}}};
1602
1603 static const
1604 struct processor_costs skylake_cost = {
1605   {
1606   /* Start of register allocator costs.  integer->integer move cost is 2. */
1607   6,                                 /* cost for loading QImode using movzbl */
1608   {4, 4, 4},                            /* cost of loading integer registers
1609                                            in QImode, HImode and SImode.
1610                                            Relative to reg-reg move (2).  */
1611   {6, 6, 6},                            /* cost of storing integer registers */
1612   2,                                    /* cost of reg,reg fld/fst */
1613   {6, 6, 8},                            /* cost of loading fp registers
1614                                            in SFmode, DFmode and XFmode */
1615   {6, 6, 10},                           /* cost of storing fp registers
1616                                            in SFmode, DFmode and XFmode */
1617   2,                                    /* cost of moving MMX register */
1618   {6, 6},                               /* cost of loading MMX registers
1619                                            in SImode and DImode */
1620   {6, 6},                               /* cost of storing MMX registers
1621                                            in SImode and DImode */
1622   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
1623   {6, 6, 6, 10, 20},                    /* cost of loading SSE registers
1624                                            in 32,64,128,256 and 512-bit */
1625   {8, 8, 8, 12, 24},                    /* cost of storing SSE registers
1626                                            in 32,64,128,256 and 512-bit */
1627   6, 6,                                 /* SSE->integer and integer->SSE moves */
1628   /* End of register allocator costs.  */
1629   },
1630
1631   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1632   COSTS_N_INSNS (1)+1,          /* cost of a lea instruction */
1633   COSTS_N_INSNS (1),                    /* variable shift costs */
1634   COSTS_N_INSNS (1),                    /* constant shift costs */
1635   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1636    COSTS_N_INSNS (4),                   /*                               HI */
1637    COSTS_N_INSNS (3),                   /*                               SI */
1638    COSTS_N_INSNS (3),                   /*                               DI */
1639    COSTS_N_INSNS (3)},                  /*                            other */
1640   0,                                    /* cost of multiply per each bit set */
1641   /* Expanding div/mod currently doesn't consider parallelism. So the cost
1642      model is not realistic. We compensate by increasing the latencies a bit.  */
1643   {COSTS_N_INSNS (11),                  /* cost of a divide/mod for QI */
1644    COSTS_N_INSNS (11),                  /*                          HI */
1645    COSTS_N_INSNS (14),                  /*                          SI */
1646    COSTS_N_INSNS (76),                  /*                          DI */
1647    COSTS_N_INSNS (76)},                 /*                          other */
1648   COSTS_N_INSNS (1),                    /* cost of movsx */
1649   COSTS_N_INSNS (0),                    /* cost of movzx */
1650   8,                                    /* "large" insn */
1651   17,                                   /* MOVE_RATIO */
1652   6,                                    /* CLEAR_RATIO */
1653   {4, 4, 4},                            /* cost of loading integer registers
1654                                            in QImode, HImode and SImode.
1655                                            Relative to reg-reg move (2).  */
1656   {6, 6, 6},                            /* cost of storing integer registers */
1657   {6, 6, 6, 10, 20},                    /* cost of loading SSE register
1658                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1659   {8, 8, 8, 12, 24},                    /* cost of storing SSE register
1660                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1661   {6, 6, 6, 10, 20},                    /* cost of unaligned loads.  */
1662   {8, 8, 8, 8, 16},                     /* cost of unaligned stores.  */
1663   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
1664   2,                                    /* cost of moving SSE register to integer.  */
1665   20, 8,                                /* Gather load static, per_elt.  */
1666   22, 10,                               /* Gather store static, per_elt.  */
1667   64,                                   /* size of l1 cache.  */
1668   512,                                  /* size of l2 cache.  */
1669   64,                                   /* size of prefetch block */
1670   6,                                    /* number of parallel prefetches */
1671   3,                                    /* Branch cost */
1672   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
1673   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1674   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
1675   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
1676   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
1677   COSTS_N_INSNS (20),                   /* cost of FSQRT instruction.  */
1678
1679   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1680   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1681   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
1682   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1683   COSTS_N_INSNS (4),                    /* cost of FMA SS instruction.  */
1684   COSTS_N_INSNS (4),                    /* cost of FMA SD instruction.  */
1685   COSTS_N_INSNS (11),                   /* cost of DIVSS instruction.  */
1686   COSTS_N_INSNS (14),                   /* cost of DIVSD instruction.  */
1687   COSTS_N_INSNS (12),                   /* cost of SQRTSS instruction.  */
1688   COSTS_N_INSNS (18),                   /* cost of SQRTSD instruction.  */
1689   1, 4, 2, 2,                           /* reassoc int, fp, vec_int, vec_fp.  */
1690   skylake_memcpy,
1691   skylake_memset,
1692   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
1693   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1694   "16:11:8",                            /* Loop alignment.  */
1695   "16:11:8",                            /* Jump alignment.  */
1696   "0:0:8",                              /* Label alignment.  */
1697   "16",                                 /* Func alignment.  */
1698 };
1699   /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1700      very small blocks it is better to use loop. For large blocks, libcall can
1701      do nontemporary accesses and beat inline considerably.  */
1702 static stringop_algs btver1_memcpy[2] = {
1703   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1704              {-1, rep_prefix_4_byte, false}}},
1705   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1706              {-1, libcall, false}}}};
1707 static stringop_algs btver1_memset[2] = {
1708   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1709              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1710   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1711              {-1, libcall, false}}}};
1712 const struct processor_costs btver1_cost = {
1713   {
1714   /* Start of register allocator costs.  integer->integer move cost is 2. */
1715   8,                                 /* cost for loading QImode using movzbl */
1716   {6, 8, 6},                            /* cost of loading integer registers
1717                                            in QImode, HImode and SImode.
1718                                            Relative to reg-reg move (2).  */
1719   {6, 8, 6},                            /* cost of storing integer registers */
1720   4,                                    /* cost of reg,reg fld/fst */
1721   {12, 12, 28},                         /* cost of loading fp registers
1722                                            in SFmode, DFmode and XFmode */
1723   {12, 12, 38},                         /* cost of storing fp registers
1724                                            in SFmode, DFmode and XFmode */
1725   4,                                    /* cost of moving MMX register */
1726   {10, 10},                             /* cost of loading MMX registers
1727                                            in SImode and DImode */
1728   {12, 12},                             /* cost of storing MMX registers
1729                                            in SImode and DImode */
1730   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1731   {10, 10, 12, 48, 96},                 /* cost of loading SSE registers
1732                                            in 32,64,128,256 and 512-bit */
1733   {10, 10, 12, 48, 96},                 /* cost of storing SSE registers
1734                                            in 32,64,128,256 and 512-bit */
1735   14, 14,                               /* SSE->integer and integer->SSE moves */
1736   /* End of register allocator costs.  */
1737   },
1738
1739   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1740   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
1741   COSTS_N_INSNS (1),                    /* variable shift costs */
1742   COSTS_N_INSNS (1),                    /* constant shift costs */
1743   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1744    COSTS_N_INSNS (4),                   /*                               HI */
1745    COSTS_N_INSNS (3),                   /*                               SI */
1746    COSTS_N_INSNS (4),                   /*                               DI */
1747    COSTS_N_INSNS (5)},                  /*                            other */
1748   0,                                    /* cost of multiply per each bit set */
1749   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1750    COSTS_N_INSNS (35),                  /*                          HI */
1751    COSTS_N_INSNS (51),                  /*                          SI */
1752    COSTS_N_INSNS (83),                  /*                          DI */
1753    COSTS_N_INSNS (83)},                 /*                          other */
1754   COSTS_N_INSNS (1),                    /* cost of movsx */
1755   COSTS_N_INSNS (1),                    /* cost of movzx */
1756   8,                                    /* "large" insn */
1757   9,                                    /* MOVE_RATIO */
1758   6,                                    /* CLEAR_RATIO */
1759   {6, 8, 6},                            /* cost of loading integer registers
1760                                            in QImode, HImode and SImode.
1761                                            Relative to reg-reg move (2).  */
1762   {6, 8, 6},                            /* cost of storing integer registers */
1763   {10, 10, 12, 48, 96},                 /* cost of loading SSE register
1764                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1765   {10, 10, 12, 48, 96},                 /* cost of storing SSE register
1766                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1767   {10, 10, 12, 48, 96},                 /* cost of unaligned loads.  */
1768   {10, 10, 12, 48, 96},                 /* cost of unaligned stores.  */
1769   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1770   14,                                   /* cost of moving SSE register to integer.  */
1771   10, 10,                               /* Gather load static, per_elt.  */
1772   10, 10,                               /* Gather store static, per_elt.  */
1773   32,                                   /* size of l1 cache.  */
1774   512,                                  /* size of l2 cache.  */
1775   64,                                   /* size of prefetch block */
1776   100,                                  /* number of parallel prefetches */
1777   2,                                    /* Branch cost */
1778   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
1779   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1780   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
1781   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1782   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1783   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
1784
1785   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1786   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1787   COSTS_N_INSNS (2),                    /* cost of MULSS instruction.  */
1788   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1789   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
1790   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
1791   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
1792   COSTS_N_INSNS (17),                   /* cost of DIVSD instruction.  */
1793   COSTS_N_INSNS (14),                   /* cost of SQRTSS instruction.  */
1794   COSTS_N_INSNS (48),                   /* cost of SQRTSD instruction.  */
1795   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1796   btver1_memcpy,
1797   btver1_memset,
1798   COSTS_N_INSNS (2),                    /* cond_taken_branch_cost.  */
1799   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1800   "16:11:8",                            /* Loop alignment.  */
1801   "16:8:8",                             /* Jump alignment.  */
1802   "0:0:8",                              /* Label alignment.  */
1803   "11",                                 /* Func alignment.  */
1804 };
1805
1806 static stringop_algs btver2_memcpy[2] = {
1807   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1808              {-1, rep_prefix_4_byte, false}}},
1809   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1810              {-1, libcall, false}}}};
1811 static stringop_algs btver2_memset[2] = {
1812   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1813              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1814   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1815              {-1, libcall, false}}}};
1816 const struct processor_costs btver2_cost = {
1817   {
1818   /* Start of register allocator costs.  integer->integer move cost is 2. */
1819   8,                                 /* cost for loading QImode using movzbl */
1820   {8, 8, 6},                            /* cost of loading integer registers
1821                                            in QImode, HImode and SImode.
1822                                            Relative to reg-reg move (2).  */
1823   {8, 8, 6},                            /* cost of storing integer registers */
1824   4,                                    /* cost of reg,reg fld/fst */
1825   {12, 12, 28},                         /* cost of loading fp registers
1826                                            in SFmode, DFmode and XFmode */
1827   {12, 12, 38},                         /* cost of storing fp registers
1828                                            in SFmode, DFmode and XFmode */
1829   4,                                    /* cost of moving MMX register */
1830   {10, 10},                             /* cost of loading MMX registers
1831                                            in SImode and DImode */
1832   {12, 12},                             /* cost of storing MMX registers
1833                                            in SImode and DImode */
1834   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1835   {10, 10, 12, 48, 96},                 /* cost of loading SSE registers
1836                                            in 32,64,128,256 and 512-bit */
1837   {10, 10, 12, 48, 96},                 /* cost of storing SSE registers
1838                                            in 32,64,128,256 and 512-bit */
1839   14, 14,                               /* SSE->integer and integer->SSE moves */
1840   /* End of register allocator costs.  */
1841   },
1842
1843   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1844   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
1845   COSTS_N_INSNS (1),                    /* variable shift costs */
1846   COSTS_N_INSNS (1),                    /* constant shift costs */
1847   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1848    COSTS_N_INSNS (4),                   /*                               HI */
1849    COSTS_N_INSNS (3),                   /*                               SI */
1850    COSTS_N_INSNS (4),                   /*                               DI */
1851    COSTS_N_INSNS (5)},                  /*                            other */
1852   0,                                    /* cost of multiply per each bit set */
1853   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1854    COSTS_N_INSNS (35),                  /*                          HI */
1855    COSTS_N_INSNS (51),                  /*                          SI */
1856    COSTS_N_INSNS (83),                  /*                          DI */
1857    COSTS_N_INSNS (83)},                 /*                          other */
1858   COSTS_N_INSNS (1),                    /* cost of movsx */
1859   COSTS_N_INSNS (1),                    /* cost of movzx */
1860   8,                                    /* "large" insn */
1861   9,                                    /* MOVE_RATIO */
1862   6,                                    /* CLEAR_RATIO */
1863   {8, 8, 6},                            /* cost of loading integer registers
1864                                            in QImode, HImode and SImode.
1865                                            Relative to reg-reg move (2).  */
1866   {8, 8, 6},                            /* cost of storing integer registers */
1867   {10, 10, 12, 48, 96},                 /* cost of loading SSE register
1868                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1869   {10, 10, 12, 48, 96},                 /* cost of storing SSE register
1870                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1871   {10, 10, 12, 48, 96},                 /* cost of unaligned loads.  */
1872   {10, 10, 12, 48, 96},                 /* cost of unaligned stores.  */
1873   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1874   14,                                   /* cost of moving SSE register to integer.  */
1875   10, 10,                               /* Gather load static, per_elt.  */
1876   10, 10,                               /* Gather store static, per_elt.  */
1877   32,                                   /* size of l1 cache.  */
1878   2048,                                 /* size of l2 cache.  */
1879   64,                                   /* size of prefetch block */
1880   100,                                  /* number of parallel prefetches */
1881   2,                                    /* Branch cost */
1882   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
1883   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1884   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
1885   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1886   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1887   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
1888
1889   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1890   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1891   COSTS_N_INSNS (2),                    /* cost of MULSS instruction.  */
1892   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1893   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
1894   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
1895   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
1896   COSTS_N_INSNS (19),                   /* cost of DIVSD instruction.  */
1897   COSTS_N_INSNS (16),                   /* cost of SQRTSS instruction.  */
1898   COSTS_N_INSNS (21),                   /* cost of SQRTSD instruction.  */
1899   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1900   btver2_memcpy,
1901   btver2_memset,
1902   COSTS_N_INSNS (2),                    /* cond_taken_branch_cost.  */
1903   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1904   "16:11:8",                            /* Loop alignment.  */
1905   "16:8:8",                             /* Jump alignment.  */
1906   "0:0:8",                              /* Label alignment.  */
1907   "11",                                 /* Func alignment.  */
1908 };
1909
1910 static stringop_algs pentium4_memcpy[2] = {
1911   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1912   DUMMY_STRINGOP_ALGS};
1913 static stringop_algs pentium4_memset[2] = {
1914   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1915              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1916   DUMMY_STRINGOP_ALGS};
1917
1918 static const
1919 struct processor_costs pentium4_cost = {
1920   {
1921   /* Start of register allocator costs.  integer->integer move cost is 2. */
1922   5,                                 /* cost for loading QImode using movzbl */
1923   {4, 5, 4},                            /* cost of loading integer registers
1924                                            in QImode, HImode and SImode.
1925                                            Relative to reg-reg move (2).  */
1926   {2, 3, 2},                            /* cost of storing integer registers */
1927   12,                                   /* cost of reg,reg fld/fst */
1928   {14, 14, 14},                         /* cost of loading fp registers
1929                                            in SFmode, DFmode and XFmode */
1930   {14, 14, 14},                         /* cost of storing fp registers
1931                                            in SFmode, DFmode and XFmode */
1932   12,                                   /* cost of moving MMX register */
1933   {16, 16},                             /* cost of loading MMX registers
1934                                            in SImode and DImode */
1935   {16, 16},                             /* cost of storing MMX registers
1936                                            in SImode and DImode */
1937   12, 24, 48,                           /* cost of moving XMM,YMM,ZMM register */
1938   {16, 16, 16, 32, 64},                 /* cost of loading SSE registers
1939                                            in 32,64,128,256 and 512-bit */
1940   {16, 16, 16, 32, 64},                 /* cost of storing SSE registers
1941                                            in 32,64,128,256 and 512-bit */
1942   20, 12,                               /* SSE->integer and integer->SSE moves */
1943   /* End of register allocator costs.  */
1944   },
1945
1946   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1947   COSTS_N_INSNS (3),                    /* cost of a lea instruction */
1948   COSTS_N_INSNS (4),                    /* variable shift costs */
1949   COSTS_N_INSNS (4),                    /* constant shift costs */
1950   {COSTS_N_INSNS (15),                  /* cost of starting multiply for QI */
1951    COSTS_N_INSNS (15),                  /*                               HI */
1952    COSTS_N_INSNS (15),                  /*                               SI */
1953    COSTS_N_INSNS (15),                  /*                               DI */
1954    COSTS_N_INSNS (15)},                 /*                            other */
1955   0,                                    /* cost of multiply per each bit set */
1956   {COSTS_N_INSNS (56),                  /* cost of a divide/mod for QI */
1957    COSTS_N_INSNS (56),                  /*                          HI */
1958    COSTS_N_INSNS (56),                  /*                          SI */
1959    COSTS_N_INSNS (56),                  /*                          DI */
1960    COSTS_N_INSNS (56)},                 /*                          other */
1961   COSTS_N_INSNS (1),                    /* cost of movsx */
1962   COSTS_N_INSNS (1),                    /* cost of movzx */
1963   16,                                   /* "large" insn */
1964   6,                                    /* MOVE_RATIO */
1965   6,                                    /* CLEAR_RATIO */
1966   {4, 5, 4},                            /* cost of loading integer registers
1967                                            in QImode, HImode and SImode.
1968                                            Relative to reg-reg move (2).  */
1969   {2, 3, 2},                            /* cost of storing integer registers */
1970   {16, 16, 16, 32, 64},                 /* cost of loading SSE register
1971                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1972   {16, 16, 16, 32, 64},                 /* cost of storing SSE register
1973                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1974   {32, 32, 32, 64, 128},                /* cost of unaligned loads.  */
1975   {32, 32, 32, 64, 128},                /* cost of unaligned stores.  */
1976   12, 24, 48,                           /* cost of moving XMM,YMM,ZMM register */
1977   20,                                   /* cost of moving SSE register to integer.  */
1978   16, 16,                               /* Gather load static, per_elt.  */
1979   16, 16,                               /* Gather store static, per_elt.  */
1980   8,                                    /* size of l1 cache.  */
1981   256,                                  /* size of l2 cache.  */
1982   64,                                   /* size of prefetch block */
1983   6,                                    /* number of parallel prefetches */
1984   2,                                    /* Branch cost */
1985   COSTS_N_INSNS (5),                    /* cost of FADD and FSUB insns.  */
1986   COSTS_N_INSNS (7),                    /* cost of FMUL instruction.  */
1987   COSTS_N_INSNS (43),                   /* cost of FDIV instruction.  */
1988   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1989   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1990   COSTS_N_INSNS (43),                   /* cost of FSQRT instruction.  */
1991
1992   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1993   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1994   COSTS_N_INSNS (6),                    /* cost of MULSS instruction.  */
1995   COSTS_N_INSNS (6),                    /* cost of MULSD instruction.  */
1996   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
1997   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
1998   COSTS_N_INSNS (23),                   /* cost of DIVSS instruction.  */
1999   COSTS_N_INSNS (38),                   /* cost of DIVSD instruction.  */
2000   COSTS_N_INSNS (23),                   /* cost of SQRTSS instruction.  */
2001   COSTS_N_INSNS (38),                   /* cost of SQRTSD instruction.  */
2002   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2003   pentium4_memcpy,
2004   pentium4_memset,
2005   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2006   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2007   NULL,                                 /* Loop alignment.  */
2008   NULL,                                 /* Jump alignment.  */
2009   NULL,                                 /* Label alignment.  */
2010   NULL,                                 /* Func alignment.  */
2011 };
2012
2013 static stringop_algs nocona_memcpy[2] = {
2014   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
2015   {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
2016              {100000, unrolled_loop, false}, {-1, libcall, false}}}};
2017
2018 static stringop_algs nocona_memset[2] = {
2019   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
2020              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2021   {libcall, {{24, loop, false}, {64, unrolled_loop, false},
2022              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2023
2024 static const
2025 struct processor_costs nocona_cost = {
2026   {
2027   /* Start of register allocator costs.  integer->integer move cost is 2. */
2028   4,                                 /* cost for loading QImode using movzbl */
2029   {4, 4, 4},                            /* cost of loading integer registers
2030                                            in QImode, HImode and SImode.
2031                                            Relative to reg-reg move (2).  */
2032   {4, 4, 4},                            /* cost of storing integer registers */
2033   12,                                   /* cost of reg,reg fld/fst */
2034   {14, 14, 14},                         /* cost of loading fp registers
2035                                            in SFmode, DFmode and XFmode */
2036   {14, 14, 14},                         /* cost of storing fp registers
2037                                            in SFmode, DFmode and XFmode */
2038   14,                                   /* cost of moving MMX register */
2039   {12, 12},                             /* cost of loading MMX registers
2040                                            in SImode and DImode */
2041   {12, 12},                             /* cost of storing MMX registers
2042                                            in SImode and DImode */
2043   6, 12, 24,                            /* cost of moving XMM,YMM,ZMM register */
2044   {12, 12, 12, 24, 48},                 /* cost of loading SSE registers
2045                                            in 32,64,128,256 and 512-bit */
2046   {12, 12, 12, 24, 48},                 /* cost of storing SSE registers
2047                                            in 32,64,128,256 and 512-bit */
2048   20, 12,                               /* SSE->integer and integer->SSE moves */
2049   /* End of register allocator costs.  */
2050   },
2051
2052   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2053   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
2054   COSTS_N_INSNS (1),                    /* variable shift costs */
2055   COSTS_N_INSNS (1),                    /* constant shift costs */
2056   {COSTS_N_INSNS (10),                  /* cost of starting multiply for QI */
2057    COSTS_N_INSNS (10),                  /*                               HI */
2058    COSTS_N_INSNS (10),                  /*                               SI */
2059    COSTS_N_INSNS (10),                  /*                               DI */
2060    COSTS_N_INSNS (10)},                 /*                            other */
2061   0,                                    /* cost of multiply per each bit set */
2062   {COSTS_N_INSNS (66),                  /* cost of a divide/mod for QI */
2063    COSTS_N_INSNS (66),                  /*                          HI */
2064    COSTS_N_INSNS (66),                  /*                          SI */
2065    COSTS_N_INSNS (66),                  /*                          DI */
2066    COSTS_N_INSNS (66)},                 /*                          other */
2067   COSTS_N_INSNS (1),                    /* cost of movsx */
2068   COSTS_N_INSNS (1),                    /* cost of movzx */
2069   16,                                   /* "large" insn */
2070   17,                                   /* MOVE_RATIO */
2071   6,                                    /* CLEAR_RATIO */
2072   {4, 4, 4},                            /* cost of loading integer registers
2073                                            in QImode, HImode and SImode.
2074                                            Relative to reg-reg move (2).  */
2075   {4, 4, 4},                            /* cost of storing integer registers */
2076   {12, 12, 12, 24, 48},                 /* cost of loading SSE register
2077                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2078   {12, 12, 12, 24, 48},                 /* cost of storing SSE register
2079                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2080   {24, 24, 24, 48, 96},                 /* cost of unaligned loads.  */
2081   {24, 24, 24, 48, 96},                 /* cost of unaligned stores.  */
2082   6, 12, 24,                            /* cost of moving XMM,YMM,ZMM register */
2083   20,                                   /* cost of moving SSE register to integer.  */
2084   12, 12,                               /* Gather load static, per_elt.  */
2085   12, 12,                               /* Gather store static, per_elt.  */
2086   8,                                    /* size of l1 cache.  */
2087   1024,                                 /* size of l2 cache.  */
2088   64,                                   /* size of prefetch block */
2089   8,                                    /* number of parallel prefetches */
2090   1,                                    /* Branch cost */
2091   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
2092   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
2093   COSTS_N_INSNS (40),                   /* cost of FDIV instruction.  */
2094   COSTS_N_INSNS (3),                    /* cost of FABS instruction.  */
2095   COSTS_N_INSNS (3),                    /* cost of FCHS instruction.  */
2096   COSTS_N_INSNS (44),                   /* cost of FSQRT instruction.  */
2097
2098   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
2099   COSTS_N_INSNS (5),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2100   COSTS_N_INSNS (7),                    /* cost of MULSS instruction.  */
2101   COSTS_N_INSNS (7),                    /* cost of MULSD instruction.  */
2102   COSTS_N_INSNS (7),                    /* cost of FMA SS instruction.  */
2103   COSTS_N_INSNS (7),                    /* cost of FMA SD instruction.  */
2104   COSTS_N_INSNS (32),                   /* cost of DIVSS instruction.  */
2105   COSTS_N_INSNS (40),                   /* cost of DIVSD instruction.  */
2106   COSTS_N_INSNS (32),                   /* cost of SQRTSS instruction.  */
2107   COSTS_N_INSNS (41),                   /* cost of SQRTSD instruction.  */
2108   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2109   nocona_memcpy,
2110   nocona_memset,
2111   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2112   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2113   NULL,                                 /* Loop alignment.  */
2114   NULL,                                 /* Jump alignment.  */
2115   NULL,                                 /* Label alignment.  */
2116   NULL,                                 /* Func alignment.  */
2117 };
2118
2119 static stringop_algs atom_memcpy[2] = {
2120   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2121   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2122              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2123 static stringop_algs atom_memset[2] = {
2124   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2125              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2126   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2127              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2128 static const
2129 struct processor_costs atom_cost = {
2130   {
2131   /* Start of register allocator costs.  integer->integer move cost is 2. */
2132   6,                                    /* cost for loading QImode using movzbl */
2133   {6, 6, 6},                            /* cost of loading integer registers
2134                                            in QImode, HImode and SImode.
2135                                            Relative to reg-reg move (2).  */
2136   {6, 6, 6},                            /* cost of storing integer registers */
2137   4,                                    /* cost of reg,reg fld/fst */
2138   {6, 6, 18},                           /* cost of loading fp registers
2139                                            in SFmode, DFmode and XFmode */
2140   {14, 14, 24},                         /* cost of storing fp registers
2141                                            in SFmode, DFmode and XFmode */
2142   2,                                    /* cost of moving MMX register */
2143   {8, 8},                               /* cost of loading MMX registers
2144                                            in SImode and DImode */
2145   {10, 10},                             /* cost of storing MMX registers
2146                                            in SImode and DImode */
2147   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
2148   {8, 8, 8, 16, 32},                    /* cost of loading SSE registers
2149                                            in 32,64,128,256 and 512-bit */
2150   {8, 8, 8, 16, 32},                    /* cost of storing SSE registers
2151                                            in 32,64,128,256 and 512-bit */
2152   8, 6,                                 /* SSE->integer and integer->SSE moves */
2153   /* End of register allocator costs.  */
2154   },
2155
2156   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2157   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
2158   COSTS_N_INSNS (1),                    /* variable shift costs */
2159   COSTS_N_INSNS (1),                    /* constant shift costs */
2160   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2161    COSTS_N_INSNS (4),                   /*                               HI */
2162    COSTS_N_INSNS (3),                   /*                               SI */
2163    COSTS_N_INSNS (4),                   /*                               DI */
2164    COSTS_N_INSNS (2)},                  /*                            other */
2165   0,                                    /* cost of multiply per each bit set */
2166   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
2167    COSTS_N_INSNS (26),                  /*                          HI */
2168    COSTS_N_INSNS (42),                  /*                          SI */
2169    COSTS_N_INSNS (74),                  /*                          DI */
2170    COSTS_N_INSNS (74)},                 /*                          other */
2171   COSTS_N_INSNS (1),                    /* cost of movsx */
2172   COSTS_N_INSNS (1),                    /* cost of movzx */
2173   8,                                    /* "large" insn */
2174   17,                                   /* MOVE_RATIO */
2175   6,                                    /* CLEAR_RATIO */
2176   {6, 6, 6},                            /* cost of loading integer registers
2177                                            in QImode, HImode and SImode.
2178                                            Relative to reg-reg move (2).  */
2179   {6, 6, 6},                            /* cost of storing integer registers */
2180   {8, 8, 8, 16, 32},                    /* cost of loading SSE register
2181                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2182   {8, 8, 8, 16, 32},                    /* cost of storing SSE register
2183                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2184   {16, 16, 16, 32, 64},                 /* cost of unaligned loads.  */
2185   {16, 16, 16, 32, 64},                 /* cost of unaligned stores.  */
2186   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
2187   8,                                    /* cost of moving SSE register to integer.  */
2188   8, 8,                                 /* Gather load static, per_elt.  */
2189   8, 8,                                 /* Gather store static, per_elt.  */
2190   32,                                   /* size of l1 cache.  */
2191   256,                                  /* size of l2 cache.  */
2192   64,                                   /* size of prefetch block */
2193   6,                                    /* number of parallel prefetches */
2194   3,                                    /* Branch cost */
2195   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
2196   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
2197   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
2198   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
2199   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
2200   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
2201
2202   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2203   COSTS_N_INSNS (5),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2204   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
2205   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
2206   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
2207   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
2208   COSTS_N_INSNS (31),                   /* cost of DIVSS instruction.  */
2209   COSTS_N_INSNS (60),                   /* cost of DIVSD instruction.  */
2210   COSTS_N_INSNS (31),                   /* cost of SQRTSS instruction.  */
2211   COSTS_N_INSNS (63),                   /* cost of SQRTSD instruction.  */
2212   2, 2, 2, 2,                           /* reassoc int, fp, vec_int, vec_fp.  */
2213   atom_memcpy,
2214   atom_memset,
2215   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2216   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2217   "16",                                 /* Loop alignment.  */
2218   "16:8:8",                             /* Jump alignment.  */
2219   "0:0:8",                              /* Label alignment.  */
2220   "16",                                 /* Func alignment.  */
2221 };
2222
2223 static stringop_algs slm_memcpy[2] = {
2224   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2225   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2226              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2227 static stringop_algs slm_memset[2] = {
2228   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2229              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2230   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2231              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2232 static const
2233 struct processor_costs slm_cost = {
2234   {
2235   /* Start of register allocator costs.  integer->integer move cost is 2. */
2236   8,                                    /* cost for loading QImode using movzbl */
2237   {8, 8, 8},                            /* cost of loading integer registers
2238                                            in QImode, HImode and SImode.
2239                                            Relative to reg-reg move (2).  */
2240   {6, 6, 6},                            /* cost of storing integer registers */
2241   2,                                    /* cost of reg,reg fld/fst */
2242   {8, 8, 18},                           /* cost of loading fp registers
2243                                            in SFmode, DFmode and XFmode */
2244   {6, 6, 18},                           /* cost of storing fp registers
2245                                            in SFmode, DFmode and XFmode */
2246   2,                                    /* cost of moving MMX register */
2247   {8, 8},                               /* cost of loading MMX registers
2248                                            in SImode and DImode */
2249   {6, 6},                               /* cost of storing MMX registers
2250                                            in SImode and DImode */
2251   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
2252   {8, 8, 8, 16, 32},                    /* cost of loading SSE registers
2253                                            in 32,64,128,256 and 512-bit */
2254   {8, 8, 8, 16, 32},                    /* cost of storing SSE registers
2255                                            in 32,64,128,256 and 512-bit */
2256   8, 6,                                 /* SSE->integer and integer->SSE moves */
2257   /* End of register allocator costs.  */
2258   },
2259
2260   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2261   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
2262   COSTS_N_INSNS (1),                    /* variable shift costs */
2263   COSTS_N_INSNS (1),                    /* constant shift costs */
2264   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2265    COSTS_N_INSNS (3),                   /*                               HI */
2266    COSTS_N_INSNS (3),                   /*                               SI */
2267    COSTS_N_INSNS (4),                   /*                               DI */
2268    COSTS_N_INSNS (2)},                  /*                            other */
2269   0,                                    /* cost of multiply per each bit set */
2270   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
2271    COSTS_N_INSNS (26),                  /*                          HI */
2272    COSTS_N_INSNS (42),                  /*                          SI */
2273    COSTS_N_INSNS (74),                  /*                          DI */
2274    COSTS_N_INSNS (74)},                 /*                          other */
2275   COSTS_N_INSNS (1),                    /* cost of movsx */
2276   COSTS_N_INSNS (1),                    /* cost of movzx */
2277   8,                                    /* "large" insn */
2278   17,                                   /* MOVE_RATIO */
2279   6,                                    /* CLEAR_RATIO */
2280   {8, 8, 8},                            /* cost of loading integer registers
2281                                            in QImode, HImode and SImode.
2282                                            Relative to reg-reg move (2).  */
2283   {6, 6, 6},                            /* cost of storing integer registers */
2284   {8, 8, 8, 16, 32},                    /* cost of loading SSE register
2285                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2286   {8, 8, 8, 16, 32},                    /* cost of storing SSE register
2287                                            in SImode, DImode and TImode.  */
2288   {16, 16, 16, 32, 64},                 /* cost of unaligned loads.  */
2289   {16, 16, 16, 32, 64},                 /* cost of unaligned stores.  */
2290   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
2291   8,                                    /* cost of moving SSE register to integer.  */
2292   8, 8,                                 /* Gather load static, per_elt.  */
2293   8, 8,                                 /* Gather store static, per_elt.  */
2294   32,                                   /* size of l1 cache.  */
2295   256,                                  /* size of l2 cache.  */
2296   64,                                   /* size of prefetch block */
2297   6,                                    /* number of parallel prefetches */
2298   3,                                    /* Branch cost */
2299   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
2300   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
2301   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
2302   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
2303   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
2304   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
2305
2306   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2307   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2308   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
2309   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
2310   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
2311   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
2312   COSTS_N_INSNS (39),                   /* cost of DIVSS instruction.  */
2313   COSTS_N_INSNS (69),                   /* cost of DIVSD instruction.  */
2314   COSTS_N_INSNS (20),                   /* cost of SQRTSS instruction.  */
2315   COSTS_N_INSNS (35),                   /* cost of SQRTSD instruction.  */
2316   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2317   slm_memcpy,
2318   slm_memset,
2319   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2320   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2321   "16",                                 /* Loop alignment.  */
2322   "16:8:8",                             /* Jump alignment.  */
2323   "0:0:8",                              /* Label alignment.  */
2324   "16",                                 /* Func alignment.  */
2325 };
2326
2327 static stringop_algs intel_memcpy[2] = {
2328   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2329   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2330              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2331 static stringop_algs intel_memset[2] = {
2332   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2333              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2334   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2335              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2336 static const
2337 struct processor_costs intel_cost = {
2338   {
2339   /* Start of register allocator costs.  integer->integer move cost is 2. */
2340   6,                                 /* cost for loading QImode using movzbl */
2341   {4, 4, 4},                            /* cost of loading integer registers
2342                                            in QImode, HImode and SImode.
2343                                            Relative to reg-reg move (2).  */
2344   {6, 6, 6},                            /* cost of storing integer registers */
2345   2,                                    /* cost of reg,reg fld/fst */
2346   {6, 6, 8},                            /* cost of loading fp registers
2347                                            in SFmode, DFmode and XFmode */
2348   {6, 6, 10},                           /* cost of storing fp registers
2349                                            in SFmode, DFmode and XFmode */
2350   2,                                    /* cost of moving MMX register */
2351   {6, 6},                               /* cost of loading MMX registers
2352                                            in SImode and DImode */
2353   {6, 6},                               /* cost of storing MMX registers
2354                                            in SImode and DImode */
2355   2, 2, 2,                              /* cost of moving XMM,YMM,ZMM register */
2356   {6, 6, 6, 6, 6},                      /* cost of loading SSE registers
2357                                            in 32,64,128,256 and 512-bit */
2358   {6, 6, 6, 6, 6},                      /* cost of storing SSE registers
2359                                            in 32,64,128,256 and 512-bit */
2360   4, 4,                                 /* SSE->integer and integer->SSE moves */
2361   /* End of register allocator costs.  */
2362   },
2363
2364   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2365   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
2366   COSTS_N_INSNS (1),                    /* variable shift costs */
2367   COSTS_N_INSNS (1),                    /* constant shift costs */
2368   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2369    COSTS_N_INSNS (3),                   /*                               HI */
2370    COSTS_N_INSNS (3),                   /*                               SI */
2371    COSTS_N_INSNS (4),                   /*                               DI */
2372    COSTS_N_INSNS (2)},                  /*                            other */
2373   0,                                    /* cost of multiply per each bit set */
2374   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
2375    COSTS_N_INSNS (26),                  /*                          HI */
2376    COSTS_N_INSNS (42),                  /*                          SI */
2377    COSTS_N_INSNS (74),                  /*                          DI */
2378    COSTS_N_INSNS (74)},                 /*                          other */
2379   COSTS_N_INSNS (1),                    /* cost of movsx */
2380   COSTS_N_INSNS (1),                    /* cost of movzx */
2381   8,                                    /* "large" insn */
2382   17,                                   /* MOVE_RATIO */
2383   6,                                    /* CLEAR_RATIO */
2384   {4, 4, 4},                            /* cost of loading integer registers
2385                                            in QImode, HImode and SImode.
2386                                            Relative to reg-reg move (2).  */
2387   {6, 6, 6},                            /* cost of storing integer registers */
2388   {6, 6, 6, 6, 6},                      /* cost of loading SSE register
2389                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2390   {6, 6, 6, 6, 6},                      /* cost of storing SSE register
2391                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2392   {10, 10, 10, 10, 10},                 /* cost of unaligned loads.  */
2393   {10, 10, 10, 10, 10},                 /* cost of unaligned loads.  */
2394   2, 2, 2,                              /* cost of moving XMM,YMM,ZMM register */
2395   4,                                    /* cost of moving SSE register to integer.  */
2396   6, 6,                                 /* Gather load static, per_elt.  */
2397   6, 6,                                 /* Gather store static, per_elt.  */
2398   32,                                   /* size of l1 cache.  */
2399   256,                                  /* size of l2 cache.  */
2400   64,                                   /* size of prefetch block */
2401   6,                                    /* number of parallel prefetches */
2402   3,                                    /* Branch cost */
2403   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
2404   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
2405   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
2406   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
2407   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
2408   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
2409
2410   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2411   COSTS_N_INSNS (8),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2412   COSTS_N_INSNS (8),                    /* cost of MULSS instruction.  */
2413   COSTS_N_INSNS (8),                    /* cost of MULSD instruction.  */
2414   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
2415   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
2416   COSTS_N_INSNS (20),                   /* cost of DIVSS instruction.  */
2417   COSTS_N_INSNS (20),                   /* cost of DIVSD instruction.  */
2418   COSTS_N_INSNS (40),                   /* cost of SQRTSS instruction.  */
2419   COSTS_N_INSNS (40),                   /* cost of SQRTSD instruction.  */
2420   1, 4, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2421   intel_memcpy,
2422   intel_memset,
2423   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2424   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2425   "16",                                 /* Loop alignment.  */
2426   "16:8:8",                             /* Jump alignment.  */
2427   "0:0:8",                              /* Label alignment.  */
2428   "16",                                 /* Func alignment.  */
2429 };
2430
2431 /* Generic should produce code tuned for Core-i7 (and newer chips)
2432    and btver1 (and newer chips).  */
2433
2434 static stringop_algs generic_memcpy[2] = {
2435   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2436              {-1, libcall, false}}},
2437   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2438              {-1, libcall, false}}}};
2439 static stringop_algs generic_memset[2] = {
2440   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2441              {-1, libcall, false}}},
2442   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2443              {-1, libcall, false}}}};
2444 static const
2445 struct processor_costs generic_cost = {
2446   {
2447   /* Start of register allocator costs.  integer->integer move cost is 2. */
2448   6,                                 /* cost for loading QImode using movzbl */
2449   {6, 6, 6},                            /* cost of loading integer registers
2450                                            in QImode, HImode and SImode.
2451                                            Relative to reg-reg move (2).  */
2452   {6, 6, 6},                            /* cost of storing integer registers */
2453   4,                                    /* cost of reg,reg fld/fst */
2454   {6, 6, 12},                           /* cost of loading fp registers
2455                                            in SFmode, DFmode and XFmode */
2456   {6, 6, 12},                           /* cost of storing fp registers
2457                                            in SFmode, DFmode and XFmode */
2458   2,                                    /* cost of moving MMX register */
2459   {6, 6},                               /* cost of loading MMX registers
2460                                            in SImode and DImode */
2461   {6, 6},                               /* cost of storing MMX registers
2462                                            in SImode and DImode */
2463   2, 3, 4,                              /* cost of moving XMM,YMM,ZMM register */
2464   {6, 6, 6, 10, 15},                    /* cost of loading SSE registers
2465                                            in 32,64,128,256 and 512-bit */
2466   {6, 6, 6, 10, 15},                    /* cost of storing SSE registers
2467                                            in 32,64,128,256 and 512-bit */
2468   6, 6,                                 /* SSE->integer and integer->SSE moves */
2469   /* End of register allocator costs.  */
2470   },
2471
2472   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2473   /* Setting cost to 2 makes our current implementation of synth_mult result in
2474      use of unnecessary temporary registers causing regression on several
2475      SPECfp benchmarks.  */
2476   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
2477   COSTS_N_INSNS (1),                    /* variable shift costs */
2478   COSTS_N_INSNS (1),                    /* constant shift costs */
2479   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2480    COSTS_N_INSNS (4),                   /*                               HI */
2481    COSTS_N_INSNS (3),                   /*                               SI */
2482    COSTS_N_INSNS (4),                   /*                               DI */
2483    COSTS_N_INSNS (4)},                  /*                            other */
2484   0,                                    /* cost of multiply per each bit set */
2485   {COSTS_N_INSNS (16),                  /* cost of a divide/mod for QI */
2486    COSTS_N_INSNS (22),                  /*                          HI */
2487    COSTS_N_INSNS (30),                  /*                          SI */
2488    COSTS_N_INSNS (74),                  /*                          DI */
2489    COSTS_N_INSNS (74)},                 /*                          other */
2490   COSTS_N_INSNS (1),                    /* cost of movsx */
2491   COSTS_N_INSNS (1),                    /* cost of movzx */
2492   8,                                    /* "large" insn */
2493   17,                                   /* MOVE_RATIO */
2494   6,                                    /* CLEAR_RATIO */
2495   {6, 6, 6},                            /* cost of loading integer registers
2496                                            in QImode, HImode and SImode.
2497                                            Relative to reg-reg move (2).  */
2498   {6, 6, 6},                            /* cost of storing integer registers */
2499   {6, 6, 6, 10, 15},                    /* cost of loading SSE register
2500                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2501   {6, 6, 6, 10, 15},                    /* cost of storing SSE register
2502                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2503   {6, 6, 6, 10, 15},                    /* cost of unaligned loads.  */
2504   {6, 6, 6, 10, 15},                    /* cost of unaligned storess.  */
2505   2, 3, 4,                              /* cost of moving XMM,YMM,ZMM register */
2506   6,                                    /* cost of moving SSE register to integer.  */
2507   18, 6,                                /* Gather load static, per_elt.  */
2508   18, 6,                                /* Gather store static, per_elt.  */
2509   32,                                   /* size of l1 cache.  */
2510   512,                                  /* size of l2 cache.  */
2511   64,                                   /* size of prefetch block */
2512   6,                                    /* number of parallel prefetches */
2513   /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2514      value is increased to perhaps more appropriate value of 5.  */
2515   3,                                    /* Branch cost */
2516   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
2517   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
2518   COSTS_N_INSNS (17),                   /* cost of FDIV instruction.  */
2519   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
2520   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
2521   COSTS_N_INSNS (14),                   /* cost of FSQRT instruction.  */
2522
2523   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2524   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2525   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
2526   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
2527   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
2528   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
2529   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
2530   COSTS_N_INSNS (17),                   /* cost of DIVSD instruction.  */
2531   COSTS_N_INSNS (14),                   /* cost of SQRTSS instruction.  */
2532   COSTS_N_INSNS (18),                   /* cost of SQRTSD instruction.  */
2533   1, 4, 3, 3,                           /* reassoc int, fp, vec_int, vec_fp.  */
2534   generic_memcpy,
2535   generic_memset,
2536   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
2537   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
2538   "16:11:8",                            /* Loop alignment.  */
2539   "16:11:8",                            /* Jump alignment.  */
2540   "0:0:8",                              /* Label alignment.  */
2541   "16",                                 /* Func alignment.  */
2542 };
2543
2544 /* core_cost should produce code tuned for Core familly of CPUs.  */
2545 static stringop_algs core_memcpy[2] = {
2546   {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2547   {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2548              {-1, libcall, false}}}};
2549 static stringop_algs core_memset[2] = {
2550   {libcall, {{6, loop_1_byte, true},
2551              {24, loop, true},
2552              {8192, rep_prefix_4_byte, true},
2553              {-1, libcall, false}}},
2554   {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2555              {-1, libcall, false}}}};
2556
2557 static const
2558 struct processor_costs core_cost = {
2559   {
2560   /* Start of register allocator costs.  integer->integer move cost is 2. */
2561   6,                                 /* cost for loading QImode using movzbl */
2562   {4, 4, 4},                            /* cost of loading integer registers
2563                                            in QImode, HImode and SImode.
2564                                            Relative to reg-reg move (2).  */
2565   {6, 6, 6},                            /* cost of storing integer registers */
2566   2,                                    /* cost of reg,reg fld/fst */
2567   {6, 6, 8},                            /* cost of loading fp registers
2568                                            in SFmode, DFmode and XFmode */
2569   {6, 6, 10},                           /* cost of storing fp registers
2570                                            in SFmode, DFmode and XFmode */
2571   2,                                    /* cost of moving MMX register */
2572   {6, 6},                               /* cost of loading MMX registers
2573                                            in SImode and DImode */
2574   {6, 6},                               /* cost of storing MMX registers
2575                                            in SImode and DImode */
2576   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
2577   {6, 6, 6, 6, 12},                     /* cost of loading SSE registers
2578                                            in 32,64,128,256 and 512-bit */
2579   {6, 6, 6, 6, 12},                     /* cost of storing SSE registers
2580                                            in 32,64,128,256 and 512-bit */
2581   6, 6,                                 /* SSE->integer and integer->SSE moves */
2582   /* End of register allocator costs.  */
2583   },
2584
2585   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2586   /* On all chips taken into consideration lea is 2 cycles and more.  With
2587      this cost however our current implementation of synth_mult results in
2588      use of unnecessary temporary registers causing regression on several
2589      SPECfp benchmarks.  */
2590   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
2591   COSTS_N_INSNS (1),                    /* variable shift costs */
2592   COSTS_N_INSNS (1),                    /* constant shift costs */
2593   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2594    COSTS_N_INSNS (4),                   /*                               HI */
2595    COSTS_N_INSNS (3),                   /*                               SI */
2596    /* Here we tune for Sandybridge or newer.  */
2597    COSTS_N_INSNS (3),                   /*                               DI */
2598    COSTS_N_INSNS (3)},                  /*                            other */
2599   0,                                    /* cost of multiply per each bit set */
2600   /* Expanding div/mod currently doesn't consider parallelism. So the cost
2601      model is not realistic. We compensate by increasing the latencies a bit.  */
2602   {COSTS_N_INSNS (11),                  /* cost of a divide/mod for QI */
2603    COSTS_N_INSNS (11),                  /*                          HI */
2604    COSTS_N_INSNS (14),                  /*                          SI */
2605    COSTS_N_INSNS (81),                  /*                          DI */
2606    COSTS_N_INSNS (81)},                 /*                          other */
2607   COSTS_N_INSNS (1),                    /* cost of movsx */
2608   COSTS_N_INSNS (1),                    /* cost of movzx */
2609   8,                                    /* "large" insn */
2610   17,                                   /* MOVE_RATIO */
2611   6,                                    /* CLEAR_RATIO */
2612   {4, 4, 4},                            /* cost of loading integer registers
2613                                            in QImode, HImode and SImode.
2614                                            Relative to reg-reg move (2).  */
2615   {6, 6, 6},                            /* cost of storing integer registers */
2616   {6, 6, 6, 6, 12},                     /* cost of loading SSE register
2617                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2618   {6, 6, 6, 6, 12},                     /* cost of storing SSE register
2619                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2620   {6, 6, 6, 6, 12},                     /* cost of unaligned loads.  */
2621   {6, 6, 6, 6, 12},                     /* cost of unaligned stores.  */
2622   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
2623   2,                                    /* cost of moving SSE register to integer.  */
2624   /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
2625      rec. throughput 6.
2626      So 5 uops statically and one uops per load.  */
2627   10, 6,                                /* Gather load static, per_elt.  */
2628   10, 6,                                /* Gather store static, per_elt.  */
2629   64,                                   /* size of l1 cache.  */
2630   512,                                  /* size of l2 cache.  */
2631   64,                                   /* size of prefetch block */
2632   6,                                    /* number of parallel prefetches */
2633   /* FIXME perhaps more appropriate value is 5.  */
2634   3,                                    /* Branch cost */
2635   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
2636   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
2637   /* 10-24 */
2638   COSTS_N_INSNS (24),                   /* cost of FDIV instruction.  */
2639   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
2640   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
2641   COSTS_N_INSNS (23),                   /* cost of FSQRT instruction.  */
2642
2643   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2644   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2645   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
2646   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
2647   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
2648   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
2649   COSTS_N_INSNS (18),                   /* cost of DIVSS instruction.  */
2650   COSTS_N_INSNS (32),                   /* cost of DIVSD instruction.  */
2651   COSTS_N_INSNS (30),                   /* cost of SQRTSS instruction.  */
2652   COSTS_N_INSNS (58),                   /* cost of SQRTSD instruction.  */
2653   1, 4, 2, 2,                           /* reassoc int, fp, vec_int, vec_fp.  */
2654   core_memcpy,
2655   core_memset,
2656   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2657   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2658   "16:11:8",                            /* Loop alignment.  */
2659   "16:11:8",                            /* Jump alignment.  */
2660   "0:0:8",                              /* Label alignment.  */
2661   "16",                                 /* Func alignment.  */
2662 };
2663