gcc/config/i386/x86-tune-costs.h

   1 /* Costs of operations of individual x86 CPUs.
   2    Copyright (C) 1988-2021 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 3, or (at your option)
   9 any later version.
  10
  11 GCC is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 Under Section 7 of GPL version 3, you are granted additional
  17 permissions described in the GCC Runtime Library Exception, version
  18 3.1, as published by the Free Software Foundation.
  19
  20 You should have received a copy of the GNU General Public License and
  21 a copy of the GCC Runtime Library Exception along with this program;
  22 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  23 <http://www.gnu.org/licenses/>.  */
  24 /* Processor costs (relative to an add) */
  25 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes.  */
  26 #define COSTS_N_BYTES(N) ((N) * 2)
  27
  28 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
  29
  30 static stringop_algs ix86_size_memcpy[2] = {
  31   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
  32   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
  33 static stringop_algs ix86_size_memset[2] = {
  34   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
  35   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
  36
  37 const
  38 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
  39   {
  40   /* Start of register allocator costs.  integer->integer move cost is 2. */
  41   2,                                 /* cost for loading QImode using movzbl */
  42   {2, 2, 2},                            /* cost of loading integer registers
  43                                            in QImode, HImode and SImode.
  44                                            Relative to reg-reg move (2).  */
  45   {2, 2, 2},                            /* cost of storing integer registers */
  46   2,                                    /* cost of reg,reg fld/fst */
  47   {2, 2, 2},                            /* cost of loading fp registers
  48                                            in SFmode, DFmode and XFmode */
  49   {2, 2, 2},                            /* cost of storing fp registers
  50                                            in SFmode, DFmode and XFmode */
  51   3,                                    /* cost of moving MMX register */
  52   {3, 3},                               /* cost of loading MMX registers
  53                                            in SImode and DImode */
  54   {3, 3},                               /* cost of storing MMX registers
  55                                            in SImode and DImode */
  56   3, 3, 3,                              /* cost of moving XMM,YMM,ZMM register */
  57   {3, 3, 3, 3, 3},                      /* cost of loading SSE registers
  58                                            in 32,64,128,256 and 512-bit */
  59   {3, 3, 3, 3, 3},                      /* cost of storing SSE registers
  60                                            in 32,64,128,256 and 512-bit */
  61   3, 3,                         /* SSE->integer and integer->SSE moves */
  62   3, 3,                         /* mask->integer and integer->mask moves */
  63   {2, 2, 2},                            /* cost of loading mask register
  64                                            in QImode, HImode, SImode.  */
  65   {2, 2, 2},                            /* cost if storing mask register
  66                                            in QImode, HImode, SImode.  */
  67   2,                                    /* cost of moving mask register.  */
  68   /* End of register allocator costs.  */
  69   },
  70
  71   COSTS_N_BYTES (2),                    /* cost of an add instruction */
  72   COSTS_N_BYTES (3),                    /* cost of a lea instruction */
  73   COSTS_N_BYTES (2),                    /* variable shift costs */
  74   COSTS_N_BYTES (3),                    /* constant shift costs */
  75   {COSTS_N_BYTES (3),                   /* cost of starting multiply for QI */
  76    COSTS_N_BYTES (3),                   /*                               HI */
  77    COSTS_N_BYTES (3),                   /*                               SI */
  78    COSTS_N_BYTES (3),                   /*                               DI */
  79    COSTS_N_BYTES (5)},                  /*                            other */
  80   0,                                    /* cost of multiply per each bit set */
  81   {COSTS_N_BYTES (3),                   /* cost of a divide/mod for QI */
  82    COSTS_N_BYTES (3),                   /*                          HI */
  83    COSTS_N_BYTES (3),                   /*                          SI */
  84    COSTS_N_BYTES (3),                   /*                          DI */
  85    COSTS_N_BYTES (5)},                  /*                          other */
  86   COSTS_N_BYTES (3),                    /* cost of movsx */
  87   COSTS_N_BYTES (3),                    /* cost of movzx */
  88   0,                                    /* "large" insn */
  89   2,                                    /* MOVE_RATIO */
  90   2,                                    /* CLEAR_RATIO */
  91   {2, 2, 2},                            /* cost of loading integer registers
  92                                            in QImode, HImode and SImode.
  93                                            Relative to reg-reg move (2).  */
  94   {2, 2, 2},                            /* cost of storing integer registers */
  95   {3, 3, 3, 3, 3},                      /* cost of loading SSE register
  96                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
  97   {3, 3, 3, 3, 3},                      /* cost of storing SSE register
  98                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
  99   {3, 3, 3, 3, 3},                      /* cost of unaligned SSE load
 100                                            in 128bit, 256bit and 512bit */
 101   {3, 3, 3, 3, 3},                      /* cost of unaligned SSE store
 102                                            in 128bit, 256bit and 512bit */
 103   3, 3, 3,                              /* cost of moving XMM,YMM,ZMM register */
 104   3,                                    /* cost of moving SSE register to integer.  */
 105   5, 0,                                 /* Gather load static, per_elt.  */
 106   5, 0,                                 /* Gather store static, per_elt.  */
 107   0,                                    /* size of l1 cache  */
 108   0,                                    /* size of l2 cache  */
 109   0,                                    /* size of prefetch block */
 110   0,                                    /* number of parallel prefetches */
 111   2,                                    /* Branch cost */
 112   COSTS_N_BYTES (2),                    /* cost of FADD and FSUB insns.  */
 113   COSTS_N_BYTES (2),                    /* cost of FMUL instruction.  */
 114   COSTS_N_BYTES (2),                    /* cost of FDIV instruction.  */
 115   COSTS_N_BYTES (2),                    /* cost of FABS instruction.  */
 116   COSTS_N_BYTES (2),                    /* cost of FCHS instruction.  */
 117   COSTS_N_BYTES (2),                    /* cost of FSQRT instruction.  */
 118
 119   COSTS_N_BYTES (2),                    /* cost of cheap SSE instruction.  */
 120   COSTS_N_BYTES (2),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 121   COSTS_N_BYTES (2),                    /* cost of MULSS instruction.  */
 122   COSTS_N_BYTES (2),                    /* cost of MULSD instruction.  */
 123   COSTS_N_BYTES (2),                    /* cost of FMA SS instruction.  */
 124   COSTS_N_BYTES (2),                    /* cost of FMA SD instruction.  */
 125   COSTS_N_BYTES (2),                    /* cost of DIVSS instruction.  */
 126   COSTS_N_BYTES (2),                    /* cost of DIVSD instruction.  */
 127   COSTS_N_BYTES (2),                    /* cost of SQRTSS instruction.  */
 128   COSTS_N_BYTES (2),                    /* cost of SQRTSD instruction.  */
 129   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 130   ix86_size_memcpy,
 131   ix86_size_memset,
 132   COSTS_N_BYTES (1),                    /* cond_taken_branch_cost.  */
 133   COSTS_N_BYTES (1),                    /* cond_not_taken_branch_cost.  */
 134   NULL,                                 /* Loop alignment.  */
 135   NULL,                                 /* Jump alignment.  */
 136   NULL,                                 /* Label alignment.  */
 137   NULL,                                 /* Func alignment.  */
 138 };
 139
 140 /* Processor costs (relative to an add) */
 141 static stringop_algs i386_memcpy[2] = {
 142   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
 143   DUMMY_STRINGOP_ALGS};
 144 static stringop_algs i386_memset[2] = {
 145   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
 146   DUMMY_STRINGOP_ALGS};
 147
 148 static const
 149 struct processor_costs i386_cost = {    /* 386 specific costs */
 150   {
 151   /* Start of register allocator costs.  integer->integer move cost is 2. */
 152   4,                                 /* cost for loading QImode using movzbl */
 153   {2, 4, 2},                            /* cost of loading integer registers
 154                                            in QImode, HImode and SImode.
 155                                            Relative to reg-reg move (2).  */
 156   {2, 4, 2},                            /* cost of storing integer registers */
 157   2,                                    /* cost of reg,reg fld/fst */
 158   {8, 8, 8},                            /* cost of loading fp registers
 159                                            in SFmode, DFmode and XFmode */
 160   {8, 8, 8},                            /* cost of storing fp registers
 161                                            in SFmode, DFmode and XFmode */
 162   2,                                    /* cost of moving MMX register */
 163   {4, 8},                               /* cost of loading MMX registers
 164                                            in SImode and DImode */
 165   {4, 8},                               /* cost of storing MMX registers
 166                                            in SImode and DImode */
 167   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 168   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 169                                            in 32,64,128,256 and 512-bit */
 170   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 171                                            in 32,64,128,256 and 512-bit */
 172   3, 3,                         /* SSE->integer and integer->SSE moves */
 173   3, 3,                         /* mask->integer and integer->mask moves */
 174   {2, 4, 2},                            /* cost of loading mask register
 175                                            in QImode, HImode, SImode.  */
 176   {2, 4, 2},                            /* cost if storing mask register
 177                                            in QImode, HImode, SImode.  */
 178   2,                                    /* cost of moving mask register.  */
 179   /* End of register allocator costs.  */
 180   },
 181
 182   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 183   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 184   COSTS_N_INSNS (3),                    /* variable shift costs */
 185   COSTS_N_INSNS (2),                    /* constant shift costs */
 186   {COSTS_N_INSNS (6),                   /* cost of starting multiply for QI */
 187    COSTS_N_INSNS (6),                   /*                               HI */
 188    COSTS_N_INSNS (6),                   /*                               SI */
 189    COSTS_N_INSNS (6),                   /*                               DI */
 190    COSTS_N_INSNS (6)},                  /*                            other */
 191   COSTS_N_INSNS (1),                    /* cost of multiply per each bit set */
 192   {COSTS_N_INSNS (23),                  /* cost of a divide/mod for QI */
 193    COSTS_N_INSNS (23),                  /*                          HI */
 194    COSTS_N_INSNS (23),                  /*                          SI */
 195    COSTS_N_INSNS (23),                  /*                          DI */
 196    COSTS_N_INSNS (23)},                 /*                          other */
 197   COSTS_N_INSNS (3),                    /* cost of movsx */
 198   COSTS_N_INSNS (2),                    /* cost of movzx */
 199   15,                                   /* "large" insn */
 200   3,                                    /* MOVE_RATIO */
 201   3,                                    /* CLEAR_RATIO */
 202   {2, 4, 2},                            /* cost of loading integer registers
 203                                            in QImode, HImode and SImode.
 204                                            Relative to reg-reg move (2).  */
 205   {2, 4, 2},                            /* cost of storing integer registers */
 206   {4, 8, 16, 32, 64},                   /* cost of loading SSE register
 207                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 208   {4, 8, 16, 32, 64},                   /* cost of storing SSE register
 209                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 210   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 211   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 212   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 213   3,                                    /* cost of moving SSE register to integer.  */
 214   4, 4,                                 /* Gather load static, per_elt.  */
 215   4, 4,                                 /* Gather store static, per_elt.  */
 216   0,                                    /* size of l1 cache  */
 217   0,                                    /* size of l2 cache  */
 218   0,                                    /* size of prefetch block */
 219   0,                                    /* number of parallel prefetches */
 220   1,                                    /* Branch cost */
 221   COSTS_N_INSNS (23),                   /* cost of FADD and FSUB insns.  */
 222   COSTS_N_INSNS (27),                   /* cost of FMUL instruction.  */
 223   COSTS_N_INSNS (88),                   /* cost of FDIV instruction.  */
 224   COSTS_N_INSNS (22),                   /* cost of FABS instruction.  */
 225   COSTS_N_INSNS (24),                   /* cost of FCHS instruction.  */
 226   COSTS_N_INSNS (122),                  /* cost of FSQRT instruction.  */
 227
 228   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 229   COSTS_N_INSNS (23),                   /* cost of ADDSS/SD SUBSS/SD insns.  */
 230   COSTS_N_INSNS (27),                   /* cost of MULSS instruction.  */
 231   COSTS_N_INSNS (27),                   /* cost of MULSD instruction.  */
 232   COSTS_N_INSNS (27),                   /* cost of FMA SS instruction.  */
 233   COSTS_N_INSNS (27),                   /* cost of FMA SD instruction.  */
 234   COSTS_N_INSNS (88),                   /* cost of DIVSS instruction.  */
 235   COSTS_N_INSNS (88),                   /* cost of DIVSD instruction.  */
 236   COSTS_N_INSNS (122),                  /* cost of SQRTSS instruction.  */
 237   COSTS_N_INSNS (122),                  /* cost of SQRTSD instruction.  */
 238   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 239   i386_memcpy,
 240   i386_memset,
 241   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 242   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 243   "4",                                  /* Loop alignment.  */
 244   "4",                                  /* Jump alignment.  */
 245   NULL,                                 /* Label alignment.  */
 246   "4",                                  /* Func alignment.  */
 247 };
 248
 249 static stringop_algs i486_memcpy[2] = {
 250   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
 251   DUMMY_STRINGOP_ALGS};
 252 static stringop_algs i486_memset[2] = {
 253   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
 254   DUMMY_STRINGOP_ALGS};
 255
 256 static const
 257 struct processor_costs i486_cost = {    /* 486 specific costs */
 258   {
 259   /* Start of register allocator costs.  integer->integer move cost is 2. */
 260   4,                                 /* cost for loading QImode using movzbl */
 261   {2, 4, 2},                            /* cost of loading integer registers
 262                                            in QImode, HImode and SImode.
 263                                            Relative to reg-reg move (2).  */
 264   {2, 4, 2},                            /* cost of storing integer registers */
 265   2,                                    /* cost of reg,reg fld/fst */
 266   {8, 8, 8},                            /* cost of loading fp registers
 267                                            in SFmode, DFmode and XFmode */
 268   {8, 8, 8},                            /* cost of storing fp registers
 269                                            in SFmode, DFmode and XFmode */
 270   2,                                    /* cost of moving MMX register */
 271   {4, 8},                               /* cost of loading MMX registers
 272                                            in SImode and DImode */
 273   {4, 8},                               /* cost of storing MMX registers
 274                                            in SImode and DImode */
 275   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 276   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 277                                            in 32,64,128,256 and 512-bit */
 278   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 279                                            in 32,64,128,256 and 512-bit */
 280   3, 3,                         /* SSE->integer and integer->SSE moves */
 281   3, 3,                         /* mask->integer and integer->mask moves */
 282   {2, 4, 2},                            /* cost of loading mask register
 283                                            in QImode, HImode, SImode.  */
 284   {2, 4, 2},                            /* cost if storing mask register
 285                                            in QImode, HImode, SImode.  */
 286   2,                                    /* cost of moving mask register.  */
 287   /* End of register allocator costs.  */
 288   },
 289
 290   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 291   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 292   COSTS_N_INSNS (3),                    /* variable shift costs */
 293   COSTS_N_INSNS (2),                    /* constant shift costs */
 294   {COSTS_N_INSNS (12),                  /* cost of starting multiply for QI */
 295    COSTS_N_INSNS (12),                  /*                               HI */
 296    COSTS_N_INSNS (12),                  /*                               SI */
 297    COSTS_N_INSNS (12),                  /*                               DI */
 298    COSTS_N_INSNS (12)},                 /*                            other */
 299   1,                                    /* cost of multiply per each bit set */
 300   {COSTS_N_INSNS (40),                  /* cost of a divide/mod for QI */
 301    COSTS_N_INSNS (40),                  /*                          HI */
 302    COSTS_N_INSNS (40),                  /*                          SI */
 303    COSTS_N_INSNS (40),                  /*                          DI */
 304    COSTS_N_INSNS (40)},                 /*                          other */
 305   COSTS_N_INSNS (3),                    /* cost of movsx */
 306   COSTS_N_INSNS (2),                    /* cost of movzx */
 307   15,                                   /* "large" insn */
 308   3,                                    /* MOVE_RATIO */
 309   3,                                    /* CLEAR_RATIO */
 310   {2, 4, 2},                            /* cost of loading integer registers
 311                                            in QImode, HImode and SImode.
 312                                            Relative to reg-reg move (2).  */
 313   {2, 4, 2},                            /* cost of storing integer registers */
 314   {4, 8, 16, 32, 64},                   /* cost of loading SSE register
 315                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 316   {4, 8, 16, 32, 64},                   /* cost of storing SSE register
 317                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 318   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 319   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 320   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 321   3,                                    /* cost of moving SSE register to integer.  */
 322   4, 4,                                 /* Gather load static, per_elt.  */
 323   4, 4,                                 /* Gather store static, per_elt.  */
 324   4,                                    /* size of l1 cache.  486 has 8kB cache
 325                                            shared for code and data, so 4kB is
 326                                            not really precise.  */
 327   4,                                    /* size of l2 cache  */
 328   0,                                    /* size of prefetch block */
 329   0,                                    /* number of parallel prefetches */
 330   1,                                    /* Branch cost */
 331   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
 332   COSTS_N_INSNS (16),                   /* cost of FMUL instruction.  */
 333   COSTS_N_INSNS (73),                   /* cost of FDIV instruction.  */
 334   COSTS_N_INSNS (3),                    /* cost of FABS instruction.  */
 335   COSTS_N_INSNS (3),                    /* cost of FCHS instruction.  */
 336   COSTS_N_INSNS (83),                   /* cost of FSQRT instruction.  */
 337
 338   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 339   COSTS_N_INSNS (8),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 340   COSTS_N_INSNS (16),                   /* cost of MULSS instruction.  */
 341   COSTS_N_INSNS (16),                   /* cost of MULSD instruction.  */
 342   COSTS_N_INSNS (16),                   /* cost of FMA SS instruction.  */
 343   COSTS_N_INSNS (16),                   /* cost of FMA SD instruction.  */
 344   COSTS_N_INSNS (73),                   /* cost of DIVSS instruction.  */
 345   COSTS_N_INSNS (74),                   /* cost of DIVSD instruction.  */
 346   COSTS_N_INSNS (83),                   /* cost of SQRTSS instruction.  */
 347   COSTS_N_INSNS (83),                   /* cost of SQRTSD instruction.  */
 348   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 349   i486_memcpy,
 350   i486_memset,
 351   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 352   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 353   "16",                                 /* Loop alignment.  */
 354   "16",                                 /* Jump alignment.  */
 355   "0:0:8",                              /* Label alignment.  */
 356   "16",                                 /* Func alignment.  */
 357 };
 358
 359 static stringop_algs pentium_memcpy[2] = {
 360   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 361   DUMMY_STRINGOP_ALGS};
 362 static stringop_algs pentium_memset[2] = {
 363   {libcall, {{-1, rep_prefix_4_byte, false}}},
 364   DUMMY_STRINGOP_ALGS};
 365
 366 static const
 367 struct processor_costs pentium_cost = {
 368   {
 369   /* Start of register allocator costs.  integer->integer move cost is 2. */
 370   6,                                 /* cost for loading QImode using movzbl */
 371   {2, 4, 2},                            /* cost of loading integer registers
 372                                            in QImode, HImode and SImode.
 373                                            Relative to reg-reg move (2).  */
 374   {2, 4, 2},                            /* cost of storing integer registers */
 375   2,                                    /* cost of reg,reg fld/fst */
 376   {2, 2, 6},                            /* cost of loading fp registers
 377                                            in SFmode, DFmode and XFmode */
 378   {4, 4, 6},                            /* cost of storing fp registers
 379                                            in SFmode, DFmode and XFmode */
 380   8,                                    /* cost of moving MMX register */
 381   {8, 8},                               /* cost of loading MMX registers
 382                                            in SImode and DImode */
 383   {8, 8},                               /* cost of storing MMX registers
 384                                            in SImode and DImode */
 385   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 386   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 387                                            in 32,64,128,256 and 512-bit */
 388   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 389                                            in 32,64,128,256 and 512-bit */
 390   3, 3,                         /* SSE->integer and integer->SSE moves */
 391   3, 3,                         /* mask->integer and integer->mask moves */
 392   {2, 4, 2},                            /* cost of loading mask register
 393                                            in QImode, HImode, SImode.  */
 394   {2, 4, 2},                            /* cost if storing mask register
 395                                            in QImode, HImode, SImode.  */
 396   2,                                    /* cost of moving mask register.  */
 397   /* End of register allocator costs.  */
 398   },
 399
 400   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 401   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 402   COSTS_N_INSNS (4),                    /* variable shift costs */
 403   COSTS_N_INSNS (1),                    /* constant shift costs */
 404   {COSTS_N_INSNS (11),                  /* cost of starting multiply for QI */
 405    COSTS_N_INSNS (11),                  /*                               HI */
 406    COSTS_N_INSNS (11),                  /*                               SI */
 407    COSTS_N_INSNS (11),                  /*                               DI */
 408    COSTS_N_INSNS (11)},                 /*                            other */
 409   0,                                    /* cost of multiply per each bit set */
 410   {COSTS_N_INSNS (25),                  /* cost of a divide/mod for QI */
 411    COSTS_N_INSNS (25),                  /*                          HI */
 412    COSTS_N_INSNS (25),                  /*                          SI */
 413    COSTS_N_INSNS (25),                  /*                          DI */
 414    COSTS_N_INSNS (25)},                 /*                          other */
 415   COSTS_N_INSNS (3),                    /* cost of movsx */
 416   COSTS_N_INSNS (2),                    /* cost of movzx */
 417   8,                                    /* "large" insn */
 418   6,                                    /* MOVE_RATIO */
 419   6,                                    /* CLEAR_RATIO */
 420   {2, 4, 2},                            /* cost of loading integer registers
 421                                            in QImode, HImode and SImode.
 422                                            Relative to reg-reg move (2).  */
 423   {2, 4, 2},                            /* cost of storing integer registers */
 424   {4, 8, 16, 32, 64},                   /* cost of loading SSE register
 425                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 426   {4, 8, 16, 32, 64},                   /* cost of storing SSE register
 427                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 428   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 429   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 430   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 431   3,                                    /* cost of moving SSE register to integer.  */
 432   4, 4,                                 /* Gather load static, per_elt.  */
 433   4, 4,                                 /* Gather store static, per_elt.  */
 434   8,                                    /* size of l1 cache.  */
 435   8,                                    /* size of l2 cache  */
 436   0,                                    /* size of prefetch block */
 437   0,                                    /* number of parallel prefetches */
 438   2,                                    /* Branch cost */
 439   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 440   COSTS_N_INSNS (3),                    /* cost of FMUL instruction.  */
 441   COSTS_N_INSNS (39),                   /* cost of FDIV instruction.  */
 442   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 443   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 444   COSTS_N_INSNS (70),                   /* cost of FSQRT instruction.  */
 445
 446   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 447   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 448   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
 449   COSTS_N_INSNS (3),                    /* cost of MULSD instruction.  */
 450   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
 451   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
 452   COSTS_N_INSNS (39),                   /* cost of DIVSS instruction.  */
 453   COSTS_N_INSNS (39),                   /* cost of DIVSD instruction.  */
 454   COSTS_N_INSNS (70),                   /* cost of SQRTSS instruction.  */
 455   COSTS_N_INSNS (70),                   /* cost of SQRTSD instruction.  */
 456   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 457   pentium_memcpy,
 458   pentium_memset,
 459   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 460   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 461   "16:8:8",                             /* Loop alignment.  */
 462   "16:8:8",                             /* Jump alignment.  */
 463   "0:0:8",                              /* Label alignment.  */
 464   "16",                                 /* Func alignment.  */
 465 };
 466
 467 static const
 468 struct processor_costs lakemont_cost = {
 469   {
 470   /* Start of register allocator costs.  integer->integer move cost is 2. */
 471   6,                                 /* cost for loading QImode using movzbl */
 472   {2, 4, 2},                            /* cost of loading integer registers
 473                                            in QImode, HImode and SImode.
 474                                            Relative to reg-reg move (2).  */
 475   {2, 4, 2},                            /* cost of storing integer registers */
 476   2,                                    /* cost of reg,reg fld/fst */
 477   {2, 2, 6},                            /* cost of loading fp registers
 478                                            in SFmode, DFmode and XFmode */
 479   {4, 4, 6},                            /* cost of storing fp registers
 480                                            in SFmode, DFmode and XFmode */
 481   8,                                    /* cost of moving MMX register */
 482   {8, 8},                               /* cost of loading MMX registers
 483                                            in SImode and DImode */
 484   {8, 8},                               /* cost of storing MMX registers
 485                                            in SImode and DImode */
 486   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 487   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 488                                            in 32,64,128,256 and 512-bit */
 489   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 490                                            in 32,64,128,256 and 512-bit */
 491   3, 3,                         /* SSE->integer and integer->SSE moves */
 492   3, 3,                         /* mask->integer and integer->mask moves */
 493   {2, 4, 2},                            /* cost of loading mask register
 494                                            in QImode, HImode, SImode.  */
 495   {2, 4, 2},                            /* cost if storing mask register
 496                                            in QImode, HImode, SImode.  */
 497   2,                                    /* cost of moving mask register.  */
 498   /* End of register allocator costs.  */
 499   },
 500
 501   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 502   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
 503   COSTS_N_INSNS (1),                    /* variable shift costs */
 504   COSTS_N_INSNS (1),                    /* constant shift costs */
 505   {COSTS_N_INSNS (11),                  /* cost of starting multiply for QI */
 506    COSTS_N_INSNS (11),                  /*                               HI */
 507    COSTS_N_INSNS (11),                  /*                               SI */
 508    COSTS_N_INSNS (11),                  /*                               DI */
 509    COSTS_N_INSNS (11)},                 /*                            other */
 510   0,                                    /* cost of multiply per each bit set */
 511   {COSTS_N_INSNS (25),                  /* cost of a divide/mod for QI */
 512    COSTS_N_INSNS (25),                  /*                          HI */
 513    COSTS_N_INSNS (25),                  /*                          SI */
 514    COSTS_N_INSNS (25),                  /*                          DI */
 515    COSTS_N_INSNS (25)},                 /*                          other */
 516   COSTS_N_INSNS (3),                    /* cost of movsx */
 517   COSTS_N_INSNS (2),                    /* cost of movzx */
 518   8,                                    /* "large" insn */
 519   17,                                   /* MOVE_RATIO */
 520   6,                                    /* CLEAR_RATIO */
 521   {2, 4, 2},                            /* cost of loading integer registers
 522                                            in QImode, HImode and SImode.
 523                                            Relative to reg-reg move (2).  */
 524   {2, 4, 2},                            /* cost of storing integer registers */
 525   {4, 8, 16, 32, 64},                   /* cost of loading SSE register
 526                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 527   {4, 8, 16, 32, 64},                   /* cost of storing SSE register
 528                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 529   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 530   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 531   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 532   3,                                    /* cost of moving SSE register to integer.  */
 533   4, 4,                                 /* Gather load static, per_elt.  */
 534   4, 4,                                 /* Gather store static, per_elt.  */
 535   8,                                    /* size of l1 cache.  */
 536   8,                                    /* size of l2 cache  */
 537   0,                                    /* size of prefetch block */
 538   0,                                    /* number of parallel prefetches */
 539   2,                                    /* Branch cost */
 540   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 541   COSTS_N_INSNS (3),                    /* cost of FMUL instruction.  */
 542   COSTS_N_INSNS (39),                   /* cost of FDIV instruction.  */
 543   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 544   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 545   COSTS_N_INSNS (70),                   /* cost of FSQRT instruction.  */
 546
 547   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 548   COSTS_N_INSNS (5),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 549   COSTS_N_INSNS (5),                    /* cost of MULSS instruction.  */
 550   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
 551   COSTS_N_INSNS (10),                   /* cost of FMA SS instruction.  */
 552   COSTS_N_INSNS (10),                   /* cost of FMA SD instruction.  */
 553   COSTS_N_INSNS (31),                   /* cost of DIVSS instruction.  */
 554   COSTS_N_INSNS (60),                   /* cost of DIVSD instruction.  */
 555   COSTS_N_INSNS (31),                   /* cost of SQRTSS instruction.  */
 556   COSTS_N_INSNS (63),                   /* cost of SQRTSD instruction.  */
 557   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 558   pentium_memcpy,
 559   pentium_memset,
 560   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 561   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 562   "16:8:8",                             /* Loop alignment.  */
 563   "16:8:8",                             /* Jump alignment.  */
 564   "0:0:8",                              /* Label alignment.  */
 565   "16",                                 /* Func alignment.  */
 566 };
 567
 568 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
 569    (we ensure the alignment).  For small blocks inline loop is still a
 570    noticeable win, for bigger blocks either rep movsl or rep movsb is
 571    way to go.  Rep movsb has apparently more expensive startup time in CPU,
 572    but after 4K the difference is down in the noise.  */
 573 static stringop_algs pentiumpro_memcpy[2] = {
 574   {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
 575                        {8192, rep_prefix_4_byte, false},
 576                        {-1, rep_prefix_1_byte, false}}},
 577   DUMMY_STRINGOP_ALGS};
 578 static stringop_algs pentiumpro_memset[2] = {
 579   {rep_prefix_4_byte, {{1024, unrolled_loop, false},
 580                        {8192, rep_prefix_4_byte, false},
 581                        {-1, libcall, false}}},
 582   DUMMY_STRINGOP_ALGS};
 583 static const
 584 struct processor_costs pentiumpro_cost = {
 585   {
 586   /* Start of register allocator costs.  integer->integer move cost is 2. */
 587   2,                                 /* cost for loading QImode using movzbl */
 588   {4, 4, 4},                            /* cost of loading integer registers
 589                                            in QImode, HImode and SImode.
 590                                            Relative to reg-reg move (2).  */
 591   {2, 2, 2},                            /* cost of storing integer registers */
 592   2,                                    /* cost of reg,reg fld/fst */
 593   {2, 2, 6},                            /* cost of loading fp registers
 594                                            in SFmode, DFmode and XFmode */
 595   {4, 4, 6},                            /* cost of storing fp registers
 596                                            in SFmode, DFmode and XFmode */
 597   2,                                    /* cost of moving MMX register */
 598   {2, 2},                               /* cost of loading MMX registers
 599                                            in SImode and DImode */
 600   {2, 2},                               /* cost of storing MMX registers
 601                                            in SImode and DImode */
 602   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 603   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 604                                            in 32,64,128,256 and 512-bit */
 605   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 606                                            in 32,64,128,256 and 512-bit */
 607   3, 3,                         /* SSE->integer and integer->SSE moves */
 608   3, 3,                         /* mask->integer and integer->mask moves */
 609   {4, 4, 4},                            /* cost of loading mask register
 610                                            in QImode, HImode, SImode.  */
 611   {2, 2, 2},                            /* cost if storing mask register
 612                                            in QImode, HImode, SImode.  */
 613   2,                                    /* cost of moving mask register.  */
 614   /* End of register allocator costs.  */
 615   },
 616
 617   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 618   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 619   COSTS_N_INSNS (1),                    /* variable shift costs */
 620   COSTS_N_INSNS (1),                    /* constant shift costs */
 621   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
 622    COSTS_N_INSNS (4),                   /*                               HI */
 623    COSTS_N_INSNS (4),                   /*                               SI */
 624    COSTS_N_INSNS (4),                   /*                               DI */
 625    COSTS_N_INSNS (4)},                  /*                            other */
 626   0,                                    /* cost of multiply per each bit set */
 627   {COSTS_N_INSNS (17),                  /* cost of a divide/mod for QI */
 628    COSTS_N_INSNS (17),                  /*                          HI */
 629    COSTS_N_INSNS (17),                  /*                          SI */
 630    COSTS_N_INSNS (17),                  /*                          DI */
 631    COSTS_N_INSNS (17)},                 /*                          other */
 632   COSTS_N_INSNS (1),                    /* cost of movsx */
 633   COSTS_N_INSNS (1),                    /* cost of movzx */
 634   8,                                    /* "large" insn */
 635   6,                                    /* MOVE_RATIO */
 636   6,                                    /* CLEAR_RATIO */
 637   {4, 4, 4},                            /* cost of loading integer registers
 638                                            in QImode, HImode and SImode.
 639                                            Relative to reg-reg move (2).  */
 640   {2, 2, 2},                            /* cost of storing integer registers */
 641   {4, 8, 16, 32, 64},                   /* cost of loading SSE register
 642                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 643   {4, 8, 16, 32, 64},                   /* cost of storing SSE register
 644                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 645   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 646   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 647   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 648   3,                                    /* cost of moving SSE register to integer.  */
 649   4, 4,                                 /* Gather load static, per_elt.  */
 650   4, 4,                                 /* Gather store static, per_elt.  */
 651   8,                                    /* size of l1 cache.  */
 652   256,                                  /* size of l2 cache  */
 653   32,                                   /* size of prefetch block */
 654   6,                                    /* number of parallel prefetches */
 655   2,                                    /* Branch cost */
 656   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 657   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
 658   COSTS_N_INSNS (56),                   /* cost of FDIV instruction.  */
 659   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 660   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 661   COSTS_N_INSNS (56),                   /* cost of FSQRT instruction.  */
 662
 663   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 664   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 665   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
 666   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
 667   COSTS_N_INSNS (7),                    /* cost of FMA SS instruction.  */
 668   COSTS_N_INSNS (7),                    /* cost of FMA SD instruction.  */
 669   COSTS_N_INSNS (18),                   /* cost of DIVSS instruction.  */
 670   COSTS_N_INSNS (18),                   /* cost of DIVSD instruction.  */
 671   COSTS_N_INSNS (31),                   /* cost of SQRTSS instruction.  */
 672   COSTS_N_INSNS (31),                   /* cost of SQRTSD instruction.  */
 673   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 674   pentiumpro_memcpy,
 675   pentiumpro_memset,
 676   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 677   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 678   "16",                                 /* Loop alignment.  */
 679   "16:11:8",                            /* Jump alignment.  */
 680   "0:0:8",                              /* Label alignment.  */
 681   "16",                                 /* Func alignment.  */
 682 };
 683
 684 static stringop_algs geode_memcpy[2] = {
 685   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 686   DUMMY_STRINGOP_ALGS};
 687 static stringop_algs geode_memset[2] = {
 688   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 689   DUMMY_STRINGOP_ALGS};
 690 static const
 691 struct processor_costs geode_cost = {
 692   {
 693   /* Start of register allocator costs.  integer->integer move cost is 2. */
 694   2,                                 /* cost for loading QImode using movzbl */
 695   {2, 2, 2},                            /* cost of loading integer registers
 696                                            in QImode, HImode and SImode.
 697                                            Relative to reg-reg move (2).  */
 698   {2, 2, 2},                            /* cost of storing integer registers */
 699   2,                                    /* cost of reg,reg fld/fst */
 700   {2, 2, 2},                            /* cost of loading fp registers
 701                                            in SFmode, DFmode and XFmode */
 702   {4, 6, 6},                            /* cost of storing fp registers
 703                                            in SFmode, DFmode and XFmode */
 704   2,                                    /* cost of moving MMX register */
 705   {2, 2},                               /* cost of loading MMX registers
 706                                            in SImode and DImode */
 707   {2, 2},                               /* cost of storing MMX registers
 708                                            in SImode and DImode */
 709   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 710   {2, 2, 8, 16, 32},                    /* cost of loading SSE registers
 711                                            in 32,64,128,256 and 512-bit */
 712   {2, 2, 8, 16, 32},                    /* cost of storing SSE registers
 713                                            in 32,64,128,256 and 512-bit */
 714   6, 6,                         /* SSE->integer and integer->SSE moves */
 715   6, 6,                         /* mask->integer and integer->mask moves */
 716   {2, 2, 2},                            /* cost of loading mask register
 717                                            in QImode, HImode, SImode.  */
 718   {2, 2, 2},                            /* cost if storing mask register
 719                                            in QImode, HImode, SImode.  */
 720   2,                                    /* cost of moving mask register.  */
 721   /* End of register allocator costs.  */
 722   },
 723
 724   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 725   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 726   COSTS_N_INSNS (2),                    /* variable shift costs */
 727   COSTS_N_INSNS (1),                    /* constant shift costs */
 728   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 729    COSTS_N_INSNS (4),                   /*                               HI */
 730    COSTS_N_INSNS (7),                   /*                               SI */
 731    COSTS_N_INSNS (7),                   /*                               DI */
 732    COSTS_N_INSNS (7)},                  /*                            other */
 733   0,                                    /* cost of multiply per each bit set */
 734   {COSTS_N_INSNS (15),                  /* cost of a divide/mod for QI */
 735    COSTS_N_INSNS (23),                  /*                          HI */
 736    COSTS_N_INSNS (39),                  /*                          SI */
 737    COSTS_N_INSNS (39),                  /*                          DI */
 738    COSTS_N_INSNS (39)},                 /*                          other */
 739   COSTS_N_INSNS (1),                    /* cost of movsx */
 740   COSTS_N_INSNS (1),                    /* cost of movzx */
 741   8,                                    /* "large" insn */
 742   4,                                    /* MOVE_RATIO */
 743   4,                                    /* CLEAR_RATIO */
 744   {2, 2, 2},                            /* cost of loading integer registers
 745                                            in QImode, HImode and SImode.
 746                                            Relative to reg-reg move (2).  */
 747   {2, 2, 2},                            /* cost of storing integer registers */
 748   {2, 2, 8, 16, 32},                    /* cost of loading SSE register
 749                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 750   {2, 2, 8, 16, 32},                    /* cost of storing SSE register
 751                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 752   {2, 2, 8, 16, 32},                    /* cost of unaligned loads.  */
 753   {2, 2, 8, 16, 32},                    /* cost of unaligned stores.  */
 754   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 755   6,                                    /* cost of moving SSE register to integer.  */
 756   2, 2,                                 /* Gather load static, per_elt.  */
 757   2, 2,                                 /* Gather store static, per_elt.  */
 758   64,                                   /* size of l1 cache.  */
 759   128,                                  /* size of l2 cache.  */
 760   32,                                   /* size of prefetch block */
 761   1,                                    /* number of parallel prefetches */
 762   1,                                    /* Branch cost */
 763   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
 764   COSTS_N_INSNS (11),                   /* cost of FMUL instruction.  */
 765   COSTS_N_INSNS (47),                   /* cost of FDIV instruction.  */
 766   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 767   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 768   COSTS_N_INSNS (54),                   /* cost of FSQRT instruction.  */
 769
 770   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 771   COSTS_N_INSNS (6),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 772   COSTS_N_INSNS (11),                   /* cost of MULSS instruction.  */
 773   COSTS_N_INSNS (11),                   /* cost of MULSD instruction.  */
 774   COSTS_N_INSNS (17),                   /* cost of FMA SS instruction.  */
 775   COSTS_N_INSNS (17),                   /* cost of FMA SD instruction.  */
 776   COSTS_N_INSNS (47),                   /* cost of DIVSS instruction.  */
 777   COSTS_N_INSNS (47),                   /* cost of DIVSD instruction.  */
 778   COSTS_N_INSNS (54),                   /* cost of SQRTSS instruction.  */
 779   COSTS_N_INSNS (54),                   /* cost of SQRTSD instruction.  */
 780   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 781   geode_memcpy,
 782   geode_memset,
 783   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 784   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 785   NULL,                                 /* Loop alignment.  */
 786   NULL,                                 /* Jump alignment.  */
 787   NULL,                                 /* Label alignment.  */
 788   NULL,                                 /* Func alignment.  */
 789 };
 790
 791 static stringop_algs k6_memcpy[2] = {
 792   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 793   DUMMY_STRINGOP_ALGS};
 794 static stringop_algs k6_memset[2] = {
 795   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 796   DUMMY_STRINGOP_ALGS};
 797 static const
 798 struct processor_costs k6_cost = {
 799   {
 800   /* Start of register allocator costs.  integer->integer move cost is 2. */
 801   3,                                 /* cost for loading QImode using movzbl */
 802   {4, 5, 4},                            /* cost of loading integer registers
 803                                            in QImode, HImode and SImode.
 804                                            Relative to reg-reg move (2).  */
 805   {2, 3, 2},                            /* cost of storing integer registers */
 806   4,                                    /* cost of reg,reg fld/fst */
 807   {6, 6, 6},                            /* cost of loading fp registers
 808                                            in SFmode, DFmode and XFmode */
 809   {4, 4, 4},                            /* cost of storing fp registers
 810                                            in SFmode, DFmode and XFmode */
 811   2,                                    /* cost of moving MMX register */
 812   {2, 2},                               /* cost of loading MMX registers
 813                                            in SImode and DImode */
 814   {2, 2},                               /* cost of storing MMX registers
 815                                            in SImode and DImode */
 816   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 817   {2, 2, 8, 16, 32},                    /* cost of loading SSE registers
 818                                            in 32,64,128,256 and 512-bit */
 819   {2, 2, 8, 16, 32},                    /* cost of storing SSE registers
 820                                            in 32,64,128,256 and 512-bit */
 821   6, 6,                         /* SSE->integer and integer->SSE moves */
 822   6, 6,                         /* mask->integer and integer->mask moves */
 823   {4, 5, 4},                            /* cost of loading mask register
 824                                            in QImode, HImode, SImode.  */
 825   {2, 3, 2},                            /* cost if storing mask register
 826                                            in QImode, HImode, SImode.  */
 827   2,                                    /* cost of moving mask register.  */
 828   /* End of register allocator costs.  */
 829   },
 830
 831   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 832   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 833   COSTS_N_INSNS (1),                    /* variable shift costs */
 834   COSTS_N_INSNS (1),                    /* constant shift costs */
 835   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 836    COSTS_N_INSNS (3),                   /*                               HI */
 837    COSTS_N_INSNS (3),                   /*                               SI */
 838    COSTS_N_INSNS (3),                   /*                               DI */
 839    COSTS_N_INSNS (3)},                  /*                            other */
 840   0,                                    /* cost of multiply per each bit set */
 841   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
 842    COSTS_N_INSNS (18),                  /*                          HI */
 843    COSTS_N_INSNS (18),                  /*                          SI */
 844    COSTS_N_INSNS (18),                  /*                          DI */
 845    COSTS_N_INSNS (18)},                 /*                          other */
 846   COSTS_N_INSNS (2),                    /* cost of movsx */
 847   COSTS_N_INSNS (2),                    /* cost of movzx */
 848   8,                                    /* "large" insn */
 849   4,                                    /* MOVE_RATIO */
 850   4,                                    /* CLEAR_RATIO */
 851   {4, 5, 4},                            /* cost of loading integer registers
 852                                            in QImode, HImode and SImode.
 853                                            Relative to reg-reg move (2).  */
 854   {2, 3, 2},                            /* cost of storing integer registers */
 855   {2, 2, 8, 16, 32},                    /* cost of loading SSE register
 856                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 857   {2, 2, 8, 16, 32},                    /* cost of storing SSE register
 858                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 859   {2, 2, 8, 16, 32},                    /* cost of unaligned loads.  */
 860   {2, 2, 8, 16, 32},                    /* cost of unaligned stores.  */
 861   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 862   6,                                    /* cost of moving SSE register to integer.  */
 863   2, 2,                                 /* Gather load static, per_elt.  */
 864   2, 2,                                 /* Gather store static, per_elt.  */
 865   32,                                   /* size of l1 cache.  */
 866   32,                                   /* size of l2 cache.  Some models
 867                                            have integrated l2 cache, but
 868                                            optimizing for k6 is not important
 869                                            enough to worry about that.  */
 870   32,                                   /* size of prefetch block */
 871   1,                                    /* number of parallel prefetches */
 872   1,                                    /* Branch cost */
 873   COSTS_N_INSNS (2),                    /* cost of FADD and FSUB insns.  */
 874   COSTS_N_INSNS (2),                    /* cost of FMUL instruction.  */
 875   COSTS_N_INSNS (56),                   /* cost of FDIV instruction.  */
 876   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 877   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 878   COSTS_N_INSNS (56),                   /* cost of FSQRT instruction.  */
 879
 880   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 881   COSTS_N_INSNS (2),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 882   COSTS_N_INSNS (2),                    /* cost of MULSS instruction.  */
 883   COSTS_N_INSNS (2),                    /* cost of MULSD instruction.  */
 884   COSTS_N_INSNS (4),                    /* cost of FMA SS instruction.  */
 885   COSTS_N_INSNS (4),                    /* cost of FMA SD instruction.  */
 886   COSTS_N_INSNS (56),                   /* cost of DIVSS instruction.  */
 887   COSTS_N_INSNS (56),                   /* cost of DIVSD instruction.  */
 888   COSTS_N_INSNS (56),                   /* cost of SQRTSS instruction.  */
 889   COSTS_N_INSNS (56),                   /* cost of SQRTSD instruction.  */
 890   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 891   k6_memcpy,
 892   k6_memset,
 893   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 894   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 895   "32:8:8",                             /* Loop alignment.  */
 896   "32:8:8",                             /* Jump alignment.  */
 897   "0:0:8",                              /* Label alignment.  */
 898   "32",                                 /* Func alignment.  */
 899 };
 900
 901 /* For some reason, Athlon deals better with REP prefix (relative to loops)
 902    compared to K8. Alignment becomes important after 8 bytes for memcpy and
 903    128 bytes for memset.  */
 904 static stringop_algs athlon_memcpy[2] = {
 905   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 906   DUMMY_STRINGOP_ALGS};
 907 static stringop_algs athlon_memset[2] = {
 908   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 909   DUMMY_STRINGOP_ALGS};
 910 static const
 911 struct processor_costs athlon_cost = {
 912   {
 913   /* Start of register allocator costs.  integer->integer move cost is 2. */
 914   4,                                 /* cost for loading QImode using movzbl */
 915   {3, 4, 3},                            /* cost of loading integer registers
 916                                            in QImode, HImode and SImode.
 917                                            Relative to reg-reg move (2).  */
 918   {3, 4, 3},                            /* cost of storing integer registers */
 919   4,                                    /* cost of reg,reg fld/fst */
 920   {4, 4, 12},                           /* cost of loading fp registers
 921                                            in SFmode, DFmode and XFmode */
 922   {6, 6, 8},                            /* cost of storing fp registers
 923                                            in SFmode, DFmode and XFmode */
 924   2,                                    /* cost of moving MMX register */
 925   {4, 4},                               /* cost of loading MMX registers
 926                                            in SImode and DImode */
 927   {4, 4},                               /* cost of storing MMX registers
 928                                            in SImode and DImode */
 929   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 930   {4, 4, 12, 12, 24},                   /* cost of loading SSE registers
 931                                            in 32,64,128,256 and 512-bit */
 932   {4, 4, 10, 10, 20},                   /* cost of storing SSE registers
 933                                            in 32,64,128,256 and 512-bit */
 934   5, 5,                         /* SSE->integer and integer->SSE moves */
 935   5, 5,                         /* mask->integer and integer->mask moves */
 936   {3, 4, 3},                            /* cost of loading mask register
 937                                            in QImode, HImode, SImode.  */
 938   {3, 4, 3},                            /* cost if storing mask register
 939                                            in QImode, HImode, SImode.  */
 940   2,                                    /* cost of moving mask register.  */
 941   /* End of register allocator costs.  */
 942   },
 943
 944   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 945   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 946   COSTS_N_INSNS (1),                    /* variable shift costs */
 947   COSTS_N_INSNS (1),                    /* constant shift costs */
 948   {COSTS_N_INSNS (5),                   /* cost of starting multiply for QI */
 949    COSTS_N_INSNS (5),                   /*                               HI */
 950    COSTS_N_INSNS (5),                   /*                               SI */
 951    COSTS_N_INSNS (5),                   /*                               DI */
 952    COSTS_N_INSNS (5)},                  /*                            other */
 953   0,                                    /* cost of multiply per each bit set */
 954   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
 955    COSTS_N_INSNS (26),                  /*                          HI */
 956    COSTS_N_INSNS (42),                  /*                          SI */
 957    COSTS_N_INSNS (74),                  /*                          DI */
 958    COSTS_N_INSNS (74)},                 /*                          other */
 959   COSTS_N_INSNS (1),                    /* cost of movsx */
 960   COSTS_N_INSNS (1),                    /* cost of movzx */
 961   8,                                    /* "large" insn */
 962   9,                                    /* MOVE_RATIO */
 963   6,                                    /* CLEAR_RATIO */
 964   {3, 4, 3},                            /* cost of loading integer registers
 965                                            in QImode, HImode and SImode.
 966                                            Relative to reg-reg move (2).  */
 967   {3, 4, 3},                            /* cost of storing integer registers */
 968   {4, 4, 12, 12, 24},                   /* cost of loading SSE register
 969                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 970   {4, 4, 10, 10, 20},                   /* cost of storing SSE register
 971                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
 972   {4, 4, 12, 12, 24},                   /* cost of unaligned loads.  */
 973   {4, 4, 10, 10, 20},                   /* cost of unaligned stores.  */
 974   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 975   5,                                    /* cost of moving SSE register to integer.  */
 976   4, 4,                                 /* Gather load static, per_elt.  */
 977   4, 4,                                 /* Gather store static, per_elt.  */
 978   64,                                   /* size of l1 cache.  */
 979   256,                                  /* size of l2 cache.  */
 980   64,                                   /* size of prefetch block */
 981   6,                                    /* number of parallel prefetches */
 982   5,                                    /* Branch cost */
 983   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
 984   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
 985   COSTS_N_INSNS (24),                   /* cost of FDIV instruction.  */
 986   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 987   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 988   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
 989
 990   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
 991   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 992   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
 993   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
 994   COSTS_N_INSNS (8),                    /* cost of FMA SS instruction.  */
 995   COSTS_N_INSNS (8),                    /* cost of FMA SD instruction.  */
 996   /* 11-16  */
 997   COSTS_N_INSNS (16),                   /* cost of DIVSS instruction.  */
 998   COSTS_N_INSNS (24),                   /* cost of DIVSD instruction.  */
 999   COSTS_N_INSNS (19),                   /* cost of SQRTSS instruction.  */
1000   COSTS_N_INSNS (19),                   /* cost of SQRTSD instruction.  */
1001   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1002   athlon_memcpy,
1003   athlon_memset,
1004   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
1005   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1006   "16:8:8",                             /* Loop alignment.  */
1007   "16:8:8",                             /* Jump alignment.  */
1008   "0:0:8",                              /* Label alignment.  */
1009   "16",                                 /* Func alignment.  */
1010 };
1011
1012 /* K8 has optimized REP instruction for medium sized blocks, but for very
1013    small blocks it is better to use loop. For large blocks, libcall can
1014    do nontemporary accesses and beat inline considerably.  */
1015 static stringop_algs k8_memcpy[2] = {
1016   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1017              {-1, rep_prefix_4_byte, false}}},
1018   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1019              {-1, libcall, false}}}};
1020 static stringop_algs k8_memset[2] = {
1021   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1022              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1023   {libcall, {{48, unrolled_loop, false},
1024              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1025 static const
1026 struct processor_costs k8_cost = {
1027   {
1028   /* Start of register allocator costs.  integer->integer move cost is 2. */
1029   4,                                 /* cost for loading QImode using movzbl */
1030   {3, 4, 3},                            /* cost of loading integer registers
1031                                            in QImode, HImode and SImode.
1032                                            Relative to reg-reg move (2).  */
1033   {3, 4, 3},                            /* cost of storing integer registers */
1034   4,                                    /* cost of reg,reg fld/fst */
1035   {4, 4, 12},                           /* cost of loading fp registers
1036                                            in SFmode, DFmode and XFmode */
1037   {6, 6, 8},                            /* cost of storing fp registers
1038                                            in SFmode, DFmode and XFmode */
1039   2,                                    /* cost of moving MMX register */
1040   {3, 3},                               /* cost of loading MMX registers
1041                                            in SImode and DImode */
1042   {4, 4},                               /* cost of storing MMX registers
1043                                            in SImode and DImode */
1044   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1045   {4, 3, 12, 12, 24},                   /* cost of loading SSE registers
1046                                            in 32,64,128,256 and 512-bit */
1047   {4, 4, 10, 10, 20},                   /* cost of storing SSE registers
1048                                            in 32,64,128,256 and 512-bit */
1049   5, 5,                         /* SSE->integer and integer->SSE moves */
1050   5, 5,                         /* mask->integer and integer->mask moves */
1051   {3, 4, 3},                            /* cost of loading mask register
1052                                            in QImode, HImode, SImode.  */
1053   {3, 4, 3},                            /* cost if storing mask register
1054                                            in QImode, HImode, SImode.  */
1055   2,                                    /* cost of moving mask register.  */
1056   /* End of register allocator costs.  */
1057   },
1058
1059   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1060   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
1061   COSTS_N_INSNS (1),                    /* variable shift costs */
1062   COSTS_N_INSNS (1),                    /* constant shift costs */
1063   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1064    COSTS_N_INSNS (4),                   /*                               HI */
1065    COSTS_N_INSNS (3),                   /*                               SI */
1066    COSTS_N_INSNS (4),                   /*                               DI */
1067    COSTS_N_INSNS (5)},                  /*                            other */
1068   0,                                    /* cost of multiply per each bit set */
1069   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
1070    COSTS_N_INSNS (26),                  /*                          HI */
1071    COSTS_N_INSNS (42),                  /*                          SI */
1072    COSTS_N_INSNS (74),                  /*                          DI */
1073    COSTS_N_INSNS (74)},                 /*                          other */
1074   COSTS_N_INSNS (1),                    /* cost of movsx */
1075   COSTS_N_INSNS (1),                    /* cost of movzx */
1076   8,                                    /* "large" insn */
1077   9,                                    /* MOVE_RATIO */
1078   6,                                    /* CLEAR_RATIO */
1079   {3, 4, 3},                            /* cost of loading integer registers
1080                                            in QImode, HImode and SImode.
1081                                            Relative to reg-reg move (2).  */
1082   {3, 4, 3},                            /* cost of storing integer registers */
1083   {4, 3, 12, 12, 24},                   /* cost of loading SSE register
1084                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1085   {4, 4, 10, 10, 20},                   /* cost of storing SSE register
1086                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1087   {4, 3, 12, 12, 24},                   /* cost of unaligned loads.  */
1088   {4, 4, 10, 10, 20},                   /* cost of unaligned stores.  */
1089   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1090   5,                                    /* cost of moving SSE register to integer.  */
1091   4, 4,                                 /* Gather load static, per_elt.  */
1092   4, 4,                                 /* Gather store static, per_elt.  */
1093   64,                                   /* size of l1 cache.  */
1094   512,                                  /* size of l2 cache.  */
1095   64,                                   /* size of prefetch block */
1096   /* New AMD processors never drop prefetches; if they cannot be performed
1097      immediately, they are queued.  We set number of simultaneous prefetches
1098      to a large constant to reflect this (it probably is not a good idea not
1099      to limit number of prefetches at all, as their execution also takes some
1100      time).  */
1101   100,                                  /* number of parallel prefetches */
1102   3,                                    /* Branch cost */
1103   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
1104   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1105   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
1106   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1107   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1108   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
1109
1110   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1111   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1112   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
1113   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1114   COSTS_N_INSNS (8),                    /* cost of FMA SS instruction.  */
1115   COSTS_N_INSNS (8),                    /* cost of FMA SD instruction.  */
1116   /* 11-16  */
1117   COSTS_N_INSNS (16),                   /* cost of DIVSS instruction.  */
1118   COSTS_N_INSNS (20),                   /* cost of DIVSD instruction.  */
1119   COSTS_N_INSNS (19),                   /* cost of SQRTSS instruction.  */
1120   COSTS_N_INSNS (27),                   /* cost of SQRTSD instruction.  */
1121   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1122   k8_memcpy,
1123   k8_memset,
1124   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
1125   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1126   "16:8:8",                             /* Loop alignment.  */
1127   "16:8:8",                             /* Jump alignment.  */
1128   "0:0:8",                              /* Label alignment.  */
1129   "16",                                 /* Func alignment.  */
1130 };
1131
1132 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1133    very small blocks it is better to use loop. For large blocks, libcall can
1134    do nontemporary accesses and beat inline considerably.  */
1135 static stringop_algs amdfam10_memcpy[2] = {
1136   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1137              {-1, rep_prefix_4_byte, false}}},
1138   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1139              {-1, libcall, false}}}};
1140 static stringop_algs amdfam10_memset[2] = {
1141   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1142              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1143   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1144              {-1, libcall, false}}}};
1145 struct processor_costs amdfam10_cost = {
1146   {
1147   /* Start of register allocator costs.  integer->integer move cost is 2. */
1148   4,                                 /* cost for loading QImode using movzbl */
1149   {3, 4, 3},                            /* cost of loading integer registers
1150                                            in QImode, HImode and SImode.
1151                                            Relative to reg-reg move (2).  */
1152   {3, 4, 3},                            /* cost of storing integer registers */
1153   4,                                    /* cost of reg,reg fld/fst */
1154   {4, 4, 12},                           /* cost of loading fp registers
1155                                            in SFmode, DFmode and XFmode */
1156   {6, 6, 8},                            /* cost of storing fp registers
1157                                            in SFmode, DFmode and XFmode */
1158   2,                                    /* cost of moving MMX register */
1159   {3, 3},                               /* cost of loading MMX registers
1160                                            in SImode and DImode */
1161   {4, 4},                               /* cost of storing MMX registers
1162                                            in SImode and DImode */
1163   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1164   {4, 4, 3, 6, 12},                     /* cost of loading SSE registers
1165                                            in 32,64,128,256 and 512-bit */
1166   {4, 4, 5, 10, 20},                    /* cost of storing SSE registers
1167                                            in 32,64,128,256 and 512-bit */
1168   3, 3,                         /* SSE->integer and integer->SSE moves */
1169   3, 3,                         /* mask->integer and integer->mask moves */
1170   {3, 4, 3},                            /* cost of loading mask register
1171                                            in QImode, HImode, SImode.  */
1172   {3, 4, 3},                            /* cost if storing mask register
1173                                            in QImode, HImode, SImode.  */
1174   2,                                    /* cost of moving mask register.  */
1175
1176                                         /* On K8:
1177                                             MOVD reg64, xmmreg Double FSTORE 4
1178                                             MOVD reg32, xmmreg Double FSTORE 4
1179                                            On AMDFAM10:
1180                                             MOVD reg64, xmmreg Double FADD 3
1181                                                                1/1  1/1
1182                                             MOVD reg32, xmmreg Double FADD 3
1183                                                                1/1  1/1 */
1184   /* End of register allocator costs.  */
1185   },
1186
1187   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1188   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
1189   COSTS_N_INSNS (1),                    /* variable shift costs */
1190   COSTS_N_INSNS (1),                    /* constant shift costs */
1191   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1192    COSTS_N_INSNS (4),                   /*                               HI */
1193    COSTS_N_INSNS (3),                   /*                               SI */
1194    COSTS_N_INSNS (4),                   /*                               DI */
1195    COSTS_N_INSNS (5)},                  /*                            other */
1196   0,                                    /* cost of multiply per each bit set */
1197   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1198    COSTS_N_INSNS (35),                  /*                          HI */
1199    COSTS_N_INSNS (51),                  /*                          SI */
1200    COSTS_N_INSNS (83),                  /*                          DI */
1201    COSTS_N_INSNS (83)},                 /*                          other */
1202   COSTS_N_INSNS (1),                    /* cost of movsx */
1203   COSTS_N_INSNS (1),                    /* cost of movzx */
1204   8,                                    /* "large" insn */
1205   9,                                    /* MOVE_RATIO */
1206   6,                                    /* CLEAR_RATIO */
1207   {3, 4, 3},                            /* cost of loading integer registers
1208                                            in QImode, HImode and SImode.
1209                                            Relative to reg-reg move (2).  */
1210   {3, 4, 3},                            /* cost of storing integer registers */
1211   {4, 4, 3, 6, 12},                     /* cost of loading SSE register
1212                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1213   {4, 4, 5, 10, 20},                    /* cost of storing SSE register
1214                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1215   {4, 4, 3, 7, 12},                     /* cost of unaligned loads.  */
1216   {4, 4, 5, 10, 20},                    /* cost of unaligned stores.  */
1217   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1218   3,                                    /* cost of moving SSE register to integer.  */
1219   4, 4,                                 /* Gather load static, per_elt.  */
1220   4, 4,                                 /* Gather store static, per_elt.  */
1221   64,                                   /* size of l1 cache.  */
1222   512,                                  /* size of l2 cache.  */
1223   64,                                   /* size of prefetch block */
1224   /* New AMD processors never drop prefetches; if they cannot be performed
1225      immediately, they are queued.  We set number of simultaneous prefetches
1226      to a large constant to reflect this (it probably is not a good idea not
1227      to limit number of prefetches at all, as their execution also takes some
1228      time).  */
1229   100,                                  /* number of parallel prefetches */
1230   2,                                    /* Branch cost */
1231   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
1232   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1233   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
1234   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1235   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1236   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
1237
1238   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1239   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1240   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
1241   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1242   COSTS_N_INSNS (8),                    /* cost of FMA SS instruction.  */
1243   COSTS_N_INSNS (8),                    /* cost of FMA SD instruction.  */
1244   /* 11-16  */
1245   COSTS_N_INSNS (16),                   /* cost of DIVSS instruction.  */
1246   COSTS_N_INSNS (20),                   /* cost of DIVSD instruction.  */
1247   COSTS_N_INSNS (19),                   /* cost of SQRTSS instruction.  */
1248   COSTS_N_INSNS (27),                   /* cost of SQRTSD instruction.  */
1249   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1250   amdfam10_memcpy,
1251   amdfam10_memset,
1252   COSTS_N_INSNS (2),                    /* cond_taken_branch_cost.  */
1253   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1254   "32:25:8",                            /* Loop alignment.  */
1255   "32:8:8",                             /* Jump alignment.  */
1256   "0:0:8",                              /* Label alignment.  */
1257   "32",                                 /* Func alignment.  */
1258 };
1259
1260 /*  BDVER has optimized REP instruction for medium sized blocks, but for
1261     very small blocks it is better to use loop. For large blocks, libcall
1262     can do nontemporary accesses and beat inline considerably.  */
1263 static stringop_algs bdver_memcpy[2] = {
1264   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1265              {-1, rep_prefix_4_byte, false}}},
1266   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1267              {-1, libcall, false}}}};
1268 static stringop_algs bdver_memset[2] = {
1269   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1270              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1271   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1272              {-1, libcall, false}}}};
1273
1274 const struct processor_costs bdver_cost = {
1275   {
1276   /* Start of register allocator costs.  integer->integer move cost is 2. */
1277   8,                                 /* cost for loading QImode using movzbl */
1278   {8, 8, 8},                            /* cost of loading integer registers
1279                                            in QImode, HImode and SImode.
1280                                            Relative to reg-reg move (2).  */
1281   {8, 8, 8},                            /* cost of storing integer registers */
1282   4,                                    /* cost of reg,reg fld/fst */
1283   {12, 12, 28},                         /* cost of loading fp registers
1284                                            in SFmode, DFmode and XFmode */
1285   {10, 10, 18},                         /* cost of storing fp registers
1286                                            in SFmode, DFmode and XFmode */
1287   4,                                    /* cost of moving MMX register */
1288   {12, 12},                             /* cost of loading MMX registers
1289                                            in SImode and DImode */
1290   {10, 10},                             /* cost of storing MMX registers
1291                                            in SImode and DImode */
1292   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1293   {12, 12, 10, 40, 60},                 /* cost of loading SSE registers
1294                                            in 32,64,128,256 and 512-bit */
1295   {10, 10, 10, 40, 60},                 /* cost of storing SSE registers
1296                                            in 32,64,128,256 and 512-bit */
1297   16, 20,                               /* SSE->integer and integer->SSE moves */
1298   16, 20,                               /* mask->integer and integer->mask moves */
1299   {8, 8, 8},                            /* cost of loading mask register
1300                                            in QImode, HImode, SImode.  */
1301   {8, 8, 8},                            /* cost if storing mask register
1302                                            in QImode, HImode, SImode.  */
1303   2,                                    /* cost of moving mask register.  */
1304   /* End of register allocator costs.  */
1305   },
1306
1307   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1308   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
1309   COSTS_N_INSNS (1),                    /* variable shift costs */
1310   COSTS_N_INSNS (1),                    /* constant shift costs */
1311   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
1312    COSTS_N_INSNS (4),                   /*                               HI */
1313    COSTS_N_INSNS (4),                   /*                               SI */
1314    COSTS_N_INSNS (6),                   /*                               DI */
1315    COSTS_N_INSNS (6)},                  /*                            other */
1316   0,                                    /* cost of multiply per each bit set */
1317   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1318    COSTS_N_INSNS (35),                  /*                          HI */
1319    COSTS_N_INSNS (51),                  /*                          SI */
1320    COSTS_N_INSNS (83),                  /*                          DI */
1321    COSTS_N_INSNS (83)},                 /*                          other */
1322   COSTS_N_INSNS (1),                    /* cost of movsx */
1323   COSTS_N_INSNS (1),                    /* cost of movzx */
1324   8,                                    /* "large" insn */
1325   9,                                    /* MOVE_RATIO */
1326   6,                                    /* CLEAR_RATIO */
1327   {8, 8, 8},                            /* cost of loading integer registers
1328                                            in QImode, HImode and SImode.
1329                                            Relative to reg-reg move (2).  */
1330   {8, 8, 8},                            /* cost of storing integer registers */
1331   {12, 12, 10, 40, 60},                 /* cost of loading SSE register
1332                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1333   {10, 10, 10, 40, 60},                 /* cost of storing SSE register
1334                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1335   {12, 12, 10, 40, 60},                 /* cost of unaligned loads.  */
1336   {10, 10, 10, 40, 60},                 /* cost of unaligned stores.  */
1337   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1338   16,                                   /* cost of moving SSE register to integer.  */
1339   12, 12,                               /* Gather load static, per_elt.  */
1340   10, 10,                               /* Gather store static, per_elt.  */
1341   16,                                   /* size of l1 cache.  */
1342   2048,                                 /* size of l2 cache.  */
1343   64,                                   /* size of prefetch block */
1344   /* New AMD processors never drop prefetches; if they cannot be performed
1345      immediately, they are queued.  We set number of simultaneous prefetches
1346      to a large constant to reflect this (it probably is not a good idea not
1347      to limit number of prefetches at all, as their execution also takes some
1348      time).  */
1349   100,                                  /* number of parallel prefetches */
1350   2,                                    /* Branch cost */
1351   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
1352   COSTS_N_INSNS (6),                    /* cost of FMUL instruction.  */
1353   COSTS_N_INSNS (42),                   /* cost of FDIV instruction.  */
1354   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1355   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1356   COSTS_N_INSNS (52),                   /* cost of FSQRT instruction.  */
1357
1358   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1359   COSTS_N_INSNS (6),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1360   COSTS_N_INSNS (6),                    /* cost of MULSS instruction.  */
1361   COSTS_N_INSNS (6),                    /* cost of MULSD instruction.  */
1362   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
1363   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
1364   /* 9-24  */
1365   COSTS_N_INSNS (24),                   /* cost of DIVSS instruction.  */
1366   /* 9-27  */
1367   COSTS_N_INSNS (27),                   /* cost of DIVSD instruction.  */
1368   COSTS_N_INSNS (15),                   /* cost of SQRTSS instruction.  */
1369   COSTS_N_INSNS (26),                   /* cost of SQRTSD instruction.  */
1370   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1371   bdver_memcpy,
1372   bdver_memset,
1373   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1374   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1375   "16:11:8",                            /* Loop alignment.  */
1376   "16:8:8",                             /* Jump alignment.  */
1377   "0:0:8",                              /* Label alignment.  */
1378   "11",                                 /* Func alignment.  */
1379 };
1380
1381
1382 /*  ZNVER1 has optimized REP instruction for medium sized blocks, but for
1383     very small blocks it is better to use loop.  For large blocks, libcall
1384     can do nontemporary accesses and beat inline considerably.  */
1385 static stringop_algs znver1_memcpy[2] = {
1386   /* 32-bit tuning.  */
1387   {libcall, {{6, loop, false},
1388              {14, unrolled_loop, false},
1389              {-1, libcall, false}}},
1390   /* 64-bit tuning.  */
1391   {libcall, {{16, loop, false},
1392              {128, rep_prefix_8_byte, false},
1393              {-1, libcall, false}}}};
1394 static stringop_algs znver1_memset[2] = {
1395   /* 32-bit tuning.  */
1396   {libcall, {{8, loop, false},
1397              {24, unrolled_loop, false},
1398              {128, rep_prefix_4_byte, false},
1399              {-1, libcall, false}}},
1400   /* 64-bit tuning.  */
1401   {libcall, {{48, unrolled_loop, false},
1402              {128, rep_prefix_8_byte, false},
1403              {-1, libcall, false}}}};
1404 struct processor_costs znver1_cost = {
1405   {
1406   /* Start of register allocator costs.  integer->integer move cost is 2. */
1407
1408   /* reg-reg moves are done by renaming and thus they are even cheaper than
1409      1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1410      to doubles of latencies, we do not model this correctly.  It does not
1411      seem to make practical difference to bump prices up even more.  */
1412   6,                                    /* cost for loading QImode using
1413                                            movzbl.  */
1414   {6, 6, 6},                            /* cost of loading integer registers
1415                                            in QImode, HImode and SImode.
1416                                            Relative to reg-reg move (2).  */
1417   {8, 8, 8},                            /* cost of storing integer
1418                                            registers.  */
1419   2,                                    /* cost of reg,reg fld/fst.  */
1420   {6, 6, 16},                           /* cost of loading fp registers
1421                                            in SFmode, DFmode and XFmode.  */
1422   {8, 8, 16},                           /* cost of storing fp registers
1423                                            in SFmode, DFmode and XFmode.  */
1424   2,                                    /* cost of moving MMX register.  */
1425   {6, 6},                               /* cost of loading MMX registers
1426                                            in SImode and DImode.  */
1427   {8, 8},                               /* cost of storing MMX registers
1428                                            in SImode and DImode.  */
1429   2, 3, 6,                              /* cost of moving XMM,YMM,ZMM register.  */
1430   {6, 6, 6, 12, 24},                    /* cost of loading SSE registers
1431                                            in 32,64,128,256 and 512-bit.  */
1432   {8, 8, 8, 16, 32},                    /* cost of storing SSE registers
1433                                            in 32,64,128,256 and 512-bit.  */
1434   6, 6,                         /* SSE->integer and integer->SSE moves.  */
1435   8, 8,                         /* mask->integer and integer->mask moves */
1436   {6, 6, 6},                            /* cost of loading mask register
1437                                            in QImode, HImode, SImode.  */
1438   {8, 8, 8},                            /* cost if storing mask register
1439                                            in QImode, HImode, SImode.  */
1440   2,                                    /* cost of moving mask register.  */
1441   /* End of register allocator costs.  */
1442   },
1443
1444   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
1445   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
1446   COSTS_N_INSNS (1),                    /* variable shift costs.  */
1447   COSTS_N_INSNS (1),                    /* constant shift costs.  */
1448   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI.  */
1449    COSTS_N_INSNS (3),                   /*                               HI.  */
1450    COSTS_N_INSNS (3),                   /*                               SI.  */
1451    COSTS_N_INSNS (3),                   /*                               DI.  */
1452    COSTS_N_INSNS (3)},                  /*                            other.  */
1453   0,                                    /* cost of multiply per each bit
1454                                             set.  */
1455    /* Depending on parameters, idiv can get faster on ryzen.  This is upper
1456       bound.  */
1457   {COSTS_N_INSNS (16),                  /* cost of a divide/mod for QI.  */
1458    COSTS_N_INSNS (22),                  /*                          HI.  */
1459    COSTS_N_INSNS (30),                  /*                          SI.  */
1460    COSTS_N_INSNS (45),                  /*                          DI.  */
1461    COSTS_N_INSNS (45)},                 /*                          other.  */
1462   COSTS_N_INSNS (1),                    /* cost of movsx.  */
1463   COSTS_N_INSNS (1),                    /* cost of movzx.  */
1464   8,                                    /* "large" insn.  */
1465   9,                                    /* MOVE_RATIO.  */
1466   6,                                    /* CLEAR_RATIO */
1467   {6, 6, 6},                            /* cost of loading integer registers
1468                                            in QImode, HImode and SImode.
1469                                            Relative to reg-reg move (2).  */
1470   {8, 8, 8},                            /* cost of storing integer
1471                                            registers.  */
1472   {6, 6, 6, 12, 24},                    /* cost of loading SSE register
1473                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1474   {8, 8, 8, 16, 32},                    /* cost of storing SSE register
1475                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1476   {6, 6, 6, 12, 24},                    /* cost of unaligned loads.  */
1477   {8, 8, 8, 16, 32},                    /* cost of unaligned stores.  */
1478   2, 3, 6,                              /* cost of moving XMM,YMM,ZMM register.  */
1479   6,                                    /* cost of moving SSE register to integer.  */
1480   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1481      throughput 12.  Approx 9 uops do not depend on vector size and every load
1482      is 7 uops.  */
1483   18, 8,                                /* Gather load static, per_elt.  */
1484   18, 10,                               /* Gather store static, per_elt.  */
1485   32,                                   /* size of l1 cache.  */
1486   512,                                  /* size of l2 cache.  */
1487   64,                                   /* size of prefetch block.  */
1488   /* New AMD processors never drop prefetches; if they cannot be performed
1489      immediately, they are queued.  We set number of simultaneous prefetches
1490      to a large constant to reflect this (it probably is not a good idea not
1491      to limit number of prefetches at all, as their execution also takes some
1492      time).  */
1493   100,                                  /* number of parallel prefetches.  */
1494   3,                                    /* Branch cost.  */
1495   COSTS_N_INSNS (5),                    /* cost of FADD and FSUB insns.  */
1496   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
1497   /* Latency of fdiv is 8-15.  */
1498   COSTS_N_INSNS (15),                   /* cost of FDIV instruction.  */
1499   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
1500   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
1501   /* Latency of fsqrt is 4-10.  */
1502   COSTS_N_INSNS (10),                   /* cost of FSQRT instruction.  */
1503
1504   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1505   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1506   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
1507   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1508   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
1509   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
1510   COSTS_N_INSNS (10),                   /* cost of DIVSS instruction.  */
1511   /* 9-13  */
1512   COSTS_N_INSNS (13),                   /* cost of DIVSD instruction.  */
1513   COSTS_N_INSNS (10),                   /* cost of SQRTSS instruction.  */
1514   COSTS_N_INSNS (15),                   /* cost of SQRTSD instruction.  */
1515   /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1516      and it can execute 2 integer additions and 2 multiplications thus
1517      reassociation may make sense up to with of 6.  SPEC2k6 bencharks suggests
1518      that 4 works better than 6 probably due to register pressure.
1519
1520      Integer vector operations are taken by FP unit and execute 3 vector
1521      plus/minus operations per cycle but only one multiply.  This is adjusted
1522      in ix86_reassociation_width.  */
1523   4, 4, 3, 6,                           /* reassoc int, fp, vec_int, vec_fp.  */
1524   znver1_memcpy,
1525   znver1_memset,
1526   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1527   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1528   "16",                                 /* Loop alignment.  */
1529   "16",                                 /* Jump alignment.  */
1530   "0:0:8",                              /* Label alignment.  */
1531   "16",                                 /* Func alignment.  */
1532 };
1533
1534 /*  ZNVER2 has optimized REP instruction for medium sized blocks, but for
1535     very small blocks it is better to use loop.  For large blocks, libcall
1536     can do nontemporary accesses and beat inline considerably.  */
1537 static stringop_algs znver2_memcpy[2] = {
1538   /* 32-bit tuning.  */
1539   {libcall, {{6, loop, false},
1540              {14, unrolled_loop, false},
1541              {-1, libcall, false}}},
1542   /* 64-bit tuning.  */
1543   {libcall, {{16, loop, false},
1544              {64, rep_prefix_4_byte, false},
1545              {-1, libcall, false}}}};
1546 static stringop_algs znver2_memset[2] = {
1547   /* 32-bit tuning.  */
1548   {libcall, {{8, loop, false},
1549              {24, unrolled_loop, false},
1550              {128, rep_prefix_4_byte, false},
1551              {-1, libcall, false}}},
1552   /* 64-bit tuning.  */
1553   {libcall, {{24, rep_prefix_4_byte, false},
1554              {128, rep_prefix_8_byte, false},
1555              {-1, libcall, false}}}};
1556
1557 struct processor_costs znver2_cost = {
1558   {
1559   /* Start of register allocator costs.  integer->integer move cost is 2. */
1560
1561   /* reg-reg moves are done by renaming and thus they are even cheaper than
1562      1 cycle.  Because reg-reg move cost is 2 and following tables correspond
1563      to doubles of latencies, we do not model this correctly.  It does not
1564      seem to make practical difference to bump prices up even more.  */
1565   6,                                    /* cost for loading QImode using
1566                                            movzbl.  */
1567   {6, 6, 6},                            /* cost of loading integer registers
1568                                            in QImode, HImode and SImode.
1569                                            Relative to reg-reg move (2).  */
1570   {8, 8, 8},                            /* cost of storing integer
1571                                            registers.  */
1572   2,                                    /* cost of reg,reg fld/fst.  */
1573   {6, 6, 16},                           /* cost of loading fp registers
1574                                            in SFmode, DFmode and XFmode.  */
1575   {8, 8, 16},                           /* cost of storing fp registers
1576                                            in SFmode, DFmode and XFmode.  */
1577   2,                                    /* cost of moving MMX register.  */
1578   {6, 6},                               /* cost of loading MMX registers
1579                                            in SImode and DImode.  */
1580   {8, 8},                               /* cost of storing MMX registers
1581                                            in SImode and DImode.  */
1582   2, 2, 3,                              /* cost of moving XMM,YMM,ZMM
1583                                            register.  */
1584   {6, 6, 6, 6, 12},                     /* cost of loading SSE registers
1585                                            in 32,64,128,256 and 512-bit.  */
1586   {8, 8, 8, 8, 16},                     /* cost of storing SSE registers
1587                                            in 32,64,128,256 and 512-bit.  */
1588   6, 6,                                 /* SSE->integer and integer->SSE
1589                                            moves.  */
1590   8, 8,                         /* mask->integer and integer->mask moves */
1591   {6, 6, 6},                            /* cost of loading mask register
1592                                            in QImode, HImode, SImode.  */
1593   {8, 8, 8},                            /* cost if storing mask register
1594                                            in QImode, HImode, SImode.  */
1595   2,                                    /* cost of moving mask register.  */
1596   /* End of register allocator costs.  */
1597   },
1598
1599   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
1600   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
1601   COSTS_N_INSNS (1),                    /* variable shift costs.  */
1602   COSTS_N_INSNS (1),                    /* constant shift costs.  */
1603   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI.  */
1604    COSTS_N_INSNS (3),                   /*                               HI.  */
1605    COSTS_N_INSNS (3),                   /*                               SI.  */
1606    COSTS_N_INSNS (3),                   /*                               DI.  */
1607    COSTS_N_INSNS (3)},                  /*                      other.  */
1608   0,                                    /* cost of multiply per each bit
1609                                            set.  */
1610    /* Depending on parameters, idiv can get faster on ryzen.  This is upper
1611       bound.  */
1612   {COSTS_N_INSNS (16),                  /* cost of a divide/mod for QI.  */
1613    COSTS_N_INSNS (22),                  /*                          HI.  */
1614    COSTS_N_INSNS (30),                  /*                          SI.  */
1615    COSTS_N_INSNS (45),                  /*                          DI.  */
1616    COSTS_N_INSNS (45)},                 /*                          other.  */
1617   COSTS_N_INSNS (1),                    /* cost of movsx.  */
1618   COSTS_N_INSNS (1),                    /* cost of movzx.  */
1619   8,                                    /* "large" insn.  */
1620   9,                                    /* MOVE_RATIO.  */
1621   6,                                    /* CLEAR_RATIO */
1622   {6, 6, 6},                            /* cost of loading integer registers
1623                                            in QImode, HImode and SImode.
1624                                            Relative to reg-reg move (2).  */
1625   {8, 8, 8},                            /* cost of storing integer
1626                                            registers.  */
1627   {6, 6, 6, 6, 12},                     /* cost of loading SSE registers
1628                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1629   {8, 8, 8, 8, 16},                     /* cost of storing SSE register
1630                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1631   {6, 6, 6, 6, 12},                     /* cost of unaligned loads.  */
1632   {8, 8, 8, 8, 16},                     /* cost of unaligned stores.  */
1633   2, 2, 3,                              /* cost of moving XMM,YMM,ZMM
1634                                            register.  */
1635   6,                                    /* cost of moving SSE register to integer.  */
1636   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1637      throughput 12.  Approx 9 uops do not depend on vector size and every load
1638      is 7 uops.  */
1639   18, 8,                                /* Gather load static, per_elt.  */
1640   18, 10,                               /* Gather store static, per_elt.  */
1641   32,                                   /* size of l1 cache.  */
1642   512,                                  /* size of l2 cache.  */
1643   64,                                   /* size of prefetch block.  */
1644   /* New AMD processors never drop prefetches; if they cannot be performed
1645      immediately, they are queued.  We set number of simultaneous prefetches
1646      to a large constant to reflect this (it probably is not a good idea not
1647      to limit number of prefetches at all, as their execution also takes some
1648      time).  */
1649   100,                                  /* number of parallel prefetches.  */
1650   3,                                    /* Branch cost.  */
1651   COSTS_N_INSNS (5),                    /* cost of FADD and FSUB insns.  */
1652   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
1653   /* Latency of fdiv is 8-15.  */
1654   COSTS_N_INSNS (15),                   /* cost of FDIV instruction.  */
1655   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
1656   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
1657   /* Latency of fsqrt is 4-10.  */
1658   COSTS_N_INSNS (10),                   /* cost of FSQRT instruction.  */
1659
1660   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1661   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1662   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
1663   COSTS_N_INSNS (3),                    /* cost of MULSD instruction.  */
1664   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
1665   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
1666   COSTS_N_INSNS (10),                   /* cost of DIVSS instruction.  */
1667   /* 9-13.  */
1668   COSTS_N_INSNS (13),                   /* cost of DIVSD instruction.  */
1669   COSTS_N_INSNS (10),                   /* cost of SQRTSS instruction.  */
1670   COSTS_N_INSNS (15),                   /* cost of SQRTSD instruction.  */
1671   /* Zen can execute 4 integer operations per cycle.  FP operations
1672      take 3 cycles and it can execute 2 integer additions and 2
1673      multiplications thus reassociation may make sense up to with of 6.
1674      SPEC2k6 bencharks suggests
1675      that 4 works better than 6 probably due to register pressure.
1676
1677      Integer vector operations are taken by FP unit and execute 3 vector
1678      plus/minus operations per cycle but only one multiply.  This is adjusted
1679      in ix86_reassociation_width.  */
1680   4, 4, 3, 6,                           /* reassoc int, fp, vec_int, vec_fp.  */
1681   znver2_memcpy,
1682   znver2_memset,
1683   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1684   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1685   "16",                                 /* Loop alignment.  */
1686   "16",                                 /* Jump alignment.  */
1687   "0:0:8",                              /* Label alignment.  */
1688   "16",                                 /* Func alignment.  */
1689 };
1690
1691 /* skylake_cost should produce code tuned for Skylake familly of CPUs.  */
1692 static stringop_algs skylake_memcpy[2] =   {
1693   {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1694   {libcall, {{16, loop, false}, {512, unrolled_loop, false},
1695              {-1, libcall, false}}}};
1696
1697 static stringop_algs skylake_memset[2] = {
1698   {libcall, {{6, loop_1_byte, true},
1699              {24, loop, true},
1700              {8192, rep_prefix_4_byte, true},
1701              {-1, libcall, false}}},
1702   {libcall, {{24, loop, true}, {512, unrolled_loop, false},
1703              {-1, libcall, false}}}};
1704
1705 static const
1706 struct processor_costs skylake_cost = {
1707   {
1708   /* Start of register allocator costs.  integer->integer move cost is 2. */
1709   6,                                 /* cost for loading QImode using movzbl */
1710   {4, 4, 4},                            /* cost of loading integer registers
1711                                            in QImode, HImode and SImode.
1712                                            Relative to reg-reg move (2).  */
1713   {6, 6, 6},                            /* cost of storing integer registers */
1714   2,                                    /* cost of reg,reg fld/fst */
1715   {6, 6, 8},                            /* cost of loading fp registers
1716                                            in SFmode, DFmode and XFmode */
1717   {6, 6, 10},                           /* cost of storing fp registers
1718                                            in SFmode, DFmode and XFmode */
1719   2,                                    /* cost of moving MMX register */
1720   {6, 6},                               /* cost of loading MMX registers
1721                                            in SImode and DImode */
1722   {6, 6},                               /* cost of storing MMX registers
1723                                            in SImode and DImode */
1724   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
1725   {6, 6, 6, 10, 20},                    /* cost of loading SSE registers
1726                                            in 32,64,128,256 and 512-bit */
1727   {8, 8, 8, 12, 24},                    /* cost of storing SSE registers
1728                                            in 32,64,128,256 and 512-bit */
1729   6, 6,                         /* SSE->integer and integer->SSE moves */
1730   5, 5,                         /* mask->integer and integer->mask moves */
1731   {8, 8, 8},                            /* cost of loading mask register
1732                                            in QImode, HImode, SImode.  */
1733   {6, 6, 6},                            /* cost if storing mask register
1734                                            in QImode, HImode, SImode.  */
1735   3,                                    /* cost of moving mask register.  */
1736   /* End of register allocator costs.  */
1737   },
1738
1739   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1740   COSTS_N_INSNS (1)+1,          /* cost of a lea instruction */
1741   COSTS_N_INSNS (1),                    /* variable shift costs */
1742   COSTS_N_INSNS (1),                    /* constant shift costs */
1743   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1744    COSTS_N_INSNS (4),                   /*                               HI */
1745    COSTS_N_INSNS (3),                   /*                               SI */
1746    COSTS_N_INSNS (3),                   /*                               DI */
1747    COSTS_N_INSNS (3)},                  /*                            other */
1748   0,                                    /* cost of multiply per each bit set */
1749   /* Expanding div/mod currently doesn't consider parallelism. So the cost
1750      model is not realistic. We compensate by increasing the latencies a bit.  */
1751   {COSTS_N_INSNS (11),                  /* cost of a divide/mod for QI */
1752    COSTS_N_INSNS (11),                  /*                          HI */
1753    COSTS_N_INSNS (14),                  /*                          SI */
1754    COSTS_N_INSNS (76),                  /*                          DI */
1755    COSTS_N_INSNS (76)},                 /*                          other */
1756   COSTS_N_INSNS (1),                    /* cost of movsx */
1757   COSTS_N_INSNS (0),                    /* cost of movzx */
1758   8,                                    /* "large" insn */
1759   17,                                   /* MOVE_RATIO */
1760   6,                                    /* CLEAR_RATIO */
1761   {4, 4, 4},                            /* cost of loading integer registers
1762                                            in QImode, HImode and SImode.
1763                                            Relative to reg-reg move (2).  */
1764   {6, 6, 6},                            /* cost of storing integer registers */
1765   {6, 6, 6, 10, 20},                    /* cost of loading SSE register
1766                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1767   {8, 8, 8, 12, 24},                    /* cost of storing SSE register
1768                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1769   {6, 6, 6, 10, 20},                    /* cost of unaligned loads.  */
1770   {8, 8, 8, 8, 16},                     /* cost of unaligned stores.  */
1771   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
1772   6,                                    /* cost of moving SSE register to integer.  */
1773   20, 8,                                /* Gather load static, per_elt.  */
1774   22, 10,                               /* Gather store static, per_elt.  */
1775   64,                                   /* size of l1 cache.  */
1776   512,                                  /* size of l2 cache.  */
1777   64,                                   /* size of prefetch block */
1778   6,                                    /* number of parallel prefetches */
1779   3,                                    /* Branch cost */
1780   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
1781   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1782   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
1783   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
1784   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
1785   COSTS_N_INSNS (20),                   /* cost of FSQRT instruction.  */
1786
1787   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1788   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1789   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
1790   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1791   COSTS_N_INSNS (4),                    /* cost of FMA SS instruction.  */
1792   COSTS_N_INSNS (4),                    /* cost of FMA SD instruction.  */
1793   COSTS_N_INSNS (11),                   /* cost of DIVSS instruction.  */
1794   COSTS_N_INSNS (14),                   /* cost of DIVSD instruction.  */
1795   COSTS_N_INSNS (12),                   /* cost of SQRTSS instruction.  */
1796   COSTS_N_INSNS (18),                   /* cost of SQRTSD instruction.  */
1797   1, 4, 2, 2,                           /* reassoc int, fp, vec_int, vec_fp.  */
1798   skylake_memcpy,
1799   skylake_memset,
1800   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
1801   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1802   "16:11:8",                            /* Loop alignment.  */
1803   "16:11:8",                            /* Jump alignment.  */
1804   "0:0:8",                              /* Label alignment.  */
1805   "16",                                 /* Func alignment.  */
1806 };
1807   /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1808      very small blocks it is better to use loop. For large blocks, libcall can
1809      do nontemporary accesses and beat inline considerably.  */
1810 static stringop_algs btver1_memcpy[2] = {
1811   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1812              {-1, rep_prefix_4_byte, false}}},
1813   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1814              {-1, libcall, false}}}};
1815 static stringop_algs btver1_memset[2] = {
1816   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1817              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1818   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1819              {-1, libcall, false}}}};
1820 const struct processor_costs btver1_cost = {
1821   {
1822   /* Start of register allocator costs.  integer->integer move cost is 2. */
1823   8,                                 /* cost for loading QImode using movzbl */
1824   {6, 8, 6},                            /* cost of loading integer registers
1825                                            in QImode, HImode and SImode.
1826                                            Relative to reg-reg move (2).  */
1827   {6, 8, 6},                            /* cost of storing integer registers */
1828   4,                                    /* cost of reg,reg fld/fst */
1829   {12, 12, 28},                         /* cost of loading fp registers
1830                                            in SFmode, DFmode and XFmode */
1831   {12, 12, 38},                         /* cost of storing fp registers
1832                                            in SFmode, DFmode and XFmode */
1833   4,                                    /* cost of moving MMX register */
1834   {10, 10},                             /* cost of loading MMX registers
1835                                            in SImode and DImode */
1836   {12, 12},                             /* cost of storing MMX registers
1837                                            in SImode and DImode */
1838   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1839   {10, 10, 12, 48, 96},                 /* cost of loading SSE registers
1840                                            in 32,64,128,256 and 512-bit */
1841   {10, 10, 12, 48, 96},                 /* cost of storing SSE registers
1842                                            in 32,64,128,256 and 512-bit */
1843   14, 14,                               /* SSE->integer and integer->SSE moves */
1844   14, 14,                               /* mask->integer and integer->mask moves */
1845   {6, 8, 6},                            /* cost of loading mask register
1846                                            in QImode, HImode, SImode.  */
1847   {6, 8, 6},                            /* cost if storing mask register
1848                                            in QImode, HImode, SImode.  */
1849   2,                                    /* cost of moving mask register.  */
1850   /* End of register allocator costs.  */
1851   },
1852
1853   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1854   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
1855   COSTS_N_INSNS (1),                    /* variable shift costs */
1856   COSTS_N_INSNS (1),                    /* constant shift costs */
1857   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1858    COSTS_N_INSNS (4),                   /*                               HI */
1859    COSTS_N_INSNS (3),                   /*                               SI */
1860    COSTS_N_INSNS (4),                   /*                               DI */
1861    COSTS_N_INSNS (5)},                  /*                            other */
1862   0,                                    /* cost of multiply per each bit set */
1863   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1864    COSTS_N_INSNS (35),                  /*                          HI */
1865    COSTS_N_INSNS (51),                  /*                          SI */
1866    COSTS_N_INSNS (83),                  /*                          DI */
1867    COSTS_N_INSNS (83)},                 /*                          other */
1868   COSTS_N_INSNS (1),                    /* cost of movsx */
1869   COSTS_N_INSNS (1),                    /* cost of movzx */
1870   8,                                    /* "large" insn */
1871   9,                                    /* MOVE_RATIO */
1872   6,                                    /* CLEAR_RATIO */
1873   {6, 8, 6},                            /* cost of loading integer registers
1874                                            in QImode, HImode and SImode.
1875                                            Relative to reg-reg move (2).  */
1876   {6, 8, 6},                            /* cost of storing integer registers */
1877   {10, 10, 12, 48, 96},                 /* cost of loading SSE register
1878                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1879   {10, 10, 12, 48, 96},                 /* cost of storing SSE register
1880                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1881   {10, 10, 12, 48, 96},                 /* cost of unaligned loads.  */
1882   {10, 10, 12, 48, 96},                 /* cost of unaligned stores.  */
1883   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1884   14,                                   /* cost of moving SSE register to integer.  */
1885   10, 10,                               /* Gather load static, per_elt.  */
1886   10, 10,                               /* Gather store static, per_elt.  */
1887   32,                                   /* size of l1 cache.  */
1888   512,                                  /* size of l2 cache.  */
1889   64,                                   /* size of prefetch block */
1890   100,                                  /* number of parallel prefetches */
1891   2,                                    /* Branch cost */
1892   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
1893   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1894   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
1895   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1896   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1897   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
1898
1899   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1900   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1901   COSTS_N_INSNS (2),                    /* cost of MULSS instruction.  */
1902   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1903   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
1904   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
1905   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
1906   COSTS_N_INSNS (17),                   /* cost of DIVSD instruction.  */
1907   COSTS_N_INSNS (14),                   /* cost of SQRTSS instruction.  */
1908   COSTS_N_INSNS (48),                   /* cost of SQRTSD instruction.  */
1909   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1910   btver1_memcpy,
1911   btver1_memset,
1912   COSTS_N_INSNS (2),                    /* cond_taken_branch_cost.  */
1913   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1914   "16:11:8",                            /* Loop alignment.  */
1915   "16:8:8",                             /* Jump alignment.  */
1916   "0:0:8",                              /* Label alignment.  */
1917   "11",                                 /* Func alignment.  */
1918 };
1919
1920 static stringop_algs btver2_memcpy[2] = {
1921   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1922              {-1, rep_prefix_4_byte, false}}},
1923   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1924              {-1, libcall, false}}}};
1925 static stringop_algs btver2_memset[2] = {
1926   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1927              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1928   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1929              {-1, libcall, false}}}};
1930 const struct processor_costs btver2_cost = {
1931   {
1932   /* Start of register allocator costs.  integer->integer move cost is 2. */
1933   8,                                 /* cost for loading QImode using movzbl */
1934   {8, 8, 6},                            /* cost of loading integer registers
1935                                            in QImode, HImode and SImode.
1936                                            Relative to reg-reg move (2).  */
1937   {8, 8, 6},                            /* cost of storing integer registers */
1938   4,                                    /* cost of reg,reg fld/fst */
1939   {12, 12, 28},                         /* cost of loading fp registers
1940                                            in SFmode, DFmode and XFmode */
1941   {12, 12, 38},                         /* cost of storing fp registers
1942                                            in SFmode, DFmode and XFmode */
1943   4,                                    /* cost of moving MMX register */
1944   {10, 10},                             /* cost of loading MMX registers
1945                                            in SImode and DImode */
1946   {12, 12},                             /* cost of storing MMX registers
1947                                            in SImode and DImode */
1948   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1949   {10, 10, 12, 48, 96},                 /* cost of loading SSE registers
1950                                            in 32,64,128,256 and 512-bit */
1951   {10, 10, 12, 48, 96},                 /* cost of storing SSE registers
1952                                            in 32,64,128,256 and 512-bit */
1953   14, 14,                               /* SSE->integer and integer->SSE moves */
1954   14, 14,                               /* mask->integer and integer->mask moves */
1955   {8, 8, 6},                            /* cost of loading mask register
1956                                            in QImode, HImode, SImode.  */
1957   {8, 8, 6},                            /* cost if storing mask register
1958                                            in QImode, HImode, SImode.  */
1959   2,                                    /* cost of moving mask register.  */
1960   /* End of register allocator costs.  */
1961   },
1962
1963   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1964   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
1965   COSTS_N_INSNS (1),                    /* variable shift costs */
1966   COSTS_N_INSNS (1),                    /* constant shift costs */
1967   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1968    COSTS_N_INSNS (4),                   /*                               HI */
1969    COSTS_N_INSNS (3),                   /*                               SI */
1970    COSTS_N_INSNS (4),                   /*                               DI */
1971    COSTS_N_INSNS (5)},                  /*                            other */
1972   0,                                    /* cost of multiply per each bit set */
1973   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1974    COSTS_N_INSNS (35),                  /*                          HI */
1975    COSTS_N_INSNS (51),                  /*                          SI */
1976    COSTS_N_INSNS (83),                  /*                          DI */
1977    COSTS_N_INSNS (83)},                 /*                          other */
1978   COSTS_N_INSNS (1),                    /* cost of movsx */
1979   COSTS_N_INSNS (1),                    /* cost of movzx */
1980   8,                                    /* "large" insn */
1981   9,                                    /* MOVE_RATIO */
1982   6,                                    /* CLEAR_RATIO */
1983   {8, 8, 6},                            /* cost of loading integer registers
1984                                            in QImode, HImode and SImode.
1985                                            Relative to reg-reg move (2).  */
1986   {8, 8, 6},                            /* cost of storing integer registers */
1987   {10, 10, 12, 48, 96},                 /* cost of loading SSE register
1988                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1989   {10, 10, 12, 48, 96},                 /* cost of storing SSE register
1990                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
1991   {10, 10, 12, 48, 96},                 /* cost of unaligned loads.  */
1992   {10, 10, 12, 48, 96},                 /* cost of unaligned stores.  */
1993   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1994   14,                                   /* cost of moving SSE register to integer.  */
1995   10, 10,                               /* Gather load static, per_elt.  */
1996   10, 10,                               /* Gather store static, per_elt.  */
1997   32,                                   /* size of l1 cache.  */
1998   2048,                                 /* size of l2 cache.  */
1999   64,                                   /* size of prefetch block */
2000   100,                                  /* number of parallel prefetches */
2001   2,                                    /* Branch cost */
2002   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
2003   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
2004   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
2005   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
2006   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
2007   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
2008
2009   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2010   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2011   COSTS_N_INSNS (2),                    /* cost of MULSS instruction.  */
2012   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
2013   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
2014   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
2015   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
2016   COSTS_N_INSNS (19),                   /* cost of DIVSD instruction.  */
2017   COSTS_N_INSNS (16),                   /* cost of SQRTSS instruction.  */
2018   COSTS_N_INSNS (21),                   /* cost of SQRTSD instruction.  */
2019   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2020   btver2_memcpy,
2021   btver2_memset,
2022   COSTS_N_INSNS (2),                    /* cond_taken_branch_cost.  */
2023   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2024   "16:11:8",                            /* Loop alignment.  */
2025   "16:8:8",                             /* Jump alignment.  */
2026   "0:0:8",                              /* Label alignment.  */
2027   "11",                                 /* Func alignment.  */
2028 };
2029
2030 static stringop_algs pentium4_memcpy[2] = {
2031   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
2032   DUMMY_STRINGOP_ALGS};
2033 static stringop_algs pentium4_memset[2] = {
2034   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
2035              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2036   DUMMY_STRINGOP_ALGS};
2037
2038 static const
2039 struct processor_costs pentium4_cost = {
2040   {
2041   /* Start of register allocator costs.  integer->integer move cost is 2. */
2042   5,                                 /* cost for loading QImode using movzbl */
2043   {4, 5, 4},                            /* cost of loading integer registers
2044                                            in QImode, HImode and SImode.
2045                                            Relative to reg-reg move (2).  */
2046   {2, 3, 2},                            /* cost of storing integer registers */
2047   12,                                   /* cost of reg,reg fld/fst */
2048   {14, 14, 14},                         /* cost of loading fp registers
2049                                            in SFmode, DFmode and XFmode */
2050   {14, 14, 14},                         /* cost of storing fp registers
2051                                            in SFmode, DFmode and XFmode */
2052   12,                                   /* cost of moving MMX register */
2053   {16, 16},                             /* cost of loading MMX registers
2054                                            in SImode and DImode */
2055   {16, 16},                             /* cost of storing MMX registers
2056                                            in SImode and DImode */
2057   12, 24, 48,                           /* cost of moving XMM,YMM,ZMM register */
2058   {16, 16, 16, 32, 64},                 /* cost of loading SSE registers
2059                                            in 32,64,128,256 and 512-bit */
2060   {16, 16, 16, 32, 64},                 /* cost of storing SSE registers
2061                                            in 32,64,128,256 and 512-bit */
2062   20, 12,                               /* SSE->integer and integer->SSE moves */
2063   20, 12,                               /* mask->integer and integer->mask moves */
2064   {4, 5, 4},                            /* cost of loading mask register
2065                                            in QImode, HImode, SImode.  */
2066   {2, 3, 2},                            /* cost if storing mask register
2067                                            in QImode, HImode, SImode.  */
2068   2,                                    /* cost of moving mask register.  */
2069   /* End of register allocator costs.  */
2070   },
2071
2072   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2073   COSTS_N_INSNS (3),                    /* cost of a lea instruction */
2074   COSTS_N_INSNS (4),                    /* variable shift costs */
2075   COSTS_N_INSNS (4),                    /* constant shift costs */
2076   {COSTS_N_INSNS (15),                  /* cost of starting multiply for QI */
2077    COSTS_N_INSNS (15),                  /*                               HI */
2078    COSTS_N_INSNS (15),                  /*                               SI */
2079    COSTS_N_INSNS (15),                  /*                               DI */
2080    COSTS_N_INSNS (15)},                 /*                            other */
2081   0,                                    /* cost of multiply per each bit set */
2082   {COSTS_N_INSNS (56),                  /* cost of a divide/mod for QI */
2083    COSTS_N_INSNS (56),                  /*                          HI */
2084    COSTS_N_INSNS (56),                  /*                          SI */
2085    COSTS_N_INSNS (56),                  /*                          DI */
2086    COSTS_N_INSNS (56)},                 /*                          other */
2087   COSTS_N_INSNS (1),                    /* cost of movsx */
2088   COSTS_N_INSNS (1),                    /* cost of movzx */
2089   16,                                   /* "large" insn */
2090   6,                                    /* MOVE_RATIO */
2091   6,                                    /* CLEAR_RATIO */
2092   {4, 5, 4},                            /* cost of loading integer registers
2093                                            in QImode, HImode and SImode.
2094                                            Relative to reg-reg move (2).  */
2095   {2, 3, 2},                            /* cost of storing integer registers */
2096   {16, 16, 16, 32, 64},                 /* cost of loading SSE register
2097                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2098   {16, 16, 16, 32, 64},                 /* cost of storing SSE register
2099                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2100   {32, 32, 32, 64, 128},                /* cost of unaligned loads.  */
2101   {32, 32, 32, 64, 128},                /* cost of unaligned stores.  */
2102   12, 24, 48,                           /* cost of moving XMM,YMM,ZMM register */
2103   20,                                   /* cost of moving SSE register to integer.  */
2104   16, 16,                               /* Gather load static, per_elt.  */
2105   16, 16,                               /* Gather store static, per_elt.  */
2106   8,                                    /* size of l1 cache.  */
2107   256,                                  /* size of l2 cache.  */
2108   64,                                   /* size of prefetch block */
2109   6,                                    /* number of parallel prefetches */
2110   2,                                    /* Branch cost */
2111   COSTS_N_INSNS (5),                    /* cost of FADD and FSUB insns.  */
2112   COSTS_N_INSNS (7),                    /* cost of FMUL instruction.  */
2113   COSTS_N_INSNS (43),                   /* cost of FDIV instruction.  */
2114   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
2115   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
2116   COSTS_N_INSNS (43),                   /* cost of FSQRT instruction.  */
2117
2118   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
2119   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2120   COSTS_N_INSNS (6),                    /* cost of MULSS instruction.  */
2121   COSTS_N_INSNS (6),                    /* cost of MULSD instruction.  */
2122   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
2123   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
2124   COSTS_N_INSNS (23),                   /* cost of DIVSS instruction.  */
2125   COSTS_N_INSNS (38),                   /* cost of DIVSD instruction.  */
2126   COSTS_N_INSNS (23),                   /* cost of SQRTSS instruction.  */
2127   COSTS_N_INSNS (38),                   /* cost of SQRTSD instruction.  */
2128   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2129   pentium4_memcpy,
2130   pentium4_memset,
2131   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2132   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2133   NULL,                                 /* Loop alignment.  */
2134   NULL,                                 /* Jump alignment.  */
2135   NULL,                                 /* Label alignment.  */
2136   NULL,                                 /* Func alignment.  */
2137 };
2138
2139 static stringop_algs nocona_memcpy[2] = {
2140   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
2141   {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
2142              {100000, unrolled_loop, false}, {-1, libcall, false}}}};
2143
2144 static stringop_algs nocona_memset[2] = {
2145   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
2146              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2147   {libcall, {{24, loop, false}, {64, unrolled_loop, false},
2148              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2149
2150 static const
2151 struct processor_costs nocona_cost = {
2152   {
2153   /* Start of register allocator costs.  integer->integer move cost is 2. */
2154   4,                                 /* cost for loading QImode using movzbl */
2155   {4, 4, 4},                            /* cost of loading integer registers
2156                                            in QImode, HImode and SImode.
2157                                            Relative to reg-reg move (2).  */
2158   {4, 4, 4},                            /* cost of storing integer registers */
2159   12,                                   /* cost of reg,reg fld/fst */
2160   {14, 14, 14},                         /* cost of loading fp registers
2161                                            in SFmode, DFmode and XFmode */
2162   {14, 14, 14},                         /* cost of storing fp registers
2163                                            in SFmode, DFmode and XFmode */
2164   14,                                   /* cost of moving MMX register */
2165   {12, 12},                             /* cost of loading MMX registers
2166                                            in SImode and DImode */
2167   {12, 12},                             /* cost of storing MMX registers
2168                                            in SImode and DImode */
2169   6, 12, 24,                            /* cost of moving XMM,YMM,ZMM register */
2170   {12, 12, 12, 24, 48},                 /* cost of loading SSE registers
2171                                            in 32,64,128,256 and 512-bit */
2172   {12, 12, 12, 24, 48},                 /* cost of storing SSE registers
2173                                            in 32,64,128,256 and 512-bit */
2174   20, 12,                               /* SSE->integer and integer->SSE moves */
2175   20, 12,                               /* mask->integer and integer->mask moves */
2176   {4, 4, 4},                            /* cost of loading mask register
2177                                            in QImode, HImode, SImode.  */
2178   {4, 4, 4},                            /* cost if storing mask register
2179                                            in QImode, HImode, SImode.  */
2180   2,                                    /* cost of moving mask register.  */
2181   /* End of register allocator costs.  */
2182   },
2183
2184   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2185   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
2186   COSTS_N_INSNS (1),                    /* variable shift costs */
2187   COSTS_N_INSNS (1),                    /* constant shift costs */
2188   {COSTS_N_INSNS (10),                  /* cost of starting multiply for QI */
2189    COSTS_N_INSNS (10),                  /*                               HI */
2190    COSTS_N_INSNS (10),                  /*                               SI */
2191    COSTS_N_INSNS (10),                  /*                               DI */
2192    COSTS_N_INSNS (10)},                 /*                            other */
2193   0,                                    /* cost of multiply per each bit set */
2194   {COSTS_N_INSNS (66),                  /* cost of a divide/mod for QI */
2195    COSTS_N_INSNS (66),                  /*                          HI */
2196    COSTS_N_INSNS (66),                  /*                          SI */
2197    COSTS_N_INSNS (66),                  /*                          DI */
2198    COSTS_N_INSNS (66)},                 /*                          other */
2199   COSTS_N_INSNS (1),                    /* cost of movsx */
2200   COSTS_N_INSNS (1),                    /* cost of movzx */
2201   16,                                   /* "large" insn */
2202   17,                                   /* MOVE_RATIO */
2203   6,                                    /* CLEAR_RATIO */
2204   {4, 4, 4},                            /* cost of loading integer registers
2205                                            in QImode, HImode and SImode.
2206                                            Relative to reg-reg move (2).  */
2207   {4, 4, 4},                            /* cost of storing integer registers */
2208   {12, 12, 12, 24, 48},                 /* cost of loading SSE register
2209                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2210   {12, 12, 12, 24, 48},                 /* cost of storing SSE register
2211                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2212   {24, 24, 24, 48, 96},                 /* cost of unaligned loads.  */
2213   {24, 24, 24, 48, 96},                 /* cost of unaligned stores.  */
2214   6, 12, 24,                            /* cost of moving XMM,YMM,ZMM register */
2215   20,                                   /* cost of moving SSE register to integer.  */
2216   12, 12,                               /* Gather load static, per_elt.  */
2217   12, 12,                               /* Gather store static, per_elt.  */
2218   8,                                    /* size of l1 cache.  */
2219   1024,                                 /* size of l2 cache.  */
2220   64,                                   /* size of prefetch block */
2221   8,                                    /* number of parallel prefetches */
2222   1,                                    /* Branch cost */
2223   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
2224   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
2225   COSTS_N_INSNS (40),                   /* cost of FDIV instruction.  */
2226   COSTS_N_INSNS (3),                    /* cost of FABS instruction.  */
2227   COSTS_N_INSNS (3),                    /* cost of FCHS instruction.  */
2228   COSTS_N_INSNS (44),                   /* cost of FSQRT instruction.  */
2229
2230   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
2231   COSTS_N_INSNS (5),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2232   COSTS_N_INSNS (7),                    /* cost of MULSS instruction.  */
2233   COSTS_N_INSNS (7),                    /* cost of MULSD instruction.  */
2234   COSTS_N_INSNS (7),                    /* cost of FMA SS instruction.  */
2235   COSTS_N_INSNS (7),                    /* cost of FMA SD instruction.  */
2236   COSTS_N_INSNS (32),                   /* cost of DIVSS instruction.  */
2237   COSTS_N_INSNS (40),                   /* cost of DIVSD instruction.  */
2238   COSTS_N_INSNS (32),                   /* cost of SQRTSS instruction.  */
2239   COSTS_N_INSNS (41),                   /* cost of SQRTSD instruction.  */
2240   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2241   nocona_memcpy,
2242   nocona_memset,
2243   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2244   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2245   NULL,                                 /* Loop alignment.  */
2246   NULL,                                 /* Jump alignment.  */
2247   NULL,                                 /* Label alignment.  */
2248   NULL,                                 /* Func alignment.  */
2249 };
2250
2251 static stringop_algs atom_memcpy[2] = {
2252   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2253   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2254              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2255 static stringop_algs atom_memset[2] = {
2256   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2257              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2258   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2259              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2260 static const
2261 struct processor_costs atom_cost = {
2262   {
2263   /* Start of register allocator costs.  integer->integer move cost is 2. */
2264   6,                                    /* cost for loading QImode using movzbl */
2265   {6, 6, 6},                            /* cost of loading integer registers
2266                                            in QImode, HImode and SImode.
2267                                            Relative to reg-reg move (2).  */
2268   {6, 6, 6},                            /* cost of storing integer registers */
2269   4,                                    /* cost of reg,reg fld/fst */
2270   {6, 6, 18},                           /* cost of loading fp registers
2271                                            in SFmode, DFmode and XFmode */
2272   {14, 14, 24},                         /* cost of storing fp registers
2273                                            in SFmode, DFmode and XFmode */
2274   2,                                    /* cost of moving MMX register */
2275   {8, 8},                               /* cost of loading MMX registers
2276                                            in SImode and DImode */
2277   {10, 10},                             /* cost of storing MMX registers
2278                                            in SImode and DImode */
2279   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
2280   {8, 8, 8, 16, 32},                    /* cost of loading SSE registers
2281                                            in 32,64,128,256 and 512-bit */
2282   {8, 8, 8, 16, 32},                    /* cost of storing SSE registers
2283                                            in 32,64,128,256 and 512-bit */
2284   8, 6,                         /* SSE->integer and integer->SSE moves */
2285   8, 6,                         /* mask->integer and integer->mask moves */
2286   {6, 6, 6},                            /* cost of loading mask register
2287                                            in QImode, HImode, SImode.  */
2288   {6, 6, 6},                    /* cost if storing mask register
2289                                            in QImode, HImode, SImode.  */
2290   2,                                    /* cost of moving mask register.  */
2291   /* End of register allocator costs.  */
2292   },
2293
2294   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2295   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
2296   COSTS_N_INSNS (1),                    /* variable shift costs */
2297   COSTS_N_INSNS (1),                    /* constant shift costs */
2298   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2299    COSTS_N_INSNS (4),                   /*                               HI */
2300    COSTS_N_INSNS (3),                   /*                               SI */
2301    COSTS_N_INSNS (4),                   /*                               DI */
2302    COSTS_N_INSNS (2)},                  /*                            other */
2303   0,                                    /* cost of multiply per each bit set */
2304   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
2305    COSTS_N_INSNS (26),                  /*                          HI */
2306    COSTS_N_INSNS (42),                  /*                          SI */
2307    COSTS_N_INSNS (74),                  /*                          DI */
2308    COSTS_N_INSNS (74)},                 /*                          other */
2309   COSTS_N_INSNS (1),                    /* cost of movsx */
2310   COSTS_N_INSNS (1),                    /* cost of movzx */
2311   8,                                    /* "large" insn */
2312   17,                                   /* MOVE_RATIO */
2313   6,                                    /* CLEAR_RATIO */
2314   {6, 6, 6},                            /* cost of loading integer registers
2315                                            in QImode, HImode and SImode.
2316                                            Relative to reg-reg move (2).  */
2317   {6, 6, 6},                            /* cost of storing integer registers */
2318   {8, 8, 8, 16, 32},                    /* cost of loading SSE register
2319                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2320   {8, 8, 8, 16, 32},                    /* cost of storing SSE register
2321                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2322   {16, 16, 16, 32, 64},                 /* cost of unaligned loads.  */
2323   {16, 16, 16, 32, 64},                 /* cost of unaligned stores.  */
2324   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
2325   8,                                    /* cost of moving SSE register to integer.  */
2326   8, 8,                                 /* Gather load static, per_elt.  */
2327   8, 8,                                 /* Gather store static, per_elt.  */
2328   32,                                   /* size of l1 cache.  */
2329   256,                                  /* size of l2 cache.  */
2330   64,                                   /* size of prefetch block */
2331   6,                                    /* number of parallel prefetches */
2332   3,                                    /* Branch cost */
2333   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
2334   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
2335   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
2336   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
2337   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
2338   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
2339
2340   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2341   COSTS_N_INSNS (5),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2342   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
2343   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
2344   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
2345   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
2346   COSTS_N_INSNS (31),                   /* cost of DIVSS instruction.  */
2347   COSTS_N_INSNS (60),                   /* cost of DIVSD instruction.  */
2348   COSTS_N_INSNS (31),                   /* cost of SQRTSS instruction.  */
2349   COSTS_N_INSNS (63),                   /* cost of SQRTSD instruction.  */
2350   2, 2, 2, 2,                           /* reassoc int, fp, vec_int, vec_fp.  */
2351   atom_memcpy,
2352   atom_memset,
2353   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2354   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2355   "16",                                 /* Loop alignment.  */
2356   "16:8:8",                             /* Jump alignment.  */
2357   "0:0:8",                              /* Label alignment.  */
2358   "16",                                 /* Func alignment.  */
2359 };
2360
2361 static stringop_algs slm_memcpy[2] = {
2362   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2363   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2364              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2365 static stringop_algs slm_memset[2] = {
2366   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2367              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2368   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2369              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2370 static const
2371 struct processor_costs slm_cost = {
2372   {
2373   /* Start of register allocator costs.  integer->integer move cost is 2. */
2374   8,                                    /* cost for loading QImode using movzbl */
2375   {8, 8, 8},                            /* cost of loading integer registers
2376                                            in QImode, HImode and SImode.
2377                                            Relative to reg-reg move (2).  */
2378   {6, 6, 6},                            /* cost of storing integer registers */
2379   2,                                    /* cost of reg,reg fld/fst */
2380   {8, 8, 18},                           /* cost of loading fp registers
2381                                            in SFmode, DFmode and XFmode */
2382   {6, 6, 18},                           /* cost of storing fp registers
2383                                            in SFmode, DFmode and XFmode */
2384   2,                                    /* cost of moving MMX register */
2385   {8, 8},                               /* cost of loading MMX registers
2386                                            in SImode and DImode */
2387   {6, 6},                               /* cost of storing MMX registers
2388                                            in SImode and DImode */
2389   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
2390   {8, 8, 8, 16, 32},                    /* cost of loading SSE registers
2391                                            in 32,64,128,256 and 512-bit */
2392   {8, 8, 8, 16, 32},                    /* cost of storing SSE registers
2393                                            in 32,64,128,256 and 512-bit */
2394   8, 6,                         /* SSE->integer and integer->SSE moves */
2395   8, 6,                         /* mask->integer and integer->mask moves */
2396   {8, 8, 8},                    /* cost of loading mask register
2397                                            in QImode, HImode, SImode.  */
2398   {6, 6, 6},                    /* cost if storing mask register
2399                                            in QImode, HImode, SImode.  */
2400   2,                                    /* cost of moving mask register.  */
2401   /* End of register allocator costs.  */
2402   },
2403
2404   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2405   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
2406   COSTS_N_INSNS (1),                    /* variable shift costs */
2407   COSTS_N_INSNS (1),                    /* constant shift costs */
2408   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2409    COSTS_N_INSNS (3),                   /*                               HI */
2410    COSTS_N_INSNS (3),                   /*                               SI */
2411    COSTS_N_INSNS (4),                   /*                               DI */
2412    COSTS_N_INSNS (2)},                  /*                            other */
2413   0,                                    /* cost of multiply per each bit set */
2414   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
2415    COSTS_N_INSNS (26),                  /*                          HI */
2416    COSTS_N_INSNS (42),                  /*                          SI */
2417    COSTS_N_INSNS (74),                  /*                          DI */
2418    COSTS_N_INSNS (74)},                 /*                          other */
2419   COSTS_N_INSNS (1),                    /* cost of movsx */
2420   COSTS_N_INSNS (1),                    /* cost of movzx */
2421   8,                                    /* "large" insn */
2422   17,                                   /* MOVE_RATIO */
2423   6,                                    /* CLEAR_RATIO */
2424   {8, 8, 8},                            /* cost of loading integer registers
2425                                            in QImode, HImode and SImode.
2426                                            Relative to reg-reg move (2).  */
2427   {6, 6, 6},                            /* cost of storing integer registers */
2428   {8, 8, 8, 16, 32},                    /* cost of loading SSE register
2429                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2430   {8, 8, 8, 16, 32},                    /* cost of storing SSE register
2431                                            in SImode, DImode and TImode.  */
2432   {16, 16, 16, 32, 64},                 /* cost of unaligned loads.  */
2433   {16, 16, 16, 32, 64},                 /* cost of unaligned stores.  */
2434   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
2435   8,                                    /* cost of moving SSE register to integer.  */
2436   8, 8,                                 /* Gather load static, per_elt.  */
2437   8, 8,                                 /* Gather store static, per_elt.  */
2438   32,                                   /* size of l1 cache.  */
2439   256,                                  /* size of l2 cache.  */
2440   64,                                   /* size of prefetch block */
2441   6,                                    /* number of parallel prefetches */
2442   3,                                    /* Branch cost */
2443   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
2444   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
2445   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
2446   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
2447   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
2448   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
2449
2450   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2451   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2452   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
2453   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
2454   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
2455   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
2456   COSTS_N_INSNS (39),                   /* cost of DIVSS instruction.  */
2457   COSTS_N_INSNS (69),                   /* cost of DIVSD instruction.  */
2458   COSTS_N_INSNS (20),                   /* cost of SQRTSS instruction.  */
2459   COSTS_N_INSNS (35),                   /* cost of SQRTSD instruction.  */
2460   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2461   slm_memcpy,
2462   slm_memset,
2463   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2464   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2465   "16",                                 /* Loop alignment.  */
2466   "16:8:8",                             /* Jump alignment.  */
2467   "0:0:8",                              /* Label alignment.  */
2468   "16",                                 /* Func alignment.  */
2469 };
2470
2471 static stringop_algs intel_memcpy[2] = {
2472   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2473   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2474              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2475 static stringop_algs intel_memset[2] = {
2476   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2477              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2478   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2479              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2480 static const
2481 struct processor_costs intel_cost = {
2482   {
2483   /* Start of register allocator costs.  integer->integer move cost is 2. */
2484   6,                                 /* cost for loading QImode using movzbl */
2485   {4, 4, 4},                            /* cost of loading integer registers
2486                                            in QImode, HImode and SImode.
2487                                            Relative to reg-reg move (2).  */
2488   {6, 6, 6},                            /* cost of storing integer registers */
2489   2,                                    /* cost of reg,reg fld/fst */
2490   {6, 6, 8},                            /* cost of loading fp registers
2491                                            in SFmode, DFmode and XFmode */
2492   {6, 6, 10},                           /* cost of storing fp registers
2493                                            in SFmode, DFmode and XFmode */
2494   2,                                    /* cost of moving MMX register */
2495   {6, 6},                               /* cost of loading MMX registers
2496                                            in SImode and DImode */
2497   {6, 6},                               /* cost of storing MMX registers
2498                                            in SImode and DImode */
2499   2, 2, 2,                              /* cost of moving XMM,YMM,ZMM register */
2500   {6, 6, 6, 6, 6},                      /* cost of loading SSE registers
2501                                            in 32,64,128,256 and 512-bit */
2502   {6, 6, 6, 6, 6},                      /* cost of storing SSE registers
2503                                            in 32,64,128,256 and 512-bit */
2504   4, 4,                         /* SSE->integer and integer->SSE moves */
2505   4, 4,                         /* mask->integer and integer->mask moves */
2506   {4, 4, 4},                            /* cost of loading mask register
2507                                            in QImode, HImode, SImode.  */
2508   {6, 6, 6},                            /* cost if storing mask register
2509                                            in QImode, HImode, SImode.  */
2510   2,                                    /* cost of moving mask register.  */
2511   /* End of register allocator costs.  */
2512   },
2513
2514   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2515   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
2516   COSTS_N_INSNS (1),                    /* variable shift costs */
2517   COSTS_N_INSNS (1),                    /* constant shift costs */
2518   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2519    COSTS_N_INSNS (3),                   /*                               HI */
2520    COSTS_N_INSNS (3),                   /*                               SI */
2521    COSTS_N_INSNS (4),                   /*                               DI */
2522    COSTS_N_INSNS (2)},                  /*                            other */
2523   0,                                    /* cost of multiply per each bit set */
2524   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
2525    COSTS_N_INSNS (26),                  /*                          HI */
2526    COSTS_N_INSNS (42),                  /*                          SI */
2527    COSTS_N_INSNS (74),                  /*                          DI */
2528    COSTS_N_INSNS (74)},                 /*                          other */
2529   COSTS_N_INSNS (1),                    /* cost of movsx */
2530   COSTS_N_INSNS (1),                    /* cost of movzx */
2531   8,                                    /* "large" insn */
2532   17,                                   /* MOVE_RATIO */
2533   6,                                    /* CLEAR_RATIO */
2534   {4, 4, 4},                            /* cost of loading integer registers
2535                                            in QImode, HImode and SImode.
2536                                            Relative to reg-reg move (2).  */
2537   {6, 6, 6},                            /* cost of storing integer registers */
2538   {6, 6, 6, 6, 6},                      /* cost of loading SSE register
2539                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2540   {6, 6, 6, 6, 6},                      /* cost of storing SSE register
2541                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2542   {10, 10, 10, 10, 10},                 /* cost of unaligned loads.  */
2543   {10, 10, 10, 10, 10},                 /* cost of unaligned loads.  */
2544   2, 2, 2,                              /* cost of moving XMM,YMM,ZMM register */
2545   4,                                    /* cost of moving SSE register to integer.  */
2546   6, 6,                                 /* Gather load static, per_elt.  */
2547   6, 6,                                 /* Gather store static, per_elt.  */
2548   32,                                   /* size of l1 cache.  */
2549   256,                                  /* size of l2 cache.  */
2550   64,                                   /* size of prefetch block */
2551   6,                                    /* number of parallel prefetches */
2552   3,                                    /* Branch cost */
2553   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
2554   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
2555   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
2556   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
2557   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
2558   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
2559
2560   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2561   COSTS_N_INSNS (8),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2562   COSTS_N_INSNS (8),                    /* cost of MULSS instruction.  */
2563   COSTS_N_INSNS (8),                    /* cost of MULSD instruction.  */
2564   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
2565   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
2566   COSTS_N_INSNS (20),                   /* cost of DIVSS instruction.  */
2567   COSTS_N_INSNS (20),                   /* cost of DIVSD instruction.  */
2568   COSTS_N_INSNS (40),                   /* cost of SQRTSS instruction.  */
2569   COSTS_N_INSNS (40),                   /* cost of SQRTSD instruction.  */
2570   1, 4, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2571   intel_memcpy,
2572   intel_memset,
2573   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2574   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2575   "16",                                 /* Loop alignment.  */
2576   "16:8:8",                             /* Jump alignment.  */
2577   "0:0:8",                              /* Label alignment.  */
2578   "16",                                 /* Func alignment.  */
2579 };
2580
2581 /* Generic should produce code tuned for Core-i7 (and newer chips)
2582    and btver1 (and newer chips).  */
2583
2584 static stringop_algs generic_memcpy[2] = {
2585   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2586              {-1, libcall, false}}},
2587   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2588              {-1, libcall, false}}}};
2589 static stringop_algs generic_memset[2] = {
2590   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2591              {-1, libcall, false}}},
2592   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2593              {-1, libcall, false}}}};
2594 static const
2595 struct processor_costs generic_cost = {
2596   {
2597   /* Start of register allocator costs.  integer->integer move cost is 2. */
2598   6,                                 /* cost for loading QImode using movzbl */
2599   {6, 6, 6},                            /* cost of loading integer registers
2600                                            in QImode, HImode and SImode.
2601                                            Relative to reg-reg move (2).  */
2602   {6, 6, 6},                            /* cost of storing integer registers */
2603   4,                                    /* cost of reg,reg fld/fst */
2604   {6, 6, 12},                           /* cost of loading fp registers
2605                                            in SFmode, DFmode and XFmode */
2606   {6, 6, 12},                           /* cost of storing fp registers
2607                                            in SFmode, DFmode and XFmode */
2608   2,                                    /* cost of moving MMX register */
2609   {6, 6},                               /* cost of loading MMX registers
2610                                            in SImode and DImode */
2611   {6, 6},                               /* cost of storing MMX registers
2612                                            in SImode and DImode */
2613   2, 3, 4,                              /* cost of moving XMM,YMM,ZMM register */
2614   {6, 6, 6, 10, 15},                    /* cost of loading SSE registers
2615                                            in 32,64,128,256 and 512-bit */
2616   {6, 6, 6, 10, 15},                    /* cost of storing SSE registers
2617                                            in 32,64,128,256 and 512-bit */
2618   6, 6,                         /* SSE->integer and integer->SSE moves */
2619   6, 6,                         /* mask->integer and integer->mask moves */
2620   {6, 6, 6},                            /* cost of loading mask register
2621                                            in QImode, HImode, SImode.  */
2622   {6, 6, 6},                    /* cost if storing mask register
2623                                            in QImode, HImode, SImode.  */
2624   2,                                    /* cost of moving mask register.  */
2625   /* End of register allocator costs.  */
2626   },
2627
2628   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2629   /* Setting cost to 2 makes our current implementation of synth_mult result in
2630      use of unnecessary temporary registers causing regression on several
2631      SPECfp benchmarks.  */
2632   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
2633   COSTS_N_INSNS (1),                    /* variable shift costs */
2634   COSTS_N_INSNS (1),                    /* constant shift costs */
2635   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2636    COSTS_N_INSNS (4),                   /*                               HI */
2637    COSTS_N_INSNS (3),                   /*                               SI */
2638    COSTS_N_INSNS (4),                   /*                               DI */
2639    COSTS_N_INSNS (4)},                  /*                            other */
2640   0,                                    /* cost of multiply per each bit set */
2641   {COSTS_N_INSNS (16),                  /* cost of a divide/mod for QI */
2642    COSTS_N_INSNS (22),                  /*                          HI */
2643    COSTS_N_INSNS (30),                  /*                          SI */
2644    COSTS_N_INSNS (74),                  /*                          DI */
2645    COSTS_N_INSNS (74)},                 /*                          other */
2646   COSTS_N_INSNS (1),                    /* cost of movsx */
2647   COSTS_N_INSNS (1),                    /* cost of movzx */
2648   8,                                    /* "large" insn */
2649   17,                                   /* MOVE_RATIO */
2650   6,                                    /* CLEAR_RATIO */
2651   {6, 6, 6},                            /* cost of loading integer registers
2652                                            in QImode, HImode and SImode.
2653                                            Relative to reg-reg move (2).  */
2654   {6, 6, 6},                            /* cost of storing integer registers */
2655   {6, 6, 6, 10, 15},                    /* cost of loading SSE register
2656                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2657   {6, 6, 6, 10, 15},                    /* cost of storing SSE register
2658                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2659   {6, 6, 6, 10, 15},                    /* cost of unaligned loads.  */
2660   {6, 6, 6, 10, 15},                    /* cost of unaligned storess.  */
2661   2, 3, 4,                              /* cost of moving XMM,YMM,ZMM register */
2662   6,                                    /* cost of moving SSE register to integer.  */
2663   18, 6,                                /* Gather load static, per_elt.  */
2664   18, 6,                                /* Gather store static, per_elt.  */
2665   32,                                   /* size of l1 cache.  */
2666   512,                                  /* size of l2 cache.  */
2667   64,                                   /* size of prefetch block */
2668   6,                                    /* number of parallel prefetches */
2669   /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2670      value is increased to perhaps more appropriate value of 5.  */
2671   3,                                    /* Branch cost */
2672   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
2673   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
2674   COSTS_N_INSNS (17),                   /* cost of FDIV instruction.  */
2675   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
2676   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
2677   COSTS_N_INSNS (14),                   /* cost of FSQRT instruction.  */
2678
2679   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2680   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2681   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
2682   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
2683   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
2684   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
2685   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
2686   COSTS_N_INSNS (17),                   /* cost of DIVSD instruction.  */
2687   COSTS_N_INSNS (14),                   /* cost of SQRTSS instruction.  */
2688   COSTS_N_INSNS (18),                   /* cost of SQRTSD instruction.  */
2689   1, 4, 3, 3,                           /* reassoc int, fp, vec_int, vec_fp.  */
2690   generic_memcpy,
2691   generic_memset,
2692   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
2693   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
2694   "16:11:8",                            /* Loop alignment.  */
2695   "16:11:8",                            /* Jump alignment.  */
2696   "0:0:8",                              /* Label alignment.  */
2697   "16",                                 /* Func alignment.  */
2698 };
2699
2700 /* core_cost should produce code tuned for Core familly of CPUs.  */
2701 static stringop_algs core_memcpy[2] = {
2702   {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2703   {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2704              {-1, libcall, false}}}};
2705 static stringop_algs core_memset[2] = {
2706   {libcall, {{6, loop_1_byte, true},
2707              {24, loop, true},
2708              {8192, rep_prefix_4_byte, true},
2709              {-1, libcall, false}}},
2710   {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2711              {-1, libcall, false}}}};
2712
2713 static const
2714 struct processor_costs core_cost = {
2715   {
2716   /* Start of register allocator costs.  integer->integer move cost is 2. */
2717   6,                                 /* cost for loading QImode using movzbl */
2718   {4, 4, 4},                            /* cost of loading integer registers
2719                                            in QImode, HImode and SImode.
2720                                            Relative to reg-reg move (2).  */
2721   {6, 6, 6},                            /* cost of storing integer registers */
2722   2,                                    /* cost of reg,reg fld/fst */
2723   {6, 6, 8},                            /* cost of loading fp registers
2724                                            in SFmode, DFmode and XFmode */
2725   {6, 6, 10},                           /* cost of storing fp registers
2726                                            in SFmode, DFmode and XFmode */
2727   2,                                    /* cost of moving MMX register */
2728   {6, 6},                               /* cost of loading MMX registers
2729                                            in SImode and DImode */
2730   {6, 6},                               /* cost of storing MMX registers
2731                                            in SImode and DImode */
2732   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
2733   {6, 6, 6, 6, 12},                     /* cost of loading SSE registers
2734                                            in 32,64,128,256 and 512-bit */
2735   {6, 6, 6, 6, 12},                     /* cost of storing SSE registers
2736                                            in 32,64,128,256 and 512-bit */
2737   6, 6,                         /* SSE->integer and integer->SSE moves */
2738   6, 6,                         /* mask->integer and integer->mask moves */
2739   {4, 4, 4},                            /* cost of loading mask register
2740                                            in QImode, HImode, SImode.  */
2741   {6, 6, 6},                            /* cost if storing mask register
2742                                            in QImode, HImode, SImode.  */
2743   2,                                    /* cost of moving mask register.  */
2744   /* End of register allocator costs.  */
2745   },
2746
2747   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2748   /* On all chips taken into consideration lea is 2 cycles and more.  With
2749      this cost however our current implementation of synth_mult results in
2750      use of unnecessary temporary registers causing regression on several
2751      SPECfp benchmarks.  */
2752   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
2753   COSTS_N_INSNS (1),                    /* variable shift costs */
2754   COSTS_N_INSNS (1),                    /* constant shift costs */
2755   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2756    COSTS_N_INSNS (4),                   /*                               HI */
2757    COSTS_N_INSNS (3),                   /*                               SI */
2758    /* Here we tune for Sandybridge or newer.  */
2759    COSTS_N_INSNS (3),                   /*                               DI */
2760    COSTS_N_INSNS (3)},                  /*                            other */
2761   0,                                    /* cost of multiply per each bit set */
2762   /* Expanding div/mod currently doesn't consider parallelism. So the cost
2763      model is not realistic. We compensate by increasing the latencies a bit.  */
2764   {COSTS_N_INSNS (11),                  /* cost of a divide/mod for QI */
2765    COSTS_N_INSNS (11),                  /*                          HI */
2766    COSTS_N_INSNS (14),                  /*                          SI */
2767    COSTS_N_INSNS (81),                  /*                          DI */
2768    COSTS_N_INSNS (81)},                 /*                          other */
2769   COSTS_N_INSNS (1),                    /* cost of movsx */
2770   COSTS_N_INSNS (1),                    /* cost of movzx */
2771   8,                                    /* "large" insn */
2772   17,                                   /* MOVE_RATIO */
2773   6,                                    /* CLEAR_RATIO */
2774   {4, 4, 4},                            /* cost of loading integer registers
2775                                            in QImode, HImode and SImode.
2776                                            Relative to reg-reg move (2).  */
2777   {6, 6, 6},                            /* cost of storing integer registers */
2778   {6, 6, 6, 6, 12},                     /* cost of loading SSE register
2779                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2780   {6, 6, 6, 6, 12},                     /* cost of storing SSE register
2781                                            in 32bit, 64bit, 128bit, 256bit and 512bit */
2782   {6, 6, 6, 6, 12},                     /* cost of unaligned loads.  */
2783   {6, 6, 6, 6, 12},                     /* cost of unaligned stores.  */
2784   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
2785   2,                                    /* cost of moving SSE register to integer.  */
2786   /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
2787      rec. throughput 6.
2788      So 5 uops statically and one uops per load.  */
2789   10, 6,                                /* Gather load static, per_elt.  */
2790   10, 6,                                /* Gather store static, per_elt.  */
2791   64,                                   /* size of l1 cache.  */
2792   512,                                  /* size of l2 cache.  */
2793   64,                                   /* size of prefetch block */
2794   6,                                    /* number of parallel prefetches */
2795   /* FIXME perhaps more appropriate value is 5.  */
2796   3,                                    /* Branch cost */
2797   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
2798   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
2799   /* 10-24 */
2800   COSTS_N_INSNS (24),                   /* cost of FDIV instruction.  */
2801   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
2802   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
2803   COSTS_N_INSNS (23),                   /* cost of FSQRT instruction.  */
2804
2805   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2806   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2807   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
2808   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
2809   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
2810   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
2811   COSTS_N_INSNS (18),                   /* cost of DIVSS instruction.  */
2812   COSTS_N_INSNS (32),                   /* cost of DIVSD instruction.  */
2813   COSTS_N_INSNS (30),                   /* cost of SQRTSS instruction.  */
2814   COSTS_N_INSNS (58),                   /* cost of SQRTSD instruction.  */
2815   1, 4, 2, 2,                           /* reassoc int, fp, vec_int, vec_fp.  */
2816   core_memcpy,
2817   core_memset,
2818   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2819   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2820   "16:11:8",                            /* Loop alignment.  */
2821   "16:11:8",                            /* Jump alignment.  */
2822   "0:0:8",                              /* Label alignment.  */
2823   "16",                                 /* Func alignment.  */
2824 };
2825