gcc/config/i386/x86-tune-costs.h

   1 /* Costs of operations of individual x86 CPUs.
   2    Copyright (C) 1988-2019 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 3, or (at your option)
   9 any later version.
  10
  11 GCC is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 Under Section 7 of GPL version 3, you are granted additional
  17 permissions described in the GCC Runtime Library Exception, version
  18 3.1, as published by the Free Software Foundation.
  19
  20 You should have received a copy of the GNU General Public License and
  21 a copy of the GCC Runtime Library Exception along with this program;
  22 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  23 <http://www.gnu.org/licenses/>.  */
  24 /* Processor costs (relative to an add) */
  25 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes.  */
  26 #define COSTS_N_BYTES(N) ((N) * 2)
  27
  28 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
  29
  30 static stringop_algs ix86_size_memcpy[2] = {
  31   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
  32   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
  33 static stringop_algs ix86_size_memset[2] = {
  34   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
  35   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
  36
  37 const
  38 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
  39   COSTS_N_BYTES (2),                    /* cost of an add instruction */
  40   COSTS_N_BYTES (3),                    /* cost of a lea instruction */
  41   COSTS_N_BYTES (2),                    /* variable shift costs */
  42   COSTS_N_BYTES (3),                    /* constant shift costs */
  43   {COSTS_N_BYTES (3),                   /* cost of starting multiply for QI */
  44    COSTS_N_BYTES (3),                   /*                               HI */
  45    COSTS_N_BYTES (3),                   /*                               SI */
  46    COSTS_N_BYTES (3),                   /*                               DI */
  47    COSTS_N_BYTES (5)},                  /*                            other */
  48   0,                                    /* cost of multiply per each bit set */
  49   {COSTS_N_BYTES (3),                   /* cost of a divide/mod for QI */
  50    COSTS_N_BYTES (3),                   /*                          HI */
  51    COSTS_N_BYTES (3),                   /*                          SI */
  52    COSTS_N_BYTES (3),                   /*                          DI */
  53    COSTS_N_BYTES (5)},                  /*                          other */
  54   COSTS_N_BYTES (3),                    /* cost of movsx */
  55   COSTS_N_BYTES (3),                    /* cost of movzx */
  56   0,                                    /* "large" insn */
  57   2,                                    /* MOVE_RATIO */
  58
  59   /* All move costs are relative to integer->integer move times 2. */
  60   2,                                 /* cost for loading QImode using movzbl */
  61   {2, 2, 2},                            /* cost of loading integer registers
  62                                            in QImode, HImode and SImode.
  63                                            Relative to reg-reg move (2).  */
  64   {2, 2, 2},                            /* cost of storing integer registers */
  65   2,                                    /* cost of reg,reg fld/fst */
  66   {2, 2, 2},                            /* cost of loading fp registers
  67                                            in SFmode, DFmode and XFmode */
  68   {2, 2, 2},                            /* cost of storing fp registers
  69                                            in SFmode, DFmode and XFmode */
  70   3,                                    /* cost of moving MMX register */
  71   {3, 3},                               /* cost of loading MMX registers
  72                                            in SImode and DImode */
  73   {3, 3},                               /* cost of storing MMX registers
  74                                            in SImode and DImode */
  75   3, 3, 3,                              /* cost of moving XMM,YMM,ZMM register */
  76   {3, 3, 3, 3, 3},                      /* cost of loading SSE registers
  77                                            in 32,64,128,256 and 512-bit */
  78   {3, 3, 3, 3, 3},                      /* cost of unaligned SSE load
  79                                            in 128bit, 256bit and 512bit */
  80   {3, 3, 3, 3, 3},                      /* cost of storing SSE registers
  81                                            in 32,64,128,256 and 512-bit */
  82   {3, 3, 3, 3, 3},                              /* cost of unaligned SSE store
  83                                            in 128bit, 256bit and 512bit */
  84   3, 3,                                 /* SSE->integer and integer->SSE moves */
  85   5, 0,                                 /* Gather load static, per_elt.  */
  86   5, 0,                                 /* Gather store static, per_elt.  */
  87   0,                                    /* size of l1 cache  */
  88   0,                                    /* size of l2 cache  */
  89   0,                                    /* size of prefetch block */
  90   0,                                    /* number of parallel prefetches */
  91   2,                                    /* Branch cost */
  92   COSTS_N_BYTES (2),                    /* cost of FADD and FSUB insns.  */
  93   COSTS_N_BYTES (2),                    /* cost of FMUL instruction.  */
  94   COSTS_N_BYTES (2),                    /* cost of FDIV instruction.  */
  95   COSTS_N_BYTES (2),                    /* cost of FABS instruction.  */
  96   COSTS_N_BYTES (2),                    /* cost of FCHS instruction.  */
  97   COSTS_N_BYTES (2),                    /* cost of FSQRT instruction.  */
  98
  99   COSTS_N_BYTES (2),                    /* cost of cheap SSE instruction.  */
 100   COSTS_N_BYTES (2),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 101   COSTS_N_BYTES (2),                    /* cost of MULSS instruction.  */
 102   COSTS_N_BYTES (2),                    /* cost of MULSD instruction.  */
 103   COSTS_N_BYTES (2),                    /* cost of FMA SS instruction.  */
 104   COSTS_N_BYTES (2),                    /* cost of FMA SD instruction.  */
 105   COSTS_N_BYTES (2),                    /* cost of DIVSS instruction.  */
 106   COSTS_N_BYTES (2),                    /* cost of DIVSD instruction.  */
 107   COSTS_N_BYTES (2),                    /* cost of SQRTSS instruction.  */
 108   COSTS_N_BYTES (2),                    /* cost of SQRTSD instruction.  */
 109   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 110   ix86_size_memcpy,
 111   ix86_size_memset,
 112   COSTS_N_BYTES (1),                    /* cond_taken_branch_cost.  */
 113   COSTS_N_BYTES (1),                    /* cond_not_taken_branch_cost.  */
 114   NULL,                                 /* Loop alignment.  */
 115   NULL,                                 /* Jump alignment.  */
 116   NULL,                                 /* Label alignment.  */
 117   NULL,                                 /* Func alignment.  */
 118 };
 119
 120 /* Processor costs (relative to an add) */
 121 static stringop_algs i386_memcpy[2] = {
 122   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
 123   DUMMY_STRINGOP_ALGS};
 124 static stringop_algs i386_memset[2] = {
 125   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
 126   DUMMY_STRINGOP_ALGS};
 127
 128 static const
 129 struct processor_costs i386_cost = {    /* 386 specific costs */
 130   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 131   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 132   COSTS_N_INSNS (3),                    /* variable shift costs */
 133   COSTS_N_INSNS (2),                    /* constant shift costs */
 134   {COSTS_N_INSNS (6),                   /* cost of starting multiply for QI */
 135    COSTS_N_INSNS (6),                   /*                               HI */
 136    COSTS_N_INSNS (6),                   /*                               SI */
 137    COSTS_N_INSNS (6),                   /*                               DI */
 138    COSTS_N_INSNS (6)},                  /*                            other */
 139   COSTS_N_INSNS (1),                    /* cost of multiply per each bit set */
 140   {COSTS_N_INSNS (23),                  /* cost of a divide/mod for QI */
 141    COSTS_N_INSNS (23),                  /*                          HI */
 142    COSTS_N_INSNS (23),                  /*                          SI */
 143    COSTS_N_INSNS (23),                  /*                          DI */
 144    COSTS_N_INSNS (23)},                 /*                          other */
 145   COSTS_N_INSNS (3),                    /* cost of movsx */
 146   COSTS_N_INSNS (2),                    /* cost of movzx */
 147   15,                                   /* "large" insn */
 148   3,                                    /* MOVE_RATIO */
 149
 150   /* All move costs are relative to integer->integer move times 2 and thus
 151      they are latency*2. */
 152   4,                                 /* cost for loading QImode using movzbl */
 153   {2, 4, 2},                            /* cost of loading integer registers
 154                                            in QImode, HImode and SImode.
 155                                            Relative to reg-reg move (2).  */
 156   {2, 4, 2},                            /* cost of storing integer registers */
 157   2,                                    /* cost of reg,reg fld/fst */
 158   {8, 8, 8},                            /* cost of loading fp registers
 159                                            in SFmode, DFmode and XFmode */
 160   {8, 8, 8},                            /* cost of storing fp registers
 161                                            in SFmode, DFmode and XFmode */
 162   2,                                    /* cost of moving MMX register */
 163   {4, 8},                               /* cost of loading MMX registers
 164                                            in SImode and DImode */
 165   {4, 8},                               /* cost of storing MMX registers
 166                                            in SImode and DImode */
 167   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 168   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 169                                            in 32,64,128,256 and 512-bit */
 170   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 171   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 172                                            in 32,64,128,256 and 512-bit */
 173   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 174   3, 3,                                 /* SSE->integer and integer->SSE moves */
 175   4, 4,                                 /* Gather load static, per_elt.  */
 176   4, 4,                                 /* Gather store static, per_elt.  */
 177   0,                                    /* size of l1 cache  */
 178   0,                                    /* size of l2 cache  */
 179   0,                                    /* size of prefetch block */
 180   0,                                    /* number of parallel prefetches */
 181   1,                                    /* Branch cost */
 182   COSTS_N_INSNS (23),                   /* cost of FADD and FSUB insns.  */
 183   COSTS_N_INSNS (27),                   /* cost of FMUL instruction.  */
 184   COSTS_N_INSNS (88),                   /* cost of FDIV instruction.  */
 185   COSTS_N_INSNS (22),                   /* cost of FABS instruction.  */
 186   COSTS_N_INSNS (24),                   /* cost of FCHS instruction.  */
 187   COSTS_N_INSNS (122),                  /* cost of FSQRT instruction.  */
 188
 189   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 190   COSTS_N_INSNS (23),                   /* cost of ADDSS/SD SUBSS/SD insns.  */
 191   COSTS_N_INSNS (27),                   /* cost of MULSS instruction.  */
 192   COSTS_N_INSNS (27),                   /* cost of MULSD instruction.  */
 193   COSTS_N_INSNS (27),                   /* cost of FMA SS instruction.  */
 194   COSTS_N_INSNS (27),                   /* cost of FMA SD instruction.  */
 195   COSTS_N_INSNS (88),                   /* cost of DIVSS instruction.  */
 196   COSTS_N_INSNS (88),                   /* cost of DIVSD instruction.  */
 197   COSTS_N_INSNS (122),                  /* cost of SQRTSS instruction.  */
 198   COSTS_N_INSNS (122),                  /* cost of SQRTSD instruction.  */
 199   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 200   i386_memcpy,
 201   i386_memset,
 202   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 203   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 204   "4",                                  /* Loop alignment.  */
 205   "4",                                  /* Jump alignment.  */
 206   NULL,                                 /* Label alignment.  */
 207   "4",                                  /* Func alignment.  */
 208 };
 209
 210 static stringop_algs i486_memcpy[2] = {
 211   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
 212   DUMMY_STRINGOP_ALGS};
 213 static stringop_algs i486_memset[2] = {
 214   {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
 215   DUMMY_STRINGOP_ALGS};
 216
 217 static const
 218 struct processor_costs i486_cost = {    /* 486 specific costs */
 219   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 220   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 221   COSTS_N_INSNS (3),                    /* variable shift costs */
 222   COSTS_N_INSNS (2),                    /* constant shift costs */
 223   {COSTS_N_INSNS (12),                  /* cost of starting multiply for QI */
 224    COSTS_N_INSNS (12),                  /*                               HI */
 225    COSTS_N_INSNS (12),                  /*                               SI */
 226    COSTS_N_INSNS (12),                  /*                               DI */
 227    COSTS_N_INSNS (12)},                 /*                            other */
 228   1,                                    /* cost of multiply per each bit set */
 229   {COSTS_N_INSNS (40),                  /* cost of a divide/mod for QI */
 230    COSTS_N_INSNS (40),                  /*                          HI */
 231    COSTS_N_INSNS (40),                  /*                          SI */
 232    COSTS_N_INSNS (40),                  /*                          DI */
 233    COSTS_N_INSNS (40)},                 /*                          other */
 234   COSTS_N_INSNS (3),                    /* cost of movsx */
 235   COSTS_N_INSNS (2),                    /* cost of movzx */
 236   15,                                   /* "large" insn */
 237   3,                                    /* MOVE_RATIO */
 238
 239   /* All move costs are relative to integer->integer move times 2 and thus
 240      they are latency*2. */
 241   4,                                 /* cost for loading QImode using movzbl */
 242   {2, 4, 2},                            /* cost of loading integer registers
 243                                            in QImode, HImode and SImode.
 244                                            Relative to reg-reg move (2).  */
 245   {2, 4, 2},                            /* cost of storing integer registers */
 246   2,                                    /* cost of reg,reg fld/fst */
 247   {8, 8, 8},                            /* cost of loading fp registers
 248                                            in SFmode, DFmode and XFmode */
 249   {8, 8, 8},                            /* cost of storing fp registers
 250                                            in SFmode, DFmode and XFmode */
 251   2,                                    /* cost of moving MMX register */
 252   {4, 8},                               /* cost of loading MMX registers
 253                                            in SImode and DImode */
 254   {4, 8},                               /* cost of storing MMX registers
 255                                            in SImode and DImode */
 256   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 257   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 258                                            in 32,64,128,256 and 512-bit */
 259   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 260   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 261                                            in 32,64,128,256 and 512-bit */
 262   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 263   3, 3,                                 /* SSE->integer and integer->SSE moves */
 264   4, 4,                                 /* Gather load static, per_elt.  */
 265   4, 4,                                 /* Gather store static, per_elt.  */
 266   4,                                    /* size of l1 cache.  486 has 8kB cache
 267                                            shared for code and data, so 4kB is
 268                                            not really precise.  */
 269   4,                                    /* size of l2 cache  */
 270   0,                                    /* size of prefetch block */
 271   0,                                    /* number of parallel prefetches */
 272   1,                                    /* Branch cost */
 273   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
 274   COSTS_N_INSNS (16),                   /* cost of FMUL instruction.  */
 275   COSTS_N_INSNS (73),                   /* cost of FDIV instruction.  */
 276   COSTS_N_INSNS (3),                    /* cost of FABS instruction.  */
 277   COSTS_N_INSNS (3),                    /* cost of FCHS instruction.  */
 278   COSTS_N_INSNS (83),                   /* cost of FSQRT instruction.  */
 279
 280   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 281   COSTS_N_INSNS (8),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 282   COSTS_N_INSNS (16),                   /* cost of MULSS instruction.  */
 283   COSTS_N_INSNS (16),                   /* cost of MULSD instruction.  */
 284   COSTS_N_INSNS (16),                   /* cost of FMA SS instruction.  */
 285   COSTS_N_INSNS (16),                   /* cost of FMA SD instruction.  */
 286   COSTS_N_INSNS (73),                   /* cost of DIVSS instruction.  */
 287   COSTS_N_INSNS (74),                   /* cost of DIVSD instruction.  */
 288   COSTS_N_INSNS (83),                   /* cost of SQRTSS instruction.  */
 289   COSTS_N_INSNS (83),                   /* cost of SQRTSD instruction.  */
 290   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 291   i486_memcpy,
 292   i486_memset,
 293   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 294   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 295   "16",                                 /* Loop alignment.  */
 296   "16",                                 /* Jump alignment.  */
 297   "0:0:8",                              /* Label alignment.  */
 298   "16",                                 /* Func alignment.  */
 299 };
 300
 301 static stringop_algs pentium_memcpy[2] = {
 302   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 303   DUMMY_STRINGOP_ALGS};
 304 static stringop_algs pentium_memset[2] = {
 305   {libcall, {{-1, rep_prefix_4_byte, false}}},
 306   DUMMY_STRINGOP_ALGS};
 307
 308 static const
 309 struct processor_costs pentium_cost = {
 310   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 311   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 312   COSTS_N_INSNS (4),                    /* variable shift costs */
 313   COSTS_N_INSNS (1),                    /* constant shift costs */
 314   {COSTS_N_INSNS (11),                  /* cost of starting multiply for QI */
 315    COSTS_N_INSNS (11),                  /*                               HI */
 316    COSTS_N_INSNS (11),                  /*                               SI */
 317    COSTS_N_INSNS (11),                  /*                               DI */
 318    COSTS_N_INSNS (11)},                 /*                            other */
 319   0,                                    /* cost of multiply per each bit set */
 320   {COSTS_N_INSNS (25),                  /* cost of a divide/mod for QI */
 321    COSTS_N_INSNS (25),                  /*                          HI */
 322    COSTS_N_INSNS (25),                  /*                          SI */
 323    COSTS_N_INSNS (25),                  /*                          DI */
 324    COSTS_N_INSNS (25)},                 /*                          other */
 325   COSTS_N_INSNS (3),                    /* cost of movsx */
 326   COSTS_N_INSNS (2),                    /* cost of movzx */
 327   8,                                    /* "large" insn */
 328   6,                                    /* MOVE_RATIO */
 329
 330   /* All move costs are relative to integer->integer move times 2 and thus
 331      they are latency*2. */
 332   6,                                 /* cost for loading QImode using movzbl */
 333   {2, 4, 2},                            /* cost of loading integer registers
 334                                            in QImode, HImode and SImode.
 335                                            Relative to reg-reg move (2).  */
 336   {2, 4, 2},                            /* cost of storing integer registers */
 337   2,                                    /* cost of reg,reg fld/fst */
 338   {2, 2, 6},                            /* cost of loading fp registers
 339                                            in SFmode, DFmode and XFmode */
 340   {4, 4, 6},                            /* cost of storing fp registers
 341                                            in SFmode, DFmode and XFmode */
 342   8,                                    /* cost of moving MMX register */
 343   {8, 8},                               /* cost of loading MMX registers
 344                                            in SImode and DImode */
 345   {8, 8},                               /* cost of storing MMX registers
 346                                            in SImode and DImode */
 347   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 348   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 349                                            in 32,64,128,256 and 512-bit */
 350   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 351   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 352                                            in 32,64,128,256 and 512-bit */
 353   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 354   3, 3,                                 /* SSE->integer and integer->SSE moves */
 355   4, 4,                                 /* Gather load static, per_elt.  */
 356   4, 4,                                 /* Gather store static, per_elt.  */
 357   8,                                    /* size of l1 cache.  */
 358   8,                                    /* size of l2 cache  */
 359   0,                                    /* size of prefetch block */
 360   0,                                    /* number of parallel prefetches */
 361   2,                                    /* Branch cost */
 362   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 363   COSTS_N_INSNS (3),                    /* cost of FMUL instruction.  */
 364   COSTS_N_INSNS (39),                   /* cost of FDIV instruction.  */
 365   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 366   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 367   COSTS_N_INSNS (70),                   /* cost of FSQRT instruction.  */
 368
 369   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 370   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 371   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
 372   COSTS_N_INSNS (3),                    /* cost of MULSD instruction.  */
 373   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
 374   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
 375   COSTS_N_INSNS (39),                   /* cost of DIVSS instruction.  */
 376   COSTS_N_INSNS (39),                   /* cost of DIVSD instruction.  */
 377   COSTS_N_INSNS (70),                   /* cost of SQRTSS instruction.  */
 378   COSTS_N_INSNS (70),                   /* cost of SQRTSD instruction.  */
 379   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 380   pentium_memcpy,
 381   pentium_memset,
 382   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 383   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 384   "16:8:8",                             /* Loop alignment.  */
 385   "16:8:8",                             /* Jump alignment.  */
 386   "0:0:8",                              /* Label alignment.  */
 387   "16",                                 /* Func alignment.  */
 388 };
 389
 390 static const
 391 struct processor_costs lakemont_cost = {
 392   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 393   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
 394   COSTS_N_INSNS (1),                    /* variable shift costs */
 395   COSTS_N_INSNS (1),                    /* constant shift costs */
 396   {COSTS_N_INSNS (11),                  /* cost of starting multiply for QI */
 397    COSTS_N_INSNS (11),                  /*                               HI */
 398    COSTS_N_INSNS (11),                  /*                               SI */
 399    COSTS_N_INSNS (11),                  /*                               DI */
 400    COSTS_N_INSNS (11)},                 /*                            other */
 401   0,                                    /* cost of multiply per each bit set */
 402   {COSTS_N_INSNS (25),                  /* cost of a divide/mod for QI */
 403    COSTS_N_INSNS (25),                  /*                          HI */
 404    COSTS_N_INSNS (25),                  /*                          SI */
 405    COSTS_N_INSNS (25),                  /*                          DI */
 406    COSTS_N_INSNS (25)},                 /*                          other */
 407   COSTS_N_INSNS (3),                    /* cost of movsx */
 408   COSTS_N_INSNS (2),                    /* cost of movzx */
 409   8,                                    /* "large" insn */
 410   17,                                   /* MOVE_RATIO */
 411
 412   /* All move costs are relative to integer->integer move times 2 and thus
 413      they are latency*2. */
 414   6,                                 /* cost for loading QImode using movzbl */
 415   {2, 4, 2},                            /* cost of loading integer registers
 416                                            in QImode, HImode and SImode.
 417                                            Relative to reg-reg move (2).  */
 418   {2, 4, 2},                            /* cost of storing integer registers */
 419   2,                                    /* cost of reg,reg fld/fst */
 420   {2, 2, 6},                            /* cost of loading fp registers
 421                                            in SFmode, DFmode and XFmode */
 422   {4, 4, 6},                            /* cost of storing fp registers
 423                                            in SFmode, DFmode and XFmode */
 424   8,                                    /* cost of moving MMX register */
 425   {8, 8},                               /* cost of loading MMX registers
 426                                            in SImode and DImode */
 427   {8, 8},                               /* cost of storing MMX registers
 428                                            in SImode and DImode */
 429   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 430   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 431                                            in 32,64,128,256 and 512-bit */
 432   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 433   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 434                                            in 32,64,128,256 and 512-bit */
 435   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 436   3, 3,                                 /* SSE->integer and integer->SSE moves */
 437   4, 4,                                 /* Gather load static, per_elt.  */
 438   4, 4,                                 /* Gather store static, per_elt.  */
 439   8,                                    /* size of l1 cache.  */
 440   8,                                    /* size of l2 cache  */
 441   0,                                    /* size of prefetch block */
 442   0,                                    /* number of parallel prefetches */
 443   2,                                    /* Branch cost */
 444   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 445   COSTS_N_INSNS (3),                    /* cost of FMUL instruction.  */
 446   COSTS_N_INSNS (39),                   /* cost of FDIV instruction.  */
 447   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 448   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 449   COSTS_N_INSNS (70),                   /* cost of FSQRT instruction.  */
 450
 451   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 452   COSTS_N_INSNS (5),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 453   COSTS_N_INSNS (5),                    /* cost of MULSS instruction.  */
 454   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
 455   COSTS_N_INSNS (10),                   /* cost of FMA SS instruction.  */
 456   COSTS_N_INSNS (10),                   /* cost of FMA SD instruction.  */
 457   COSTS_N_INSNS (31),                   /* cost of DIVSS instruction.  */
 458   COSTS_N_INSNS (60),                   /* cost of DIVSD instruction.  */
 459   COSTS_N_INSNS (31),                   /* cost of SQRTSS instruction.  */
 460   COSTS_N_INSNS (63),                   /* cost of SQRTSD instruction.  */
 461   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 462   pentium_memcpy,
 463   pentium_memset,
 464   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 465   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 466   "16:8:8",                             /* Loop alignment.  */
 467   "16:8:8",                             /* Jump alignment.  */
 468   "0:0:8",                              /* Label alignment.  */
 469   "16",                                 /* Func alignment.  */
 470 };
 471
 472 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
 473    (we ensure the alignment).  For small blocks inline loop is still a
 474    noticeable win, for bigger blocks either rep movsl or rep movsb is
 475    way to go.  Rep movsb has apparently more expensive startup time in CPU,
 476    but after 4K the difference is down in the noise.  */
 477 static stringop_algs pentiumpro_memcpy[2] = {
 478   {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
 479                        {8192, rep_prefix_4_byte, false},
 480                        {-1, rep_prefix_1_byte, false}}},
 481   DUMMY_STRINGOP_ALGS};
 482 static stringop_algs pentiumpro_memset[2] = {
 483   {rep_prefix_4_byte, {{1024, unrolled_loop, false},
 484                        {8192, rep_prefix_4_byte, false},
 485                        {-1, libcall, false}}},
 486   DUMMY_STRINGOP_ALGS};
 487 static const
 488 struct processor_costs pentiumpro_cost = {
 489   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 490   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 491   COSTS_N_INSNS (1),                    /* variable shift costs */
 492   COSTS_N_INSNS (1),                    /* constant shift costs */
 493   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
 494    COSTS_N_INSNS (4),                   /*                               HI */
 495    COSTS_N_INSNS (4),                   /*                               SI */
 496    COSTS_N_INSNS (4),                   /*                               DI */
 497    COSTS_N_INSNS (4)},                  /*                            other */
 498   0,                                    /* cost of multiply per each bit set */
 499   {COSTS_N_INSNS (17),                  /* cost of a divide/mod for QI */
 500    COSTS_N_INSNS (17),                  /*                          HI */
 501    COSTS_N_INSNS (17),                  /*                          SI */
 502    COSTS_N_INSNS (17),                  /*                          DI */
 503    COSTS_N_INSNS (17)},                 /*                          other */
 504   COSTS_N_INSNS (1),                    /* cost of movsx */
 505   COSTS_N_INSNS (1),                    /* cost of movzx */
 506   8,                                    /* "large" insn */
 507   6,                                    /* MOVE_RATIO */
 508
 509   /* All move costs are relative to integer->integer move times 2 and thus
 510      they are latency*2. */
 511   2,                                 /* cost for loading QImode using movzbl */
 512   {4, 4, 4},                            /* cost of loading integer registers
 513                                            in QImode, HImode and SImode.
 514                                            Relative to reg-reg move (2).  */
 515   {2, 2, 2},                            /* cost of storing integer registers */
 516   2,                                    /* cost of reg,reg fld/fst */
 517   {2, 2, 6},                            /* cost of loading fp registers
 518                                            in SFmode, DFmode and XFmode */
 519   {4, 4, 6},                            /* cost of storing fp registers
 520                                            in SFmode, DFmode and XFmode */
 521   2,                                    /* cost of moving MMX register */
 522   {2, 2},                               /* cost of loading MMX registers
 523                                            in SImode and DImode */
 524   {2, 2},                               /* cost of storing MMX registers
 525                                            in SImode and DImode */
 526   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 527   {4, 8, 16, 32, 64},                   /* cost of loading SSE registers
 528                                            in 32,64,128,256 and 512-bit */
 529   {4, 8, 16, 32, 64},                   /* cost of unaligned loads.  */
 530   {4, 8, 16, 32, 64},                   /* cost of storing SSE registers
 531                                            in 32,64,128,256 and 512-bit */
 532   {4, 8, 16, 32, 64},                   /* cost of unaligned stores.  */
 533   3, 3,                                 /* SSE->integer and integer->SSE moves */
 534   4, 4,                                 /* Gather load static, per_elt.  */
 535   4, 4,                                 /* Gather store static, per_elt.  */
 536   8,                                    /* size of l1 cache.  */
 537   256,                                  /* size of l2 cache  */
 538   32,                                   /* size of prefetch block */
 539   6,                                    /* number of parallel prefetches */
 540   2,                                    /* Branch cost */
 541   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 542   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
 543   COSTS_N_INSNS (56),                   /* cost of FDIV instruction.  */
 544   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 545   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 546   COSTS_N_INSNS (56),                   /* cost of FSQRT instruction.  */
 547
 548   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 549   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 550   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
 551   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
 552   COSTS_N_INSNS (7),                    /* cost of FMA SS instruction.  */
 553   COSTS_N_INSNS (7),                    /* cost of FMA SD instruction.  */
 554   COSTS_N_INSNS (18),                   /* cost of DIVSS instruction.  */
 555   COSTS_N_INSNS (18),                   /* cost of DIVSD instruction.  */
 556   COSTS_N_INSNS (31),                   /* cost of SQRTSS instruction.  */
 557   COSTS_N_INSNS (31),                   /* cost of SQRTSD instruction.  */
 558   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 559   pentiumpro_memcpy,
 560   pentiumpro_memset,
 561   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 562   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 563   "16",                                 /* Loop alignment.  */
 564   "16:11:8",                            /* Jump alignment.  */
 565   "0:0:8",                              /* Label alignment.  */
 566   "16",                                 /* Func alignment.  */
 567 };
 568
 569 static stringop_algs geode_memcpy[2] = {
 570   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 571   DUMMY_STRINGOP_ALGS};
 572 static stringop_algs geode_memset[2] = {
 573   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 574   DUMMY_STRINGOP_ALGS};
 575 static const
 576 struct processor_costs geode_cost = {
 577   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 578   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 579   COSTS_N_INSNS (2),                    /* variable shift costs */
 580   COSTS_N_INSNS (1),                    /* constant shift costs */
 581   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 582    COSTS_N_INSNS (4),                   /*                               HI */
 583    COSTS_N_INSNS (7),                   /*                               SI */
 584    COSTS_N_INSNS (7),                   /*                               DI */
 585    COSTS_N_INSNS (7)},                  /*                            other */
 586   0,                                    /* cost of multiply per each bit set */
 587   {COSTS_N_INSNS (15),                  /* cost of a divide/mod for QI */
 588    COSTS_N_INSNS (23),                  /*                          HI */
 589    COSTS_N_INSNS (39),                  /*                          SI */
 590    COSTS_N_INSNS (39),                  /*                          DI */
 591    COSTS_N_INSNS (39)},                 /*                          other */
 592   COSTS_N_INSNS (1),                    /* cost of movsx */
 593   COSTS_N_INSNS (1),                    /* cost of movzx */
 594   8,                                    /* "large" insn */
 595   4,                                    /* MOVE_RATIO */
 596
 597   /* All move costs are relative to integer->integer move times 2 and thus
 598      they are latency*2. */
 599   2,                                 /* cost for loading QImode using movzbl */
 600   {2, 2, 2},                            /* cost of loading integer registers
 601                                            in QImode, HImode and SImode.
 602                                            Relative to reg-reg move (2).  */
 603   {2, 2, 2},                            /* cost of storing integer registers */
 604   2,                                    /* cost of reg,reg fld/fst */
 605   {2, 2, 2},                            /* cost of loading fp registers
 606                                            in SFmode, DFmode and XFmode */
 607   {4, 6, 6},                            /* cost of storing fp registers
 608                                            in SFmode, DFmode and XFmode */
 609
 610   2,                                    /* cost of moving MMX register */
 611   {2, 2},                               /* cost of loading MMX registers
 612                                            in SImode and DImode */
 613   {2, 2},                               /* cost of storing MMX registers
 614                                            in SImode and DImode */
 615   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 616   {2, 2, 8, 16, 32},                    /* cost of loading SSE registers
 617                                            in 32,64,128,256 and 512-bit */
 618   {2, 2, 8, 16, 32},                    /* cost of unaligned loads.  */
 619   {2, 2, 8, 16, 32},                    /* cost of storing SSE registers
 620                                            in 32,64,128,256 and 512-bit */
 621   {2, 2, 8, 16, 32},                    /* cost of unaligned stores.  */
 622   6, 6,                                 /* SSE->integer and integer->SSE moves */
 623   2, 2,                                 /* Gather load static, per_elt.  */
 624   2, 2,                                 /* Gather store static, per_elt.  */
 625   64,                                   /* size of l1 cache.  */
 626   128,                                  /* size of l2 cache.  */
 627   32,                                   /* size of prefetch block */
 628   1,                                    /* number of parallel prefetches */
 629   1,                                    /* Branch cost */
 630   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
 631   COSTS_N_INSNS (11),                   /* cost of FMUL instruction.  */
 632   COSTS_N_INSNS (47),                   /* cost of FDIV instruction.  */
 633   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 634   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 635   COSTS_N_INSNS (54),                   /* cost of FSQRT instruction.  */
 636
 637   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 638   COSTS_N_INSNS (6),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 639   COSTS_N_INSNS (11),                   /* cost of MULSS instruction.  */
 640   COSTS_N_INSNS (11),                   /* cost of MULSD instruction.  */
 641   COSTS_N_INSNS (17),                   /* cost of FMA SS instruction.  */
 642   COSTS_N_INSNS (17),                   /* cost of FMA SD instruction.  */
 643   COSTS_N_INSNS (47),                   /* cost of DIVSS instruction.  */
 644   COSTS_N_INSNS (47),                   /* cost of DIVSD instruction.  */
 645   COSTS_N_INSNS (54),                   /* cost of SQRTSS instruction.  */
 646   COSTS_N_INSNS (54),                   /* cost of SQRTSD instruction.  */
 647   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 648   geode_memcpy,
 649   geode_memset,
 650   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 651   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 652   NULL,                                 /* Loop alignment.  */
 653   NULL,                                 /* Jump alignment.  */
 654   NULL,                                 /* Label alignment.  */
 655   NULL,                                 /* Func alignment.  */
 656 };
 657
 658 static stringop_algs k6_memcpy[2] = {
 659   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 660   DUMMY_STRINGOP_ALGS};
 661 static stringop_algs k6_memset[2] = {
 662   {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 663   DUMMY_STRINGOP_ALGS};
 664 static const
 665 struct processor_costs k6_cost = {
 666   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 667   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 668   COSTS_N_INSNS (1),                    /* variable shift costs */
 669   COSTS_N_INSNS (1),                    /* constant shift costs */
 670   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 671    COSTS_N_INSNS (3),                   /*                               HI */
 672    COSTS_N_INSNS (3),                   /*                               SI */
 673    COSTS_N_INSNS (3),                   /*                               DI */
 674    COSTS_N_INSNS (3)},                  /*                            other */
 675   0,                                    /* cost of multiply per each bit set */
 676   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
 677    COSTS_N_INSNS (18),                  /*                          HI */
 678    COSTS_N_INSNS (18),                  /*                          SI */
 679    COSTS_N_INSNS (18),                  /*                          DI */
 680    COSTS_N_INSNS (18)},                 /*                          other */
 681   COSTS_N_INSNS (2),                    /* cost of movsx */
 682   COSTS_N_INSNS (2),                    /* cost of movzx */
 683   8,                                    /* "large" insn */
 684   4,                                    /* MOVE_RATIO */
 685
 686   /* All move costs are relative to integer->integer move times 2 and thus
 687      they are latency*2. */
 688   3,                                 /* cost for loading QImode using movzbl */
 689   {4, 5, 4},                            /* cost of loading integer registers
 690                                            in QImode, HImode and SImode.
 691                                            Relative to reg-reg move (2).  */
 692   {2, 3, 2},                            /* cost of storing integer registers */
 693   4,                                    /* cost of reg,reg fld/fst */
 694   {6, 6, 6},                            /* cost of loading fp registers
 695                                            in SFmode, DFmode and XFmode */
 696   {4, 4, 4},                            /* cost of storing fp registers
 697                                            in SFmode, DFmode and XFmode */
 698   2,                                    /* cost of moving MMX register */
 699   {2, 2},                               /* cost of loading MMX registers
 700                                            in SImode and DImode */
 701   {2, 2},                               /* cost of storing MMX registers
 702                                            in SImode and DImode */
 703   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 704   {2, 2, 8, 16, 32},                    /* cost of loading SSE registers
 705                                            in 32,64,128,256 and 512-bit */
 706   {2, 2, 8, 16, 32},                    /* cost of unaligned loads.  */
 707   {2, 2, 8, 16, 32},                    /* cost of storing SSE registers
 708                                            in 32,64,128,256 and 512-bit */
 709   {2, 2, 8, 16, 32},                    /* cost of unaligned stores.  */
 710   6, 6,                                 /* SSE->integer and integer->SSE moves */
 711   2, 2,                                 /* Gather load static, per_elt.  */
 712   2, 2,                                 /* Gather store static, per_elt.  */
 713   32,                                   /* size of l1 cache.  */
 714   32,                                   /* size of l2 cache.  Some models
 715                                            have integrated l2 cache, but
 716                                            optimizing for k6 is not important
 717                                            enough to worry about that.  */
 718   32,                                   /* size of prefetch block */
 719   1,                                    /* number of parallel prefetches */
 720   1,                                    /* Branch cost */
 721   COSTS_N_INSNS (2),                    /* cost of FADD and FSUB insns.  */
 722   COSTS_N_INSNS (2),                    /* cost of FMUL instruction.  */
 723   COSTS_N_INSNS (56),                   /* cost of FDIV instruction.  */
 724   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 725   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 726   COSTS_N_INSNS (56),                   /* cost of FSQRT instruction.  */
 727
 728   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
 729   COSTS_N_INSNS (2),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 730   COSTS_N_INSNS (2),                    /* cost of MULSS instruction.  */
 731   COSTS_N_INSNS (2),                    /* cost of MULSD instruction.  */
 732   COSTS_N_INSNS (4),                    /* cost of FMA SS instruction.  */
 733   COSTS_N_INSNS (4),                    /* cost of FMA SD instruction.  */
 734   COSTS_N_INSNS (56),                   /* cost of DIVSS instruction.  */
 735   COSTS_N_INSNS (56),                   /* cost of DIVSD instruction.  */
 736   COSTS_N_INSNS (56),                   /* cost of SQRTSS instruction.  */
 737   COSTS_N_INSNS (56),                   /* cost of SQRTSD instruction.  */
 738   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 739   k6_memcpy,
 740   k6_memset,
 741   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 742   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 743   "32:8:8",                             /* Loop alignment.  */
 744   "32:8:8",                             /* Jump alignment.  */
 745   "0:0:8",                              /* Label alignment.  */
 746   "32",                                 /* Func alignment.  */
 747 };
 748
 749 /* For some reason, Athlon deals better with REP prefix (relative to loops)
 750    compared to K8. Alignment becomes important after 8 bytes for memcpy and
 751    128 bytes for memset.  */
 752 static stringop_algs athlon_memcpy[2] = {
 753   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 754   DUMMY_STRINGOP_ALGS};
 755 static stringop_algs athlon_memset[2] = {
 756   {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 757   DUMMY_STRINGOP_ALGS};
 758 static const
 759 struct processor_costs athlon_cost = {
 760   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 761   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 762   COSTS_N_INSNS (1),                    /* variable shift costs */
 763   COSTS_N_INSNS (1),                    /* constant shift costs */
 764   {COSTS_N_INSNS (5),                   /* cost of starting multiply for QI */
 765    COSTS_N_INSNS (5),                   /*                               HI */
 766    COSTS_N_INSNS (5),                   /*                               SI */
 767    COSTS_N_INSNS (5),                   /*                               DI */
 768    COSTS_N_INSNS (5)},                  /*                            other */
 769   0,                                    /* cost of multiply per each bit set */
 770   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
 771    COSTS_N_INSNS (26),                  /*                          HI */
 772    COSTS_N_INSNS (42),                  /*                          SI */
 773    COSTS_N_INSNS (74),                  /*                          DI */
 774    COSTS_N_INSNS (74)},                 /*                          other */
 775   COSTS_N_INSNS (1),                    /* cost of movsx */
 776   COSTS_N_INSNS (1),                    /* cost of movzx */
 777   8,                                    /* "large" insn */
 778   9,                                    /* MOVE_RATIO */
 779
 780   /* All move costs are relative to integer->integer move times 2 and thus
 781      they are latency*2. */
 782   4,                                 /* cost for loading QImode using movzbl */
 783   {3, 4, 3},                            /* cost of loading integer registers
 784                                            in QImode, HImode and SImode.
 785                                            Relative to reg-reg move (2).  */
 786   {3, 4, 3},                            /* cost of storing integer registers */
 787   4,                                    /* cost of reg,reg fld/fst */
 788   {4, 4, 12},                           /* cost of loading fp registers
 789                                            in SFmode, DFmode and XFmode */
 790   {6, 6, 8},                            /* cost of storing fp registers
 791                                            in SFmode, DFmode and XFmode */
 792   2,                                    /* cost of moving MMX register */
 793   {4, 4},                               /* cost of loading MMX registers
 794                                            in SImode and DImode */
 795   {4, 4},                               /* cost of storing MMX registers
 796                                            in SImode and DImode */
 797   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 798   {4, 4, 12, 12, 24},                   /* cost of loading SSE registers
 799                                            in 32,64,128,256 and 512-bit */
 800   {4, 4, 12, 12, 24},                   /* cost of unaligned loads.  */
 801   {4, 4, 10, 10, 20},                   /* cost of storing SSE registers
 802                                            in 32,64,128,256 and 512-bit */
 803   {4, 4, 10, 10, 20},                   /* cost of unaligned stores.  */
 804   5, 5,                                 /* SSE->integer and integer->SSE moves */
 805   4, 4,                                 /* Gather load static, per_elt.  */
 806   4, 4,                                 /* Gather store static, per_elt.  */
 807   64,                                   /* size of l1 cache.  */
 808   256,                                  /* size of l2 cache.  */
 809   64,                                   /* size of prefetch block */
 810   6,                                    /* number of parallel prefetches */
 811   5,                                    /* Branch cost */
 812   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
 813   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
 814   COSTS_N_INSNS (24),                   /* cost of FDIV instruction.  */
 815   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 816   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 817   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
 818
 819   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
 820   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 821   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
 822   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
 823   COSTS_N_INSNS (8),                    /* cost of FMA SS instruction.  */
 824   COSTS_N_INSNS (8),                    /* cost of FMA SD instruction.  */
 825   /* 11-16  */
 826   COSTS_N_INSNS (16),                   /* cost of DIVSS instruction.  */
 827   COSTS_N_INSNS (24),                   /* cost of DIVSD instruction.  */
 828   COSTS_N_INSNS (19),                   /* cost of SQRTSS instruction.  */
 829   COSTS_N_INSNS (19),                   /* cost of SQRTSD instruction.  */
 830   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 831   athlon_memcpy,
 832   athlon_memset,
 833   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 834   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
 835   "16:8:8",                             /* Loop alignment.  */
 836   "16:8:8",                             /* Jump alignment.  */
 837   "0:0:8",                              /* Label alignment.  */
 838   "16",                                 /* Func alignment.  */
 839 };
 840
 841 /* K8 has optimized REP instruction for medium sized blocks, but for very
 842    small blocks it is better to use loop. For large blocks, libcall can
 843    do nontemporary accesses and beat inline considerably.  */
 844 static stringop_algs k8_memcpy[2] = {
 845   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
 846              {-1, rep_prefix_4_byte, false}}},
 847   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
 848              {-1, libcall, false}}}};
 849 static stringop_algs k8_memset[2] = {
 850   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
 851              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 852   {libcall, {{48, unrolled_loop, false},
 853              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
 854 static const
 855 struct processor_costs k8_cost = {
 856   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 857   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 858   COSTS_N_INSNS (1),                    /* variable shift costs */
 859   COSTS_N_INSNS (1),                    /* constant shift costs */
 860   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 861    COSTS_N_INSNS (4),                   /*                               HI */
 862    COSTS_N_INSNS (3),                   /*                               SI */
 863    COSTS_N_INSNS (4),                   /*                               DI */
 864    COSTS_N_INSNS (5)},                  /*                            other */
 865   0,                                    /* cost of multiply per each bit set */
 866   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
 867    COSTS_N_INSNS (26),                  /*                          HI */
 868    COSTS_N_INSNS (42),                  /*                          SI */
 869    COSTS_N_INSNS (74),                  /*                          DI */
 870    COSTS_N_INSNS (74)},                 /*                          other */
 871   COSTS_N_INSNS (1),                    /* cost of movsx */
 872   COSTS_N_INSNS (1),                    /* cost of movzx */
 873   8,                                    /* "large" insn */
 874   9,                                    /* MOVE_RATIO */
 875
 876   /* All move costs are relative to integer->integer move times 2 and thus
 877      they are latency*2. */
 878   4,                                 /* cost for loading QImode using movzbl */
 879   {3, 4, 3},                            /* cost of loading integer registers
 880                                            in QImode, HImode and SImode.
 881                                            Relative to reg-reg move (2).  */
 882   {3, 4, 3},                            /* cost of storing integer registers */
 883   4,                                    /* cost of reg,reg fld/fst */
 884   {4, 4, 12},                           /* cost of loading fp registers
 885                                            in SFmode, DFmode and XFmode */
 886   {6, 6, 8},                            /* cost of storing fp registers
 887                                            in SFmode, DFmode and XFmode */
 888   2,                                    /* cost of moving MMX register */
 889   {3, 3},                               /* cost of loading MMX registers
 890                                            in SImode and DImode */
 891   {4, 4},                               /* cost of storing MMX registers
 892                                            in SImode and DImode */
 893   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 894   {4, 3, 12, 12, 24},                   /* cost of loading SSE registers
 895                                            in 32,64,128,256 and 512-bit */
 896   {4, 3, 12, 12, 24},                   /* cost of unaligned loads.  */
 897   {4, 4, 10, 10, 20},                   /* cost of storing SSE registers
 898                                            in 32,64,128,256 and 512-bit */
 899   {4, 4, 10, 10, 20},                   /* cost of unaligned stores.  */
 900   5, 5,                                 /* SSE->integer and integer->SSE moves */
 901   4, 4,                                 /* Gather load static, per_elt.  */
 902   4, 4,                                 /* Gather store static, per_elt.  */
 903   64,                                   /* size of l1 cache.  */
 904   512,                                  /* size of l2 cache.  */
 905   64,                                   /* size of prefetch block */
 906   /* New AMD processors never drop prefetches; if they cannot be performed
 907      immediately, they are queued.  We set number of simultaneous prefetches
 908      to a large constant to reflect this (it probably is not a good idea not
 909      to limit number of prefetches at all, as their execution also takes some
 910      time).  */
 911   100,                                  /* number of parallel prefetches */
 912   3,                                    /* Branch cost */
 913   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
 914   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
 915   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
 916   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 917   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 918   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
 919
 920   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
 921   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
 922   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
 923   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
 924   COSTS_N_INSNS (8),                    /* cost of FMA SS instruction.  */
 925   COSTS_N_INSNS (8),                    /* cost of FMA SD instruction.  */
 926   /* 11-16  */
 927   COSTS_N_INSNS (16),                   /* cost of DIVSS instruction.  */
 928   COSTS_N_INSNS (20),                   /* cost of DIVSD instruction.  */
 929   COSTS_N_INSNS (19),                   /* cost of SQRTSS instruction.  */
 930   COSTS_N_INSNS (27),                   /* cost of SQRTSD instruction.  */
 931   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
 932   k8_memcpy,
 933   k8_memset,
 934   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
 935   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
 936   "16:8:8",                             /* Loop alignment.  */
 937   "16:8:8",                             /* Jump alignment.  */
 938   "0:0:8",                              /* Label alignment.  */
 939   "16",                                 /* Func alignment.  */
 940 };
 941
 942 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
 943    very small blocks it is better to use loop. For large blocks, libcall can
 944    do nontemporary accesses and beat inline considerably.  */
 945 static stringop_algs amdfam10_memcpy[2] = {
 946   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
 947              {-1, rep_prefix_4_byte, false}}},
 948   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
 949              {-1, libcall, false}}}};
 950 static stringop_algs amdfam10_memset[2] = {
 951   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
 952              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
 953   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
 954              {-1, libcall, false}}}};
 955 struct processor_costs amdfam10_cost = {
 956   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 957   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 958   COSTS_N_INSNS (1),                    /* variable shift costs */
 959   COSTS_N_INSNS (1),                    /* constant shift costs */
 960   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 961    COSTS_N_INSNS (4),                   /*                               HI */
 962    COSTS_N_INSNS (3),                   /*                               SI */
 963    COSTS_N_INSNS (4),                   /*                               DI */
 964    COSTS_N_INSNS (5)},                  /*                            other */
 965   0,                                    /* cost of multiply per each bit set */
 966   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
 967    COSTS_N_INSNS (35),                  /*                          HI */
 968    COSTS_N_INSNS (51),                  /*                          SI */
 969    COSTS_N_INSNS (83),                  /*                          DI */
 970    COSTS_N_INSNS (83)},                 /*                          other */
 971   COSTS_N_INSNS (1),                    /* cost of movsx */
 972   COSTS_N_INSNS (1),                    /* cost of movzx */
 973   8,                                    /* "large" insn */
 974   9,                                    /* MOVE_RATIO */
 975
 976   /* All move costs are relative to integer->integer move times 2 and thus
 977      they are latency*2. */
 978   4,                                 /* cost for loading QImode using movzbl */
 979   {3, 4, 3},                            /* cost of loading integer registers
 980                                            in QImode, HImode and SImode.
 981                                            Relative to reg-reg move (2).  */
 982   {3, 4, 3},                            /* cost of storing integer registers */
 983   4,                                    /* cost of reg,reg fld/fst */
 984   {4, 4, 12},                           /* cost of loading fp registers
 985                                            in SFmode, DFmode and XFmode */
 986   {6, 6, 8},                            /* cost of storing fp registers
 987                                            in SFmode, DFmode and XFmode */
 988   2,                                    /* cost of moving MMX register */
 989   {3, 3},                               /* cost of loading MMX registers
 990                                            in SImode and DImode */
 991   {4, 4},                               /* cost of storing MMX registers
 992                                            in SImode and DImode */
 993   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
 994   {4, 4, 3, 6, 12},                     /* cost of loading SSE registers
 995                                            in 32,64,128,256 and 512-bit */
 996   {4, 4, 3, 7, 12},                     /* cost of unaligned loads.  */
 997   {4, 4, 5, 10, 20},                    /* cost of storing SSE registers
 998                                            in 32,64,128,256 and 512-bit */
 999   {4, 4, 5, 10, 20},                    /* cost of unaligned stores.  */
1000   3, 3,                                 /* SSE->integer and integer->SSE moves */
1001                                         /* On K8:
1002                                             MOVD reg64, xmmreg Double FSTORE 4
1003                                             MOVD reg32, xmmreg Double FSTORE 4
1004                                            On AMDFAM10:
1005                                             MOVD reg64, xmmreg Double FADD 3
1006                                                                1/1  1/1
1007                                             MOVD reg32, xmmreg Double FADD 3
1008                                                                1/1  1/1 */
1009   4, 4,                                 /* Gather load static, per_elt.  */
1010   4, 4,                                 /* Gather store static, per_elt.  */
1011   64,                                   /* size of l1 cache.  */
1012   512,                                  /* size of l2 cache.  */
1013   64,                                   /* size of prefetch block */
1014   /* New AMD processors never drop prefetches; if they cannot be performed
1015      immediately, they are queued.  We set number of simultaneous prefetches
1016      to a large constant to reflect this (it probably is not a good idea not
1017      to limit number of prefetches at all, as their execution also takes some
1018      time).  */
1019   100,                                  /* number of parallel prefetches */
1020   2,                                    /* Branch cost */
1021   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
1022   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1023   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
1024   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1025   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1026   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
1027
1028   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1029   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1030   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
1031   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1032   COSTS_N_INSNS (8),                    /* cost of FMA SS instruction.  */
1033   COSTS_N_INSNS (8),                    /* cost of FMA SD instruction.  */
1034   /* 11-16  */
1035   COSTS_N_INSNS (16),                   /* cost of DIVSS instruction.  */
1036   COSTS_N_INSNS (20),                   /* cost of DIVSD instruction.  */
1037   COSTS_N_INSNS (19),                   /* cost of SQRTSS instruction.  */
1038   COSTS_N_INSNS (27),                   /* cost of SQRTSD instruction.  */
1039   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1040   amdfam10_memcpy,
1041   amdfam10_memset,
1042   COSTS_N_INSNS (2),                    /* cond_taken_branch_cost.  */
1043   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1044   "32:25:8",                            /* Loop alignment.  */
1045   "32:8:8",                             /* Jump alignment.  */
1046   "0:0:8",                              /* Label alignment.  */
1047   "32",                                 /* Func alignment.  */
1048 };
1049
1050 /*  BDVER has optimized REP instruction for medium sized blocks, but for
1051     very small blocks it is better to use loop. For large blocks, libcall
1052     can do nontemporary accesses and beat inline considerably.  */
1053 static stringop_algs bdver_memcpy[2] = {
1054   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1055              {-1, rep_prefix_4_byte, false}}},
1056   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1057              {-1, libcall, false}}}};
1058 static stringop_algs bdver_memset[2] = {
1059   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1060              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1061   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1062              {-1, libcall, false}}}};
1063
1064 const struct processor_costs bdver_cost = {
1065   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1066   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
1067   COSTS_N_INSNS (1),                    /* variable shift costs */
1068   COSTS_N_INSNS (1),                    /* constant shift costs */
1069   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
1070    COSTS_N_INSNS (4),                   /*                               HI */
1071    COSTS_N_INSNS (4),                   /*                               SI */
1072    COSTS_N_INSNS (6),                   /*                               DI */
1073    COSTS_N_INSNS (6)},                  /*                            other */
1074   0,                                    /* cost of multiply per each bit set */
1075   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1076    COSTS_N_INSNS (35),                  /*                          HI */
1077    COSTS_N_INSNS (51),                  /*                          SI */
1078    COSTS_N_INSNS (83),                  /*                          DI */
1079    COSTS_N_INSNS (83)},                 /*                          other */
1080   COSTS_N_INSNS (1),                    /* cost of movsx */
1081   COSTS_N_INSNS (1),                    /* cost of movzx */
1082   8,                                    /* "large" insn */
1083   9,                                    /* MOVE_RATIO */
1084
1085   /* All move costs are relative to integer->integer move times 2 and thus
1086      they are latency*2. */
1087   8,                                 /* cost for loading QImode using movzbl */
1088   {8, 8, 8},                            /* cost of loading integer registers
1089                                            in QImode, HImode and SImode.
1090                                            Relative to reg-reg move (2).  */
1091   {8, 8, 8},                            /* cost of storing integer registers */
1092   4,                                    /* cost of reg,reg fld/fst */
1093   {12, 12, 28},                         /* cost of loading fp registers
1094                                            in SFmode, DFmode and XFmode */
1095   {10, 10, 18},                         /* cost of storing fp registers
1096                                            in SFmode, DFmode and XFmode */
1097   4,                                    /* cost of moving MMX register */
1098   {12, 12},                             /* cost of loading MMX registers
1099                                            in SImode and DImode */
1100   {10, 10},                             /* cost of storing MMX registers
1101                                            in SImode and DImode */
1102   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1103   {12, 12, 10, 40, 60},                 /* cost of loading SSE registers
1104                                            in 32,64,128,256 and 512-bit */
1105   {12, 12, 10, 40, 60},                 /* cost of unaligned loads.  */
1106   {10, 10, 10, 40, 60},                 /* cost of storing SSE registers
1107                                            in 32,64,128,256 and 512-bit */
1108   {10, 10, 10, 40, 60},                 /* cost of unaligned stores.  */
1109   16, 20,                               /* SSE->integer and integer->SSE moves */
1110   12, 12,                               /* Gather load static, per_elt.  */
1111   10, 10,                               /* Gather store static, per_elt.  */
1112   16,                                   /* size of l1 cache.  */
1113   2048,                                 /* size of l2 cache.  */
1114   64,                                   /* size of prefetch block */
1115   /* New AMD processors never drop prefetches; if they cannot be performed
1116      immediately, they are queued.  We set number of simultaneous prefetches
1117      to a large constant to reflect this (it probably is not a good idea not
1118      to limit number of prefetches at all, as their execution also takes some
1119      time).  */
1120   100,                                  /* number of parallel prefetches */
1121   2,                                    /* Branch cost */
1122   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
1123   COSTS_N_INSNS (6),                    /* cost of FMUL instruction.  */
1124   COSTS_N_INSNS (42),                   /* cost of FDIV instruction.  */
1125   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1126   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1127   COSTS_N_INSNS (52),                   /* cost of FSQRT instruction.  */
1128
1129   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1130   COSTS_N_INSNS (6),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1131   COSTS_N_INSNS (6),                    /* cost of MULSS instruction.  */
1132   COSTS_N_INSNS (6),                    /* cost of MULSD instruction.  */
1133   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
1134   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
1135   /* 9-24  */
1136   COSTS_N_INSNS (24),                   /* cost of DIVSS instruction.  */
1137   /* 9-27  */
1138   COSTS_N_INSNS (27),                   /* cost of DIVSD instruction.  */
1139   COSTS_N_INSNS (15),                   /* cost of SQRTSS instruction.  */
1140   COSTS_N_INSNS (26),                   /* cost of SQRTSD instruction.  */
1141   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1142   bdver_memcpy,
1143   bdver_memset,
1144   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1145   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1146   "16:11:8",                            /* Loop alignment.  */
1147   "16:8:8",                             /* Jump alignment.  */
1148   "0:0:8",                              /* Label alignment.  */
1149   "11",                                 /* Func alignment.  */
1150 };
1151
1152
1153 /*  ZNVER1 has optimized REP instruction for medium sized blocks, but for
1154     very small blocks it is better to use loop.  For large blocks, libcall
1155     can do nontemporary accesses and beat inline considerably.  */
1156 static stringop_algs znver1_memcpy[2] = {
1157   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1158              {-1, rep_prefix_4_byte, false}}},
1159   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1160              {-1, libcall, false}}}};
1161 static stringop_algs znver1_memset[2] = {
1162   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1163              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1164   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1165              {-1, libcall, false}}}};
1166 struct processor_costs znver1_cost = {
1167   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
1168   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
1169   COSTS_N_INSNS (1),                    /* variable shift costs.  */
1170   COSTS_N_INSNS (1),                    /* constant shift costs.  */
1171   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI.  */
1172    COSTS_N_INSNS (3),                   /*                               HI.  */
1173    COSTS_N_INSNS (3),                   /*                               SI.  */
1174    COSTS_N_INSNS (3),                   /*                               DI.  */
1175    COSTS_N_INSNS (3)},                  /*                            other.  */
1176   0,                                    /* cost of multiply per each bit
1177                                             set.  */
1178    /* Depending on parameters, idiv can get faster on ryzen.  This is upper
1179       bound.  */
1180   {COSTS_N_INSNS (16),                  /* cost of a divide/mod for QI.  */
1181    COSTS_N_INSNS (22),                  /*                          HI.  */
1182    COSTS_N_INSNS (30),                  /*                          SI.  */
1183    COSTS_N_INSNS (45),                  /*                          DI.  */
1184    COSTS_N_INSNS (45)},                 /*                          other.  */
1185   COSTS_N_INSNS (1),                    /* cost of movsx.  */
1186   COSTS_N_INSNS (1),                    /* cost of movzx.  */
1187   8,                                    /* "large" insn.  */
1188   9,                                    /* MOVE_RATIO.  */
1189
1190   /* All move costs are relative to integer->integer move times 2 and thus
1191      they are latency*2. */
1192
1193   /* reg-reg moves are done by renaming and thus they are even cheaper than
1194      1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1195      to doubles of latencies, we do not model this correctly.  It does not
1196      seem to make practical difference to bump prices up even more.  */
1197   6,                                    /* cost for loading QImode using
1198                                            movzbl.  */
1199   {6, 6, 6},                            /* cost of loading integer registers
1200                                            in QImode, HImode and SImode.
1201                                            Relative to reg-reg move (2).  */
1202   {8, 8, 8},                            /* cost of storing integer
1203                                            registers.  */
1204   2,                                    /* cost of reg,reg fld/fst.  */
1205   {6, 6, 16},                           /* cost of loading fp registers
1206                                            in SFmode, DFmode and XFmode.  */
1207   {8, 8, 16},                           /* cost of storing fp registers
1208                                            in SFmode, DFmode and XFmode.  */
1209   2,                                    /* cost of moving MMX register.  */
1210   {6, 6},                               /* cost of loading MMX registers
1211                                            in SImode and DImode.  */
1212   {8, 8},                               /* cost of storing MMX registers
1213                                            in SImode and DImode.  */
1214   2, 3, 6,                              /* cost of moving XMM,YMM,ZMM register.  */
1215   {6, 6, 6, 12, 24},                    /* cost of loading SSE registers
1216                                            in 32,64,128,256 and 512-bit.  */
1217   {6, 6, 6, 12, 24},                    /* cost of unaligned loads.  */
1218   {8, 8, 8, 16, 32},                    /* cost of storing SSE registers
1219                                            in 32,64,128,256 and 512-bit.  */
1220   {8, 8, 8, 16, 32},                    /* cost of unaligned stores.  */
1221   6, 6,                                 /* SSE->integer and integer->SSE moves.  */
1222   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1223      throughput 12.  Approx 9 uops do not depend on vector size and every load
1224      is 7 uops.  */
1225   18, 8,                                /* Gather load static, per_elt.  */
1226   18, 10,                               /* Gather store static, per_elt.  */
1227   32,                                   /* size of l1 cache.  */
1228   512,                                  /* size of l2 cache.  */
1229   64,                                   /* size of prefetch block.  */
1230   /* New AMD processors never drop prefetches; if they cannot be performed
1231      immediately, they are queued.  We set number of simultaneous prefetches
1232      to a large constant to reflect this (it probably is not a good idea not
1233      to limit number of prefetches at all, as their execution also takes some
1234      time).  */
1235   100,                                  /* number of parallel prefetches.  */
1236   3,                                    /* Branch cost.  */
1237   COSTS_N_INSNS (5),                    /* cost of FADD and FSUB insns.  */
1238   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
1239   /* Latency of fdiv is 8-15.  */
1240   COSTS_N_INSNS (15),                   /* cost of FDIV instruction.  */
1241   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
1242   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
1243   /* Latency of fsqrt is 4-10.  */
1244   COSTS_N_INSNS (10),                   /* cost of FSQRT instruction.  */
1245
1246   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1247   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1248   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
1249   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1250   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
1251   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
1252   COSTS_N_INSNS (10),                   /* cost of DIVSS instruction.  */
1253   /* 9-13  */
1254   COSTS_N_INSNS (13),                   /* cost of DIVSD instruction.  */
1255   COSTS_N_INSNS (10),                   /* cost of SQRTSS instruction.  */
1256   COSTS_N_INSNS (15),                   /* cost of SQRTSD instruction.  */
1257   /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1258      and it can execute 2 integer additions and 2 multiplications thus
1259      reassociation may make sense up to with of 6.  SPEC2k6 bencharks suggests
1260      that 4 works better than 6 probably due to register pressure.
1261
1262      Integer vector operations are taken by FP unit and execute 3 vector
1263      plus/minus operations per cycle but only one multiply.  This is adjusted
1264      in ix86_reassociation_width.  */
1265   4, 4, 3, 6,                           /* reassoc int, fp, vec_int, vec_fp.  */
1266   znver1_memcpy,
1267   znver1_memset,
1268   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1269   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1270   "16",                                 /* Loop alignment.  */
1271   "16",                                 /* Jump alignment.  */
1272   "0:0:8",                              /* Label alignment.  */
1273   "16",                                 /* Func alignment.  */
1274 };
1275
1276 /*  ZNVER2 has optimized REP instruction for medium sized blocks, but for
1277     very small blocks it is better to use loop.  For large blocks, libcall
1278     can do nontemporary accesses and beat inline considerably.  */
1279 static stringop_algs znver2_memcpy[2] = {
1280   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1281              {-1, rep_prefix_4_byte, false}}},
1282   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1283              {-1, libcall, false}}}};
1284 static stringop_algs znver2_memset[2] = {
1285   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1286              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1287   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1288              {-1, libcall, false}}}};
1289
1290 struct processor_costs znver2_cost = {
1291   COSTS_N_INSNS (1),                    /* cost of an add instruction.  */
1292   COSTS_N_INSNS (1),                    /* cost of a lea instruction.  */
1293   COSTS_N_INSNS (1),                    /* variable shift costs.  */
1294   COSTS_N_INSNS (1),                    /* constant shift costs.  */
1295   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI.  */
1296    COSTS_N_INSNS (3),                   /*                               HI.  */
1297    COSTS_N_INSNS (3),                   /*                               SI.  */
1298    COSTS_N_INSNS (3),                   /*                               DI.  */
1299    COSTS_N_INSNS (3)},                  /*                      other.  */
1300   0,                                    /* cost of multiply per each bit
1301                                            set.  */
1302    /* Depending on parameters, idiv can get faster on ryzen.  This is upper
1303       bound.  */
1304   {COSTS_N_INSNS (16),                  /* cost of a divide/mod for QI.  */
1305    COSTS_N_INSNS (22),                  /*                          HI.  */
1306    COSTS_N_INSNS (30),                  /*                          SI.  */
1307    COSTS_N_INSNS (45),                  /*                          DI.  */
1308    COSTS_N_INSNS (45)},                 /*                          other.  */
1309   COSTS_N_INSNS (1),                    /* cost of movsx.  */
1310   COSTS_N_INSNS (1),                    /* cost of movzx.  */
1311   8,                                    /* "large" insn.  */
1312   9,                                    /* MOVE_RATIO.  */
1313
1314   /* All move costs are relative to integer->integer move times 2 and thus
1315      they are latency*2.  */
1316
1317   /* reg-reg moves are done by renaming and thus they are even cheaper than
1318      1 cycle.  Because reg-reg move cost is 2 and following tables correspond
1319      to doubles of latencies, we do not model this correctly.  It does not
1320      seem to make practical difference to bump prices up even more.  */
1321   6,                                    /* cost for loading QImode using
1322                                            movzbl.  */
1323   {6, 6, 6},                            /* cost of loading integer registers
1324                                            in QImode, HImode and SImode.
1325                                            Relative to reg-reg move (2).  */
1326   {8, 8, 8},                            /* cost of storing integer
1327                                            registers.  */
1328   2,                                    /* cost of reg,reg fld/fst.  */
1329   {6, 6, 16},                           /* cost of loading fp registers
1330                                            in SFmode, DFmode and XFmode.  */
1331   {8, 8, 16},                           /* cost of storing fp registers
1332                                            in SFmode, DFmode and XFmode.  */
1333   2,                                    /* cost of moving MMX register.  */
1334   {6, 6},                               /* cost of loading MMX registers
1335                                            in SImode and DImode.  */
1336   {8, 8},                               /* cost of storing MMX registers
1337                                            in SImode and DImode.  */
1338   2, 3, 6,                              /* cost of moving XMM,YMM,ZMM
1339                                            register.  */
1340   {6, 6, 6, 10, 20},                    /* cost of loading SSE registers
1341                                            in 32,64,128,256 and 512-bit.  */
1342   {6, 6, 6, 10, 20},                    /* cost of unaligned loads.  */
1343   {8, 8, 8, 8, 16},                     /* cost of storing SSE registers
1344                                            in 32,64,128,256 and 512-bit.  */
1345   {8, 8, 8, 8, 16},                     /* cost of unaligned stores.  */
1346   6, 6,                                 /* SSE->integer and integer->SSE
1347                                            moves.  */
1348   /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1349      throughput 12.  Approx 9 uops do not depend on vector size and every load
1350      is 7 uops.  */
1351   18, 8,                                /* Gather load static, per_elt.  */
1352   18, 10,                               /* Gather store static, per_elt.  */
1353   32,                                   /* size of l1 cache.  */
1354   512,                                  /* size of l2 cache.  */
1355   64,                                   /* size of prefetch block.  */
1356   /* New AMD processors never drop prefetches; if they cannot be performed
1357      immediately, they are queued.  We set number of simultaneous prefetches
1358      to a large constant to reflect this (it probably is not a good idea not
1359      to limit number of prefetches at all, as their execution also takes some
1360      time).  */
1361   100,                                  /* number of parallel prefetches.  */
1362   3,                                    /* Branch cost.  */
1363   COSTS_N_INSNS (5),                    /* cost of FADD and FSUB insns.  */
1364   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
1365   /* Latency of fdiv is 8-15.  */
1366   COSTS_N_INSNS (15),                   /* cost of FDIV instruction.  */
1367   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
1368   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
1369   /* Latency of fsqrt is 4-10.  */
1370   COSTS_N_INSNS (10),                   /* cost of FSQRT instruction.  */
1371
1372   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1373   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1374   COSTS_N_INSNS (3),                    /* cost of MULSS instruction.  */
1375   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1376   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
1377   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
1378   COSTS_N_INSNS (10),                   /* cost of DIVSS instruction.  */
1379   /* 9-13.  */
1380   COSTS_N_INSNS (13),                   /* cost of DIVSD instruction.  */
1381   COSTS_N_INSNS (10),                   /* cost of SQRTSS instruction.  */
1382   COSTS_N_INSNS (15),                   /* cost of SQRTSD instruction.  */
1383   /* Zen can execute 4 integer operations per cycle.  FP operations
1384      take 3 cycles and it can execute 2 integer additions and 2
1385      multiplications thus reassociation may make sense up to with of 6.
1386      SPEC2k6 bencharks suggests
1387      that 4 works better than 6 probably due to register pressure.
1388
1389      Integer vector operations are taken by FP unit and execute 3 vector
1390      plus/minus operations per cycle but only one multiply.  This is adjusted
1391      in ix86_reassociation_width.  */
1392   4, 4, 3, 6,                           /* reassoc int, fp, vec_int, vec_fp.  */
1393   znver2_memcpy,
1394   znver2_memset,
1395   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
1396   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
1397   "16",                                 /* Loop alignment.  */
1398   "16",                                 /* Jump alignment.  */
1399   "0:0:8",                              /* Label alignment.  */
1400   "16",                                 /* Func alignment.  */
1401 };
1402
1403 /* skylake_cost should produce code tuned for Skylake familly of CPUs.  */
1404 static stringop_algs skylake_memcpy[2] =   {
1405   {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1406   {libcall, {{16, loop, false}, {512, unrolled_loop, false},
1407              {-1, libcall, false}}}};
1408
1409 static stringop_algs skylake_memset[2] = {
1410   {libcall, {{6, loop_1_byte, true},
1411              {24, loop, true},
1412              {8192, rep_prefix_4_byte, true},
1413              {-1, libcall, false}}},
1414   {libcall, {{24, loop, true}, {512, unrolled_loop, false},
1415              {-1, libcall, false}}}};
1416
1417 static const
1418 struct processor_costs skylake_cost = {
1419   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1420   COSTS_N_INSNS (1)+1,          /* cost of a lea instruction */
1421   COSTS_N_INSNS (1),                    /* variable shift costs */
1422   COSTS_N_INSNS (1),                    /* constant shift costs */
1423   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1424    COSTS_N_INSNS (4),                   /*                               HI */
1425    COSTS_N_INSNS (3),                   /*                               SI */
1426    COSTS_N_INSNS (3),                   /*                               DI */
1427    COSTS_N_INSNS (3)},                  /*                            other */
1428   0,                                    /* cost of multiply per each bit set */
1429   /* Expanding div/mod currently doesn't consider parallelism. So the cost
1430      model is not realistic. We compensate by increasing the latencies a bit.  */
1431   {COSTS_N_INSNS (11),                  /* cost of a divide/mod for QI */
1432    COSTS_N_INSNS (11),                  /*                          HI */
1433    COSTS_N_INSNS (14),                  /*                          SI */
1434    COSTS_N_INSNS (76),                  /*                          DI */
1435    COSTS_N_INSNS (76)},                 /*                          other */
1436   COSTS_N_INSNS (1),                    /* cost of movsx */
1437   COSTS_N_INSNS (0),                    /* cost of movzx */
1438   8,                                    /* "large" insn */
1439   17,                                   /* MOVE_RATIO */
1440
1441   6,                                 /* cost for loading QImode using movzbl */
1442   {4, 4, 4},                            /* cost of loading integer registers
1443                                            in QImode, HImode and SImode.
1444                                            Relative to reg-reg move (2).  */
1445   {6, 6, 3},                            /* cost of storing integer registers */
1446   2,                                    /* cost of reg,reg fld/fst */
1447   {6, 6, 8},                            /* cost of loading fp registers
1448                                            in SFmode, DFmode and XFmode */
1449   {6, 6, 10},                           /* cost of storing fp registers
1450                                            in SFmode, DFmode and XFmode */
1451   2,                                    /* cost of moving MMX register */
1452   {6, 6},                               /* cost of loading MMX registers
1453                                            in SImode and DImode */
1454   {6, 6},                               /* cost of storing MMX registers
1455                                            in SImode and DImode */
1456   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
1457   {6, 6, 6, 10, 20},                    /* cost of loading SSE registers
1458                                            in 32,64,128,256 and 512-bit */
1459   {6, 6, 6, 10, 20},                    /* cost of unaligned loads.  */
1460   {8, 8, 8, 12, 24},                    /* cost of storing SSE registers
1461                                            in 32,64,128,256 and 512-bit */
1462   {8, 8, 8, 8, 16},                     /* cost of unaligned stores.  */
1463   2, 2,                                 /* SSE->integer and integer->SSE moves */
1464   20, 8,                                /* Gather load static, per_elt.  */
1465   22, 10,                               /* Gather store static, per_elt.  */
1466   64,                                   /* size of l1 cache.  */
1467   512,                                  /* size of l2 cache.  */
1468   64,                                   /* size of prefetch block */
1469   6,                                    /* number of parallel prefetches */
1470   3,                                    /* Branch cost */
1471   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
1472   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1473   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
1474   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
1475   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
1476   COSTS_N_INSNS (20),                   /* cost of FSQRT instruction.  */
1477
1478   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1479   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1480   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
1481   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1482   COSTS_N_INSNS (4),                    /* cost of FMA SS instruction.  */
1483   COSTS_N_INSNS (4),                    /* cost of FMA SD instruction.  */
1484   COSTS_N_INSNS (11),                   /* cost of DIVSS instruction.  */
1485   COSTS_N_INSNS (14),                   /* cost of DIVSD instruction.  */
1486   COSTS_N_INSNS (12),                   /* cost of SQRTSS instruction.  */
1487   COSTS_N_INSNS (18),                   /* cost of SQRTSD instruction.  */
1488   1, 4, 2, 2,                           /* reassoc int, fp, vec_int, vec_fp.  */
1489   skylake_memcpy,
1490   skylake_memset,
1491   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
1492   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1493   "16:11:8",                            /* Loop alignment.  */
1494   "16:11:8",                            /* Jump alignment.  */
1495   "0:0:8",                              /* Label alignment.  */
1496   "16",                                 /* Func alignment.  */
1497 };
1498   /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1499      very small blocks it is better to use loop. For large blocks, libcall can
1500      do nontemporary accesses and beat inline considerably.  */
1501 static stringop_algs btver1_memcpy[2] = {
1502   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1503              {-1, rep_prefix_4_byte, false}}},
1504   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1505              {-1, libcall, false}}}};
1506 static stringop_algs btver1_memset[2] = {
1507   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1508              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1509   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1510              {-1, libcall, false}}}};
1511 const struct processor_costs btver1_cost = {
1512   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1513   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
1514   COSTS_N_INSNS (1),                    /* variable shift costs */
1515   COSTS_N_INSNS (1),                    /* constant shift costs */
1516   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1517    COSTS_N_INSNS (4),                   /*                               HI */
1518    COSTS_N_INSNS (3),                   /*                               SI */
1519    COSTS_N_INSNS (4),                   /*                               DI */
1520    COSTS_N_INSNS (5)},                  /*                            other */
1521   0,                                    /* cost of multiply per each bit set */
1522   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1523    COSTS_N_INSNS (35),                  /*                          HI */
1524    COSTS_N_INSNS (51),                  /*                          SI */
1525    COSTS_N_INSNS (83),                  /*                          DI */
1526    COSTS_N_INSNS (83)},                 /*                          other */
1527   COSTS_N_INSNS (1),                    /* cost of movsx */
1528   COSTS_N_INSNS (1),                    /* cost of movzx */
1529   8,                                    /* "large" insn */
1530   9,                                    /* MOVE_RATIO */
1531
1532   /* All move costs are relative to integer->integer move times 2 and thus
1533      they are latency*2. */
1534   8,                                 /* cost for loading QImode using movzbl */
1535   {6, 8, 6},                            /* cost of loading integer registers
1536                                            in QImode, HImode and SImode.
1537                                            Relative to reg-reg move (2).  */
1538   {6, 8, 6},                            /* cost of storing integer registers */
1539   4,                                    /* cost of reg,reg fld/fst */
1540   {12, 12, 28},                         /* cost of loading fp registers
1541                                            in SFmode, DFmode and XFmode */
1542   {12, 12, 38},                         /* cost of storing fp registers
1543                                            in SFmode, DFmode and XFmode */
1544   4,                                    /* cost of moving MMX register */
1545   {10, 10},                             /* cost of loading MMX registers
1546                                            in SImode and DImode */
1547   {12, 12},                             /* cost of storing MMX registers
1548                                            in SImode and DImode */
1549   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1550   {10, 10, 12, 48, 96},                 /* cost of loading SSE registers
1551                                            in 32,64,128,256 and 512-bit */
1552   {10, 10, 12, 48, 96},                 /* cost of unaligned loads.  */
1553   {10, 10, 12, 48, 96},                 /* cost of storing SSE registers
1554                                            in 32,64,128,256 and 512-bit */
1555   {10, 10, 12, 48, 96},                 /* cost of unaligned stores.  */
1556   14, 14,                               /* SSE->integer and integer->SSE moves */
1557   10, 10,                               /* Gather load static, per_elt.  */
1558   10, 10,                               /* Gather store static, per_elt.  */
1559   32,                                   /* size of l1 cache.  */
1560   512,                                  /* size of l2 cache.  */
1561   64,                                   /* size of prefetch block */
1562   100,                                  /* number of parallel prefetches */
1563   2,                                    /* Branch cost */
1564   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
1565   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1566   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
1567   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1568   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1569   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
1570
1571   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1572   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1573   COSTS_N_INSNS (2),                    /* cost of MULSS instruction.  */
1574   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1575   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
1576   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
1577   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
1578   COSTS_N_INSNS (17),                   /* cost of DIVSD instruction.  */
1579   COSTS_N_INSNS (14),                   /* cost of SQRTSS instruction.  */
1580   COSTS_N_INSNS (48),                   /* cost of SQRTSD instruction.  */
1581   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1582   btver1_memcpy,
1583   btver1_memset,
1584   COSTS_N_INSNS (2),                    /* cond_taken_branch_cost.  */
1585   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1586   "16:11:8",                            /* Loop alignment.  */
1587   "16:8:8",                             /* Jump alignment.  */
1588   "0:0:8",                              /* Label alignment.  */
1589   "11",                                 /* Func alignment.  */
1590 };
1591
1592 static stringop_algs btver2_memcpy[2] = {
1593   {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1594              {-1, rep_prefix_4_byte, false}}},
1595   {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1596              {-1, libcall, false}}}};
1597 static stringop_algs btver2_memset[2] = {
1598   {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1599              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1600   {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1601              {-1, libcall, false}}}};
1602 const struct processor_costs btver2_cost = {
1603   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1604   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
1605   COSTS_N_INSNS (1),                    /* variable shift costs */
1606   COSTS_N_INSNS (1),                    /* constant shift costs */
1607   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1608    COSTS_N_INSNS (4),                   /*                               HI */
1609    COSTS_N_INSNS (3),                   /*                               SI */
1610    COSTS_N_INSNS (4),                   /*                               DI */
1611    COSTS_N_INSNS (5)},                  /*                            other */
1612   0,                                    /* cost of multiply per each bit set */
1613   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
1614    COSTS_N_INSNS (35),                  /*                          HI */
1615    COSTS_N_INSNS (51),                  /*                          SI */
1616    COSTS_N_INSNS (83),                  /*                          DI */
1617    COSTS_N_INSNS (83)},                 /*                          other */
1618   COSTS_N_INSNS (1),                    /* cost of movsx */
1619   COSTS_N_INSNS (1),                    /* cost of movzx */
1620   8,                                    /* "large" insn */
1621   9,                                    /* MOVE_RATIO */
1622
1623   /* All move costs are relative to integer->integer move times 2 and thus
1624      they are latency*2. */
1625   8,                                 /* cost for loading QImode using movzbl */
1626   {8, 8, 6},                            /* cost of loading integer registers
1627                                            in QImode, HImode and SImode.
1628                                            Relative to reg-reg move (2).  */
1629   {8, 8, 6},                            /* cost of storing integer registers */
1630   4,                                    /* cost of reg,reg fld/fst */
1631   {12, 12, 28},                         /* cost of loading fp registers
1632                                            in SFmode, DFmode and XFmode */
1633   {12, 12, 38},                         /* cost of storing fp registers
1634                                            in SFmode, DFmode and XFmode */
1635   4,                                    /* cost of moving MMX register */
1636   {10, 10},                             /* cost of loading MMX registers
1637                                            in SImode and DImode */
1638   {12, 12},                             /* cost of storing MMX registers
1639                                            in SImode and DImode */
1640   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1641   {10, 10, 12, 48, 96},                 /* cost of loading SSE registers
1642                                            in 32,64,128,256 and 512-bit */
1643   {10, 10, 12, 48, 96},                 /* cost of unaligned loads.  */
1644   {10, 10, 12, 48, 96},                 /* cost of storing SSE registers
1645                                            in 32,64,128,256 and 512-bit */
1646   {10, 10, 12, 48, 96},                 /* cost of unaligned stores.  */
1647   14, 14,                               /* SSE->integer and integer->SSE moves */
1648   10, 10,                               /* Gather load static, per_elt.  */
1649   10, 10,                               /* Gather store static, per_elt.  */
1650   32,                                   /* size of l1 cache.  */
1651   2048,                                 /* size of l2 cache.  */
1652   64,                                   /* size of prefetch block */
1653   100,                                  /* number of parallel prefetches */
1654   2,                                    /* Branch cost */
1655   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
1656   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
1657   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
1658   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1659   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1660   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
1661
1662   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1663   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1664   COSTS_N_INSNS (2),                    /* cost of MULSS instruction.  */
1665   COSTS_N_INSNS (4),                    /* cost of MULSD instruction.  */
1666   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
1667   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
1668   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
1669   COSTS_N_INSNS (19),                   /* cost of DIVSD instruction.  */
1670   COSTS_N_INSNS (16),                   /* cost of SQRTSS instruction.  */
1671   COSTS_N_INSNS (21),                   /* cost of SQRTSD instruction.  */
1672   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1673   btver2_memcpy,
1674   btver2_memset,
1675   COSTS_N_INSNS (2),                    /* cond_taken_branch_cost.  */
1676   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1677   "16:11:8",                            /* Loop alignment.  */
1678   "16:8:8",                             /* Jump alignment.  */
1679   "0:0:8",                              /* Label alignment.  */
1680   "11",                                 /* Func alignment.  */
1681 };
1682
1683 static stringop_algs pentium4_memcpy[2] = {
1684   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1685   DUMMY_STRINGOP_ALGS};
1686 static stringop_algs pentium4_memset[2] = {
1687   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1688              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1689   DUMMY_STRINGOP_ALGS};
1690
1691 static const
1692 struct processor_costs pentium4_cost = {
1693   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1694   COSTS_N_INSNS (3),                    /* cost of a lea instruction */
1695   COSTS_N_INSNS (4),                    /* variable shift costs */
1696   COSTS_N_INSNS (4),                    /* constant shift costs */
1697   {COSTS_N_INSNS (15),                  /* cost of starting multiply for QI */
1698    COSTS_N_INSNS (15),                  /*                               HI */
1699    COSTS_N_INSNS (15),                  /*                               SI */
1700    COSTS_N_INSNS (15),                  /*                               DI */
1701    COSTS_N_INSNS (15)},                 /*                            other */
1702   0,                                    /* cost of multiply per each bit set */
1703   {COSTS_N_INSNS (56),                  /* cost of a divide/mod for QI */
1704    COSTS_N_INSNS (56),                  /*                          HI */
1705    COSTS_N_INSNS (56),                  /*                          SI */
1706    COSTS_N_INSNS (56),                  /*                          DI */
1707    COSTS_N_INSNS (56)},                 /*                          other */
1708   COSTS_N_INSNS (1),                    /* cost of movsx */
1709   COSTS_N_INSNS (1),                    /* cost of movzx */
1710   16,                                   /* "large" insn */
1711   6,                                    /* MOVE_RATIO */
1712
1713   /* All move costs are relative to integer->integer move times 2 and thus
1714      they are latency*2. */
1715   5,                                 /* cost for loading QImode using movzbl */
1716   {4, 5, 4},                            /* cost of loading integer registers
1717                                            in QImode, HImode and SImode.
1718                                            Relative to reg-reg move (2).  */
1719   {2, 3, 2},                            /* cost of storing integer registers */
1720   12,                                   /* cost of reg,reg fld/fst */
1721   {14, 14, 14},                         /* cost of loading fp registers
1722                                            in SFmode, DFmode and XFmode */
1723   {14, 14, 14},                         /* cost of storing fp registers
1724                                            in SFmode, DFmode and XFmode */
1725   12,                                   /* cost of moving MMX register */
1726   {16, 16},                             /* cost of loading MMX registers
1727                                            in SImode and DImode */
1728   {16, 16},                             /* cost of storing MMX registers
1729                                            in SImode and DImode */
1730   12, 24, 48,                           /* cost of moving XMM,YMM,ZMM register */
1731   {16, 16, 16, 32, 64},                 /* cost of loading SSE registers
1732                                            in 32,64,128,256 and 512-bit */
1733   {32, 32, 32, 64, 128},                /* cost of unaligned loads.  */
1734   {16, 16, 16, 32, 64},                 /* cost of storing SSE registers
1735                                            in 32,64,128,256 and 512-bit */
1736   {32, 32, 32, 64, 128},                /* cost of unaligned stores.  */
1737   20, 12,                               /* SSE->integer and integer->SSE moves */
1738   16, 16,                               /* Gather load static, per_elt.  */
1739   16, 16,                               /* Gather store static, per_elt.  */
1740   8,                                    /* size of l1 cache.  */
1741   256,                                  /* size of l2 cache.  */
1742   64,                                   /* size of prefetch block */
1743   6,                                    /* number of parallel prefetches */
1744   2,                                    /* Branch cost */
1745   COSTS_N_INSNS (5),                    /* cost of FADD and FSUB insns.  */
1746   COSTS_N_INSNS (7),                    /* cost of FMUL instruction.  */
1747   COSTS_N_INSNS (43),                   /* cost of FDIV instruction.  */
1748   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
1749   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
1750   COSTS_N_INSNS (43),                   /* cost of FSQRT instruction.  */
1751
1752   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1753   COSTS_N_INSNS (4),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1754   COSTS_N_INSNS (6),                    /* cost of MULSS instruction.  */
1755   COSTS_N_INSNS (6),                    /* cost of MULSD instruction.  */
1756   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
1757   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
1758   COSTS_N_INSNS (23),                   /* cost of DIVSS instruction.  */
1759   COSTS_N_INSNS (38),                   /* cost of DIVSD instruction.  */
1760   COSTS_N_INSNS (23),                   /* cost of SQRTSS instruction.  */
1761   COSTS_N_INSNS (38),                   /* cost of SQRTSD instruction.  */
1762   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1763   pentium4_memcpy,
1764   pentium4_memset,
1765   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
1766   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1767   NULL,                                 /* Loop alignment.  */
1768   NULL,                                 /* Jump alignment.  */
1769   NULL,                                 /* Label alignment.  */
1770   NULL,                                 /* Func alignment.  */
1771 };
1772
1773 static stringop_algs nocona_memcpy[2] = {
1774   {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1775   {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1776              {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1777
1778 static stringop_algs nocona_memset[2] = {
1779   {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1780              {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1781   {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1782              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1783
1784 static const
1785 struct processor_costs nocona_cost = {
1786   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1787   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
1788   COSTS_N_INSNS (1),                    /* variable shift costs */
1789   COSTS_N_INSNS (1),                    /* constant shift costs */
1790   {COSTS_N_INSNS (10),                  /* cost of starting multiply for QI */
1791    COSTS_N_INSNS (10),                  /*                               HI */
1792    COSTS_N_INSNS (10),                  /*                               SI */
1793    COSTS_N_INSNS (10),                  /*                               DI */
1794    COSTS_N_INSNS (10)},                 /*                            other */
1795   0,                                    /* cost of multiply per each bit set */
1796   {COSTS_N_INSNS (66),                  /* cost of a divide/mod for QI */
1797    COSTS_N_INSNS (66),                  /*                          HI */
1798    COSTS_N_INSNS (66),                  /*                          SI */
1799    COSTS_N_INSNS (66),                  /*                          DI */
1800    COSTS_N_INSNS (66)},                 /*                          other */
1801   COSTS_N_INSNS (1),                    /* cost of movsx */
1802   COSTS_N_INSNS (1),                    /* cost of movzx */
1803   16,                                   /* "large" insn */
1804   17,                                   /* MOVE_RATIO */
1805
1806   /* All move costs are relative to integer->integer move times 2 and thus
1807      they are latency*2. */
1808   4,                                 /* cost for loading QImode using movzbl */
1809   {4, 4, 4},                            /* cost of loading integer registers
1810                                            in QImode, HImode and SImode.
1811                                            Relative to reg-reg move (2).  */
1812   {4, 4, 4},                            /* cost of storing integer registers */
1813   12,                                   /* cost of reg,reg fld/fst */
1814   {14, 14, 14},                         /* cost of loading fp registers
1815                                            in SFmode, DFmode and XFmode */
1816   {14, 14, 14},                         /* cost of storing fp registers
1817                                            in SFmode, DFmode and XFmode */
1818   14,                                   /* cost of moving MMX register */
1819   {12, 12},                             /* cost of loading MMX registers
1820                                            in SImode and DImode */
1821   {12, 12},                             /* cost of storing MMX registers
1822                                            in SImode and DImode */
1823   6, 12, 24,                            /* cost of moving XMM,YMM,ZMM register */
1824   {12, 12, 12, 24, 48},                 /* cost of loading SSE registers
1825                                            in 32,64,128,256 and 512-bit */
1826   {24, 24, 24, 48, 96},                 /* cost of unaligned loads.  */
1827   {12, 12, 12, 24, 48},                 /* cost of storing SSE registers
1828                                            in 32,64,128,256 and 512-bit */
1829   {24, 24, 24, 48, 96},                 /* cost of unaligned stores.  */
1830   20, 12,                               /* SSE->integer and integer->SSE moves */
1831   12, 12,                               /* Gather load static, per_elt.  */
1832   12, 12,                               /* Gather store static, per_elt.  */
1833   8,                                    /* size of l1 cache.  */
1834   1024,                                 /* size of l2 cache.  */
1835   64,                                   /* size of prefetch block */
1836   8,                                    /* number of parallel prefetches */
1837   1,                                    /* Branch cost */
1838   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
1839   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
1840   COSTS_N_INSNS (40),                   /* cost of FDIV instruction.  */
1841   COSTS_N_INSNS (3),                    /* cost of FABS instruction.  */
1842   COSTS_N_INSNS (3),                    /* cost of FCHS instruction.  */
1843   COSTS_N_INSNS (44),                   /* cost of FSQRT instruction.  */
1844
1845   COSTS_N_INSNS (2),                    /* cost of cheap SSE instruction.  */
1846   COSTS_N_INSNS (5),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1847   COSTS_N_INSNS (7),                    /* cost of MULSS instruction.  */
1848   COSTS_N_INSNS (7),                    /* cost of MULSD instruction.  */
1849   COSTS_N_INSNS (7),                    /* cost of FMA SS instruction.  */
1850   COSTS_N_INSNS (7),                    /* cost of FMA SD instruction.  */
1851   COSTS_N_INSNS (32),                   /* cost of DIVSS instruction.  */
1852   COSTS_N_INSNS (40),                   /* cost of DIVSD instruction.  */
1853   COSTS_N_INSNS (32),                   /* cost of SQRTSS instruction.  */
1854   COSTS_N_INSNS (41),                   /* cost of SQRTSD instruction.  */
1855   1, 1, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
1856   nocona_memcpy,
1857   nocona_memset,
1858   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
1859   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1860   NULL,                                 /* Loop alignment.  */
1861   NULL,                                 /* Jump alignment.  */
1862   NULL,                                 /* Label alignment.  */
1863   NULL,                                 /* Func alignment.  */
1864 };
1865
1866 static stringop_algs atom_memcpy[2] = {
1867   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1868   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1869              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1870 static stringop_algs atom_memset[2] = {
1871   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1872              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1873   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1874              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1875 static const
1876 struct processor_costs atom_cost = {
1877   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1878   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
1879   COSTS_N_INSNS (1),                    /* variable shift costs */
1880   COSTS_N_INSNS (1),                    /* constant shift costs */
1881   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1882    COSTS_N_INSNS (4),                   /*                               HI */
1883    COSTS_N_INSNS (3),                   /*                               SI */
1884    COSTS_N_INSNS (4),                   /*                               DI */
1885    COSTS_N_INSNS (2)},                  /*                            other */
1886   0,                                    /* cost of multiply per each bit set */
1887   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
1888    COSTS_N_INSNS (26),                  /*                          HI */
1889    COSTS_N_INSNS (42),                  /*                          SI */
1890    COSTS_N_INSNS (74),                  /*                          DI */
1891    COSTS_N_INSNS (74)},                 /*                          other */
1892   COSTS_N_INSNS (1),                    /* cost of movsx */
1893   COSTS_N_INSNS (1),                    /* cost of movzx */
1894   8,                                    /* "large" insn */
1895   17,                                   /* MOVE_RATIO */
1896
1897   /* All move costs are relative to integer->integer move times 2 and thus
1898      they are latency*2. */
1899   6,                                    /* cost for loading QImode using movzbl */
1900   {6, 6, 6},                            /* cost of loading integer registers
1901                                            in QImode, HImode and SImode.
1902                                            Relative to reg-reg move (2).  */
1903   {6, 6, 6},                            /* cost of storing integer registers */
1904   4,                                    /* cost of reg,reg fld/fst */
1905   {6, 6, 18},                           /* cost of loading fp registers
1906                                            in SFmode, DFmode and XFmode */
1907   {14, 14, 24},                         /* cost of storing fp registers
1908                                            in SFmode, DFmode and XFmode */
1909   2,                                    /* cost of moving MMX register */
1910   {8, 8},                               /* cost of loading MMX registers
1911                                            in SImode and DImode */
1912   {10, 10},                             /* cost of storing MMX registers
1913                                            in SImode and DImode */
1914   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
1915   {8, 8, 8, 16, 32},                    /* cost of loading SSE registers
1916                                            in 32,64,128,256 and 512-bit */
1917   {16, 16, 16, 32, 64},                 /* cost of unaligned loads.  */
1918   {8, 8, 8, 16, 32},                    /* cost of storing SSE registers
1919                                            in 32,64,128,256 and 512-bit */
1920   {16, 16, 16, 32, 64},                 /* cost of unaligned stores.  */
1921   8, 6,                                 /* SSE->integer and integer->SSE moves */
1922   8, 8,                                 /* Gather load static, per_elt.  */
1923   8, 8,                                 /* Gather store static, per_elt.  */
1924   32,                                   /* size of l1 cache.  */
1925   256,                                  /* size of l2 cache.  */
1926   64,                                   /* size of prefetch block */
1927   6,                                    /* number of parallel prefetches */
1928   3,                                    /* Branch cost */
1929   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
1930   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
1931   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
1932   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
1933   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
1934   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
1935
1936   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
1937   COSTS_N_INSNS (5),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
1938   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
1939   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
1940   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
1941   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
1942   COSTS_N_INSNS (31),                   /* cost of DIVSS instruction.  */
1943   COSTS_N_INSNS (60),                   /* cost of DIVSD instruction.  */
1944   COSTS_N_INSNS (31),                   /* cost of SQRTSS instruction.  */
1945   COSTS_N_INSNS (63),                   /* cost of SQRTSD instruction.  */
1946   2, 2, 2, 2,                           /* reassoc int, fp, vec_int, vec_fp.  */
1947   atom_memcpy,
1948   atom_memset,
1949   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
1950   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
1951   "16",                                 /* Loop alignment.  */
1952   "16:8:8",                             /* Jump alignment.  */
1953   "0:0:8",                              /* Label alignment.  */
1954   "16",                                 /* Func alignment.  */
1955 };
1956
1957 static stringop_algs slm_memcpy[2] = {
1958   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1959   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1960              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1961 static stringop_algs slm_memset[2] = {
1962   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1963              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1964   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1965              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1966 static const
1967 struct processor_costs slm_cost = {
1968   COSTS_N_INSNS (1),                    /* cost of an add instruction */
1969   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
1970   COSTS_N_INSNS (1),                    /* variable shift costs */
1971   COSTS_N_INSNS (1),                    /* constant shift costs */
1972   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
1973    COSTS_N_INSNS (3),                   /*                               HI */
1974    COSTS_N_INSNS (3),                   /*                               SI */
1975    COSTS_N_INSNS (4),                   /*                               DI */
1976    COSTS_N_INSNS (2)},                  /*                            other */
1977   0,                                    /* cost of multiply per each bit set */
1978   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
1979    COSTS_N_INSNS (26),                  /*                          HI */
1980    COSTS_N_INSNS (42),                  /*                          SI */
1981    COSTS_N_INSNS (74),                  /*                          DI */
1982    COSTS_N_INSNS (74)},                 /*                          other */
1983   COSTS_N_INSNS (1),                    /* cost of movsx */
1984   COSTS_N_INSNS (1),                    /* cost of movzx */
1985   8,                                    /* "large" insn */
1986   17,                                   /* MOVE_RATIO */
1987
1988   /* All move costs are relative to integer->integer move times 2 and thus
1989      they are latency*2. */
1990   8,                                    /* cost for loading QImode using movzbl */
1991   {8, 8, 8},                            /* cost of loading integer registers
1992                                            in QImode, HImode and SImode.
1993                                            Relative to reg-reg move (2).  */
1994   {6, 6, 6},                            /* cost of storing integer registers */
1995   2,                                    /* cost of reg,reg fld/fst */
1996   {8, 8, 18},                           /* cost of loading fp registers
1997                                            in SFmode, DFmode and XFmode */
1998   {6, 6, 18},                           /* cost of storing fp registers
1999                                            in SFmode, DFmode and XFmode */
2000   2,                                    /* cost of moving MMX register */
2001   {8, 8},                               /* cost of loading MMX registers
2002                                            in SImode and DImode */
2003   {6, 6},                               /* cost of storing MMX registers
2004                                            in SImode and DImode */
2005   2, 4, 8,                              /* cost of moving XMM,YMM,ZMM register */
2006   {8, 8, 8, 16, 32},                    /* cost of loading SSE registers
2007                                            in 32,64,128,256 and 512-bit */
2008   {16, 16, 16, 32, 64},                 /* cost of unaligned loads.  */
2009   {8, 8, 8, 16, 32},                    /* cost of storing SSE registers
2010                                            in 32,64,128,256 and 512-bit */
2011   {16, 16, 16, 32, 64},                 /* cost of unaligned stores.  */
2012   8, 6,                                 /* SSE->integer and integer->SSE moves */
2013   8, 8,                                 /* Gather load static, per_elt.  */
2014   8, 8,                                 /* Gather store static, per_elt.  */
2015   32,                                   /* size of l1 cache.  */
2016   256,                                  /* size of l2 cache.  */
2017   64,                                   /* size of prefetch block */
2018   6,                                    /* number of parallel prefetches */
2019   3,                                    /* Branch cost */
2020   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
2021   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
2022   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
2023   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
2024   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
2025   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
2026
2027   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2028   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2029   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
2030   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
2031   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
2032   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
2033   COSTS_N_INSNS (39),                   /* cost of DIVSS instruction.  */
2034   COSTS_N_INSNS (69),                   /* cost of DIVSD instruction.  */
2035   COSTS_N_INSNS (20),                   /* cost of SQRTSS instruction.  */
2036   COSTS_N_INSNS (35),                   /* cost of SQRTSD instruction.  */
2037   1, 2, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2038   slm_memcpy,
2039   slm_memset,
2040   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2041   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2042   "16",                                 /* Loop alignment.  */
2043   "16:8:8",                             /* Jump alignment.  */
2044   "0:0:8",                              /* Label alignment.  */
2045   "16",                                 /* Func alignment.  */
2046 };
2047
2048 static stringop_algs intel_memcpy[2] = {
2049   {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2050   {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2051              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2052 static stringop_algs intel_memset[2] = {
2053   {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2054              {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2055   {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2056              {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2057 static const
2058 struct processor_costs intel_cost = {
2059   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2060   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
2061   COSTS_N_INSNS (1),                    /* variable shift costs */
2062   COSTS_N_INSNS (1),                    /* constant shift costs */
2063   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2064    COSTS_N_INSNS (3),                   /*                               HI */
2065    COSTS_N_INSNS (3),                   /*                               SI */
2066    COSTS_N_INSNS (4),                   /*                               DI */
2067    COSTS_N_INSNS (2)},                  /*                            other */
2068   0,                                    /* cost of multiply per each bit set */
2069   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
2070    COSTS_N_INSNS (26),                  /*                          HI */
2071    COSTS_N_INSNS (42),                  /*                          SI */
2072    COSTS_N_INSNS (74),                  /*                          DI */
2073    COSTS_N_INSNS (74)},                 /*                          other */
2074   COSTS_N_INSNS (1),                    /* cost of movsx */
2075   COSTS_N_INSNS (1),                    /* cost of movzx */
2076   8,                                    /* "large" insn */
2077   17,                                   /* MOVE_RATIO */
2078
2079   /* All move costs are relative to integer->integer move times 2 and thus
2080      they are latency*2. */
2081   6,                                 /* cost for loading QImode using movzbl */
2082   {4, 4, 4},                            /* cost of loading integer registers
2083                                            in QImode, HImode and SImode.
2084                                            Relative to reg-reg move (2).  */
2085   {6, 6, 6},                            /* cost of storing integer registers */
2086   2,                                    /* cost of reg,reg fld/fst */
2087   {6, 6, 8},                            /* cost of loading fp registers
2088                                            in SFmode, DFmode and XFmode */
2089   {6, 6, 10},                           /* cost of storing fp registers
2090                                            in SFmode, DFmode and XFmode */
2091   2,                                    /* cost of moving MMX register */
2092   {6, 6},                               /* cost of loading MMX registers
2093                                            in SImode and DImode */
2094   {6, 6},                               /* cost of storing MMX registers
2095                                            in SImode and DImode */
2096   2, 2, 2,                              /* cost of moving XMM,YMM,ZMM register */
2097   {6, 6, 6, 6, 6},                      /* cost of loading SSE registers
2098                                            in 32,64,128,256 and 512-bit */
2099   {10, 10, 10, 10, 10},                 /* cost of unaligned loads.  */
2100   {6, 6, 6, 6, 6},                      /* cost of storing SSE registers
2101                                            in 32,64,128,256 and 512-bit */
2102   {10, 10, 10, 10, 10},                 /* cost of unaligned loads.  */
2103   4, 4,                                 /* SSE->integer and integer->SSE moves */
2104   6, 6,                                 /* Gather load static, per_elt.  */
2105   6, 6,                                 /* Gather store static, per_elt.  */
2106   32,                                   /* size of l1 cache.  */
2107   256,                                  /* size of l2 cache.  */
2108   64,                                   /* size of prefetch block */
2109   6,                                    /* number of parallel prefetches */
2110   3,                                    /* Branch cost */
2111   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
2112   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
2113   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
2114   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
2115   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
2116   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
2117
2118   COSTS_N_INSNS (8),                    /* cost of cheap SSE instruction.  */
2119   COSTS_N_INSNS (8),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2120   COSTS_N_INSNS (8),                    /* cost of MULSS instruction.  */
2121   COSTS_N_INSNS (8),                    /* cost of MULSD instruction.  */
2122   COSTS_N_INSNS (6),                    /* cost of FMA SS instruction.  */
2123   COSTS_N_INSNS (6),                    /* cost of FMA SD instruction.  */
2124   COSTS_N_INSNS (20),                   /* cost of DIVSS instruction.  */
2125   COSTS_N_INSNS (20),                   /* cost of DIVSD instruction.  */
2126   COSTS_N_INSNS (40),                   /* cost of SQRTSS instruction.  */
2127   COSTS_N_INSNS (40),                   /* cost of SQRTSD instruction.  */
2128   1, 4, 1, 1,                           /* reassoc int, fp, vec_int, vec_fp.  */
2129   intel_memcpy,
2130   intel_memset,
2131   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2132   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2133   "16",                                 /* Loop alignment.  */
2134   "16:8:8",                             /* Jump alignment.  */
2135   "0:0:8",                              /* Label alignment.  */
2136   "16",                                 /* Func alignment.  */
2137 };
2138
2139 /* Generic should produce code tuned for Core-i7 (and newer chips)
2140    and btver1 (and newer chips).  */
2141
2142 static stringop_algs generic_memcpy[2] = {
2143   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2144              {-1, libcall, false}}},
2145   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2146              {-1, libcall, false}}}};
2147 static stringop_algs generic_memset[2] = {
2148   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2149              {-1, libcall, false}}},
2150   {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2151              {-1, libcall, false}}}};
2152 static const
2153 struct processor_costs generic_cost = {
2154   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2155   /* Setting cost to 2 makes our current implementation of synth_mult result in
2156      use of unnecessary temporary registers causing regression on several
2157      SPECfp benchmarks.  */
2158   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
2159   COSTS_N_INSNS (1),                    /* variable shift costs */
2160   COSTS_N_INSNS (1),                    /* constant shift costs */
2161   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2162    COSTS_N_INSNS (4),                   /*                               HI */
2163    COSTS_N_INSNS (3),                   /*                               SI */
2164    COSTS_N_INSNS (4),                   /*                               DI */
2165    COSTS_N_INSNS (4)},                  /*                            other */
2166   0,                                    /* cost of multiply per each bit set */
2167   {COSTS_N_INSNS (16),                  /* cost of a divide/mod for QI */
2168    COSTS_N_INSNS (22),                  /*                          HI */
2169    COSTS_N_INSNS (30),                  /*                          SI */
2170    COSTS_N_INSNS (74),                  /*                          DI */
2171    COSTS_N_INSNS (74)},                 /*                          other */
2172   COSTS_N_INSNS (1),                    /* cost of movsx */
2173   COSTS_N_INSNS (1),                    /* cost of movzx */
2174   8,                                    /* "large" insn */
2175   17,                                   /* MOVE_RATIO */
2176
2177   /* All move costs are relative to integer->integer move times 2 and thus
2178      they are latency*2. */
2179   6,                                 /* cost for loading QImode using movzbl */
2180   {6, 6, 6},                            /* cost of loading integer registers
2181                                            in QImode, HImode and SImode.
2182                                            Relative to reg-reg move (2).  */
2183   {6, 6, 6},                            /* cost of storing integer registers */
2184   4,                                    /* cost of reg,reg fld/fst */
2185   {6, 6, 12},                           /* cost of loading fp registers
2186                                            in SFmode, DFmode and XFmode */
2187   {6, 6, 12},                           /* cost of storing fp registers
2188                                            in SFmode, DFmode and XFmode */
2189   2,                                    /* cost of moving MMX register */
2190   {6, 6},                               /* cost of loading MMX registers
2191                                            in SImode and DImode */
2192   {6, 6},                               /* cost of storing MMX registers
2193                                            in SImode and DImode */
2194   2, 3, 4,                              /* cost of moving XMM,YMM,ZMM register */
2195   {6, 6, 6, 10, 15},                    /* cost of loading SSE registers
2196                                            in 32,64,128,256 and 512-bit */
2197   {6, 6, 6, 10, 15},                    /* cost of unaligned loads.  */
2198   {6, 6, 6, 10, 15},                    /* cost of storing SSE registers
2199                                            in 32,64,128,256 and 512-bit */
2200   {6, 6, 6, 10, 15},                    /* cost of unaligned storess.  */
2201   6, 6,                                 /* SSE->integer and integer->SSE moves */
2202   18, 6,                                /* Gather load static, per_elt.  */
2203   18, 6,                                /* Gather store static, per_elt.  */
2204   32,                                   /* size of l1 cache.  */
2205   512,                                  /* size of l2 cache.  */
2206   64,                                   /* size of prefetch block */
2207   6,                                    /* number of parallel prefetches */
2208   /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2209      value is increased to perhaps more appropriate value of 5.  */
2210   3,                                    /* Branch cost */
2211   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
2212   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
2213   COSTS_N_INSNS (17),                   /* cost of FDIV instruction.  */
2214   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
2215   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
2216   COSTS_N_INSNS (14),                   /* cost of FSQRT instruction.  */
2217
2218   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2219   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2220   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
2221   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
2222   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
2223   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
2224   COSTS_N_INSNS (13),                   /* cost of DIVSS instruction.  */
2225   COSTS_N_INSNS (17),                   /* cost of DIVSD instruction.  */
2226   COSTS_N_INSNS (14),                   /* cost of SQRTSS instruction.  */
2227   COSTS_N_INSNS (18),                   /* cost of SQRTSD instruction.  */
2228   1, 4, 3, 3,                           /* reassoc int, fp, vec_int, vec_fp.  */
2229   generic_memcpy,
2230   generic_memset,
2231   COSTS_N_INSNS (4),                    /* cond_taken_branch_cost.  */
2232   COSTS_N_INSNS (2),                    /* cond_not_taken_branch_cost.  */
2233   "16:11:8",                            /* Loop alignment.  */
2234   "16:11:8",                            /* Jump alignment.  */
2235   "0:0:8",                              /* Label alignment.  */
2236   "16",                                 /* Func alignment.  */
2237 };
2238
2239 /* core_cost should produce code tuned for Core familly of CPUs.  */
2240 static stringop_algs core_memcpy[2] = {
2241   {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2242   {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2243              {-1, libcall, false}}}};
2244 static stringop_algs core_memset[2] = {
2245   {libcall, {{6, loop_1_byte, true},
2246              {24, loop, true},
2247              {8192, rep_prefix_4_byte, true},
2248              {-1, libcall, false}}},
2249   {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2250              {-1, libcall, false}}}};
2251
2252 static const
2253 struct processor_costs core_cost = {
2254   COSTS_N_INSNS (1),                    /* cost of an add instruction */
2255   /* On all chips taken into consideration lea is 2 cycles and more.  With
2256      this cost however our current implementation of synth_mult results in
2257      use of unnecessary temporary registers causing regression on several
2258      SPECfp benchmarks.  */
2259   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
2260   COSTS_N_INSNS (1),                    /* variable shift costs */
2261   COSTS_N_INSNS (1),                    /* constant shift costs */
2262   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
2263    COSTS_N_INSNS (4),                   /*                               HI */
2264    COSTS_N_INSNS (3),                   /*                               SI */
2265    /* Here we tune for Sandybridge or newer.  */
2266    COSTS_N_INSNS (3),                   /*                               DI */
2267    COSTS_N_INSNS (3)},                  /*                            other */
2268   0,                                    /* cost of multiply per each bit set */
2269   /* Expanding div/mod currently doesn't consider parallelism. So the cost
2270      model is not realistic. We compensate by increasing the latencies a bit.  */
2271   {COSTS_N_INSNS (11),                  /* cost of a divide/mod for QI */
2272    COSTS_N_INSNS (11),                  /*                          HI */
2273    COSTS_N_INSNS (14),                  /*                          SI */
2274    COSTS_N_INSNS (81),                  /*                          DI */
2275    COSTS_N_INSNS (81)},                 /*                          other */
2276   COSTS_N_INSNS (1),                    /* cost of movsx */
2277   COSTS_N_INSNS (1),                    /* cost of movzx */
2278   8,                                    /* "large" insn */
2279   17,                                   /* MOVE_RATIO */
2280
2281   /* All move costs are relative to integer->integer move times 2 and thus
2282      they are latency*2. */
2283   6,                                 /* cost for loading QImode using movzbl */
2284   {4, 4, 4},                            /* cost of loading integer registers
2285                                            in QImode, HImode and SImode.
2286                                            Relative to reg-reg move (2).  */
2287   {6, 6, 6},                            /* cost of storing integer registers */
2288   2,                                    /* cost of reg,reg fld/fst */
2289   {6, 6, 8},                            /* cost of loading fp registers
2290                                            in SFmode, DFmode and XFmode */
2291   {6, 6, 10},                           /* cost of storing fp registers
2292                                            in SFmode, DFmode and XFmode */
2293   2,                                    /* cost of moving MMX register */
2294   {6, 6},                               /* cost of loading MMX registers
2295                                            in SImode and DImode */
2296   {6, 6},                               /* cost of storing MMX registers
2297                                            in SImode and DImode */
2298   2, 2, 4,                              /* cost of moving XMM,YMM,ZMM register */
2299   {6, 6, 6, 6, 12},                     /* cost of loading SSE registers
2300                                            in 32,64,128,256 and 512-bit */
2301   {6, 6, 6, 6, 12},                     /* cost of unaligned loads.  */
2302   {6, 6, 6, 6, 12},                     /* cost of storing SSE registers
2303                                            in 32,64,128,256 and 512-bit */
2304   {6, 6, 6, 6, 12},                     /* cost of unaligned stores.  */
2305   2, 2,                                 /* SSE->integer and integer->SSE moves */
2306   /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
2307      rec. throughput 6.
2308      So 5 uops statically and one uops per load.  */
2309   10, 6,                                /* Gather load static, per_elt.  */
2310   10, 6,                                /* Gather store static, per_elt.  */
2311   64,                                   /* size of l1 cache.  */
2312   512,                                  /* size of l2 cache.  */
2313   64,                                   /* size of prefetch block */
2314   6,                                    /* number of parallel prefetches */
2315   /* FIXME perhaps more appropriate value is 5.  */
2316   3,                                    /* Branch cost */
2317   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
2318   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
2319   /* 10-24 */
2320   COSTS_N_INSNS (24),                   /* cost of FDIV instruction.  */
2321   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
2322   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
2323   COSTS_N_INSNS (23),                   /* cost of FSQRT instruction.  */
2324
2325   COSTS_N_INSNS (1),                    /* cost of cheap SSE instruction.  */
2326   COSTS_N_INSNS (3),                    /* cost of ADDSS/SD SUBSS/SD insns.  */
2327   COSTS_N_INSNS (4),                    /* cost of MULSS instruction.  */
2328   COSTS_N_INSNS (5),                    /* cost of MULSD instruction.  */
2329   COSTS_N_INSNS (5),                    /* cost of FMA SS instruction.  */
2330   COSTS_N_INSNS (5),                    /* cost of FMA SD instruction.  */
2331   COSTS_N_INSNS (18),                   /* cost of DIVSS instruction.  */
2332   COSTS_N_INSNS (32),                   /* cost of DIVSD instruction.  */
2333   COSTS_N_INSNS (30),                   /* cost of SQRTSS instruction.  */
2334   COSTS_N_INSNS (58),                   /* cost of SQRTSD instruction.  */
2335   1, 4, 2, 2,                           /* reassoc int, fp, vec_int, vec_fp.  */
2336   core_memcpy,
2337   core_memset,
2338   COSTS_N_INSNS (3),                    /* cond_taken_branch_cost.  */
2339   COSTS_N_INSNS (1),                    /* cond_not_taken_branch_cost.  */
2340   "16:11:8",                            /* Loop alignment.  */
2341   "16:11:8",                            /* Jump alignment.  */
2342   "0:0:8",                              /* Label alignment.  */
2343   "16",                                 /* Func alignment.  */
2344 };
2345