]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/i386/x86-tune-costs.h
Match: Support form 2 for vector signed integer .SAT_ADD
[thirdparty/gcc.git] / gcc / config / i386 / x86-tune-costs.h
CommitLineData
df41dbaf 1/* Costs of operations of individual x86 CPUs.
a945c346 2 Copyright (C) 1988-2024 Free Software Foundation, Inc.
64766e8d 3
df41dbaf
JH
4This file is part of GCC.
5
6GCC is free software; you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation; either version 3, or (at your option)
9any later version.
10
11GCC is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14GNU General Public License for more details.
15
16Under Section 7 of GPL version 3, you are granted additional
17permissions described in the GCC Runtime Library Exception, version
183.1, as published by the Free Software Foundation.
19
20You should have received a copy of the GNU General Public License and
21a copy of the GCC Runtime Library Exception along with this program;
22see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23<http://www.gnu.org/licenses/>. */
64766e8d
JH
24/* Processor costs (relative to an add) */
25/* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
26#define COSTS_N_BYTES(N) ((N) * 2)
27
28#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
29
30static stringop_algs ix86_size_memcpy[2] = {
31 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
32 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
33static stringop_algs ix86_size_memset[2] = {
34 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
35 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
36
37const
38struct processor_costs ix86_size_cost = {/* costs for tuning for size */
72bb85f8 39 {
d321551c
L
40 /* Start of register allocator costs. integer->integer move cost is 2. */
41 2, /* cost for loading QImode using movzbl */
42 {2, 2, 2}, /* cost of loading integer registers
43 in QImode, HImode and SImode.
44 Relative to reg-reg move (2). */
45 {2, 2, 2}, /* cost of storing integer registers */
46 2, /* cost of reg,reg fld/fst */
47 {2, 2, 2}, /* cost of loading fp registers
48 in SFmode, DFmode and XFmode */
49 {2, 2, 2}, /* cost of storing fp registers
50 in SFmode, DFmode and XFmode */
51 3, /* cost of moving MMX register */
52 {3, 3}, /* cost of loading MMX registers
53 in SImode and DImode */
54 {3, 3}, /* cost of storing MMX registers
55 in SImode and DImode */
56 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
57 {3, 3, 3, 3, 3}, /* cost of loading SSE registers
58 in 32,64,128,256 and 512-bit */
59 {3, 3, 3, 3, 3}, /* cost of storing SSE registers
60 in 32,64,128,256 and 512-bit */
ecc3135a 61 3, 3, /* SSE->integer and integer->SSE moves */
62 3, 3, /* mask->integer and integer->mask moves */
00cb3494
L
63 {2, 2, 2}, /* cost of loading mask register
64 in QImode, HImode, SImode. */
65 {2, 2, 2}, /* cost if storing mask register
66 in QImode, HImode, SImode. */
67 2, /* cost of moving mask register. */
d321551c 68 /* End of register allocator costs. */
72bb85f8 69 },
d321551c 70
64766e8d
JH
71 COSTS_N_BYTES (2), /* cost of an add instruction */
72 COSTS_N_BYTES (3), /* cost of a lea instruction */
73 COSTS_N_BYTES (2), /* variable shift costs */
74 COSTS_N_BYTES (3), /* constant shift costs */
75 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
76 COSTS_N_BYTES (3), /* HI */
77 COSTS_N_BYTES (3), /* SI */
78 COSTS_N_BYTES (3), /* DI */
79 COSTS_N_BYTES (5)}, /* other */
80 0, /* cost of multiply per each bit set */
81 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
82 COSTS_N_BYTES (3), /* HI */
83 COSTS_N_BYTES (3), /* SI */
84 COSTS_N_BYTES (3), /* DI */
85 COSTS_N_BYTES (5)}, /* other */
86 COSTS_N_BYTES (3), /* cost of movsx */
87 COSTS_N_BYTES (3), /* cost of movzx */
88 0, /* "large" insn */
89 2, /* MOVE_RATIO */
25e22b19 90 2, /* CLEAR_RATIO */
64766e8d
JH
91 {2, 2, 2}, /* cost of loading integer registers
92 in QImode, HImode and SImode.
93 Relative to reg-reg move (2). */
94 {2, 2, 2}, /* cost of storing integer registers */
d321551c
L
95 {3, 3, 3, 3, 3}, /* cost of loading SSE register
96 in 32bit, 64bit, 128bit, 256bit and 512bit */
97 {3, 3, 3, 3, 3}, /* cost of storing SSE register
98 in 32bit, 64bit, 128bit, 256bit and 512bit */
df41dbaf
JH
99 {3, 3, 3, 3, 3}, /* cost of unaligned SSE load
100 in 128bit, 256bit and 512bit */
d321551c 101 {3, 3, 3, 3, 3}, /* cost of unaligned SSE store
df41dbaf 102 in 128bit, 256bit and 512bit */
d321551c
L
103 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
104 3, /* cost of moving SSE register to integer. */
a4fe6139
JH
105 5, 0, /* Gather load static, per_elt. */
106 5, 0, /* Gather store static, per_elt. */
64766e8d
JH
107 0, /* size of l1 cache */
108 0, /* size of l2 cache */
109 0, /* size of prefetch block */
110 0, /* number of parallel prefetches */
111 2, /* Branch cost */
112 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
113 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
114 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
115 COSTS_N_BYTES (2), /* cost of FABS instruction. */
116 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
117 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
6065f444 118
c53c148c 119 COSTS_N_BYTES (2), /* cost of cheap SSE instruction. */
6065f444
JH
120 COSTS_N_BYTES (2), /* cost of ADDSS/SD SUBSS/SD insns. */
121 COSTS_N_BYTES (2), /* cost of MULSS instruction. */
122 COSTS_N_BYTES (2), /* cost of MULSD instruction. */
c53c148c
JH
123 COSTS_N_BYTES (2), /* cost of FMA SS instruction. */
124 COSTS_N_BYTES (2), /* cost of FMA SD instruction. */
6065f444
JH
125 COSTS_N_BYTES (2), /* cost of DIVSS instruction. */
126 COSTS_N_BYTES (2), /* cost of DIVSD instruction. */
127 COSTS_N_BYTES (2), /* cost of SQRTSS instruction. */
128 COSTS_N_BYTES (2), /* cost of SQRTSD instruction. */
64766e8d
JH
129 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
130 ix86_size_memcpy,
131 ix86_size_memset,
f6fd8f2b
JH
132 COSTS_N_BYTES (1), /* cond_taken_branch_cost. */
133 COSTS_N_BYTES (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
134 NULL, /* Loop alignment. */
135 NULL, /* Jump alignment. */
136 NULL, /* Label alignment. */
137 NULL, /* Func alignment. */
071e428c
HW
138 4, /* Small unroll limit. */
139 2, /* Small unroll factor. */
64766e8d
JH
140};
141
142/* Processor costs (relative to an add) */
143static stringop_algs i386_memcpy[2] = {
144 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
145 DUMMY_STRINGOP_ALGS};
146static stringop_algs i386_memset[2] = {
147 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
148 DUMMY_STRINGOP_ALGS};
149
150static const
151struct processor_costs i386_cost = { /* 386 specific costs */
72bb85f8 152 {
d321551c
L
153 /* Start of register allocator costs. integer->integer move cost is 2. */
154 4, /* cost for loading QImode using movzbl */
155 {2, 4, 2}, /* cost of loading integer registers
156 in QImode, HImode and SImode.
157 Relative to reg-reg move (2). */
158 {2, 4, 2}, /* cost of storing integer registers */
159 2, /* cost of reg,reg fld/fst */
160 {8, 8, 8}, /* cost of loading fp registers
161 in SFmode, DFmode and XFmode */
162 {8, 8, 8}, /* cost of storing fp registers
163 in SFmode, DFmode and XFmode */
164 2, /* cost of moving MMX register */
165 {4, 8}, /* cost of loading MMX registers
166 in SImode and DImode */
167 {4, 8}, /* cost of storing MMX registers
168 in SImode and DImode */
169 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
170 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
171 in 32,64,128,256 and 512-bit */
172 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
173 in 32,64,128,256 and 512-bit */
ecc3135a 174 3, 3, /* SSE->integer and integer->SSE moves */
175 3, 3, /* mask->integer and integer->mask moves */
00cb3494
L
176 {2, 4, 2}, /* cost of loading mask register
177 in QImode, HImode, SImode. */
178 {2, 4, 2}, /* cost if storing mask register
179 in QImode, HImode, SImode. */
180 2, /* cost of moving mask register. */
d321551c 181 /* End of register allocator costs. */
72bb85f8 182 },
d321551c 183
64766e8d
JH
184 COSTS_N_INSNS (1), /* cost of an add instruction */
185 COSTS_N_INSNS (1), /* cost of a lea instruction */
186 COSTS_N_INSNS (3), /* variable shift costs */
187 COSTS_N_INSNS (2), /* constant shift costs */
188 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
189 COSTS_N_INSNS (6), /* HI */
190 COSTS_N_INSNS (6), /* SI */
191 COSTS_N_INSNS (6), /* DI */
192 COSTS_N_INSNS (6)}, /* other */
193 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
194 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
195 COSTS_N_INSNS (23), /* HI */
196 COSTS_N_INSNS (23), /* SI */
197 COSTS_N_INSNS (23), /* DI */
198 COSTS_N_INSNS (23)}, /* other */
199 COSTS_N_INSNS (3), /* cost of movsx */
200 COSTS_N_INSNS (2), /* cost of movzx */
201 15, /* "large" insn */
202 3, /* MOVE_RATIO */
25e22b19 203 3, /* CLEAR_RATIO */
64766e8d
JH
204 {2, 4, 2}, /* cost of loading integer registers
205 in QImode, HImode and SImode.
206 Relative to reg-reg move (2). */
207 {2, 4, 2}, /* cost of storing integer registers */
d321551c
L
208 {4, 8, 16, 32, 64}, /* cost of loading SSE register
209 in 32bit, 64bit, 128bit, 256bit and 512bit */
210 {4, 8, 16, 32, 64}, /* cost of storing SSE register
211 in 32bit, 64bit, 128bit, 256bit and 512bit */
df41dbaf 212 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
df41dbaf 213 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
d321551c
L
214 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
215 3, /* cost of moving SSE register to integer. */
a4fe6139
JH
216 4, 4, /* Gather load static, per_elt. */
217 4, 4, /* Gather store static, per_elt. */
64766e8d
JH
218 0, /* size of l1 cache */
219 0, /* size of l2 cache */
220 0, /* size of prefetch block */
221 0, /* number of parallel prefetches */
222 1, /* Branch cost */
223 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
224 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
225 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
226 COSTS_N_INSNS (22), /* cost of FABS instruction. */
227 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
228 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
6065f444 229
c53c148c 230 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
231 COSTS_N_INSNS (23), /* cost of ADDSS/SD SUBSS/SD insns. */
232 COSTS_N_INSNS (27), /* cost of MULSS instruction. */
233 COSTS_N_INSNS (27), /* cost of MULSD instruction. */
c53c148c
JH
234 COSTS_N_INSNS (27), /* cost of FMA SS instruction. */
235 COSTS_N_INSNS (27), /* cost of FMA SD instruction. */
6065f444
JH
236 COSTS_N_INSNS (88), /* cost of DIVSS instruction. */
237 COSTS_N_INSNS (88), /* cost of DIVSD instruction. */
238 COSTS_N_INSNS (122), /* cost of SQRTSS instruction. */
239 COSTS_N_INSNS (122), /* cost of SQRTSD instruction. */
64766e8d
JH
240 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
241 i386_memcpy,
242 i386_memset,
f6fd8f2b
JH
243 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
244 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
245 "4", /* Loop alignment. */
246 "4", /* Jump alignment. */
247 NULL, /* Label alignment. */
248 "4", /* Func alignment. */
071e428c
HW
249 4, /* Small unroll limit. */
250 2, /* Small unroll factor. */
64766e8d
JH
251};
252
253static stringop_algs i486_memcpy[2] = {
254 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
255 DUMMY_STRINGOP_ALGS};
256static stringop_algs i486_memset[2] = {
257 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
258 DUMMY_STRINGOP_ALGS};
259
260static const
261struct processor_costs i486_cost = { /* 486 specific costs */
72bb85f8 262 {
d321551c
L
263 /* Start of register allocator costs. integer->integer move cost is 2. */
264 4, /* cost for loading QImode using movzbl */
265 {2, 4, 2}, /* cost of loading integer registers
266 in QImode, HImode and SImode.
267 Relative to reg-reg move (2). */
268 {2, 4, 2}, /* cost of storing integer registers */
269 2, /* cost of reg,reg fld/fst */
270 {8, 8, 8}, /* cost of loading fp registers
271 in SFmode, DFmode and XFmode */
272 {8, 8, 8}, /* cost of storing fp registers
273 in SFmode, DFmode and XFmode */
274 2, /* cost of moving MMX register */
275 {4, 8}, /* cost of loading MMX registers
276 in SImode and DImode */
277 {4, 8}, /* cost of storing MMX registers
278 in SImode and DImode */
279 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
280 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
281 in 32,64,128,256 and 512-bit */
282 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
283 in 32,64,128,256 and 512-bit */
ecc3135a 284 3, 3, /* SSE->integer and integer->SSE moves */
285 3, 3, /* mask->integer and integer->mask moves */
00cb3494
L
286 {2, 4, 2}, /* cost of loading mask register
287 in QImode, HImode, SImode. */
288 {2, 4, 2}, /* cost if storing mask register
289 in QImode, HImode, SImode. */
290 2, /* cost of moving mask register. */
d321551c 291 /* End of register allocator costs. */
72bb85f8 292 },
d321551c 293
64766e8d
JH
294 COSTS_N_INSNS (1), /* cost of an add instruction */
295 COSTS_N_INSNS (1), /* cost of a lea instruction */
296 COSTS_N_INSNS (3), /* variable shift costs */
297 COSTS_N_INSNS (2), /* constant shift costs */
298 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
299 COSTS_N_INSNS (12), /* HI */
300 COSTS_N_INSNS (12), /* SI */
301 COSTS_N_INSNS (12), /* DI */
302 COSTS_N_INSNS (12)}, /* other */
303 1, /* cost of multiply per each bit set */
304 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
305 COSTS_N_INSNS (40), /* HI */
306 COSTS_N_INSNS (40), /* SI */
307 COSTS_N_INSNS (40), /* DI */
308 COSTS_N_INSNS (40)}, /* other */
309 COSTS_N_INSNS (3), /* cost of movsx */
310 COSTS_N_INSNS (2), /* cost of movzx */
311 15, /* "large" insn */
312 3, /* MOVE_RATIO */
25e22b19 313 3, /* CLEAR_RATIO */
64766e8d
JH
314 {2, 4, 2}, /* cost of loading integer registers
315 in QImode, HImode and SImode.
316 Relative to reg-reg move (2). */
317 {2, 4, 2}, /* cost of storing integer registers */
d321551c
L
318 {4, 8, 16, 32, 64}, /* cost of loading SSE register
319 in 32bit, 64bit, 128bit, 256bit and 512bit */
320 {4, 8, 16, 32, 64}, /* cost of storing SSE register
321 in 32bit, 64bit, 128bit, 256bit and 512bit */
df41dbaf 322 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
df41dbaf 323 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
d321551c
L
324 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
325 3, /* cost of moving SSE register to integer. */
a4fe6139
JH
326 4, 4, /* Gather load static, per_elt. */
327 4, 4, /* Gather store static, per_elt. */
64766e8d
JH
328 4, /* size of l1 cache. 486 has 8kB cache
329 shared for code and data, so 4kB is
330 not really precise. */
331 4, /* size of l2 cache */
332 0, /* size of prefetch block */
333 0, /* number of parallel prefetches */
334 1, /* Branch cost */
335 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
336 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
337 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
338 COSTS_N_INSNS (3), /* cost of FABS instruction. */
339 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
340 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
6065f444 341
c53c148c 342 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
343 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
344 COSTS_N_INSNS (16), /* cost of MULSS instruction. */
345 COSTS_N_INSNS (16), /* cost of MULSD instruction. */
c53c148c
JH
346 COSTS_N_INSNS (16), /* cost of FMA SS instruction. */
347 COSTS_N_INSNS (16), /* cost of FMA SD instruction. */
6065f444
JH
348 COSTS_N_INSNS (73), /* cost of DIVSS instruction. */
349 COSTS_N_INSNS (74), /* cost of DIVSD instruction. */
350 COSTS_N_INSNS (83), /* cost of SQRTSS instruction. */
351 COSTS_N_INSNS (83), /* cost of SQRTSD instruction. */
64766e8d
JH
352 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
353 i486_memcpy,
354 i486_memset,
f6fd8f2b
JH
355 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
356 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
357 "16", /* Loop alignment. */
358 "16", /* Jump alignment. */
359 "0:0:8", /* Label alignment. */
360 "16", /* Func alignment. */
071e428c
HW
361 4, /* Small unroll limit. */
362 2, /* Small unroll factor. */
64766e8d
JH
363};
364
365static stringop_algs pentium_memcpy[2] = {
366 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
367 DUMMY_STRINGOP_ALGS};
368static stringop_algs pentium_memset[2] = {
369 {libcall, {{-1, rep_prefix_4_byte, false}}},
370 DUMMY_STRINGOP_ALGS};
371
372static const
373struct processor_costs pentium_cost = {
72bb85f8 374 {
d321551c
L
375 /* Start of register allocator costs. integer->integer move cost is 2. */
376 6, /* cost for loading QImode using movzbl */
377 {2, 4, 2}, /* cost of loading integer registers
378 in QImode, HImode and SImode.
379 Relative to reg-reg move (2). */
380 {2, 4, 2}, /* cost of storing integer registers */
381 2, /* cost of reg,reg fld/fst */
382 {2, 2, 6}, /* cost of loading fp registers
383 in SFmode, DFmode and XFmode */
384 {4, 4, 6}, /* cost of storing fp registers
385 in SFmode, DFmode and XFmode */
386 8, /* cost of moving MMX register */
387 {8, 8}, /* cost of loading MMX registers
388 in SImode and DImode */
389 {8, 8}, /* cost of storing MMX registers
390 in SImode and DImode */
391 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
392 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
393 in 32,64,128,256 and 512-bit */
394 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
395 in 32,64,128,256 and 512-bit */
ecc3135a 396 3, 3, /* SSE->integer and integer->SSE moves */
397 3, 3, /* mask->integer and integer->mask moves */
00cb3494
L
398 {2, 4, 2}, /* cost of loading mask register
399 in QImode, HImode, SImode. */
400 {2, 4, 2}, /* cost if storing mask register
401 in QImode, HImode, SImode. */
402 2, /* cost of moving mask register. */
d321551c 403 /* End of register allocator costs. */
72bb85f8 404 },
d321551c 405
64766e8d
JH
406 COSTS_N_INSNS (1), /* cost of an add instruction */
407 COSTS_N_INSNS (1), /* cost of a lea instruction */
408 COSTS_N_INSNS (4), /* variable shift costs */
409 COSTS_N_INSNS (1), /* constant shift costs */
410 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
411 COSTS_N_INSNS (11), /* HI */
412 COSTS_N_INSNS (11), /* SI */
413 COSTS_N_INSNS (11), /* DI */
414 COSTS_N_INSNS (11)}, /* other */
415 0, /* cost of multiply per each bit set */
416 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
417 COSTS_N_INSNS (25), /* HI */
418 COSTS_N_INSNS (25), /* SI */
419 COSTS_N_INSNS (25), /* DI */
420 COSTS_N_INSNS (25)}, /* other */
421 COSTS_N_INSNS (3), /* cost of movsx */
422 COSTS_N_INSNS (2), /* cost of movzx */
423 8, /* "large" insn */
424 6, /* MOVE_RATIO */
25e22b19 425 6, /* CLEAR_RATIO */
64766e8d
JH
426 {2, 4, 2}, /* cost of loading integer registers
427 in QImode, HImode and SImode.
428 Relative to reg-reg move (2). */
429 {2, 4, 2}, /* cost of storing integer registers */
d321551c
L
430 {4, 8, 16, 32, 64}, /* cost of loading SSE register
431 in 32bit, 64bit, 128bit, 256bit and 512bit */
432 {4, 8, 16, 32, 64}, /* cost of storing SSE register
433 in 32bit, 64bit, 128bit, 256bit and 512bit */
df41dbaf 434 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
df41dbaf 435 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
d321551c
L
436 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
437 3, /* cost of moving SSE register to integer. */
a4fe6139
JH
438 4, 4, /* Gather load static, per_elt. */
439 4, 4, /* Gather store static, per_elt. */
64766e8d
JH
440 8, /* size of l1 cache. */
441 8, /* size of l2 cache */
442 0, /* size of prefetch block */
443 0, /* number of parallel prefetches */
444 2, /* Branch cost */
445 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
446 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
447 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
448 COSTS_N_INSNS (1), /* cost of FABS instruction. */
449 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
450 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
6065f444 451
c53c148c 452 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
453 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
454 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
455 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
c53c148c
JH
456 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
457 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
6065f444
JH
458 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
459 COSTS_N_INSNS (39), /* cost of DIVSD instruction. */
460 COSTS_N_INSNS (70), /* cost of SQRTSS instruction. */
461 COSTS_N_INSNS (70), /* cost of SQRTSD instruction. */
64766e8d
JH
462 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
463 pentium_memcpy,
464 pentium_memset,
f6fd8f2b
JH
465 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
466 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
467 "16:8:8", /* Loop alignment. */
468 "16:8:8", /* Jump alignment. */
469 "0:0:8", /* Label alignment. */
470 "16", /* Func alignment. */
071e428c
HW
471 4, /* Small unroll limit. */
472 2, /* Small unroll factor. */
64766e8d
JH
473};
474
475static const
476struct processor_costs lakemont_cost = {
72bb85f8 477 {
d321551c
L
478 /* Start of register allocator costs. integer->integer move cost is 2. */
479 6, /* cost for loading QImode using movzbl */
480 {2, 4, 2}, /* cost of loading integer registers
481 in QImode, HImode and SImode.
482 Relative to reg-reg move (2). */
483 {2, 4, 2}, /* cost of storing integer registers */
484 2, /* cost of reg,reg fld/fst */
485 {2, 2, 6}, /* cost of loading fp registers
486 in SFmode, DFmode and XFmode */
487 {4, 4, 6}, /* cost of storing fp registers
488 in SFmode, DFmode and XFmode */
489 8, /* cost of moving MMX register */
490 {8, 8}, /* cost of loading MMX registers
491 in SImode and DImode */
492 {8, 8}, /* cost of storing MMX registers
493 in SImode and DImode */
494 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
495 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
496 in 32,64,128,256 and 512-bit */
497 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
498 in 32,64,128,256 and 512-bit */
ecc3135a 499 3, 3, /* SSE->integer and integer->SSE moves */
500 3, 3, /* mask->integer and integer->mask moves */
00cb3494
L
501 {2, 4, 2}, /* cost of loading mask register
502 in QImode, HImode, SImode. */
503 {2, 4, 2}, /* cost if storing mask register
504 in QImode, HImode, SImode. */
505 2, /* cost of moving mask register. */
d321551c 506 /* End of register allocator costs. */
72bb85f8 507 },
d321551c 508
64766e8d
JH
509 COSTS_N_INSNS (1), /* cost of an add instruction */
510 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
511 COSTS_N_INSNS (1), /* variable shift costs */
512 COSTS_N_INSNS (1), /* constant shift costs */
513 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
514 COSTS_N_INSNS (11), /* HI */
515 COSTS_N_INSNS (11), /* SI */
516 COSTS_N_INSNS (11), /* DI */
517 COSTS_N_INSNS (11)}, /* other */
518 0, /* cost of multiply per each bit set */
519 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
520 COSTS_N_INSNS (25), /* HI */
521 COSTS_N_INSNS (25), /* SI */
522 COSTS_N_INSNS (25), /* DI */
523 COSTS_N_INSNS (25)}, /* other */
524 COSTS_N_INSNS (3), /* cost of movsx */
525 COSTS_N_INSNS (2), /* cost of movzx */
526 8, /* "large" insn */
527 17, /* MOVE_RATIO */
25e22b19 528 6, /* CLEAR_RATIO */
64766e8d
JH
529 {2, 4, 2}, /* cost of loading integer registers
530 in QImode, HImode and SImode.
531 Relative to reg-reg move (2). */
532 {2, 4, 2}, /* cost of storing integer registers */
d321551c
L
533 {4, 8, 16, 32, 64}, /* cost of loading SSE register
534 in 32bit, 64bit, 128bit, 256bit and 512bit */
535 {4, 8, 16, 32, 64}, /* cost of storing SSE register
536 in 32bit, 64bit, 128bit, 256bit and 512bit */
df41dbaf 537 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
df41dbaf 538 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
d321551c
L
539 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
540 3, /* cost of moving SSE register to integer. */
a4fe6139
JH
541 4, 4, /* Gather load static, per_elt. */
542 4, 4, /* Gather store static, per_elt. */
64766e8d
JH
543 8, /* size of l1 cache. */
544 8, /* size of l2 cache */
545 0, /* size of prefetch block */
546 0, /* number of parallel prefetches */
547 2, /* Branch cost */
548 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
549 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
550 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
551 COSTS_N_INSNS (1), /* cost of FABS instruction. */
552 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
553 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
6065f444 554
c53c148c 555 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
556 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
557 COSTS_N_INSNS (5), /* cost of MULSS instruction. */
558 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
c53c148c
JH
559 COSTS_N_INSNS (10), /* cost of FMA SS instruction. */
560 COSTS_N_INSNS (10), /* cost of FMA SD instruction. */
6065f444
JH
561 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
562 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
563 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
564 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
64766e8d
JH
565 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
566 pentium_memcpy,
567 pentium_memset,
f6fd8f2b
JH
568 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
569 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
570 "16:8:8", /* Loop alignment. */
571 "16:8:8", /* Jump alignment. */
572 "0:0:8", /* Label alignment. */
573 "16", /* Func alignment. */
071e428c
HW
574 4, /* Small unroll limit. */
575 2, /* Small unroll factor. */
64766e8d
JH
576};
577
578/* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
579 (we ensure the alignment). For small blocks inline loop is still a
580 noticeable win, for bigger blocks either rep movsl or rep movsb is
581 way to go. Rep movsb has apparently more expensive startup time in CPU,
582 but after 4K the difference is down in the noise. */
583static stringop_algs pentiumpro_memcpy[2] = {
584 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
585 {8192, rep_prefix_4_byte, false},
586 {-1, rep_prefix_1_byte, false}}},
587 DUMMY_STRINGOP_ALGS};
588static stringop_algs pentiumpro_memset[2] = {
589 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
590 {8192, rep_prefix_4_byte, false},
591 {-1, libcall, false}}},
592 DUMMY_STRINGOP_ALGS};
593static const
594struct processor_costs pentiumpro_cost = {
72bb85f8 595 {
d321551c
L
596 /* Start of register allocator costs. integer->integer move cost is 2. */
597 2, /* cost for loading QImode using movzbl */
598 {4, 4, 4}, /* cost of loading integer registers
599 in QImode, HImode and SImode.
600 Relative to reg-reg move (2). */
601 {2, 2, 2}, /* cost of storing integer registers */
602 2, /* cost of reg,reg fld/fst */
603 {2, 2, 6}, /* cost of loading fp registers
604 in SFmode, DFmode and XFmode */
605 {4, 4, 6}, /* cost of storing fp registers
606 in SFmode, DFmode and XFmode */
607 2, /* cost of moving MMX register */
608 {2, 2}, /* cost of loading MMX registers
609 in SImode and DImode */
610 {2, 2}, /* cost of storing MMX registers
611 in SImode and DImode */
612 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
613 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
614 in 32,64,128,256 and 512-bit */
615 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
616 in 32,64,128,256 and 512-bit */
ecc3135a 617 3, 3, /* SSE->integer and integer->SSE moves */
618 3, 3, /* mask->integer and integer->mask moves */
00cb3494
L
619 {4, 4, 4}, /* cost of loading mask register
620 in QImode, HImode, SImode. */
621 {2, 2, 2}, /* cost if storing mask register
622 in QImode, HImode, SImode. */
623 2, /* cost of moving mask register. */
d321551c 624 /* End of register allocator costs. */
72bb85f8 625 },
d321551c 626
64766e8d
JH
627 COSTS_N_INSNS (1), /* cost of an add instruction */
628 COSTS_N_INSNS (1), /* cost of a lea instruction */
629 COSTS_N_INSNS (1), /* variable shift costs */
630 COSTS_N_INSNS (1), /* constant shift costs */
631 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
632 COSTS_N_INSNS (4), /* HI */
633 COSTS_N_INSNS (4), /* SI */
634 COSTS_N_INSNS (4), /* DI */
635 COSTS_N_INSNS (4)}, /* other */
636 0, /* cost of multiply per each bit set */
637 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
638 COSTS_N_INSNS (17), /* HI */
639 COSTS_N_INSNS (17), /* SI */
640 COSTS_N_INSNS (17), /* DI */
641 COSTS_N_INSNS (17)}, /* other */
642 COSTS_N_INSNS (1), /* cost of movsx */
643 COSTS_N_INSNS (1), /* cost of movzx */
644 8, /* "large" insn */
645 6, /* MOVE_RATIO */
25e22b19 646 6, /* CLEAR_RATIO */
64766e8d
JH
647 {4, 4, 4}, /* cost of loading integer registers
648 in QImode, HImode and SImode.
649 Relative to reg-reg move (2). */
650 {2, 2, 2}, /* cost of storing integer registers */
d321551c
L
651 {4, 8, 16, 32, 64}, /* cost of loading SSE register
652 in 32bit, 64bit, 128bit, 256bit and 512bit */
653 {4, 8, 16, 32, 64}, /* cost of storing SSE register
654 in 32bit, 64bit, 128bit, 256bit and 512bit */
df41dbaf 655 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
df41dbaf 656 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
d321551c
L
657 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
658 3, /* cost of moving SSE register to integer. */
a4fe6139
JH
659 4, 4, /* Gather load static, per_elt. */
660 4, 4, /* Gather store static, per_elt. */
64766e8d
JH
661 8, /* size of l1 cache. */
662 256, /* size of l2 cache */
663 32, /* size of prefetch block */
664 6, /* number of parallel prefetches */
665 2, /* Branch cost */
666 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
667 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
668 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
669 COSTS_N_INSNS (2), /* cost of FABS instruction. */
670 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
671 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
6065f444 672
c53c148c 673 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
674 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
675 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
676 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
c53c148c
JH
677 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
678 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
6065f444
JH
679 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
680 COSTS_N_INSNS (18), /* cost of DIVSD instruction. */
681 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
682 COSTS_N_INSNS (31), /* cost of SQRTSD instruction. */
64766e8d
JH
683 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
684 pentiumpro_memcpy,
685 pentiumpro_memset,
f6fd8f2b
JH
686 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
687 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
688 "16", /* Loop alignment. */
689 "16:11:8", /* Jump alignment. */
690 "0:0:8", /* Label alignment. */
691 "16", /* Func alignment. */
071e428c
HW
692 4, /* Small unroll limit. */
693 2, /* Small unroll factor. */
64766e8d
JH
694};
695
696static stringop_algs geode_memcpy[2] = {
697 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
698 DUMMY_STRINGOP_ALGS};
699static stringop_algs geode_memset[2] = {
700 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
701 DUMMY_STRINGOP_ALGS};
702static const
703struct processor_costs geode_cost = {
72bb85f8 704 {
d321551c
L
705 /* Start of register allocator costs. integer->integer move cost is 2. */
706 2, /* cost for loading QImode using movzbl */
707 {2, 2, 2}, /* cost of loading integer registers
708 in QImode, HImode and SImode.
709 Relative to reg-reg move (2). */
710 {2, 2, 2}, /* cost of storing integer registers */
711 2, /* cost of reg,reg fld/fst */
712 {2, 2, 2}, /* cost of loading fp registers
713 in SFmode, DFmode and XFmode */
714 {4, 6, 6}, /* cost of storing fp registers
715 in SFmode, DFmode and XFmode */
716 2, /* cost of moving MMX register */
717 {2, 2}, /* cost of loading MMX registers
718 in SImode and DImode */
719 {2, 2}, /* cost of storing MMX registers
720 in SImode and DImode */
721 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
722 {2, 2, 8, 16, 32}, /* cost of loading SSE registers
723 in 32,64,128,256 and 512-bit */
724 {2, 2, 8, 16, 32}, /* cost of storing SSE registers
725 in 32,64,128,256 and 512-bit */
ecc3135a 726 6, 6, /* SSE->integer and integer->SSE moves */
727 6, 6, /* mask->integer and integer->mask moves */
00cb3494
L
728 {2, 2, 2}, /* cost of loading mask register
729 in QImode, HImode, SImode. */
730 {2, 2, 2}, /* cost if storing mask register
731 in QImode, HImode, SImode. */
732 2, /* cost of moving mask register. */
d321551c 733 /* End of register allocator costs. */
72bb85f8 734 },
d321551c 735
64766e8d
JH
736 COSTS_N_INSNS (1), /* cost of an add instruction */
737 COSTS_N_INSNS (1), /* cost of a lea instruction */
738 COSTS_N_INSNS (2), /* variable shift costs */
739 COSTS_N_INSNS (1), /* constant shift costs */
740 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
741 COSTS_N_INSNS (4), /* HI */
742 COSTS_N_INSNS (7), /* SI */
743 COSTS_N_INSNS (7), /* DI */
744 COSTS_N_INSNS (7)}, /* other */
745 0, /* cost of multiply per each bit set */
746 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
747 COSTS_N_INSNS (23), /* HI */
748 COSTS_N_INSNS (39), /* SI */
749 COSTS_N_INSNS (39), /* DI */
750 COSTS_N_INSNS (39)}, /* other */
751 COSTS_N_INSNS (1), /* cost of movsx */
752 COSTS_N_INSNS (1), /* cost of movzx */
753 8, /* "large" insn */
754 4, /* MOVE_RATIO */
25e22b19 755 4, /* CLEAR_RATIO */
df41dbaf 756 {2, 2, 2}, /* cost of loading integer registers
64766e8d
JH
757 in QImode, HImode and SImode.
758 Relative to reg-reg move (2). */
df41dbaf 759 {2, 2, 2}, /* cost of storing integer registers */
d321551c
L
760 {2, 2, 8, 16, 32}, /* cost of loading SSE register
761 in 32bit, 64bit, 128bit, 256bit and 512bit */
762 {2, 2, 8, 16, 32}, /* cost of storing SSE register
763 in 32bit, 64bit, 128bit, 256bit and 512bit */
df41dbaf 764 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */
df41dbaf 765 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
d321551c
L
766 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
767 6, /* cost of moving SSE register to integer. */
a4fe6139
JH
768 2, 2, /* Gather load static, per_elt. */
769 2, 2, /* Gather store static, per_elt. */
64766e8d
JH
770 64, /* size of l1 cache. */
771 128, /* size of l2 cache. */
772 32, /* size of prefetch block */
773 1, /* number of parallel prefetches */
774 1, /* Branch cost */
775 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
776 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
777 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
778 COSTS_N_INSNS (1), /* cost of FABS instruction. */
779 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
780 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
6065f444 781
c53c148c 782 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
783 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
784 COSTS_N_INSNS (11), /* cost of MULSS instruction. */
785 COSTS_N_INSNS (11), /* cost of MULSD instruction. */
c53c148c
JH
786 COSTS_N_INSNS (17), /* cost of FMA SS instruction. */
787 COSTS_N_INSNS (17), /* cost of FMA SD instruction. */
6065f444
JH
788 COSTS_N_INSNS (47), /* cost of DIVSS instruction. */
789 COSTS_N_INSNS (47), /* cost of DIVSD instruction. */
790 COSTS_N_INSNS (54), /* cost of SQRTSS instruction. */
791 COSTS_N_INSNS (54), /* cost of SQRTSD instruction. */
64766e8d
JH
792 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
793 geode_memcpy,
794 geode_memset,
f6fd8f2b
JH
795 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
796 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
797 NULL, /* Loop alignment. */
798 NULL, /* Jump alignment. */
799 NULL, /* Label alignment. */
800 NULL, /* Func alignment. */
071e428c
HW
801 4, /* Small unroll limit. */
802 2, /* Small unroll factor. */
64766e8d
JH
803};
804
805static stringop_algs k6_memcpy[2] = {
806 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
807 DUMMY_STRINGOP_ALGS};
808static stringop_algs k6_memset[2] = {
809 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
810 DUMMY_STRINGOP_ALGS};
811static const
812struct processor_costs k6_cost = {
72bb85f8 813 {
d321551c
L
814 /* Start of register allocator costs. integer->integer move cost is 2. */
815 3, /* cost for loading QImode using movzbl */
816 {4, 5, 4}, /* cost of loading integer registers
817 in QImode, HImode and SImode.
818 Relative to reg-reg move (2). */
819 {2, 3, 2}, /* cost of storing integer registers */
820 4, /* cost of reg,reg fld/fst */
821 {6, 6, 6}, /* cost of loading fp registers
822 in SFmode, DFmode and XFmode */
823 {4, 4, 4}, /* cost of storing fp registers
824 in SFmode, DFmode and XFmode */
825 2, /* cost of moving MMX register */
826 {2, 2}, /* cost of loading MMX registers
827 in SImode and DImode */
828 {2, 2}, /* cost of storing MMX registers
829 in SImode and DImode */
830 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
831 {2, 2, 8, 16, 32}, /* cost of loading SSE registers
832 in 32,64,128,256 and 512-bit */
833 {2, 2, 8, 16, 32}, /* cost of storing SSE registers
834 in 32,64,128,256 and 512-bit */
ecc3135a 835 6, 6, /* SSE->integer and integer->SSE moves */
836 6, 6, /* mask->integer and integer->mask moves */
00cb3494
L
837 {4, 5, 4}, /* cost of loading mask register
838 in QImode, HImode, SImode. */
839 {2, 3, 2}, /* cost if storing mask register
840 in QImode, HImode, SImode. */
841 2, /* cost of moving mask register. */
d321551c 842 /* End of register allocator costs. */
72bb85f8 843 },
d321551c 844
64766e8d
JH
845 COSTS_N_INSNS (1), /* cost of an add instruction */
846 COSTS_N_INSNS (2), /* cost of a lea instruction */
847 COSTS_N_INSNS (1), /* variable shift costs */
848 COSTS_N_INSNS (1), /* constant shift costs */
849 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
850 COSTS_N_INSNS (3), /* HI */
851 COSTS_N_INSNS (3), /* SI */
852 COSTS_N_INSNS (3), /* DI */
853 COSTS_N_INSNS (3)}, /* other */
854 0, /* cost of multiply per each bit set */
855 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
856 COSTS_N_INSNS (18), /* HI */
857 COSTS_N_INSNS (18), /* SI */
858 COSTS_N_INSNS (18), /* DI */
859 COSTS_N_INSNS (18)}, /* other */
860 COSTS_N_INSNS (2), /* cost of movsx */
861 COSTS_N_INSNS (2), /* cost of movzx */
862 8, /* "large" insn */
863 4, /* MOVE_RATIO */
25e22b19 864 4, /* CLEAR_RATIO */
64766e8d
JH
865 {4, 5, 4}, /* cost of loading integer registers
866 in QImode, HImode and SImode.
867 Relative to reg-reg move (2). */
868 {2, 3, 2}, /* cost of storing integer registers */
d321551c
L
869 {2, 2, 8, 16, 32}, /* cost of loading SSE register
870 in 32bit, 64bit, 128bit, 256bit and 512bit */
871 {2, 2, 8, 16, 32}, /* cost of storing SSE register
872 in 32bit, 64bit, 128bit, 256bit and 512bit */
df41dbaf 873 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */
df41dbaf 874 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
d321551c
L
875 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
876 6, /* cost of moving SSE register to integer. */
a4fe6139
JH
877 2, 2, /* Gather load static, per_elt. */
878 2, 2, /* Gather store static, per_elt. */
64766e8d
JH
879 32, /* size of l1 cache. */
880 32, /* size of l2 cache. Some models
881 have integrated l2 cache, but
882 optimizing for k6 is not important
883 enough to worry about that. */
884 32, /* size of prefetch block */
885 1, /* number of parallel prefetches */
886 1, /* Branch cost */
887 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
888 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
889 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
890 COSTS_N_INSNS (2), /* cost of FABS instruction. */
891 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
892 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
6065f444 893
c53c148c 894 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
895 COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */
896 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
897 COSTS_N_INSNS (2), /* cost of MULSD instruction. */
c53c148c
JH
898 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
899 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
6065f444
JH
900 COSTS_N_INSNS (56), /* cost of DIVSS instruction. */
901 COSTS_N_INSNS (56), /* cost of DIVSD instruction. */
902 COSTS_N_INSNS (56), /* cost of SQRTSS instruction. */
903 COSTS_N_INSNS (56), /* cost of SQRTSD instruction. */
64766e8d
JH
904 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
905 k6_memcpy,
906 k6_memset,
f6fd8f2b
JH
907 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
908 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
909 "32:8:8", /* Loop alignment. */
910 "32:8:8", /* Jump alignment. */
911 "0:0:8", /* Label alignment. */
912 "32", /* Func alignment. */
071e428c
HW
913 4, /* Small unroll limit. */
914 2, /* Small unroll factor. */
64766e8d
JH
915};
916
917/* For some reason, Athlon deals better with REP prefix (relative to loops)
918 compared to K8. Alignment becomes important after 8 bytes for memcpy and
919 128 bytes for memset. */
920static stringop_algs athlon_memcpy[2] = {
921 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
922 DUMMY_STRINGOP_ALGS};
923static stringop_algs athlon_memset[2] = {
924 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
925 DUMMY_STRINGOP_ALGS};
926static const
927struct processor_costs athlon_cost = {
72bb85f8 928 {
d321551c
L
929 /* Start of register allocator costs. integer->integer move cost is 2. */
930 4, /* cost for loading QImode using movzbl */
931 {3, 4, 3}, /* cost of loading integer registers
932 in QImode, HImode and SImode.
933 Relative to reg-reg move (2). */
934 {3, 4, 3}, /* cost of storing integer registers */
935 4, /* cost of reg,reg fld/fst */
936 {4, 4, 12}, /* cost of loading fp registers
937 in SFmode, DFmode and XFmode */
938 {6, 6, 8}, /* cost of storing fp registers
939 in SFmode, DFmode and XFmode */
940 2, /* cost of moving MMX register */
941 {4, 4}, /* cost of loading MMX registers
942 in SImode and DImode */
943 {4, 4}, /* cost of storing MMX registers
944 in SImode and DImode */
945 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
946 {4, 4, 12, 12, 24}, /* cost of loading SSE registers
947 in 32,64,128,256 and 512-bit */
948 {4, 4, 10, 10, 20}, /* cost of storing SSE registers
949 in 32,64,128,256 and 512-bit */
ecc3135a 950 5, 5, /* SSE->integer and integer->SSE moves */
951 5, 5, /* mask->integer and integer->mask moves */
00cb3494
L
952 {3, 4, 3}, /* cost of loading mask register
953 in QImode, HImode, SImode. */
954 {3, 4, 3}, /* cost if storing mask register
955 in QImode, HImode, SImode. */
956 2, /* cost of moving mask register. */
d321551c 957 /* End of register allocator costs. */
72bb85f8 958 },
d321551c 959
64766e8d
JH
960 COSTS_N_INSNS (1), /* cost of an add instruction */
961 COSTS_N_INSNS (2), /* cost of a lea instruction */
962 COSTS_N_INSNS (1), /* variable shift costs */
963 COSTS_N_INSNS (1), /* constant shift costs */
964 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
965 COSTS_N_INSNS (5), /* HI */
966 COSTS_N_INSNS (5), /* SI */
967 COSTS_N_INSNS (5), /* DI */
968 COSTS_N_INSNS (5)}, /* other */
969 0, /* cost of multiply per each bit set */
970 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
971 COSTS_N_INSNS (26), /* HI */
972 COSTS_N_INSNS (42), /* SI */
973 COSTS_N_INSNS (74), /* DI */
974 COSTS_N_INSNS (74)}, /* other */
975 COSTS_N_INSNS (1), /* cost of movsx */
976 COSTS_N_INSNS (1), /* cost of movzx */
977 8, /* "large" insn */
978 9, /* MOVE_RATIO */
25e22b19 979 6, /* CLEAR_RATIO */
64766e8d
JH
980 {3, 4, 3}, /* cost of loading integer registers
981 in QImode, HImode and SImode.
982 Relative to reg-reg move (2). */
983 {3, 4, 3}, /* cost of storing integer registers */
d321551c
L
984 {4, 4, 12, 12, 24}, /* cost of loading SSE register
985 in 32bit, 64bit, 128bit, 256bit and 512bit */
986 {4, 4, 10, 10, 20}, /* cost of storing SSE register
987 in 32bit, 64bit, 128bit, 256bit and 512bit */
b7167993 988 {4, 4, 12, 12, 24}, /* cost of unaligned loads. */
b7167993 989 {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
d321551c
L
990 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
991 5, /* cost of moving SSE register to integer. */
a4fe6139
JH
992 4, 4, /* Gather load static, per_elt. */
993 4, 4, /* Gather store static, per_elt. */
64766e8d
JH
994 64, /* size of l1 cache. */
995 256, /* size of l2 cache. */
996 64, /* size of prefetch block */
997 6, /* number of parallel prefetches */
998 5, /* Branch cost */
999 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1000 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1001 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1002 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1003 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1004 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
6065f444 1005
c53c148c 1006 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
6065f444
JH
1007 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1008 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1009 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
c53c148c
JH
1010 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
1011 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
6065f444
JH
1012 /* 11-16 */
1013 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
1014 COSTS_N_INSNS (24), /* cost of DIVSD instruction. */
1015 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
1016 COSTS_N_INSNS (19), /* cost of SQRTSD instruction. */
64766e8d
JH
1017 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1018 athlon_memcpy,
1019 athlon_memset,
f6fd8f2b
JH
1020 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1021 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
1022 "16:8:8", /* Loop alignment. */
1023 "16:8:8", /* Jump alignment. */
1024 "0:0:8", /* Label alignment. */
1025 "16", /* Func alignment. */
071e428c
HW
1026 4, /* Small unroll limit. */
1027 2, /* Small unroll factor. */
64766e8d
JH
1028};
1029
1030/* K8 has optimized REP instruction for medium sized blocks, but for very
1031 small blocks it is better to use loop. For large blocks, libcall can
1032 do nontemporary accesses and beat inline considerably. */
1033static stringop_algs k8_memcpy[2] = {
1034 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1035 {-1, rep_prefix_4_byte, false}}},
1036 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1037 {-1, libcall, false}}}};
1038static stringop_algs k8_memset[2] = {
1039 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1040 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1041 {libcall, {{48, unrolled_loop, false},
1042 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1043static const
1044struct processor_costs k8_cost = {
72bb85f8 1045 {
d321551c
L
1046 /* Start of register allocator costs. integer->integer move cost is 2. */
1047 4, /* cost for loading QImode using movzbl */
1048 {3, 4, 3}, /* cost of loading integer registers
1049 in QImode, HImode and SImode.
1050 Relative to reg-reg move (2). */
1051 {3, 4, 3}, /* cost of storing integer registers */
1052 4, /* cost of reg,reg fld/fst */
1053 {4, 4, 12}, /* cost of loading fp registers
1054 in SFmode, DFmode and XFmode */
1055 {6, 6, 8}, /* cost of storing fp registers
1056 in SFmode, DFmode and XFmode */
1057 2, /* cost of moving MMX register */
1058 {3, 3}, /* cost of loading MMX registers
1059 in SImode and DImode */
1060 {4, 4}, /* cost of storing MMX registers
1061 in SImode and DImode */
1062 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1063 {4, 3, 12, 12, 24}, /* cost of loading SSE registers
1064 in 32,64,128,256 and 512-bit */
1065 {4, 4, 10, 10, 20}, /* cost of storing SSE registers
1066 in 32,64,128,256 and 512-bit */
ecc3135a 1067 5, 5, /* SSE->integer and integer->SSE moves */
1068 5, 5, /* mask->integer and integer->mask moves */
00cb3494
L
1069 {3, 4, 3}, /* cost of loading mask register
1070 in QImode, HImode, SImode. */
1071 {3, 4, 3}, /* cost if storing mask register
1072 in QImode, HImode, SImode. */
1073 2, /* cost of moving mask register. */
d321551c 1074 /* End of register allocator costs. */
72bb85f8 1075 },
d321551c 1076
64766e8d
JH
1077 COSTS_N_INSNS (1), /* cost of an add instruction */
1078 COSTS_N_INSNS (2), /* cost of a lea instruction */
1079 COSTS_N_INSNS (1), /* variable shift costs */
1080 COSTS_N_INSNS (1), /* constant shift costs */
1081 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1082 COSTS_N_INSNS (4), /* HI */
1083 COSTS_N_INSNS (3), /* SI */
1084 COSTS_N_INSNS (4), /* DI */
1085 COSTS_N_INSNS (5)}, /* other */
1086 0, /* cost of multiply per each bit set */
1087 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1088 COSTS_N_INSNS (26), /* HI */
1089 COSTS_N_INSNS (42), /* SI */
1090 COSTS_N_INSNS (74), /* DI */
1091 COSTS_N_INSNS (74)}, /* other */
1092 COSTS_N_INSNS (1), /* cost of movsx */
1093 COSTS_N_INSNS (1), /* cost of movzx */
1094 8, /* "large" insn */
1095 9, /* MOVE_RATIO */
25e22b19 1096 6, /* CLEAR_RATIO */
64766e8d
JH
1097 {3, 4, 3}, /* cost of loading integer registers
1098 in QImode, HImode and SImode.
1099 Relative to reg-reg move (2). */
1100 {3, 4, 3}, /* cost of storing integer registers */
d321551c
L
1101 {4, 3, 12, 12, 24}, /* cost of loading SSE register
1102 in 32bit, 64bit, 128bit, 256bit and 512bit */
1103 {4, 4, 10, 10, 20}, /* cost of storing SSE register
1104 in 32bit, 64bit, 128bit, 256bit and 512bit */
b7167993 1105 {4, 3, 12, 12, 24}, /* cost of unaligned loads. */
b7167993 1106 {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
d321551c
L
1107 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1108 5, /* cost of moving SSE register to integer. */
a4fe6139
JH
1109 4, 4, /* Gather load static, per_elt. */
1110 4, 4, /* Gather store static, per_elt. */
64766e8d
JH
1111 64, /* size of l1 cache. */
1112 512, /* size of l2 cache. */
1113 64, /* size of prefetch block */
1114 /* New AMD processors never drop prefetches; if they cannot be performed
1115 immediately, they are queued. We set number of simultaneous prefetches
1116 to a large constant to reflect this (it probably is not a good idea not
1117 to limit number of prefetches at all, as their execution also takes some
1118 time). */
1119 100, /* number of parallel prefetches */
1120 3, /* Branch cost */
1121 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1122 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1123 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1124 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1125 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1126 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
6065f444 1127
c53c148c 1128 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
6065f444
JH
1129 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1130 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1131 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
c53c148c
JH
1132 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
1133 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
6065f444
JH
1134 /* 11-16 */
1135 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
1136 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
1137 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
1138 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
64766e8d
JH
1139 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1140 k8_memcpy,
1141 k8_memset,
f6fd8f2b
JH
1142 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1143 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
7dc58b50
ML
1144 "16:8:8", /* Loop alignment. */
1145 "16:8:8", /* Jump alignment. */
1146 "0:0:8", /* Label alignment. */
1147 "16", /* Func alignment. */
071e428c
HW
1148 4, /* Small unroll limit. */
1149 2, /* Small unroll factor. */
64766e8d
JH
1150};
1151
1152/* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1153 very small blocks it is better to use loop. For large blocks, libcall can
1154 do nontemporary accesses and beat inline considerably. */
1155static stringop_algs amdfam10_memcpy[2] = {
1156 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1157 {-1, rep_prefix_4_byte, false}}},
1158 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1159 {-1, libcall, false}}}};
1160static stringop_algs amdfam10_memset[2] = {
1161 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1162 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1163 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1164 {-1, libcall, false}}}};
1165struct processor_costs amdfam10_cost = {
72bb85f8 1166 {
d321551c 1167 /* Start of register allocator costs. integer->integer move cost is 2. */
64766e8d
JH
1168 4, /* cost for loading QImode using movzbl */
1169 {3, 4, 3}, /* cost of loading integer registers
1170 in QImode, HImode and SImode.
1171 Relative to reg-reg move (2). */
1172 {3, 4, 3}, /* cost of storing integer registers */
1173 4, /* cost of reg,reg fld/fst */
1174 {4, 4, 12}, /* cost of loading fp registers
1175 in SFmode, DFmode and XFmode */
1176 {6, 6, 8}, /* cost of storing fp registers
1177 in SFmode, DFmode and XFmode */
1178 2, /* cost of moving MMX register */
1179 {3, 3}, /* cost of loading MMX registers
1180 in SImode and DImode */
1181 {4, 4}, /* cost of storing MMX registers
1182 in SImode and DImode */
df41dbaf
JH
1183 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1184 {4, 4, 3, 6, 12}, /* cost of loading SSE registers
1185 in 32,64,128,256 and 512-bit */
df41dbaf
JH
1186 {4, 4, 5, 10, 20}, /* cost of storing SSE registers
1187 in 32,64,128,256 and 512-bit */
ecc3135a 1188 3, 3, /* SSE->integer and integer->SSE moves */
1189 3, 3, /* mask->integer and integer->mask moves */
00cb3494
L
1190 {3, 4, 3}, /* cost of loading mask register
1191 in QImode, HImode, SImode. */
1192 {3, 4, 3}, /* cost if storing mask register
1193 in QImode, HImode, SImode. */
1194 2, /* cost of moving mask register. */
d321551c 1195
64766e8d
JH
1196 /* On K8:
1197 MOVD reg64, xmmreg Double FSTORE 4
1198 MOVD reg32, xmmreg Double FSTORE 4
1199 On AMDFAM10:
1200 MOVD reg64, xmmreg Double FADD 3
1201 1/1 1/1
1202 MOVD reg32, xmmreg Double FADD 3
1203 1/1 1/1 */
d321551c 1204 /* End of register allocator costs. */
72bb85f8 1205 },
d321551c
L
1206
1207 COSTS_N_INSNS (1), /* cost of an add instruction */
1208 COSTS_N_INSNS (2), /* cost of a lea instruction */
1209 COSTS_N_INSNS (1), /* variable shift costs */
1210 COSTS_N_INSNS (1), /* constant shift costs */
1211 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1212 COSTS_N_INSNS (4), /* HI */
1213 COSTS_N_INSNS (3), /* SI */
1214 COSTS_N_INSNS (4), /* DI */
1215 COSTS_N_INSNS (5)}, /* other */
1216 0, /* cost of multiply per each bit set */
1217 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1218 COSTS_N_INSNS (35), /* HI */
1219 COSTS_N_INSNS (51), /* SI */
1220 COSTS_N_INSNS (83), /* DI */
1221 COSTS_N_INSNS (83)}, /* other */
1222 COSTS_N_INSNS (1), /* cost of movsx */
1223 COSTS_N_INSNS (1), /* cost of movzx */
1224 8, /* "large" insn */
1225 9, /* MOVE_RATIO */
25e22b19 1226 6, /* CLEAR_RATIO */
d321551c
L
1227 {3, 4, 3}, /* cost of loading integer registers
1228 in QImode, HImode and SImode.
1229 Relative to reg-reg move (2). */
1230 {3, 4, 3}, /* cost of storing integer registers */
1231 {4, 4, 3, 6, 12}, /* cost of loading SSE register
1232 in 32bit, 64bit, 128bit, 256bit and 512bit */
1233 {4, 4, 5, 10, 20}, /* cost of storing SSE register
1234 in 32bit, 64bit, 128bit, 256bit and 512bit */
1235 {4, 4, 3, 7, 12}, /* cost of unaligned loads. */
1236 {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
1237 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1238 3, /* cost of moving SSE register to integer. */
a4fe6139
JH
1239 4, 4, /* Gather load static, per_elt. */
1240 4, 4, /* Gather store static, per_elt. */
64766e8d
JH
1241 64, /* size of l1 cache. */
1242 512, /* size of l2 cache. */
1243 64, /* size of prefetch block */
1244 /* New AMD processors never drop prefetches; if they cannot be performed
1245 immediately, they are queued. We set number of simultaneous prefetches
1246 to a large constant to reflect this (it probably is not a good idea not
1247 to limit number of prefetches at all, as their execution also takes some
1248 time). */
1249 100, /* number of parallel prefetches */
1250 2, /* Branch cost */
1251 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1252 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1253 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1254 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1255 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1256 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
6065f444 1257
c53c148c 1258 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
6065f444
JH
1259 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1260 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1261 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
c53c148c
JH
1262 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
1263 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
6065f444
JH
1264 /* 11-16 */
1265 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
1266 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
1267 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
1268 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
64766e8d
JH
1269 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1270 amdfam10_memcpy,
1271 amdfam10_memset,
f6fd8f2b
JH
1272 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1273 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
1274 "32:25:8", /* Loop alignment. */
1275 "32:8:8", /* Jump alignment. */
1276 "0:0:8", /* Label alignment. */
1277 "32", /* Func alignment. */
071e428c
HW
1278 4, /* Small unroll limit. */
1279 2, /* Small unroll factor. */
64766e8d
JH
1280};
1281
c727b835 1282/* BDVER has optimized REP instruction for medium sized blocks, but for
64766e8d
JH
1283 very small blocks it is better to use loop. For large blocks, libcall
1284 can do nontemporary accesses and beat inline considerably. */
c727b835 1285static stringop_algs bdver_memcpy[2] = {
64766e8d
JH
1286 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1287 {-1, rep_prefix_4_byte, false}}},
1288 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1289 {-1, libcall, false}}}};
c727b835 1290static stringop_algs bdver_memset[2] = {
64766e8d
JH
1291 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1292 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1293 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1294 {-1, libcall, false}}}};
1295
c727b835 1296const struct processor_costs bdver_cost = {
72bb85f8 1297 {
d321551c
L
1298 /* Start of register allocator costs. integer->integer move cost is 2. */
1299 8, /* cost for loading QImode using movzbl */
1300 {8, 8, 8}, /* cost of loading integer registers
1301 in QImode, HImode and SImode.
1302 Relative to reg-reg move (2). */
1303 {8, 8, 8}, /* cost of storing integer registers */
1304 4, /* cost of reg,reg fld/fst */
1305 {12, 12, 28}, /* cost of loading fp registers
1306 in SFmode, DFmode and XFmode */
1307 {10, 10, 18}, /* cost of storing fp registers
1308 in SFmode, DFmode and XFmode */
1309 4, /* cost of moving MMX register */
1310 {12, 12}, /* cost of loading MMX registers
1311 in SImode and DImode */
1312 {10, 10}, /* cost of storing MMX registers
1313 in SImode and DImode */
1314 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1315 {12, 12, 10, 40, 60}, /* cost of loading SSE registers
1316 in 32,64,128,256 and 512-bit */
1317 {10, 10, 10, 40, 60}, /* cost of storing SSE registers
1318 in 32,64,128,256 and 512-bit */
1319 16, 20, /* SSE->integer and integer->SSE moves */
ecc3135a 1320 16, 20, /* mask->integer and integer->mask moves */
00cb3494
L
1321 {8, 8, 8}, /* cost of loading mask register
1322 in QImode, HImode, SImode. */
1323 {8, 8, 8}, /* cost if storing mask register
1324 in QImode, HImode, SImode. */
1325 2, /* cost of moving mask register. */
d321551c 1326 /* End of register allocator costs. */
72bb85f8 1327 },
d321551c 1328
64766e8d
JH
1329 COSTS_N_INSNS (1), /* cost of an add instruction */
1330 COSTS_N_INSNS (1), /* cost of a lea instruction */
1331 COSTS_N_INSNS (1), /* variable shift costs */
1332 COSTS_N_INSNS (1), /* constant shift costs */
1333 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1334 COSTS_N_INSNS (4), /* HI */
1335 COSTS_N_INSNS (4), /* SI */
1336 COSTS_N_INSNS (6), /* DI */
1337 COSTS_N_INSNS (6)}, /* other */
1338 0, /* cost of multiply per each bit set */
1339 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1340 COSTS_N_INSNS (35), /* HI */
1341 COSTS_N_INSNS (51), /* SI */
1342 COSTS_N_INSNS (83), /* DI */
1343 COSTS_N_INSNS (83)}, /* other */
1344 COSTS_N_INSNS (1), /* cost of movsx */
1345 COSTS_N_INSNS (1), /* cost of movzx */
1346 8, /* "large" insn */
1347 9, /* MOVE_RATIO */
25e22b19 1348 6, /* CLEAR_RATIO */
df41dbaf 1349 {8, 8, 8}, /* cost of loading integer registers
64766e8d
JH
1350 in QImode, HImode and SImode.
1351 Relative to reg-reg move (2). */
df41dbaf 1352 {8, 8, 8}, /* cost of storing integer registers */
d321551c
L
1353 {12, 12, 10, 40, 60}, /* cost of loading SSE register
1354 in 32bit, 64bit, 128bit, 256bit and 512bit */
1355 {10, 10, 10, 40, 60}, /* cost of storing SSE register
1356 in 32bit, 64bit, 128bit, 256bit and 512bit */
b7167993 1357 {12, 12, 10, 40, 60}, /* cost of unaligned loads. */
b7167993 1358 {10, 10, 10, 40, 60}, /* cost of unaligned stores. */
d321551c
L
1359 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1360 16, /* cost of moving SSE register to integer. */
a4fe6139
JH
1361 12, 12, /* Gather load static, per_elt. */
1362 10, 10, /* Gather store static, per_elt. */
64766e8d
JH
1363 16, /* size of l1 cache. */
1364 2048, /* size of l2 cache. */
1365 64, /* size of prefetch block */
1366 /* New AMD processors never drop prefetches; if they cannot be performed
1367 immediately, they are queued. We set number of simultaneous prefetches
1368 to a large constant to reflect this (it probably is not a good idea not
1369 to limit number of prefetches at all, as their execution also takes some
1370 time). */
1371 100, /* number of parallel prefetches */
1372 2, /* Branch cost */
1373 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1374 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1375 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1376 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1377 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1378 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
6065f444 1379
c53c148c 1380 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
6065f444
JH
1381 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1382 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1383 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
c53c148c
JH
1384 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1385 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
6065f444
JH
1386 /* 9-24 */
1387 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1388 /* 9-27 */
1389 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1390 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1391 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
64766e8d 1392 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
c727b835
RB
1393 bdver_memcpy,
1394 bdver_memset,
f6fd8f2b
JH
1395 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1396 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
7dc58b50
ML
1397 "16:11:8", /* Loop alignment. */
1398 "16:8:8", /* Jump alignment. */
1399 "0:0:8", /* Label alignment. */
1400 "11", /* Func alignment. */
071e428c
HW
1401 4, /* Small unroll limit. */
1402 2, /* Small unroll factor. */
64766e8d
JH
1403};
1404
1405
1406/* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1407 very small blocks it is better to use loop. For large blocks, libcall
1408 can do nontemporary accesses and beat inline considerably. */
1409static stringop_algs znver1_memcpy[2] = {
da346efd
ML
1410 /* 32-bit tuning. */
1411 {libcall, {{6, loop, false},
1412 {14, unrolled_loop, false},
dc65aba7 1413 {-1, libcall, false}}},
da346efd
ML
1414 /* 64-bit tuning. */
1415 {libcall, {{16, loop, false},
dc65aba7 1416 {128, rep_prefix_8_byte, false},
64766e8d
JH
1417 {-1, libcall, false}}}};
1418static stringop_algs znver1_memset[2] = {
da346efd
ML
1419 /* 32-bit tuning. */
1420 {libcall, {{8, loop, false},
1421 {24, unrolled_loop, false},
dc65aba7 1422 {128, rep_prefix_4_byte, false},
da346efd
ML
1423 {-1, libcall, false}}},
1424 /* 64-bit tuning. */
1425 {libcall, {{48, unrolled_loop, false},
dc65aba7 1426 {128, rep_prefix_8_byte, false},
64766e8d
JH
1427 {-1, libcall, false}}}};
1428struct processor_costs znver1_cost = {
72bb85f8 1429 {
d321551c
L
1430 /* Start of register allocator costs. integer->integer move cost is 2. */
1431
1432 /* reg-reg moves are done by renaming and thus they are even cheaper than
1433 1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1434 to doubles of latencies, we do not model this correctly. It does not
1435 seem to make practical difference to bump prices up even more. */
1436 6, /* cost for loading QImode using
1437 movzbl. */
1438 {6, 6, 6}, /* cost of loading integer registers
1439 in QImode, HImode and SImode.
1440 Relative to reg-reg move (2). */
1441 {8, 8, 8}, /* cost of storing integer
1442 registers. */
1443 2, /* cost of reg,reg fld/fst. */
1444 {6, 6, 16}, /* cost of loading fp registers
1445 in SFmode, DFmode and XFmode. */
1446 {8, 8, 16}, /* cost of storing fp registers
1447 in SFmode, DFmode and XFmode. */
1448 2, /* cost of moving MMX register. */
1449 {6, 6}, /* cost of loading MMX registers
1450 in SImode and DImode. */
1451 {8, 8}, /* cost of storing MMX registers
1452 in SImode and DImode. */
1453 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
1454 {6, 6, 6, 12, 24}, /* cost of loading SSE registers
1455 in 32,64,128,256 and 512-bit. */
1456 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
1457 in 32,64,128,256 and 512-bit. */
ecc3135a 1458 6, 6, /* SSE->integer and integer->SSE moves. */
1459 8, 8, /* mask->integer and integer->mask moves */
00cb3494
L
1460 {6, 6, 6}, /* cost of loading mask register
1461 in QImode, HImode, SImode. */
1462 {8, 8, 8}, /* cost if storing mask register
1463 in QImode, HImode, SImode. */
1464 2, /* cost of moving mask register. */
d321551c 1465 /* End of register allocator costs. */
72bb85f8 1466 },
d321551c 1467
64766e8d
JH
1468 COSTS_N_INSNS (1), /* cost of an add instruction. */
1469 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1470 COSTS_N_INSNS (1), /* variable shift costs. */
1471 COSTS_N_INSNS (1), /* constant shift costs. */
1472 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1473 COSTS_N_INSNS (3), /* HI. */
1474 COSTS_N_INSNS (3), /* SI. */
6065f444
JH
1475 COSTS_N_INSNS (3), /* DI. */
1476 COSTS_N_INSNS (3)}, /* other. */
64766e8d
JH
1477 0, /* cost of multiply per each bit
1478 set. */
6065f444
JH
1479 /* Depending on parameters, idiv can get faster on ryzen. This is upper
1480 bound. */
1481 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
1482 COSTS_N_INSNS (22), /* HI. */
1483 COSTS_N_INSNS (30), /* SI. */
1484 COSTS_N_INSNS (45), /* DI. */
1485 COSTS_N_INSNS (45)}, /* other. */
64766e8d
JH
1486 COSTS_N_INSNS (1), /* cost of movsx. */
1487 COSTS_N_INSNS (1), /* cost of movzx. */
1488 8, /* "large" insn. */
1489 9, /* MOVE_RATIO. */
25e22b19 1490 6, /* CLEAR_RATIO */
01118373 1491 {6, 6, 6}, /* cost of loading integer registers
64766e8d
JH
1492 in QImode, HImode and SImode.
1493 Relative to reg-reg move (2). */
01118373 1494 {8, 8, 8}, /* cost of storing integer
64766e8d 1495 registers. */
d321551c
L
1496 {6, 6, 6, 12, 24}, /* cost of loading SSE register
1497 in 32bit, 64bit, 128bit, 256bit and 512bit */
1498 {8, 8, 8, 16, 32}, /* cost of storing SSE register
1499 in 32bit, 64bit, 128bit, 256bit and 512bit */
b7167993 1500 {6, 6, 6, 12, 24}, /* cost of unaligned loads. */
b7167993 1501 {8, 8, 8, 16, 32}, /* cost of unaligned stores. */
d321551c
L
1502 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
1503 6, /* cost of moving SSE register to integer. */
a4fe6139
JH
1504 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1505 throughput 12. Approx 9 uops do not depend on vector size and every load
1506 is 7 uops. */
1507 18, 8, /* Gather load static, per_elt. */
1508 18, 10, /* Gather store static, per_elt. */
64766e8d
JH
1509 32, /* size of l1 cache. */
1510 512, /* size of l2 cache. */
1511 64, /* size of prefetch block. */
1512 /* New AMD processors never drop prefetches; if they cannot be performed
1513 immediately, they are queued. We set number of simultaneous prefetches
1514 to a large constant to reflect this (it probably is not a good idea not
1515 to limit number of prefetches at all, as their execution also takes some
1516 time). */
1517 100, /* number of parallel prefetches. */
1518 3, /* Branch cost. */
6065f444
JH
1519 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1520 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1521 /* Latency of fdiv is 8-15. */
1522 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1523 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1524 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1525 /* Latency of fsqrt is 4-10. */
1526 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1527
c53c148c 1528 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
1529 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1530 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1531 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
c53c148c
JH
1532 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1533 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
6065f444
JH
1534 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1535 /* 9-13 */
1536 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1537 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1538 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
64766e8d
JH
1539 /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1540 and it can execute 2 integer additions and 2 multiplications thus
1541 reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests
1542 that 4 works better than 6 probably due to register pressure.
1543
1544 Integer vector operations are taken by FP unit and execute 3 vector
1545 plus/minus operations per cycle but only one multiply. This is adjusted
1546 in ix86_reassociation_width. */
1547 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1548 znver1_memcpy,
1549 znver1_memset,
f6fd8f2b
JH
1550 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1551 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
7dc58b50
ML
1552 "16", /* Loop alignment. */
1553 "16", /* Jump alignment. */
1554 "0:0:8", /* Label alignment. */
1555 "16", /* Func alignment. */
071e428c
HW
1556 4, /* Small unroll limit. */
1557 2, /* Small unroll factor. */
64766e8d
JH
1558};
1559
2901f42f
VK
1560/* ZNVER2 has optimized REP instruction for medium sized blocks, but for
1561 very small blocks it is better to use loop. For large blocks, libcall
1562 can do nontemporary accesses and beat inline considerably. */
1563static stringop_algs znver2_memcpy[2] = {
da346efd
ML
1564 /* 32-bit tuning. */
1565 {libcall, {{6, loop, false},
1566 {14, unrolled_loop, false},
dc65aba7 1567 {-1, libcall, false}}},
da346efd
ML
1568 /* 64-bit tuning. */
1569 {libcall, {{16, loop, false},
1570 {64, rep_prefix_4_byte, false},
2901f42f
VK
1571 {-1, libcall, false}}}};
1572static stringop_algs znver2_memset[2] = {
da346efd
ML
1573 /* 32-bit tuning. */
1574 {libcall, {{8, loop, false},
1575 {24, unrolled_loop, false},
dc65aba7 1576 {128, rep_prefix_4_byte, false},
da346efd
ML
1577 {-1, libcall, false}}},
1578 /* 64-bit tuning. */
1579 {libcall, {{24, rep_prefix_4_byte, false},
1580 {128, rep_prefix_8_byte, false},
2901f42f
VK
1581 {-1, libcall, false}}}};
1582
1583struct processor_costs znver2_cost = {
72bb85f8 1584 {
d321551c 1585 /* Start of register allocator costs. integer->integer move cost is 2. */
2901f42f 1586
5b32a181
JH
1587 /* reg-reg moves are done by renaming and thus they are even cheaper than
1588 1 cycle. Because reg-reg move cost is 2 and following tables correspond
1589 to doubles of latencies, we do not model this correctly. It does not
1590 seem to make practical difference to bump prices up even more. */
1591 6, /* cost for loading QImode using
1592 movzbl. */
1593 {6, 6, 6}, /* cost of loading integer registers
1594 in QImode, HImode and SImode.
1595 Relative to reg-reg move (2). */
1596 {8, 8, 8}, /* cost of storing integer
1597 registers. */
1598 2, /* cost of reg,reg fld/fst. */
1599 {6, 6, 16}, /* cost of loading fp registers
1600 in SFmode, DFmode and XFmode. */
1601 {8, 8, 16}, /* cost of storing fp registers
1602 in SFmode, DFmode and XFmode. */
1603 2, /* cost of moving MMX register. */
1604 {6, 6}, /* cost of loading MMX registers
1605 in SImode and DImode. */
1606 {8, 8}, /* cost of storing MMX registers
1607 in SImode and DImode. */
1608 2, 2, 3, /* cost of moving XMM,YMM,ZMM
1609 register. */
1610 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
1611 in 32,64,128,256 and 512-bit. */
1612 {8, 8, 8, 8, 16}, /* cost of storing SSE registers
1613 in 32,64,128,256 and 512-bit. */
1614 6, 6, /* SSE->integer and integer->SSE
1615 moves. */
1616 8, 8, /* mask->integer and integer->mask moves */
1617 {6, 6, 6}, /* cost of loading mask register
1618 in QImode, HImode, SImode. */
1619 {8, 8, 8}, /* cost if storing mask register
1620 in QImode, HImode, SImode. */
1621 2, /* cost of moving mask register. */
1622 /* End of register allocator costs. */
1623 },
1624
1625 COSTS_N_INSNS (1), /* cost of an add instruction. */
1626 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1627 COSTS_N_INSNS (1), /* variable shift costs. */
1628 COSTS_N_INSNS (1), /* constant shift costs. */
1629 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1630 COSTS_N_INSNS (3), /* HI. */
1631 COSTS_N_INSNS (3), /* SI. */
1632 COSTS_N_INSNS (3), /* DI. */
1633 COSTS_N_INSNS (3)}, /* other. */
1634 0, /* cost of multiply per each bit
1635 set. */
1636 /* Depending on parameters, idiv can get faster on ryzen. This is upper
1637 bound. */
1638 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
1639 COSTS_N_INSNS (22), /* HI. */
1640 COSTS_N_INSNS (30), /* SI. */
1641 COSTS_N_INSNS (45), /* DI. */
1642 COSTS_N_INSNS (45)}, /* other. */
1643 COSTS_N_INSNS (1), /* cost of movsx. */
1644 COSTS_N_INSNS (1), /* cost of movzx. */
1645 8, /* "large" insn. */
1646 9, /* MOVE_RATIO. */
1647 6, /* CLEAR_RATIO */
1648 {6, 6, 6}, /* cost of loading integer registers
1649 in QImode, HImode and SImode.
1650 Relative to reg-reg move (2). */
1651 {8, 8, 8}, /* cost of storing integer
1652 registers. */
1653 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
1654 in 32bit, 64bit, 128bit, 256bit and 512bit */
1655 {8, 8, 8, 8, 16}, /* cost of storing SSE register
1656 in 32bit, 64bit, 128bit, 256bit and 512bit */
1657 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */
1658 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
1659 2, 2, 3, /* cost of moving XMM,YMM,ZMM
1660 register. */
1661 6, /* cost of moving SSE register to integer. */
1662 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1663 throughput 12. Approx 9 uops do not depend on vector size and every load
1664 is 7 uops. */
1665 18, 8, /* Gather load static, per_elt. */
1666 18, 10, /* Gather store static, per_elt. */
1667 32, /* size of l1 cache. */
1668 512, /* size of l2 cache. */
1669 64, /* size of prefetch block. */
1670 /* New AMD processors never drop prefetches; if they cannot be performed
1671 immediately, they are queued. We set number of simultaneous prefetches
1672 to a large constant to reflect this (it probably is not a good idea not
1673 to limit number of prefetches at all, as their execution also takes some
1674 time). */
1675 100, /* number of parallel prefetches. */
1676 3, /* Branch cost. */
1677 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1678 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1679 /* Latency of fdiv is 8-15. */
1680 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1681 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1682 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1683 /* Latency of fsqrt is 4-10. */
1684 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1685
1686 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1687 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1688 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1689 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
1690 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1691 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1692 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1693 /* 9-13. */
1694 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1695 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1696 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
1697 /* Zen can execute 4 integer operations per cycle. FP operations
1698 take 3 cycles and it can execute 2 integer additions and 2
1699 multiplications thus reassociation may make sense up to with of 6.
1700 SPEC2k6 bencharks suggests
1701 that 4 works better than 6 probably due to register pressure.
1702
1703 Integer vector operations are taken by FP unit and execute 3 vector
1704 plus/minus operations per cycle but only one multiply. This is adjusted
1705 in ix86_reassociation_width. */
1706 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1707 znver2_memcpy,
1708 znver2_memset,
1709 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1710 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1711 "16", /* Loop alignment. */
1712 "16", /* Jump alignment. */
1713 "0:0:8", /* Label alignment. */
1714 "16", /* Func alignment. */
071e428c
HW
1715 4, /* Small unroll limit. */
1716 2, /* Small unroll factor. */
5b32a181
JH
1717};
1718
1719struct processor_costs znver3_cost = {
1720 {
1721 /* Start of register allocator costs. integer->integer move cost is 2. */
1722
bf3b532b
TJ
1723 /* reg-reg moves are done by renaming and thus they are even cheaper than
1724 1 cycle. Because reg-reg move cost is 2 and following tables correspond
1725 to doubles of latencies, we do not model this correctly. It does not
1726 seem to make practical difference to bump prices up even more. */
1727 6, /* cost for loading QImode using
1728 movzbl. */
1729 {6, 6, 6}, /* cost of loading integer registers
1730 in QImode, HImode and SImode.
1731 Relative to reg-reg move (2). */
1732 {8, 8, 8}, /* cost of storing integer
1733 registers. */
1734 2, /* cost of reg,reg fld/fst. */
1735 {6, 6, 16}, /* cost of loading fp registers
1736 in SFmode, DFmode and XFmode. */
1737 {8, 8, 16}, /* cost of storing fp registers
1738 in SFmode, DFmode and XFmode. */
1739 2, /* cost of moving MMX register. */
1740 {6, 6}, /* cost of loading MMX registers
1741 in SImode and DImode. */
1742 {8, 8}, /* cost of storing MMX registers
1743 in SImode and DImode. */
1744 2, 2, 3, /* cost of moving XMM,YMM,ZMM
1745 register. */
1746 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
1747 in 32,64,128,256 and 512-bit. */
1748 {8, 8, 8, 8, 16}, /* cost of storing SSE registers
1749 in 32,64,128,256 and 512-bit. */
1750 6, 6, /* SSE->integer and integer->SSE
1751 moves. */
1752 8, 8, /* mask->integer and integer->mask moves */
1753 {6, 6, 6}, /* cost of loading mask register
1754 in QImode, HImode, SImode. */
1755 {8, 8, 8}, /* cost if storing mask register
1756 in QImode, HImode, SImode. */
1757 2, /* cost of moving mask register. */
1758 /* End of register allocator costs. */
1759 },
1760
1761 COSTS_N_INSNS (1), /* cost of an add instruction. */
1762 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1763 COSTS_N_INSNS (1), /* variable shift costs. */
1764 COSTS_N_INSNS (1), /* constant shift costs. */
1765 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1766 COSTS_N_INSNS (3), /* HI. */
1767 COSTS_N_INSNS (3), /* SI. */
1768 COSTS_N_INSNS (3), /* DI. */
1769 COSTS_N_INSNS (3)}, /* other. */
1770 0, /* cost of multiply per each bit
1771 set. */
1772 {COSTS_N_INSNS (9), /* cost of a divide/mod for QI. */
1773 COSTS_N_INSNS (10), /* HI. */
1774 COSTS_N_INSNS (12), /* SI. */
1775 COSTS_N_INSNS (17), /* DI. */
1776 COSTS_N_INSNS (17)}, /* other. */
1777 COSTS_N_INSNS (1), /* cost of movsx. */
1778 COSTS_N_INSNS (1), /* cost of movzx. */
1779 8, /* "large" insn. */
1780 9, /* MOVE_RATIO. */
1781 6, /* CLEAR_RATIO */
1782 {6, 6, 6}, /* cost of loading integer registers
1783 in QImode, HImode and SImode.
1784 Relative to reg-reg move (2). */
1785 {8, 8, 8}, /* cost of storing integer
1786 registers. */
1787 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
1788 in 32bit, 64bit, 128bit, 256bit and 512bit */
1789 {8, 8, 8, 8, 16}, /* cost of storing SSE register
1790 in 32bit, 64bit, 128bit, 256bit and 512bit */
1791 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */
1792 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
1793 2, 2, 3, /* cost of moving XMM,YMM,ZMM
1794 register. */
1795 6, /* cost of moving SSE register to integer. */
1796 /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops,
1797 throughput 9. Approx 7 uops do not depend on vector size and every load
1798 is 4 uops. */
1799 14, 8, /* Gather load static, per_elt. */
1800 14, 10, /* Gather store static, per_elt. */
1801 32, /* size of l1 cache. */
1802 512, /* size of l2 cache. */
1803 64, /* size of prefetch block. */
1804 /* New AMD processors never drop prefetches; if they cannot be performed
1805 immediately, they are queued. We set number of simultaneous prefetches
1806 to a large constant to reflect this (it probably is not a good idea not
1807 to limit number of prefetches at all, as their execution also takes some
1808 time). */
1809 100, /* number of parallel prefetches. */
1810 3, /* Branch cost. */
1811 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1812 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1813 /* Latency of fdiv is 8-15. */
1814 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1815 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1816 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1817 /* Latency of fsqrt is 4-10. */
1818 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1819
1820 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1821 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1822 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1823 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
1824 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1825 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1826 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1827 /* 9-13. */
1828 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1829 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1830 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
1831 /* Zen can execute 4 integer operations per cycle. FP operations
1832 take 3 cycles and it can execute 2 integer additions and 2
1833 multiplications thus reassociation may make sense up to with of 6.
1834 SPEC2k6 bencharks suggests
1835 that 4 works better than 6 probably due to register pressure.
1836
1837 Integer vector operations are taken by FP unit and execute 3 vector
1838 plus/minus operations per cycle but only one multiply. This is adjusted
1839 in ix86_reassociation_width. */
1840 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1841 znver2_memcpy,
1842 znver2_memset,
1843 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1844 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1845 "16", /* Loop alignment. */
1846 "16", /* Jump alignment. */
1847 "0:0:8", /* Label alignment. */
1848 "16", /* Func alignment. */
071e428c
HW
1849 4, /* Small unroll limit. */
1850 2, /* Small unroll factor. */
bf3b532b
TJ
1851};
1852
1853/* This table currently replicates znver3_cost table. */
1854struct processor_costs znver4_cost = {
1855 {
1856 /* Start of register allocator costs. integer->integer move cost is 2. */
1857
2901f42f
VK
1858 /* reg-reg moves are done by renaming and thus they are even cheaper than
1859 1 cycle. Because reg-reg move cost is 2 and following tables correspond
1860 to doubles of latencies, we do not model this correctly. It does not
1861 seem to make practical difference to bump prices up even more. */
1862 6, /* cost for loading QImode using
1863 movzbl. */
1864 {6, 6, 6}, /* cost of loading integer registers
1865 in QImode, HImode and SImode.
1866 Relative to reg-reg move (2). */
1867 {8, 8, 8}, /* cost of storing integer
1868 registers. */
1869 2, /* cost of reg,reg fld/fst. */
bbe04bad 1870 {14, 14, 17}, /* cost of loading fp registers
2901f42f 1871 in SFmode, DFmode and XFmode. */
bbe04bad 1872 {12, 12, 16}, /* cost of storing fp registers
2901f42f
VK
1873 in SFmode, DFmode and XFmode. */
1874 2, /* cost of moving MMX register. */
1875 {6, 6}, /* cost of loading MMX registers
1876 in SImode and DImode. */
1877 {8, 8}, /* cost of storing MMX registers
1878 in SImode and DImode. */
187dd65d 1879 2, 2, 3, /* cost of moving XMM,YMM,ZMM
2901f42f 1880 register. */
bbe04bad 1881 {6, 6, 10, 10, 12}, /* cost of loading SSE registers
2901f42f 1882 in 32,64,128,256 and 512-bit. */
bbe04bad 1883 {8, 8, 8, 12, 12}, /* cost of storing SSE registers
2901f42f 1884 in 32,64,128,256 and 512-bit. */
bbe04bad 1885 6, 8, /* SSE->integer and integer->SSE
2901f42f 1886 moves. */
bbe04bad 1887 8, 8, /* mask->integer and integer->mask moves */
00cb3494
L
1888 {6, 6, 6}, /* cost of loading mask register
1889 in QImode, HImode, SImode. */
1890 {8, 8, 8}, /* cost if storing mask register
1891 in QImode, HImode, SImode. */
1892 2, /* cost of moving mask register. */
d321551c 1893 /* End of register allocator costs. */
72bb85f8 1894 },
d321551c
L
1895
1896 COSTS_N_INSNS (1), /* cost of an add instruction. */
bbe04bad 1897 /* TODO: Lea with 3 components has cost 2. */
d321551c
L
1898 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1899 COSTS_N_INSNS (1), /* variable shift costs. */
1900 COSTS_N_INSNS (1), /* constant shift costs. */
1901 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1902 COSTS_N_INSNS (3), /* HI. */
1903 COSTS_N_INSNS (3), /* SI. */
1904 COSTS_N_INSNS (3), /* DI. */
1905 COSTS_N_INSNS (3)}, /* other. */
1906 0, /* cost of multiply per each bit
1907 set. */
bbe04bad
JH
1908 {COSTS_N_INSNS (12), /* cost of a divide/mod for QI. */
1909 COSTS_N_INSNS (13), /* HI. */
1910 COSTS_N_INSNS (13), /* SI. */
1911 COSTS_N_INSNS (18), /* DI. */
1912 COSTS_N_INSNS (18)}, /* other. */
d321551c
L
1913 COSTS_N_INSNS (1), /* cost of movsx. */
1914 COSTS_N_INSNS (1), /* cost of movzx. */
1915 8, /* "large" insn. */
1916 9, /* MOVE_RATIO. */
25e22b19 1917 6, /* CLEAR_RATIO */
d321551c
L
1918 {6, 6, 6}, /* cost of loading integer registers
1919 in QImode, HImode and SImode.
1920 Relative to reg-reg move (2). */
1921 {8, 8, 8}, /* cost of storing integer
1922 registers. */
bbe04bad 1923 {6, 6, 10, 10, 12}, /* cost of loading SSE registers
d321551c 1924 in 32bit, 64bit, 128bit, 256bit and 512bit */
bbe04bad 1925 {8, 8, 8, 12, 12}, /* cost of storing SSE register
d321551c 1926 in 32bit, 64bit, 128bit, 256bit and 512bit */
1e3aa9c9
RB
1927 {6, 6, 10, 10, 12}, /* cost of unaligned loads. */
1928 {8, 8, 8, 12, 12}, /* cost of unaligned stores. */
bbe04bad 1929 2, 2, 2, /* cost of moving XMM,YMM,ZMM
d321551c
L
1930 register. */
1931 6, /* cost of moving SSE register to integer. */
bbe04bad
JH
1932 /* VGATHERDPD is 17 uops and throughput is 4, VGATHERDPS is 24 uops,
1933 throughput 5. Approx 7 uops do not depend on vector size and every load
1934 is 5 uops. */
1935 14, 10, /* Gather load static, per_elt. */
1936 14, 20, /* Gather store static, per_elt. */
2901f42f 1937 32, /* size of l1 cache. */
bbe04bad 1938 1024, /* size of l2 cache. */
2901f42f
VK
1939 64, /* size of prefetch block. */
1940 /* New AMD processors never drop prefetches; if they cannot be performed
1941 immediately, they are queued. We set number of simultaneous prefetches
1942 to a large constant to reflect this (it probably is not a good idea not
1943 to limit number of prefetches at all, as their execution also takes some
1944 time). */
1945 100, /* number of parallel prefetches. */
1946 3, /* Branch cost. */
bbe04bad
JH
1947 COSTS_N_INSNS (7), /* cost of FADD and FSUB insns. */
1948 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
2901f42f
VK
1949 /* Latency of fdiv is 8-15. */
1950 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1951 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1952 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1953 /* Latency of fsqrt is 4-10. */
bbe04bad 1954 COSTS_N_INSNS (25), /* cost of FSQRT instruction. */
2901f42f
VK
1955
1956 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1957 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1958 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
187dd65d 1959 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
bbe04bad
JH
1960 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
1961 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
1962 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
2901f42f
VK
1963 /* 9-13. */
1964 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
bbe04bad
JH
1965 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1966 COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */
2901f42f
VK
1967 /* Zen can execute 4 integer operations per cycle. FP operations
1968 take 3 cycles and it can execute 2 integer additions and 2
1969 multiplications thus reassociation may make sense up to with of 6.
1970 SPEC2k6 bencharks suggests
1971 that 4 works better than 6 probably due to register pressure.
1972
1973 Integer vector operations are taken by FP unit and execute 3 vector
1974 plus/minus operations per cycle but only one multiply. This is adjusted
1975 in ix86_reassociation_width. */
1976 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1977 znver2_memcpy,
1978 znver2_memset,
1979 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1980 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1981 "16", /* Loop alignment. */
1982 "16", /* Jump alignment. */
d0aa0af9
JH
1983 "0:0:8", /* Label alignment. */
1984 "16", /* Func alignment. */
1985 4, /* Small unroll limit. */
1986 2, /* Small unroll factor. */
1987};
1988
1989/* This table currently replicates znver4_cost table. */
1990struct processor_costs znver5_cost = {
1991 {
1992 /* Start of register allocator costs. integer->integer move cost is 2. */
1993
1994 /* reg-reg moves are done by renaming and thus they are even cheaper than
1995 1 cycle. Because reg-reg move cost is 2 and following tables correspond
1996 to doubles of latencies, we do not model this correctly. It does not
1997 seem to make practical difference to bump prices up even more. */
1998 6, /* cost for loading QImode using
1999 movzbl. */
2000 {6, 6, 6}, /* cost of loading integer registers
2001 in QImode, HImode and SImode.
2002 Relative to reg-reg move (2). */
2003 {8, 8, 8}, /* cost of storing integer
2004 registers. */
2005 2, /* cost of reg,reg fld/fst. */
2006 {14, 14, 17}, /* cost of loading fp registers
2007 in SFmode, DFmode and XFmode. */
2008 {12, 12, 16}, /* cost of storing fp registers
2009 in SFmode, DFmode and XFmode. */
2010 2, /* cost of moving MMX register. */
2011 {6, 6}, /* cost of loading MMX registers
2012 in SImode and DImode. */
2013 {8, 8}, /* cost of storing MMX registers
2014 in SImode and DImode. */
2015 2, 2, 3, /* cost of moving XMM,YMM,ZMM
2016 register. */
2017 {6, 6, 10, 10, 12}, /* cost of loading SSE registers
2018 in 32,64,128,256 and 512-bit. */
2019 {8, 8, 8, 12, 12}, /* cost of storing SSE registers
2020 in 32,64,128,256 and 512-bit. */
2021 6, 8, /* SSE->integer and integer->SSE
2022 moves. */
2023 8, 8, /* mask->integer and integer->mask moves */
2024 {6, 6, 6}, /* cost of loading mask register
2025 in QImode, HImode, SImode. */
2026 {8, 8, 8}, /* cost if storing mask register
2027 in QImode, HImode, SImode. */
2028 2, /* cost of moving mask register. */
2029 /* End of register allocator costs. */
2030 },
2031
2032 COSTS_N_INSNS (1), /* cost of an add instruction. */
2033 /* TODO: Lea with 3 components has cost 2. */
2034 COSTS_N_INSNS (1), /* cost of a lea instruction. */
2035 COSTS_N_INSNS (1), /* variable shift costs. */
2036 COSTS_N_INSNS (1), /* constant shift costs. */
4292297a 2037 /* mul has latency 3, executes in 3 integer units. */
d0aa0af9
JH
2038 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
2039 COSTS_N_INSNS (3), /* HI. */
2040 COSTS_N_INSNS (3), /* SI. */
2041 COSTS_N_INSNS (3), /* DI. */
2042 COSTS_N_INSNS (3)}, /* other. */
2043 0, /* cost of multiply per each bit
2044 set. */
4292297a
JH
2045 /* integer divide has latency of 8 cycles
2046 plus 1 for every 9 bits of quotient. */
d0aa0af9
JH
2047 {COSTS_N_INSNS (10), /* cost of a divide/mod for QI. */
2048 COSTS_N_INSNS (11), /* HI. */
2049 COSTS_N_INSNS (13), /* SI. */
2050 COSTS_N_INSNS (16), /* DI. */
2051 COSTS_N_INSNS (16)}, /* other. */
2052 COSTS_N_INSNS (1), /* cost of movsx. */
2053 COSTS_N_INSNS (1), /* cost of movzx. */
4292297a 2054 15, /* "large" insn. */
d0aa0af9
JH
2055 9, /* MOVE_RATIO. */
2056 6, /* CLEAR_RATIO */
2057 {6, 6, 6}, /* cost of loading integer registers
2058 in QImode, HImode and SImode.
2059 Relative to reg-reg move (2). */
2060 {8, 8, 8}, /* cost of storing integer
2061 registers. */
2062 {6, 6, 10, 10, 12}, /* cost of loading SSE registers
2063 in 32bit, 64bit, 128bit, 256bit and 512bit */
2064 {8, 8, 8, 12, 12}, /* cost of storing SSE register
2065 in 32bit, 64bit, 128bit, 256bit and 512bit */
89639379
RB
2066 {6, 6, 10, 10, 12}, /* cost of unaligned loads. */
2067 {8, 8, 8, 12, 12}, /* cost of unaligned stores. */
d0aa0af9
JH
2068 2, 2, 2, /* cost of moving XMM,YMM,ZMM
2069 register. */
2070 6, /* cost of moving SSE register to integer. */
4292297a
JH
2071
2072 /* TODO: gather and scatter instructions are currently disabled in
2073 x86-tune.def. In some cases they are however a win, see PR116582
2074 We however need good cost model for them. */
d0aa0af9
JH
2075 14, 10, /* Gather load static, per_elt. */
2076 14, 20, /* Gather store static, per_elt. */
4292297a 2077 48, /* size of l1 cache. */
d0aa0af9
JH
2078 1024, /* size of l2 cache. */
2079 64, /* size of prefetch block. */
2080 /* New AMD processors never drop prefetches; if they cannot be performed
2081 immediately, they are queued. We set number of simultaneous prefetches
2082 to a large constant to reflect this (it probably is not a good idea not
2083 to limit number of prefetches at all, as their execution also takes some
2084 time). */
2085 100, /* number of parallel prefetches. */
2086 3, /* Branch cost. */
4292297a
JH
2087 /* TODO x87 latencies are still based on znver4.
2088 Probably not very important these days. */
d0aa0af9
JH
2089 COSTS_N_INSNS (7), /* cost of FADD and FSUB insns. */
2090 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
2091 /* Latency of fdiv is 8-15. */
2092 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
2093 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2094 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2095 /* Latency of fsqrt is 4-10. */
2096 COSTS_N_INSNS (25), /* cost of FSQRT instruction. */
2097
4292297a 2098 /* SSE instructions have typical throughput 4 and latency 1. */
d0aa0af9 2099 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
4292297a
JH
2100 /* ADDSS has throughput 2 and latency 2
2101 (in some cases when source is another addition). */
2102 COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */
2103 /* MULSS has throughput 2 and latency 3. */
d0aa0af9
JH
2104 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
2105 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
4292297a 2106 /* FMA had throughput 2 and latency 4. */
d0aa0af9
JH
2107 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
2108 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
4292297a 2109 /* DIVSS has throughtput 0.4 and latency 10. */
d0aa0af9 2110 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
4292297a 2111 /* DIVSD has throughtput 0.25 and latency 13. */
d0aa0af9 2112 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
4292297a 2113 /* DIVSD has throughtput 0.22 and latency 14. */
d0aa0af9 2114 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
4292297a 2115 /* DIVSD has throughtput 0.13 and latency 20. */
d0aa0af9 2116 COSTS_N_INSNS (20), /* cost of SQRTSD instruction. */
f0ab3de6
JH
2117 /* Zen5 can execute:
2118 - integer ops: 6 per cycle, at most 3 multiplications.
2119 latency 1 for additions, 3 for multiplications (pipelined)
2120
2121 Setting width of 9 for multiplication is probably excessive
2122 for register pressure.
2123 - fp ops: 2 additions per cycle, latency 2-3
2124 2 multiplicaitons per cycle, latency 3
2125 - vector intger ops: 4 additions, latency 1
2126 2 multiplications, latency 4
2127 We increase width to 6 for multiplications
2128 in ix86_reassociation_width. */
2129 6, 6, 4, 6, /* reassoc int, fp, vec_int, vec_fp. */
d0aa0af9
JH
2130 znver2_memcpy,
2131 znver2_memset,
2132 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
2133 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
2134 "16", /* Loop alignment. */
2135 "16", /* Jump alignment. */
2901f42f
VK
2136 "0:0:8", /* Label alignment. */
2137 "16", /* Func alignment. */
071e428c
HW
2138 4, /* Small unroll limit. */
2139 2, /* Small unroll factor. */
2901f42f
VK
2140};
2141
c234d831
UB
2142/* skylake_cost should produce code tuned for Skylake familly of CPUs. */
2143static stringop_algs skylake_memcpy[2] = {
a32452a5
L
2144 {libcall,
2145 {{256, rep_prefix_1_byte, true},
2146 {256, loop, false},
2147 {-1, libcall, false}}},
2148 {libcall,
2149 {{256, rep_prefix_1_byte, true},
2150 {256, loop, false},
2151 {-1, libcall, false}}}};
c234d831
UB
2152
2153static stringop_algs skylake_memset[2] = {
a32452a5
L
2154 {libcall,
2155 {{256, rep_prefix_1_byte, true},
2156 {256, loop, false},
2157 {-1, libcall, false}}},
2158 {libcall,
2159 {{256, rep_prefix_1_byte, true},
2160 {256, loop, false},
2161 {-1, libcall, false}}}};
c234d831
UB
2162
2163static const
2164struct processor_costs skylake_cost = {
72bb85f8 2165 {
d321551c
L
2166 /* Start of register allocator costs. integer->integer move cost is 2. */
2167 6, /* cost for loading QImode using movzbl */
2168 {4, 4, 4}, /* cost of loading integer registers
2169 in QImode, HImode and SImode.
2170 Relative to reg-reg move (2). */
7706f2f3 2171 {6, 6, 6}, /* cost of storing integer registers */
d321551c
L
2172 2, /* cost of reg,reg fld/fst */
2173 {6, 6, 8}, /* cost of loading fp registers
2174 in SFmode, DFmode and XFmode */
2175 {6, 6, 10}, /* cost of storing fp registers
2176 in SFmode, DFmode and XFmode */
2177 2, /* cost of moving MMX register */
2178 {6, 6}, /* cost of loading MMX registers
2179 in SImode and DImode */
2180 {6, 6}, /* cost of storing MMX registers
2181 in SImode and DImode */
2182 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
2183 {6, 6, 6, 10, 20}, /* cost of loading SSE registers
2184 in 32,64,128,256 and 512-bit */
2185 {8, 8, 8, 12, 24}, /* cost of storing SSE registers
2186 in 32,64,128,256 and 512-bit */
ecc3135a 2187 6, 6, /* SSE->integer and integer->SSE moves */
657612fb 2188 6, 6, /* mask->integer and integer->mask moves */
ecc3135a 2189 {8, 8, 8}, /* cost of loading mask register
00cb3494 2190 in QImode, HImode, SImode. */
ecc3135a 2191 {6, 6, 6}, /* cost if storing mask register
00cb3494 2192 in QImode, HImode, SImode. */
16516644 2193 3, /* cost of moving mask register. */
d321551c 2194 /* End of register allocator costs. */
72bb85f8 2195 },
d321551c 2196
c234d831
UB
2197 COSTS_N_INSNS (1), /* cost of an add instruction */
2198 COSTS_N_INSNS (1)+1, /* cost of a lea instruction */
2199 COSTS_N_INSNS (1), /* variable shift costs */
2200 COSTS_N_INSNS (1), /* constant shift costs */
2201 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
bc00de07 2202 COSTS_N_INSNS (3), /* HI */
c234d831 2203 COSTS_N_INSNS (3), /* SI */
a2ef9558
MT
2204 COSTS_N_INSNS (3), /* DI */
2205 COSTS_N_INSNS (3)}, /* other */
c234d831 2206 0, /* cost of multiply per each bit set */
02308bd3
MT
2207 /* Expanding div/mod currently doesn't consider parallelism. So the cost
2208 model is not realistic. We compensate by increasing the latencies a bit. */
2209 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
2210 COSTS_N_INSNS (11), /* HI */
2211 COSTS_N_INSNS (14), /* SI */
c234d831
UB
2212 COSTS_N_INSNS (76), /* DI */
2213 COSTS_N_INSNS (76)}, /* other */
2214 COSTS_N_INSNS (1), /* cost of movsx */
2215 COSTS_N_INSNS (0), /* cost of movzx */
2216 8, /* "large" insn */
2217 17, /* MOVE_RATIO */
a32452a5 2218 17, /* CLEAR_RATIO */
269edf4e 2219 {6, 6, 6}, /* cost of loading integer registers
c234d831
UB
2220 in QImode, HImode and SImode.
2221 Relative to reg-reg move (2). */
269edf4e
CL
2222 {8, 8, 8}, /* cost of storing integer registers */
2223 {8, 8, 8, 8, 16}, /* cost of loading SSE register
d321551c 2224 in 32bit, 64bit, 128bit, 256bit and 512bit */
d3152981 2225 {8, 8, 8, 8, 16}, /* cost of storing SSE register
d321551c 2226 in 32bit, 64bit, 128bit, 256bit and 512bit */
269edf4e 2227 {8, 8, 8, 8, 16}, /* cost of unaligned loads. */
c234d831 2228 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
d321551c 2229 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
83858ba1 2230 6, /* cost of moving SSE register to integer. */
c234d831
UB
2231 20, 8, /* Gather load static, per_elt. */
2232 22, 10, /* Gather store static, per_elt. */
2233 64, /* size of l1 cache. */
2234 512, /* size of l2 cache. */
2235 64, /* size of prefetch block */
2236 6, /* number of parallel prefetches */
2237 3, /* Branch cost */
2238 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2239 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
2240 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2241 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2242 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2243 COSTS_N_INSNS (20), /* cost of FSQRT instruction. */
2244
2245 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2246 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
2247 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2248 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
2249 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
2250 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
2251 COSTS_N_INSNS (11), /* cost of DIVSS instruction. */
2252 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
2253 COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */
2254 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
2255 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
2256 skylake_memcpy,
2257 skylake_memset,
2258 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2259 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
2260 "16:11:8", /* Loop alignment. */
2261 "16:11:8", /* Jump alignment. */
2262 "0:0:8", /* Label alignment. */
2263 "16", /* Func alignment. */
071e428c
HW
2264 4, /* Small unroll limit. */
2265 2, /* Small unroll factor. */
c234d831 2266};
bf24f4ec
L
2267
2268/* icelake_cost should produce code tuned for Icelake family of CPUs.
2269 NB: rep_prefix_1_byte is used only for known size. */
2270
2271static stringop_algs icelake_memcpy[2] = {
2272 {libcall,
2273 {{256, rep_prefix_1_byte, true},
2274 {256, loop, false},
2275 {-1, libcall, false}}},
2276 {libcall,
2277 {{256, rep_prefix_1_byte, true},
2278 {256, loop, false},
2279 {-1, libcall, false}}}};
2280
2281static stringop_algs icelake_memset[2] = {
2282 {libcall,
2283 {{256, rep_prefix_1_byte, true},
2284 {256, loop, false},
2285 {-1, libcall, false}}},
2286 {libcall,
2287 {{256, rep_prefix_1_byte, true},
2288 {256, loop, false},
2289 {-1, libcall, false}}}};
2290
2291static const
2292struct processor_costs icelake_cost = {
2293 {
2294 /* Start of register allocator costs. integer->integer move cost is 2. */
2295 6, /* cost for loading QImode using movzbl */
2296 {4, 4, 4}, /* cost of loading integer registers
2297 in QImode, HImode and SImode.
2298 Relative to reg-reg move (2). */
2299 {6, 6, 6}, /* cost of storing integer registers */
2300 2, /* cost of reg,reg fld/fst */
2301 {6, 6, 8}, /* cost of loading fp registers
2302 in SFmode, DFmode and XFmode */
2303 {6, 6, 10}, /* cost of storing fp registers
2304 in SFmode, DFmode and XFmode */
2305 2, /* cost of moving MMX register */
2306 {6, 6}, /* cost of loading MMX registers
2307 in SImode and DImode */
2308 {6, 6}, /* cost of storing MMX registers
2309 in SImode and DImode */
2310 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
2311 {6, 6, 6, 10, 20}, /* cost of loading SSE registers
2312 in 32,64,128,256 and 512-bit */
2313 {8, 8, 8, 12, 24}, /* cost of storing SSE registers
2314 in 32,64,128,256 and 512-bit */
2315 6, 6, /* SSE->integer and integer->SSE moves */
657612fb 2316 6, 6, /* mask->integer and integer->mask moves */
bf24f4ec
L
2317 {8, 8, 8}, /* cost of loading mask register
2318 in QImode, HImode, SImode. */
2319 {6, 6, 6}, /* cost if storing mask register
2320 in QImode, HImode, SImode. */
2321 3, /* cost of moving mask register. */
2322 /* End of register allocator costs. */
2323 },
2324
2325 COSTS_N_INSNS (1), /* cost of an add instruction */
2326 COSTS_N_INSNS (1)+1, /* cost of a lea instruction */
2327 COSTS_N_INSNS (1), /* variable shift costs */
2328 COSTS_N_INSNS (1), /* constant shift costs */
2329 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
bc00de07 2330 COSTS_N_INSNS (3), /* HI */
bf24f4ec
L
2331 COSTS_N_INSNS (3), /* SI */
2332 COSTS_N_INSNS (3), /* DI */
2333 COSTS_N_INSNS (3)}, /* other */
2334 0, /* cost of multiply per each bit set */
2335 /* Expanding div/mod currently doesn't consider parallelism. So the cost
2336 model is not realistic. We compensate by increasing the latencies a bit. */
2337 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
2338 COSTS_N_INSNS (11), /* HI */
2339 COSTS_N_INSNS (14), /* SI */
2340 COSTS_N_INSNS (76), /* DI */
2341 COSTS_N_INSNS (76)}, /* other */
2342 COSTS_N_INSNS (1), /* cost of movsx */
2343 COSTS_N_INSNS (0), /* cost of movzx */
2344 8, /* "large" insn */
2345 17, /* MOVE_RATIO */
2346 17, /* CLEAR_RATIO */
269edf4e 2347 {6, 6, 6}, /* cost of loading integer registers
bf24f4ec
L
2348 in QImode, HImode and SImode.
2349 Relative to reg-reg move (2). */
269edf4e
CL
2350 {8, 8, 8}, /* cost of storing integer registers */
2351 {8, 8, 8, 8, 16}, /* cost of loading SSE register
bf24f4ec 2352 in 32bit, 64bit, 128bit, 256bit and 512bit */
d3152981 2353 {8, 8, 8, 8, 16}, /* cost of storing SSE register
bf24f4ec 2354 in 32bit, 64bit, 128bit, 256bit and 512bit */
269edf4e 2355 {8, 8, 8, 8, 16}, /* cost of unaligned loads. */
bf24f4ec
L
2356 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
2357 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
2358 6, /* cost of moving SSE register to integer. */
2359 20, 8, /* Gather load static, per_elt. */
2360 22, 10, /* Gather store static, per_elt. */
2361 64, /* size of l1 cache. */
2362 512, /* size of l2 cache. */
2363 64, /* size of prefetch block */
2364 6, /* number of parallel prefetches */
2365 3, /* Branch cost */
2366 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2367 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
2368 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2369 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2370 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2371 COSTS_N_INSNS (20), /* cost of FSQRT instruction. */
2372
2373 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2374 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
2375 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2376 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
2377 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
2378 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
2379 COSTS_N_INSNS (11), /* cost of DIVSS instruction. */
2380 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
2381 COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */
2382 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
2383 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
2384 icelake_memcpy,
2385 icelake_memset,
2386 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2387 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2388 "16:11:8", /* Loop alignment. */
2389 "16:11:8", /* Jump alignment. */
2390 "0:0:8", /* Label alignment. */
2391 "16", /* Func alignment. */
071e428c
HW
2392 4, /* Small unroll limit. */
2393 2, /* Small unroll factor. */
bf24f4ec
L
2394};
2395
4f442a3b
CL
2396/* alderlake_cost should produce code tuned for alderlake family of CPUs. */
2397static stringop_algs alderlake_memcpy[2] = {
2398 {libcall,
2399 {{256, rep_prefix_1_byte, true},
2400 {256, loop, false},
2401 {-1, libcall, false}}},
2402 {libcall,
2403 {{256, rep_prefix_1_byte, true},
2404 {256, loop, false},
2405 {-1, libcall, false}}}};
2406static stringop_algs alderlake_memset[2] = {
2407 {libcall,
2408 {{256, rep_prefix_1_byte, true},
2409 {256, loop, false},
2410 {-1, libcall, false}}},
2411 {libcall,
2412 {{256, rep_prefix_1_byte, true},
2413 {256, loop, false},
2414 {-1, libcall, false}}}};
2415static const
2416struct processor_costs alderlake_cost = {
2417 {
2418 /* Start of register allocator costs. integer->integer move cost is 2. */
2419 6, /* cost for loading QImode using movzbl */
2420 {6, 6, 6}, /* cost of loading integer registers
2421 in QImode, HImode and SImode.
2422 Relative to reg-reg move (2). */
2423 {6, 6, 6}, /* cost of storing integer registers */
2424 4, /* cost of reg,reg fld/fst */
2425 {6, 6, 12}, /* cost of loading fp registers
2426 in SFmode, DFmode and XFmode */
2427 {6, 6, 12}, /* cost of storing fp registers
2428 in SFmode, DFmode and XFmode */
2429 2, /* cost of moving MMX register */
2430 {6, 6}, /* cost of loading MMX registers
2431 in SImode and DImode */
2432 {6, 6}, /* cost of storing MMX registers
2433 in SImode and DImode */
2434 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
2435 {6, 6, 6, 10, 15}, /* cost of loading SSE registers
2436 in 32,64,128,256 and 512-bit */
2437 {6, 6, 6, 10, 15}, /* cost of storing SSE registers
2438 in 32,64,128,256 and 512-bit */
2439 6, 6, /* SSE->integer and integer->SSE moves */
2440 6, 6, /* mask->integer and integer->mask moves */
2441 {6, 6, 6}, /* cost of loading mask register
2442 in QImode, HImode, SImode. */
2443 {6, 6, 6}, /* cost if storing mask register
2444 in QImode, HImode, SImode. */
2445 2, /* cost of moving mask register. */
2446 /* End of register allocator costs. */
2447 },
2448
2449 COSTS_N_INSNS (1), /* cost of an add instruction */
2450 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2451 COSTS_N_INSNS (1), /* variable shift costs */
2452 COSTS_N_INSNS (1), /* constant shift costs */
2453 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
bc00de07 2454 COSTS_N_INSNS (3), /* HI */
4f442a3b 2455 COSTS_N_INSNS (3), /* SI */
bc00de07 2456 COSTS_N_INSNS (3), /* DI */
4f442a3b
CL
2457 COSTS_N_INSNS (4)}, /* other */
2458 0, /* cost of multiply per each bit set */
2459 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */
2460 COSTS_N_INSNS (22), /* HI */
2461 COSTS_N_INSNS (30), /* SI */
2462 COSTS_N_INSNS (74), /* DI */
2463 COSTS_N_INSNS (74)}, /* other */
2464 COSTS_N_INSNS (1), /* cost of movsx */
2465 COSTS_N_INSNS (1), /* cost of movzx */
2466 8, /* "large" insn */
2467 17, /* MOVE_RATIO */
2468 17, /* CLEAR_RATIO */
2469 {6, 6, 6}, /* cost of loading integer registers
2470 in QImode, HImode and SImode.
2471 Relative to reg-reg move (2). */
269edf4e
CL
2472 {8, 8, 8}, /* cost of storing integer registers */
2473 {8, 8, 8, 10, 15}, /* cost of loading SSE register
4f442a3b 2474 in 32bit, 64bit, 128bit, 256bit and 512bit */
269edf4e 2475 {8, 8, 8, 10, 15}, /* cost of storing SSE register
4f442a3b 2476 in 32bit, 64bit, 128bit, 256bit and 512bit */
269edf4e
CL
2477 {8, 8, 8, 10, 15}, /* cost of unaligned loads. */
2478 {8, 8, 8, 10, 15}, /* cost of unaligned storess. */
4f442a3b
CL
2479 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
2480 6, /* cost of moving SSE register to integer. */
2481 18, 6, /* Gather load static, per_elt. */
2482 18, 6, /* Gather store static, per_elt. */
2483 32, /* size of l1 cache. */
2484 512, /* size of l2 cache. */
2485 64, /* size of prefetch block */
2486 6, /* number of parallel prefetches */
2487 3, /* Branch cost */
2488 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2489 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
2490 COSTS_N_INSNS (17), /* cost of FDIV instruction. */
2491 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2492 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2493 COSTS_N_INSNS (14), /* cost of FSQRT instruction. */
2494
2495 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2496 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2497 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2498 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2499 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2500 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
2501 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
2502 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
2503 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
2504 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
2505 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
2506 alderlake_memcpy,
2507 alderlake_memset,
2508 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
2509 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
2510 "16:11:8", /* Loop alignment. */
2511 "16:11:8", /* Jump alignment. */
2512 "0:0:8", /* Label alignment. */
2513 "16", /* Func alignment. */
071e428c
HW
2514 4, /* Small unroll limit. */
2515 2, /* Small unroll factor. */
4f442a3b
CL
2516};
2517
64766e8d
JH
2518 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
2519 very small blocks it is better to use loop. For large blocks, libcall can
2520 do nontemporary accesses and beat inline considerably. */
2521static stringop_algs btver1_memcpy[2] = {
2522 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
2523 {-1, rep_prefix_4_byte, false}}},
2524 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
2525 {-1, libcall, false}}}};
2526static stringop_algs btver1_memset[2] = {
2527 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
2528 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2529 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
2530 {-1, libcall, false}}}};
2531const struct processor_costs btver1_cost = {
72bb85f8 2532 {
d321551c
L
2533 /* Start of register allocator costs. integer->integer move cost is 2. */
2534 8, /* cost for loading QImode using movzbl */
2535 {6, 8, 6}, /* cost of loading integer registers
2536 in QImode, HImode and SImode.
2537 Relative to reg-reg move (2). */
2538 {6, 8, 6}, /* cost of storing integer registers */
2539 4, /* cost of reg,reg fld/fst */
2540 {12, 12, 28}, /* cost of loading fp registers
2541 in SFmode, DFmode and XFmode */
2542 {12, 12, 38}, /* cost of storing fp registers
2543 in SFmode, DFmode and XFmode */
2544 4, /* cost of moving MMX register */
2545 {10, 10}, /* cost of loading MMX registers
2546 in SImode and DImode */
2547 {12, 12}, /* cost of storing MMX registers
2548 in SImode and DImode */
2549 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2550 {10, 10, 12, 48, 96}, /* cost of loading SSE registers
2551 in 32,64,128,256 and 512-bit */
2552 {10, 10, 12, 48, 96}, /* cost of storing SSE registers
2553 in 32,64,128,256 and 512-bit */
2554 14, 14, /* SSE->integer and integer->SSE moves */
ecc3135a 2555 14, 14, /* mask->integer and integer->mask moves */
00cb3494
L
2556 {6, 8, 6}, /* cost of loading mask register
2557 in QImode, HImode, SImode. */
2558 {6, 8, 6}, /* cost if storing mask register
2559 in QImode, HImode, SImode. */
2560 2, /* cost of moving mask register. */
d321551c 2561 /* End of register allocator costs. */
72bb85f8 2562 },
d321551c 2563
64766e8d
JH
2564 COSTS_N_INSNS (1), /* cost of an add instruction */
2565 COSTS_N_INSNS (2), /* cost of a lea instruction */
2566 COSTS_N_INSNS (1), /* variable shift costs */
2567 COSTS_N_INSNS (1), /* constant shift costs */
2568 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2569 COSTS_N_INSNS (4), /* HI */
2570 COSTS_N_INSNS (3), /* SI */
2571 COSTS_N_INSNS (4), /* DI */
2572 COSTS_N_INSNS (5)}, /* other */
2573 0, /* cost of multiply per each bit set */
2574 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
2575 COSTS_N_INSNS (35), /* HI */
2576 COSTS_N_INSNS (51), /* SI */
2577 COSTS_N_INSNS (83), /* DI */
2578 COSTS_N_INSNS (83)}, /* other */
2579 COSTS_N_INSNS (1), /* cost of movsx */
2580 COSTS_N_INSNS (1), /* cost of movzx */
2581 8, /* "large" insn */
2582 9, /* MOVE_RATIO */
25e22b19 2583 6, /* CLEAR_RATIO */
df41dbaf 2584 {6, 8, 6}, /* cost of loading integer registers
64766e8d
JH
2585 in QImode, HImode and SImode.
2586 Relative to reg-reg move (2). */
df41dbaf 2587 {6, 8, 6}, /* cost of storing integer registers */
d321551c
L
2588 {10, 10, 12, 48, 96}, /* cost of loading SSE register
2589 in 32bit, 64bit, 128bit, 256bit and 512bit */
2590 {10, 10, 12, 48, 96}, /* cost of storing SSE register
2591 in 32bit, 64bit, 128bit, 256bit and 512bit */
b7167993 2592 {10, 10, 12, 48, 96}, /* cost of unaligned loads. */
b7167993 2593 {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
d321551c
L
2594 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2595 14, /* cost of moving SSE register to integer. */
a4fe6139
JH
2596 10, 10, /* Gather load static, per_elt. */
2597 10, 10, /* Gather store static, per_elt. */
64766e8d
JH
2598 32, /* size of l1 cache. */
2599 512, /* size of l2 cache. */
2600 64, /* size of prefetch block */
2601 100, /* number of parallel prefetches */
2602 2, /* Branch cost */
2603 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
2604 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
2605 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
2606 COSTS_N_INSNS (2), /* cost of FABS instruction. */
2607 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
2608 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
6065f444 2609
c53c148c 2610 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
2611 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2612 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
2613 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
c53c148c
JH
2614 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2615 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
6065f444
JH
2616 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
2617 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
2618 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
2619 COSTS_N_INSNS (48), /* cost of SQRTSD instruction. */
64766e8d
JH
2620 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2621 btver1_memcpy,
2622 btver1_memset,
f6fd8f2b
JH
2623 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
2624 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
2625 "16:11:8", /* Loop alignment. */
2626 "16:8:8", /* Jump alignment. */
2627 "0:0:8", /* Label alignment. */
2628 "11", /* Func alignment. */
071e428c
HW
2629 4, /* Small unroll limit. */
2630 2, /* Small unroll factor. */
64766e8d
JH
2631};
2632
2633static stringop_algs btver2_memcpy[2] = {
2634 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
2635 {-1, rep_prefix_4_byte, false}}},
2636 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
2637 {-1, libcall, false}}}};
2638static stringop_algs btver2_memset[2] = {
2639 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
2640 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2641 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
2642 {-1, libcall, false}}}};
2643const struct processor_costs btver2_cost = {
72bb85f8 2644 {
d321551c
L
2645 /* Start of register allocator costs. integer->integer move cost is 2. */
2646 8, /* cost for loading QImode using movzbl */
2647 {8, 8, 6}, /* cost of loading integer registers
2648 in QImode, HImode and SImode.
2649 Relative to reg-reg move (2). */
2650 {8, 8, 6}, /* cost of storing integer registers */
2651 4, /* cost of reg,reg fld/fst */
2652 {12, 12, 28}, /* cost of loading fp registers
2653 in SFmode, DFmode and XFmode */
2654 {12, 12, 38}, /* cost of storing fp registers
2655 in SFmode, DFmode and XFmode */
2656 4, /* cost of moving MMX register */
2657 {10, 10}, /* cost of loading MMX registers
2658 in SImode and DImode */
2659 {12, 12}, /* cost of storing MMX registers
2660 in SImode and DImode */
2661 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2662 {10, 10, 12, 48, 96}, /* cost of loading SSE registers
2663 in 32,64,128,256 and 512-bit */
2664 {10, 10, 12, 48, 96}, /* cost of storing SSE registers
2665 in 32,64,128,256 and 512-bit */
2666 14, 14, /* SSE->integer and integer->SSE moves */
ecc3135a 2667 14, 14, /* mask->integer and integer->mask moves */
00cb3494
L
2668 {8, 8, 6}, /* cost of loading mask register
2669 in QImode, HImode, SImode. */
2670 {8, 8, 6}, /* cost if storing mask register
2671 in QImode, HImode, SImode. */
2672 2, /* cost of moving mask register. */
d321551c 2673 /* End of register allocator costs. */
72bb85f8 2674 },
d321551c 2675
64766e8d
JH
2676 COSTS_N_INSNS (1), /* cost of an add instruction */
2677 COSTS_N_INSNS (2), /* cost of a lea instruction */
2678 COSTS_N_INSNS (1), /* variable shift costs */
2679 COSTS_N_INSNS (1), /* constant shift costs */
2680 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2681 COSTS_N_INSNS (4), /* HI */
2682 COSTS_N_INSNS (3), /* SI */
2683 COSTS_N_INSNS (4), /* DI */
2684 COSTS_N_INSNS (5)}, /* other */
2685 0, /* cost of multiply per each bit set */
2686 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
2687 COSTS_N_INSNS (35), /* HI */
2688 COSTS_N_INSNS (51), /* SI */
2689 COSTS_N_INSNS (83), /* DI */
2690 COSTS_N_INSNS (83)}, /* other */
2691 COSTS_N_INSNS (1), /* cost of movsx */
2692 COSTS_N_INSNS (1), /* cost of movzx */
2693 8, /* "large" insn */
2694 9, /* MOVE_RATIO */
25e22b19 2695 6, /* CLEAR_RATIO */
df41dbaf 2696 {8, 8, 6}, /* cost of loading integer registers
64766e8d
JH
2697 in QImode, HImode and SImode.
2698 Relative to reg-reg move (2). */
df41dbaf 2699 {8, 8, 6}, /* cost of storing integer registers */
d321551c
L
2700 {10, 10, 12, 48, 96}, /* cost of loading SSE register
2701 in 32bit, 64bit, 128bit, 256bit and 512bit */
2702 {10, 10, 12, 48, 96}, /* cost of storing SSE register
2703 in 32bit, 64bit, 128bit, 256bit and 512bit */
b7167993 2704 {10, 10, 12, 48, 96}, /* cost of unaligned loads. */
b7167993 2705 {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
d321551c
L
2706 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2707 14, /* cost of moving SSE register to integer. */
a4fe6139
JH
2708 10, 10, /* Gather load static, per_elt. */
2709 10, 10, /* Gather store static, per_elt. */
64766e8d
JH
2710 32, /* size of l1 cache. */
2711 2048, /* size of l2 cache. */
2712 64, /* size of prefetch block */
2713 100, /* number of parallel prefetches */
2714 2, /* Branch cost */
2715 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
2716 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
2717 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
2718 COSTS_N_INSNS (2), /* cost of FABS instruction. */
2719 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
2720 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
6065f444 2721
c53c148c 2722 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
2723 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2724 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
2725 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
c53c148c
JH
2726 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2727 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
6065f444
JH
2728 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
2729 COSTS_N_INSNS (19), /* cost of DIVSD instruction. */
2730 COSTS_N_INSNS (16), /* cost of SQRTSS instruction. */
2731 COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */
64766e8d
JH
2732 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2733 btver2_memcpy,
2734 btver2_memset,
f6fd8f2b
JH
2735 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
2736 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
2737 "16:11:8", /* Loop alignment. */
2738 "16:8:8", /* Jump alignment. */
2739 "0:0:8", /* Label alignment. */
2740 "11", /* Func alignment. */
071e428c
HW
2741 4, /* Small unroll limit. */
2742 2, /* Small unroll factor. */
64766e8d
JH
2743};
2744
2745static stringop_algs pentium4_memcpy[2] = {
2746 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
2747 DUMMY_STRINGOP_ALGS};
2748static stringop_algs pentium4_memset[2] = {
2749 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
2750 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2751 DUMMY_STRINGOP_ALGS};
2752
2753static const
2754struct processor_costs pentium4_cost = {
72bb85f8 2755 {
d321551c 2756 /* Start of register allocator costs. integer->integer move cost is 2. */
df41dbaf 2757 5, /* cost for loading QImode using movzbl */
64766e8d
JH
2758 {4, 5, 4}, /* cost of loading integer registers
2759 in QImode, HImode and SImode.
2760 Relative to reg-reg move (2). */
2761 {2, 3, 2}, /* cost of storing integer registers */
df41dbaf
JH
2762 12, /* cost of reg,reg fld/fst */
2763 {14, 14, 14}, /* cost of loading fp registers
64766e8d 2764 in SFmode, DFmode and XFmode */
df41dbaf 2765 {14, 14, 14}, /* cost of storing fp registers
64766e8d 2766 in SFmode, DFmode and XFmode */
df41dbaf
JH
2767 12, /* cost of moving MMX register */
2768 {16, 16}, /* cost of loading MMX registers
64766e8d 2769 in SImode and DImode */
df41dbaf 2770 {16, 16}, /* cost of storing MMX registers
64766e8d 2771 in SImode and DImode */
df41dbaf
JH
2772 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
2773 {16, 16, 16, 32, 64}, /* cost of loading SSE registers
2774 in 32,64,128,256 and 512-bit */
d321551c
L
2775 {16, 16, 16, 32, 64}, /* cost of storing SSE registers
2776 in 32,64,128,256 and 512-bit */
2777 20, 12, /* SSE->integer and integer->SSE moves */
ecc3135a 2778 20, 12, /* mask->integer and integer->mask moves */
00cb3494
L
2779 {4, 5, 4}, /* cost of loading mask register
2780 in QImode, HImode, SImode. */
2781 {2, 3, 2}, /* cost if storing mask register
2782 in QImode, HImode, SImode. */
2783 2, /* cost of moving mask register. */
d321551c 2784 /* End of register allocator costs. */
72bb85f8 2785 },
d321551c
L
2786
2787 COSTS_N_INSNS (1), /* cost of an add instruction */
2788 COSTS_N_INSNS (3), /* cost of a lea instruction */
2789 COSTS_N_INSNS (4), /* variable shift costs */
2790 COSTS_N_INSNS (4), /* constant shift costs */
2791 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
2792 COSTS_N_INSNS (15), /* HI */
2793 COSTS_N_INSNS (15), /* SI */
2794 COSTS_N_INSNS (15), /* DI */
2795 COSTS_N_INSNS (15)}, /* other */
2796 0, /* cost of multiply per each bit set */
2797 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
2798 COSTS_N_INSNS (56), /* HI */
2799 COSTS_N_INSNS (56), /* SI */
2800 COSTS_N_INSNS (56), /* DI */
2801 COSTS_N_INSNS (56)}, /* other */
2802 COSTS_N_INSNS (1), /* cost of movsx */
2803 COSTS_N_INSNS (1), /* cost of movzx */
2804 16, /* "large" insn */
2805 6, /* MOVE_RATIO */
25e22b19 2806 6, /* CLEAR_RATIO */
d321551c
L
2807 {4, 5, 4}, /* cost of loading integer registers
2808 in QImode, HImode and SImode.
2809 Relative to reg-reg move (2). */
2810 {2, 3, 2}, /* cost of storing integer registers */
2811 {16, 16, 16, 32, 64}, /* cost of loading SSE register
2812 in 32bit, 64bit, 128bit, 256bit and 512bit */
2813 {16, 16, 16, 32, 64}, /* cost of storing SSE register
2814 in 32bit, 64bit, 128bit, 256bit and 512bit */
df41dbaf 2815 {32, 32, 32, 64, 128}, /* cost of unaligned loads. */
df41dbaf 2816 {32, 32, 32, 64, 128}, /* cost of unaligned stores. */
d321551c
L
2817 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
2818 20, /* cost of moving SSE register to integer. */
a4fe6139
JH
2819 16, 16, /* Gather load static, per_elt. */
2820 16, 16, /* Gather store static, per_elt. */
64766e8d
JH
2821 8, /* size of l1 cache. */
2822 256, /* size of l2 cache. */
2823 64, /* size of prefetch block */
2824 6, /* number of parallel prefetches */
2825 2, /* Branch cost */
2826 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
2827 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
2828 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
2829 COSTS_N_INSNS (2), /* cost of FABS instruction. */
2830 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
2831 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
6065f444 2832
c53c148c 2833 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
6065f444
JH
2834 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
2835 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
2836 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
c53c148c
JH
2837 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2838 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
6065f444
JH
2839 COSTS_N_INSNS (23), /* cost of DIVSS instruction. */
2840 COSTS_N_INSNS (38), /* cost of DIVSD instruction. */
2841 COSTS_N_INSNS (23), /* cost of SQRTSS instruction. */
2842 COSTS_N_INSNS (38), /* cost of SQRTSD instruction. */
64766e8d
JH
2843 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2844 pentium4_memcpy,
2845 pentium4_memset,
f6fd8f2b
JH
2846 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2847 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
2848 NULL, /* Loop alignment. */
2849 NULL, /* Jump alignment. */
2850 NULL, /* Label alignment. */
2851 NULL, /* Func alignment. */
071e428c
HW
2852 4, /* Small unroll limit. */
2853 2, /* Small unroll factor. */
64766e8d
JH
2854};
2855
2856static stringop_algs nocona_memcpy[2] = {
2857 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
2858 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
2859 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
2860
2861static stringop_algs nocona_memset[2] = {
2862 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
2863 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2864 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
2865 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2866
2867static const
2868struct processor_costs nocona_cost = {
72bb85f8 2869 {
d321551c
L
2870 /* Start of register allocator costs. integer->integer move cost is 2. */
2871 4, /* cost for loading QImode using movzbl */
2872 {4, 4, 4}, /* cost of loading integer registers
2873 in QImode, HImode and SImode.
2874 Relative to reg-reg move (2). */
2875 {4, 4, 4}, /* cost of storing integer registers */
2876 12, /* cost of reg,reg fld/fst */
2877 {14, 14, 14}, /* cost of loading fp registers
2878 in SFmode, DFmode and XFmode */
2879 {14, 14, 14}, /* cost of storing fp registers
2880 in SFmode, DFmode and XFmode */
2881 14, /* cost of moving MMX register */
2882 {12, 12}, /* cost of loading MMX registers
2883 in SImode and DImode */
2884 {12, 12}, /* cost of storing MMX registers
2885 in SImode and DImode */
2886 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
2887 {12, 12, 12, 24, 48}, /* cost of loading SSE registers
2888 in 32,64,128,256 and 512-bit */
2889 {12, 12, 12, 24, 48}, /* cost of storing SSE registers
2890 in 32,64,128,256 and 512-bit */
2891 20, 12, /* SSE->integer and integer->SSE moves */
ecc3135a 2892 20, 12, /* mask->integer and integer->mask moves */
00cb3494
L
2893 {4, 4, 4}, /* cost of loading mask register
2894 in QImode, HImode, SImode. */
2895 {4, 4, 4}, /* cost if storing mask register
2896 in QImode, HImode, SImode. */
2897 2, /* cost of moving mask register. */
d321551c 2898 /* End of register allocator costs. */
72bb85f8 2899 },
d321551c 2900
64766e8d
JH
2901 COSTS_N_INSNS (1), /* cost of an add instruction */
2902 COSTS_N_INSNS (1), /* cost of a lea instruction */
2903 COSTS_N_INSNS (1), /* variable shift costs */
2904 COSTS_N_INSNS (1), /* constant shift costs */
2905 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
2906 COSTS_N_INSNS (10), /* HI */
2907 COSTS_N_INSNS (10), /* SI */
2908 COSTS_N_INSNS (10), /* DI */
2909 COSTS_N_INSNS (10)}, /* other */
2910 0, /* cost of multiply per each bit set */
2911 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
2912 COSTS_N_INSNS (66), /* HI */
2913 COSTS_N_INSNS (66), /* SI */
2914 COSTS_N_INSNS (66), /* DI */
2915 COSTS_N_INSNS (66)}, /* other */
2916 COSTS_N_INSNS (1), /* cost of movsx */
2917 COSTS_N_INSNS (1), /* cost of movzx */
2918 16, /* "large" insn */
2919 17, /* MOVE_RATIO */
25e22b19 2920 6, /* CLEAR_RATIO */
64766e8d
JH
2921 {4, 4, 4}, /* cost of loading integer registers
2922 in QImode, HImode and SImode.
2923 Relative to reg-reg move (2). */
2924 {4, 4, 4}, /* cost of storing integer registers */
d321551c
L
2925 {12, 12, 12, 24, 48}, /* cost of loading SSE register
2926 in 32bit, 64bit, 128bit, 256bit and 512bit */
2927 {12, 12, 12, 24, 48}, /* cost of storing SSE register
2928 in 32bit, 64bit, 128bit, 256bit and 512bit */
df41dbaf 2929 {24, 24, 24, 48, 96}, /* cost of unaligned loads. */
df41dbaf 2930 {24, 24, 24, 48, 96}, /* cost of unaligned stores. */
d321551c
L
2931 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
2932 20, /* cost of moving SSE register to integer. */
a4fe6139
JH
2933 12, 12, /* Gather load static, per_elt. */
2934 12, 12, /* Gather store static, per_elt. */
64766e8d
JH
2935 8, /* size of l1 cache. */
2936 1024, /* size of l2 cache. */
2937 64, /* size of prefetch block */
2938 8, /* number of parallel prefetches */
2939 1, /* Branch cost */
2940 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
2941 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2942 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
2943 COSTS_N_INSNS (3), /* cost of FABS instruction. */
2944 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
2945 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
6065f444 2946
c53c148c 2947 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
6065f444
JH
2948 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
2949 COSTS_N_INSNS (7), /* cost of MULSS instruction. */
2950 COSTS_N_INSNS (7), /* cost of MULSD instruction. */
c53c148c
JH
2951 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
2952 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
6065f444
JH
2953 COSTS_N_INSNS (32), /* cost of DIVSS instruction. */
2954 COSTS_N_INSNS (40), /* cost of DIVSD instruction. */
2955 COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */
2956 COSTS_N_INSNS (41), /* cost of SQRTSD instruction. */
64766e8d
JH
2957 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2958 nocona_memcpy,
2959 nocona_memset,
f6fd8f2b
JH
2960 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2961 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
2962 NULL, /* Loop alignment. */
2963 NULL, /* Jump alignment. */
2964 NULL, /* Label alignment. */
2965 NULL, /* Func alignment. */
071e428c
HW
2966 4, /* Small unroll limit. */
2967 2, /* Small unroll factor. */
64766e8d
JH
2968};
2969
2970static stringop_algs atom_memcpy[2] = {
2971 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2972 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2973 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2974static stringop_algs atom_memset[2] = {
2975 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2976 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2977 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2978 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2979static const
2980struct processor_costs atom_cost = {
72bb85f8 2981 {
d321551c
L
2982 /* Start of register allocator costs. integer->integer move cost is 2. */
2983 6, /* cost for loading QImode using movzbl */
2984 {6, 6, 6}, /* cost of loading integer registers
2985 in QImode, HImode and SImode.
2986 Relative to reg-reg move (2). */
2987 {6, 6, 6}, /* cost of storing integer registers */
2988 4, /* cost of reg,reg fld/fst */
2989 {6, 6, 18}, /* cost of loading fp registers
2990 in SFmode, DFmode and XFmode */
2991 {14, 14, 24}, /* cost of storing fp registers
2992 in SFmode, DFmode and XFmode */
2993 2, /* cost of moving MMX register */
2994 {8, 8}, /* cost of loading MMX registers
2995 in SImode and DImode */
2996 {10, 10}, /* cost of storing MMX registers
2997 in SImode and DImode */
2998 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2999 {8, 8, 8, 16, 32}, /* cost of loading SSE registers
3000 in 32,64,128,256 and 512-bit */
3001 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
3002 in 32,64,128,256 and 512-bit */
ecc3135a 3003 8, 6, /* SSE->integer and integer->SSE moves */
3004 8, 6, /* mask->integer and integer->mask moves */
00cb3494
L
3005 {6, 6, 6}, /* cost of loading mask register
3006 in QImode, HImode, SImode. */
3007 {6, 6, 6}, /* cost if storing mask register
3008 in QImode, HImode, SImode. */
3009 2, /* cost of moving mask register. */
d321551c 3010 /* End of register allocator costs. */
72bb85f8 3011 },
d321551c 3012
64766e8d
JH
3013 COSTS_N_INSNS (1), /* cost of an add instruction */
3014 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
3015 COSTS_N_INSNS (1), /* variable shift costs */
3016 COSTS_N_INSNS (1), /* constant shift costs */
3017 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
3018 COSTS_N_INSNS (4), /* HI */
3019 COSTS_N_INSNS (3), /* SI */
3020 COSTS_N_INSNS (4), /* DI */
3021 COSTS_N_INSNS (2)}, /* other */
3022 0, /* cost of multiply per each bit set */
3023 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
3024 COSTS_N_INSNS (26), /* HI */
3025 COSTS_N_INSNS (42), /* SI */
3026 COSTS_N_INSNS (74), /* DI */
3027 COSTS_N_INSNS (74)}, /* other */
3028 COSTS_N_INSNS (1), /* cost of movsx */
3029 COSTS_N_INSNS (1), /* cost of movzx */
3030 8, /* "large" insn */
3031 17, /* MOVE_RATIO */
25e22b19 3032 6, /* CLEAR_RATIO */
df41dbaf 3033 {6, 6, 6}, /* cost of loading integer registers
64766e8d
JH
3034 in QImode, HImode and SImode.
3035 Relative to reg-reg move (2). */
df41dbaf 3036 {6, 6, 6}, /* cost of storing integer registers */
d321551c
L
3037 {8, 8, 8, 16, 32}, /* cost of loading SSE register
3038 in 32bit, 64bit, 128bit, 256bit and 512bit */
3039 {8, 8, 8, 16, 32}, /* cost of storing SSE register
3040 in 32bit, 64bit, 128bit, 256bit and 512bit */
df41dbaf 3041 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */
df41dbaf 3042 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
d321551c
L
3043 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3044 8, /* cost of moving SSE register to integer. */
a4fe6139
JH
3045 8, 8, /* Gather load static, per_elt. */
3046 8, 8, /* Gather store static, per_elt. */
64766e8d
JH
3047 32, /* size of l1 cache. */
3048 256, /* size of l2 cache. */
3049 64, /* size of prefetch block */
3050 6, /* number of parallel prefetches */
3051 3, /* Branch cost */
3052 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
3053 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
3054 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
3055 COSTS_N_INSNS (8), /* cost of FABS instruction. */
3056 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
3057 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
6065f444 3058
c53c148c 3059 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
3060 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
3061 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
3062 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
c53c148c
JH
3063 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
3064 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
6065f444
JH
3065 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
3066 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
3067 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
3068 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
64766e8d
JH
3069 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
3070 atom_memcpy,
3071 atom_memset,
f6fd8f2b
JH
3072 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
3073 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
3074 "16", /* Loop alignment. */
3075 "16:8:8", /* Jump alignment. */
3076 "0:0:8", /* Label alignment. */
3077 "16", /* Func alignment. */
071e428c
HW
3078 4, /* Small unroll limit. */
3079 2, /* Small unroll factor. */
64766e8d
JH
3080};
3081
3082static stringop_algs slm_memcpy[2] = {
3083 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
3084 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
3085 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
3086static stringop_algs slm_memset[2] = {
3087 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
3088 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
3089 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
3090 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
3091static const
3092struct processor_costs slm_cost = {
72bb85f8 3093 {
d321551c
L
3094 /* Start of register allocator costs. integer->integer move cost is 2. */
3095 8, /* cost for loading QImode using movzbl */
3096 {8, 8, 8}, /* cost of loading integer registers
3097 in QImode, HImode and SImode.
3098 Relative to reg-reg move (2). */
3099 {6, 6, 6}, /* cost of storing integer registers */
3100 2, /* cost of reg,reg fld/fst */
3101 {8, 8, 18}, /* cost of loading fp registers
3102 in SFmode, DFmode and XFmode */
3103 {6, 6, 18}, /* cost of storing fp registers
3104 in SFmode, DFmode and XFmode */
3105 2, /* cost of moving MMX register */
3106 {8, 8}, /* cost of loading MMX registers
3107 in SImode and DImode */
3108 {6, 6}, /* cost of storing MMX registers
3109 in SImode and DImode */
3110 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3111 {8, 8, 8, 16, 32}, /* cost of loading SSE registers
3112 in 32,64,128,256 and 512-bit */
3113 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
3114 in 32,64,128,256 and 512-bit */
ecc3135a 3115 8, 6, /* SSE->integer and integer->SSE moves */
3116 8, 6, /* mask->integer and integer->mask moves */
00cb3494
L
3117 {8, 8, 8}, /* cost of loading mask register
3118 in QImode, HImode, SImode. */
3119 {6, 6, 6}, /* cost if storing mask register
3120 in QImode, HImode, SImode. */
3121 2, /* cost of moving mask register. */
d321551c 3122 /* End of register allocator costs. */
72bb85f8 3123 },
d321551c 3124
64766e8d
JH
3125 COSTS_N_INSNS (1), /* cost of an add instruction */
3126 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
3127 COSTS_N_INSNS (1), /* variable shift costs */
3128 COSTS_N_INSNS (1), /* constant shift costs */
3129 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
3130 COSTS_N_INSNS (3), /* HI */
3131 COSTS_N_INSNS (3), /* SI */
3132 COSTS_N_INSNS (4), /* DI */
3133 COSTS_N_INSNS (2)}, /* other */
3134 0, /* cost of multiply per each bit set */
3135 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
3136 COSTS_N_INSNS (26), /* HI */
3137 COSTS_N_INSNS (42), /* SI */
3138 COSTS_N_INSNS (74), /* DI */
3139 COSTS_N_INSNS (74)}, /* other */
3140 COSTS_N_INSNS (1), /* cost of movsx */
3141 COSTS_N_INSNS (1), /* cost of movzx */
3142 8, /* "large" insn */
3143 17, /* MOVE_RATIO */
25e22b19 3144 6, /* CLEAR_RATIO */
df41dbaf 3145 {8, 8, 8}, /* cost of loading integer registers
64766e8d
JH
3146 in QImode, HImode and SImode.
3147 Relative to reg-reg move (2). */
df41dbaf 3148 {6, 6, 6}, /* cost of storing integer registers */
d321551c
L
3149 {8, 8, 8, 16, 32}, /* cost of loading SSE register
3150 in 32bit, 64bit, 128bit, 256bit and 512bit */
3151 {8, 8, 8, 16, 32}, /* cost of storing SSE register
3152 in SImode, DImode and TImode. */
df41dbaf 3153 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */
df41dbaf 3154 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
d321551c
L
3155 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3156 8, /* cost of moving SSE register to integer. */
a4fe6139
JH
3157 8, 8, /* Gather load static, per_elt. */
3158 8, 8, /* Gather store static, per_elt. */
64766e8d
JH
3159 32, /* size of l1 cache. */
3160 256, /* size of l2 cache. */
3161 64, /* size of prefetch block */
3162 6, /* number of parallel prefetches */
3163 3, /* Branch cost */
3164 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
3165 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
3166 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
3167 COSTS_N_INSNS (8), /* cost of FABS instruction. */
3168 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
3169 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
6065f444 3170
c53c148c 3171 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
3172 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3173 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
3174 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
c53c148c
JH
3175 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
3176 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
6065f444
JH
3177 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
3178 COSTS_N_INSNS (69), /* cost of DIVSD instruction. */
3179 COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */
3180 COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */
64766e8d
JH
3181 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
3182 slm_memcpy,
3183 slm_memset,
f6fd8f2b
JH
3184 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
3185 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
3186 "16", /* Loop alignment. */
3187 "16:8:8", /* Jump alignment. */
3188 "0:0:8", /* Label alignment. */
3189 "16", /* Func alignment. */
071e428c
HW
3190 4, /* Small unroll limit. */
3191 2, /* Small unroll factor. */
64766e8d
JH
3192};
3193
c3a2437f
L
3194static stringop_algs tremont_memcpy[2] = {
3195 {libcall,
3196 {{256, rep_prefix_1_byte, true},
3197 {256, loop, false},
3198 {-1, libcall, false}}},
3199 {libcall,
3200 {{256, rep_prefix_1_byte, true},
3201 {256, loop, false},
3202 {-1, libcall, false}}}};
3203static stringop_algs tremont_memset[2] = {
3204 {libcall,
3205 {{256, rep_prefix_1_byte, true},
3206 {256, loop, false},
3207 {-1, libcall, false}}},
3208 {libcall,
3209 {{256, rep_prefix_1_byte, true},
3210 {256, loop, false},
3211 {-1, libcall, false}}}};
3212static const
3213struct processor_costs tremont_cost = {
3214 {
3215 /* Start of register allocator costs. integer->integer move cost is 2. */
3216 6, /* cost for loading QImode using movzbl */
3217 {6, 6, 6}, /* cost of loading integer registers
3218 in QImode, HImode and SImode.
3219 Relative to reg-reg move (2). */
3220 {6, 6, 6}, /* cost of storing integer registers */
3221 4, /* cost of reg,reg fld/fst */
3222 {6, 6, 12}, /* cost of loading fp registers
3223 in SFmode, DFmode and XFmode */
3224 {6, 6, 12}, /* cost of storing fp registers
3225 in SFmode, DFmode and XFmode */
3226 2, /* cost of moving MMX register */
3227 {6, 6}, /* cost of loading MMX registers
3228 in SImode and DImode */
3229 {6, 6}, /* cost of storing MMX registers
3230 in SImode and DImode */
3231 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
3232 {6, 6, 6, 10, 15}, /* cost of loading SSE registers
3233 in 32,64,128,256 and 512-bit */
3234 {6, 6, 6, 10, 15}, /* cost of storing SSE registers
3235 in 32,64,128,256 and 512-bit */
3236 6, 6, /* SSE->integer and integer->SSE moves */
3237 6, 6, /* mask->integer and integer->mask moves */
3238 {6, 6, 6}, /* cost of loading mask register
3239 in QImode, HImode, SImode. */
3240 {6, 6, 6}, /* cost if storing mask register
3241 in QImode, HImode, SImode. */
3242 2, /* cost of moving mask register. */
3243 /* End of register allocator costs. */
3244 },
3245
3246 COSTS_N_INSNS (1), /* cost of an add instruction */
3247 /* Setting cost to 2 makes our current implementation of synth_mult result in
3248 use of unnecessary temporary registers causing regression on several
3249 SPECfp benchmarks. */
3250 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
3251 COSTS_N_INSNS (1), /* variable shift costs */
3252 COSTS_N_INSNS (1), /* constant shift costs */
3253 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
bc00de07 3254 COSTS_N_INSNS (3), /* HI */
c3a2437f 3255 COSTS_N_INSNS (3), /* SI */
bc00de07 3256 COSTS_N_INSNS (3), /* DI */
c3a2437f
L
3257 COSTS_N_INSNS (4)}, /* other */
3258 0, /* cost of multiply per each bit set */
3259 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */
3260 COSTS_N_INSNS (22), /* HI */
3261 COSTS_N_INSNS (30), /* SI */
3262 COSTS_N_INSNS (74), /* DI */
3263 COSTS_N_INSNS (74)}, /* other */
3264 COSTS_N_INSNS (1), /* cost of movsx */
3265 COSTS_N_INSNS (1), /* cost of movzx */
3266 8, /* "large" insn */
3267 17, /* MOVE_RATIO */
3268 17, /* CLEAR_RATIO */
3269 {6, 6, 6}, /* cost of loading integer registers
3270 in QImode, HImode and SImode.
3271 Relative to reg-reg move (2). */
3272 {6, 6, 6}, /* cost of storing integer registers */
3273 {6, 6, 6, 10, 15}, /* cost of loading SSE register
3274 in 32bit, 64bit, 128bit, 256bit and 512bit */
3275 {6, 6, 6, 10, 15}, /* cost of storing SSE register
3276 in 32bit, 64bit, 128bit, 256bit and 512bit */
3277 {6, 6, 6, 10, 15}, /* cost of unaligned loads. */
3278 {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
3279 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
3280 6, /* cost of moving SSE register to integer. */
3281 18, 6, /* Gather load static, per_elt. */
3282 18, 6, /* Gather store static, per_elt. */
3283 32, /* size of l1 cache. */
3284 512, /* size of l2 cache. */
3285 64, /* size of prefetch block */
3286 6, /* number of parallel prefetches */
3287 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
3288 value is increased to perhaps more appropriate value of 5. */
3289 3, /* Branch cost */
3290 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
3291 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
3292 COSTS_N_INSNS (17), /* cost of FDIV instruction. */
3293 COSTS_N_INSNS (1), /* cost of FABS instruction. */
3294 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
3295 COSTS_N_INSNS (14), /* cost of FSQRT instruction. */
3296
3297 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3298 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3299 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
3300 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
3301 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
3302 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
3303 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
3304 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
3305 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
3306 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
3307 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
3308 tremont_memcpy,
3309 tremont_memset,
3310 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
3311 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
3312 "16:11:8", /* Loop alignment. */
3313 "16:11:8", /* Jump alignment. */
3314 "0:0:8", /* Label alignment. */
3315 "16", /* Func alignment. */
071e428c
HW
3316 4, /* Small unroll limit. */
3317 2, /* Small unroll factor. */
c3a2437f
L
3318};
3319
64766e8d
JH
3320static stringop_algs intel_memcpy[2] = {
3321 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
3322 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
3323 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
3324static stringop_algs intel_memset[2] = {
3325 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
3326 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
3327 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
3328 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
3329static const
3330struct processor_costs intel_cost = {
72bb85f8 3331 {
d321551c
L
3332 /* Start of register allocator costs. integer->integer move cost is 2. */
3333 6, /* cost for loading QImode using movzbl */
3334 {4, 4, 4}, /* cost of loading integer registers
3335 in QImode, HImode and SImode.
3336 Relative to reg-reg move (2). */
3337 {6, 6, 6}, /* cost of storing integer registers */
3338 2, /* cost of reg,reg fld/fst */
3339 {6, 6, 8}, /* cost of loading fp registers
3340 in SFmode, DFmode and XFmode */
3341 {6, 6, 10}, /* cost of storing fp registers
3342 in SFmode, DFmode and XFmode */
3343 2, /* cost of moving MMX register */
3344 {6, 6}, /* cost of loading MMX registers
3345 in SImode and DImode */
3346 {6, 6}, /* cost of storing MMX registers
3347 in SImode and DImode */
3348 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */
3349 {6, 6, 6, 6, 6}, /* cost of loading SSE registers
3350 in 32,64,128,256 and 512-bit */
3351 {6, 6, 6, 6, 6}, /* cost of storing SSE registers
3352 in 32,64,128,256 and 512-bit */
ecc3135a 3353 4, 4, /* SSE->integer and integer->SSE moves */
3354 4, 4, /* mask->integer and integer->mask moves */
00cb3494
L
3355 {4, 4, 4}, /* cost of loading mask register
3356 in QImode, HImode, SImode. */
3357 {6, 6, 6}, /* cost if storing mask register
3358 in QImode, HImode, SImode. */
3359 2, /* cost of moving mask register. */
d321551c 3360 /* End of register allocator costs. */
72bb85f8 3361 },
d321551c 3362
64766e8d
JH
3363 COSTS_N_INSNS (1), /* cost of an add instruction */
3364 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
3365 COSTS_N_INSNS (1), /* variable shift costs */
3366 COSTS_N_INSNS (1), /* constant shift costs */
3367 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
3368 COSTS_N_INSNS (3), /* HI */
3369 COSTS_N_INSNS (3), /* SI */
3370 COSTS_N_INSNS (4), /* DI */
3371 COSTS_N_INSNS (2)}, /* other */
3372 0, /* cost of multiply per each bit set */
3373 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
3374 COSTS_N_INSNS (26), /* HI */
3375 COSTS_N_INSNS (42), /* SI */
3376 COSTS_N_INSNS (74), /* DI */
3377 COSTS_N_INSNS (74)}, /* other */
3378 COSTS_N_INSNS (1), /* cost of movsx */
3379 COSTS_N_INSNS (1), /* cost of movzx */
3380 8, /* "large" insn */
3381 17, /* MOVE_RATIO */
25e22b19 3382 6, /* CLEAR_RATIO */
64766e8d
JH
3383 {4, 4, 4}, /* cost of loading integer registers
3384 in QImode, HImode and SImode.
3385 Relative to reg-reg move (2). */
af863030 3386 {6, 6, 6}, /* cost of storing integer registers */
d321551c
L
3387 {6, 6, 6, 6, 6}, /* cost of loading SSE register
3388 in 32bit, 64bit, 128bit, 256bit and 512bit */
3389 {6, 6, 6, 6, 6}, /* cost of storing SSE register
3390 in 32bit, 64bit, 128bit, 256bit and 512bit */
df41dbaf 3391 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
df41dbaf 3392 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
d321551c
L
3393 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */
3394 4, /* cost of moving SSE register to integer. */
a4fe6139
JH
3395 6, 6, /* Gather load static, per_elt. */
3396 6, 6, /* Gather store static, per_elt. */
64766e8d
JH
3397 32, /* size of l1 cache. */
3398 256, /* size of l2 cache. */
3399 64, /* size of prefetch block */
3400 6, /* number of parallel prefetches */
3401 3, /* Branch cost */
3402 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
3403 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
3404 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
3405 COSTS_N_INSNS (8), /* cost of FABS instruction. */
3406 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
3407 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
6065f444 3408
3ff59baa 3409 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
3410 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
3411 COSTS_N_INSNS (8), /* cost of MULSS instruction. */
3412 COSTS_N_INSNS (8), /* cost of MULSD instruction. */
c53c148c
JH
3413 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
3414 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
6065f444
JH
3415 COSTS_N_INSNS (20), /* cost of DIVSS instruction. */
3416 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
3417 COSTS_N_INSNS (40), /* cost of SQRTSS instruction. */
3418 COSTS_N_INSNS (40), /* cost of SQRTSD instruction. */
64766e8d
JH
3419 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
3420 intel_memcpy,
3421 intel_memset,
f6fd8f2b
JH
3422 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
3423 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
3424 "16", /* Loop alignment. */
3425 "16:8:8", /* Jump alignment. */
3426 "0:0:8", /* Label alignment. */
3427 "16", /* Func alignment. */
071e428c
HW
3428 4, /* Small unroll limit. */
3429 2, /* Small unroll factor. */
64766e8d
JH
3430};
3431
a239aff8
M
3432/* lujiazui_cost should produce code tuned for ZHAOXIN lujiazui CPU. */
3433static stringop_algs lujiazui_memcpy[2] = {
3434 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
3435 {-1, libcall, false}}},
3436 {libcall, {{12, unrolled_loop, true}, {32, loop, false},
3437 {6144, rep_prefix_8_byte, false},
3438 {-1, libcall, false}}}};
3439static stringop_algs lujiazui_memset[2] = {
3440 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
3441 {-1, libcall, false}}},
3442 {libcall, {{12, loop, true}, {32, loop, false},
3443 {640, rep_prefix_8_byte, false},
3444 {-1, libcall, false}}}};
3445static const
3446struct processor_costs lujiazui_cost = {
3447 {
3448 /* Start of register allocator costs. integer->integer move cost is 2. */
3449 6, /* cost for loading QImode using movzbl. */
3450 {6, 6, 6}, /* cost of loading integer registers
3451 in QImode, HImode and SImode.
3452 Relative to reg-reg move (2). */
3453 {6, 6, 6}, /* cost of storing integer registers. */
3454 2, /* cost of reg,reg fld/fst. */
3455 {6, 6, 8}, /* cost of loading fp registers
3456 in SFmode, DFmode and XFmode. */
3457 {6, 6, 8}, /* cost of storing fp registers
3458 in SFmode, DFmode and XFmode. */
3459 2, /* cost of moving MMX register. */
3460 {6, 6}, /* cost of loading MMX registers
3461 in SImode and DImode. */
3462 {6, 6}, /* cost of storing MMX registers
3463 in SImode and DImode. */
3464 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */
3465 {6, 6, 6, 10, 15}, /* cost of loading SSE registers
3466 in 32,64,128,256 and 512-bit. */
3467 {6, 6, 6, 10, 15}, /* cost of storing SSE registers
3468 in 32,64,128,256 and 512-bit. */
3469 6, 6, /* SSE->integer and integer->SSE moves. */
3470 6, 6, /* mask->integer and integer->mask moves. */
3471 {6, 6, 6}, /* cost of loading mask register
3472 in QImode, HImode, SImode. */
3473 {6, 6, 6}, /* cost if storing mask register
3474 in QImode, HImode, SImode. */
3475 2, /* cost of moving mask register. */
3476 /* End of register allocator costs. */
3477 },
3478
3479 COSTS_N_INSNS (1), /* cost of an add instruction. */
3480 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction. */
3481 COSTS_N_INSNS (1), /* variable shift costs. */
3482 COSTS_N_INSNS (1), /* constant shift costs. */
3483 {COSTS_N_INSNS (2), /* cost of starting multiply for QI. */
3484 COSTS_N_INSNS (3), /* HI. */
3485 COSTS_N_INSNS (3), /* SI. */
3486 COSTS_N_INSNS (12), /* DI. */
3487 COSTS_N_INSNS (14)}, /* other. */
3488 0, /* cost of multiply per each bit set. */
3489 {COSTS_N_INSNS (22), /* cost of a divide/mod for QI. */
3490 COSTS_N_INSNS (24), /* HI. */
3491 COSTS_N_INSNS (24), /* SI. */
3492 COSTS_N_INSNS (150), /* DI. */
3493 COSTS_N_INSNS (152)}, /* other. */
3494 COSTS_N_INSNS (1), /* cost of movsx. */
3495 COSTS_N_INSNS (1), /* cost of movzx. */
3496 8, /* "large" insn. */
3497 17, /* MOVE_RATIO. */
3498 6, /* CLEAR_RATIO. */
3499 {6, 6, 6}, /* cost of loading integer registers
3500 in QImode, HImode and SImode.
3501 Relative to reg-reg move (2). */
3502 {6, 6, 6}, /* cost of storing integer registers. */
3503 {6, 6, 6, 10, 15}, /* cost of loading SSE register
3504 in 32bit, 64bit, 128bit, 256bit and 512bit. */
3505 {6, 6, 6, 10, 15}, /* cost of storing SSE register
3506 in 32bit, 64bit, 128bit, 256bit and 512bit. */
3507 {6, 6, 6, 10, 15}, /* cost of unaligned loads. */
3508 {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
3509 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */
3510 6, /* cost of moving SSE register to integer. */
3511 18, 6, /* Gather load static, per_elt. */
3512 18, 6, /* Gather store static, per_elt. */
3513 32, /* size of l1 cache. */
3514 4096, /* size of l2 cache. */
3515 64, /* size of prefetch block. */
3516 /* Lujiazui processor never drop prefetches, like AMD processors. */
3517 100, /* number of parallel prefetches. */
3518 3, /* Branch cost. */
3519 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
3520 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
3521 COSTS_N_INSNS (22), /* cost of FDIV instruction. */
3522 COSTS_N_INSNS (1), /* cost of FABS instruction. */
3523 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
3524 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
3525
3526 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3527 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3528 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
3529 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
3530 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
3531 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
3532 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
3533 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
3534 COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */
3535 COSTS_N_INSNS (60), /* cost of SQRTSD instruction. */
3536 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
3537 lujiazui_memcpy,
3538 lujiazui_memset,
3539 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
3540 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
3541 "16:11:8", /* Loop alignment. */
3542 "16:11:8", /* Jump alignment. */
3543 "0:0:8", /* Label alignment. */
3544 "16", /* Func alignment. */
071e428c
HW
3545 4, /* Small unroll limit. */
3546 2, /* Small unroll factor. */
a239aff8
M
3547};
3548
94c0b26f
M
3549/* yongfeng_cost should produce code tuned for ZHAOXIN yongfeng CPU. */
3550static stringop_algs yongfeng_memcpy[2] = {
3551 {libcall, {{6, unrolled_loop, true}, {256, unrolled_loop, false},
3552 {-1, libcall, false}}},
3553 {libcall, {{8, loop, false}, {512, unrolled_loop, false},
3554 {-1, libcall, false}}}};
3555static stringop_algs yongfeng_memset[2] = {
3556 {libcall, {{6, loop_1_byte, false}, {128, loop, false},
3557 {-1, libcall, false}}},
3558 {libcall, {{2, rep_prefix_4_byte, false}, {64, loop, false},
3559 {1024, vector_loop, false},
3560 {-1, libcall, false}}}};
3561static const
3562struct processor_costs yongfeng_cost = {
3563 {
3564 /* Start of register allocator costs. integer->integer move cost is 2. */
3565 8, /* cost for loading QImode using movzbl. */
3566 {8, 8, 8}, /* cost of loading integer registers
3567 in QImode, HImode and SImode.
3568 Relative to reg-reg move (2). */
3569 {8, 8, 8}, /* cost of storing integer registers. */
3570 2, /* cost of reg,reg fld/fst. */
3571 {8, 8, 8}, /* cost of loading fp registers
3572 in SFmode, DFmode and XFmode. */
3573 {8, 8, 8}, /* cost of storing fp registers
3574 in SFmode, DFmode and XFmode. */
3575 2, /* cost of moving MMX register. */
3576 {8, 8}, /* cost of loading MMX registers
3577 in SImode and DImode. */
3578 {8, 8}, /* cost of storing MMX registers
3579 in SImode and DImode. */
3580 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */
3581 {8, 8, 8, 10, 15}, /* cost of loading SSE registers
3582 in 32,64,128,256 and 512-bit. */
3583 {8, 8, 8, 10, 15}, /* cost of storing SSE registers
3584 in 32,64,128,256 and 512-bit. */
3585 8, 8, /* SSE->integer and integer->SSE moves. */
3586 8, 8, /* mask->integer and integer->mask moves. */
3587 {8, 8, 8}, /* cost of loading mask register
3588 in QImode, HImode, SImode. */
3589 {8, 8, 8}, /* cost if storing mask register
3590 in QImode, HImode, SImode. */
3591 2, /* cost of moving mask register. */
3592 /* End of register allocator costs. */
3593 },
3594
3595 COSTS_N_INSNS (1), /* cost of an add instruction. */
3596 COSTS_N_INSNS (1), /* cost of a lea instruction. */
3597 COSTS_N_INSNS (1), /* variable shift costs. */
3598 COSTS_N_INSNS (1), /* constant shift costs. */
3599 {COSTS_N_INSNS (2), /* cost of starting multiply for QI. */
3600 COSTS_N_INSNS (3), /* HI. */
3601 COSTS_N_INSNS (2), /* SI. */
3602 COSTS_N_INSNS (2), /* DI. */
3603 COSTS_N_INSNS (3)}, /* other. */
3604 0, /* cost of multiply per each bit set. */
3605 {COSTS_N_INSNS (8), /* cost of a divide/mod for QI. */
3606 COSTS_N_INSNS (9), /* HI. */
3607 COSTS_N_INSNS (8), /* SI. */
3608 COSTS_N_INSNS (41), /* DI. */
3609 COSTS_N_INSNS (41)}, /* other. */
3610 COSTS_N_INSNS (1), /* cost of movsx. */
3611 COSTS_N_INSNS (1), /* cost of movzx. */
3612 8, /* "large" insn. */
3613 17, /* MOVE_RATIO. */
3614 6, /* CLEAR_RATIO. */
3615 {8, 8, 8}, /* cost of loading integer registers
3616 in QImode, HImode and SImode.
3617 Relative to reg-reg move (2). */
3618 {8, 8, 8}, /* cost of storing integer registers. */
3619 {8, 8, 8, 12, 15}, /* cost of loading SSE register
3620 in 32bit, 64bit, 128bit, 256bit and 512bit. */
3621 {8, 8, 8, 12, 15}, /* cost of storing SSE register
3622 in 32bit, 64bit, 128bit, 256bit and 512bit. */
3623 {8, 8, 8, 12, 15}, /* cost of unaligned loads. */
3624 {8, 8, 8, 12, 15}, /* cost of unaligned storess. */
3625 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */
3626 8, /* cost of moving SSE register to integer. */
3627 18, 6, /* Gather load static, per_elt. */
3628 18, 6, /* Gather store static, per_elt. */
3629 32, /* size of l1 cache. */
3630 256, /* size of l2 cache. */
3631 64, /* size of prefetch block. */
3632 12, /* number of parallel prefetches. */
3633 3, /* Branch cost. */
3634 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
3635 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
3636 COSTS_N_INSNS (14), /* cost of FDIV instruction. */
3637 COSTS_N_INSNS (1), /* cost of FABS instruction. */
3638 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
3639 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
3640
3641 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3642 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3643 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
3644 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
3645 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
3646 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
3647 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
3648 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
3649 COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */
3650 COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */
3651 4, 4, 4, 4, /* reassoc int, fp, vec_int, vec_fp. */
3652 yongfeng_memcpy,
3653 yongfeng_memset,
3654 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
3655 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
3656 "16:11:8", /* Loop alignment. */
3657 "16:11:8", /* Jump alignment. */
3658 "0:0:8", /* Label alignment. */
3659 "16", /* Func alignment. */
3660 4, /* Small unroll limit. */
3661 2, /* Small unroll factor. */
3662};
3663
6f6ea27d 3664/* shijidadao_cost should produce code tuned for ZHAOXIN shijidadao CPU. */
3665static stringop_algs shijidadao_memcpy[2] = {
3666 {libcall, {{8, unrolled_loop, true}, {256, unrolled_loop, false},
3667 {-1, libcall, false}}},
3668 {libcall, {{10, loop, true}, {256, unrolled_loop, false},
3669 {-1, libcall, false}}}};
3670static stringop_algs shijidadao_memset[2] = {
3671 {libcall, {{4, loop, true}, {128, unrolled_loop, false},
3672 {-1, libcall, false}}},
3673 {libcall, {{1, rep_prefix_4_byte, false}, {14, loop, true},
3674 {1024, vector_loop, false},
3675 {-1, libcall, false}}}};
3676static const
3677struct processor_costs shijidadao_cost = {
3678 {
3679 /* Start of register allocator costs. integer->integer move cost is 2. */
3680 8, /* cost for loading QImode using movzbl. */
3681 {8, 8, 8}, /* cost of loading integer registers
3682 in QImode, HImode and SImode.
3683 Relative to reg-reg move (2). */
3684 {8, 8, 8}, /* cost of storing integer registers. */
3685 2, /* cost of reg,reg fld/fst. */
3686 {8, 8, 8}, /* cost of loading fp registers
3687 in SFmode, DFmode and XFmode. */
3688 {8, 8, 8}, /* cost of storing fp registers
3689 in SFmode, DFmode and XFmode. */
3690 2, /* cost of moving MMX register. */
3691 {8, 8}, /* cost of loading MMX registers
3692 in SImode and DImode. */
3693 {8, 8}, /* cost of storing MMX registers
3694 in SImode and DImode. */
3695 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */
3696 {8, 8, 8, 10, 15}, /* cost of loading SSE registers
3697 in 32,64,128,256 and 512-bit. */
3698 {8, 8, 8, 10, 15}, /* cost of storing SSE registers
3699 in 32,64,128,256 and 512-bit. */
3700 8, 8, /* SSE->integer and integer->SSE moves. */
3701 8, 8, /* mask->integer and integer->mask moves. */
3702 {8, 8, 8}, /* cost of loading mask register
3703 in QImode, HImode, SImode. */
3704 {8, 8, 8}, /* cost if storing mask register
3705 in QImode, HImode, SImode. */
3706 2, /* cost of moving mask register. */
3707 /* End of register allocator costs. */
3708 },
3709
3710 COSTS_N_INSNS (1), /* cost of an add instruction. */
3711 COSTS_N_INSNS (1), /* cost of a lea instruction. */
3712 COSTS_N_INSNS (1), /* variable shift costs. */
3713 COSTS_N_INSNS (1), /* constant shift costs. */
3714 {COSTS_N_INSNS (2), /* cost of starting multiply for QI. */
3715 COSTS_N_INSNS (3), /* HI. */
3716 COSTS_N_INSNS (2), /* SI. */
3717 COSTS_N_INSNS (2), /* DI. */
3718 COSTS_N_INSNS (3)}, /* other. */
3719 0, /* cost of multiply per each bit set. */
3720 {COSTS_N_INSNS (9), /* cost of a divide/mod for QI. */
3721 COSTS_N_INSNS (10), /* HI. */
3722 COSTS_N_INSNS (9), /* SI. */
3723 COSTS_N_INSNS (50), /* DI. */
3724 COSTS_N_INSNS (50)}, /* other. */
3725 COSTS_N_INSNS (1), /* cost of movsx. */
3726 COSTS_N_INSNS (1), /* cost of movzx. */
3727 8, /* "large" insn. */
3728 17, /* MOVE_RATIO. */
3729 6, /* CLEAR_RATIO. */
3730 {8, 8, 8}, /* cost of loading integer registers
3731 in QImode, HImode and SImode.
3732 Relative to reg-reg move (2). */
3733 {8, 8, 8}, /* cost of storing integer registers. */
3734 {8, 8, 8, 12, 15}, /* cost of loading SSE register
3735 in 32bit, 64bit, 128bit, 256bit and 512bit. */
3736 {8, 8, 8, 12, 15}, /* cost of storing SSE register
3737 in 32bit, 64bit, 128bit, 256bit and 512bit. */
3738 {8, 8, 8, 12, 15}, /* cost of unaligned loads. */
3739 {8, 8, 8, 12, 15}, /* cost of unaligned storess. */
3740 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */
3741 8, /* cost of moving SSE register to integer. */
3742 18, 6, /* Gather load static, per_elt. */
3743 18, 6, /* Gather store static, per_elt. */
3744 32, /* size of l1 cache. */
3745 256, /* size of l2 cache. */
3746 64, /* size of prefetch block. */
3747 12, /* number of parallel prefetches. */
3748 3, /* Branch cost. */
3749 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
3750 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
3751 COSTS_N_INSNS (13), /* cost of FDIV instruction. */
3752 COSTS_N_INSNS (2), /* cost of FABS instruction. */
3753 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
3754 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
3755
3756 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3757 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3758 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
3759 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
3760 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
3761 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
3762 COSTS_N_INSNS (11), /* cost of DIVSS instruction. */
3763 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
3764 COSTS_N_INSNS (11), /* cost of SQRTSS instruction. */
3765 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
3766 4, 4, 4, 4, /* reassoc int, fp, vec_int, vec_fp. */
3767 shijidadao_memcpy,
3768 shijidadao_memset,
3769 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
3770 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
3771 "16:11:8", /* Loop alignment. */
3772 "16:11:8", /* Jump alignment. */
3773 "0:0:8", /* Label alignment. */
3774 "16", /* Func alignment. */
3775 4, /* Small unroll limit. */
3776 2, /* Small unroll factor. */
3777};
3778
3779
94c0b26f 3780
64766e8d
JH
3781/* Generic should produce code tuned for Core-i7 (and newer chips)
3782 and btver1 (and newer chips). */
3783
3784static stringop_algs generic_memcpy[2] = {
3785 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
3786 {-1, libcall, false}}},
3787 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
3788 {-1, libcall, false}}}};
3789static stringop_algs generic_memset[2] = {
3790 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
3791 {-1, libcall, false}}},
3792 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
3793 {-1, libcall, false}}}};
3794static const
3795struct processor_costs generic_cost = {
72bb85f8 3796 {
d321551c
L
3797 /* Start of register allocator costs. integer->integer move cost is 2. */
3798 6, /* cost for loading QImode using movzbl */
3799 {6, 6, 6}, /* cost of loading integer registers
3800 in QImode, HImode and SImode.
3801 Relative to reg-reg move (2). */
3802 {6, 6, 6}, /* cost of storing integer registers */
3803 4, /* cost of reg,reg fld/fst */
3804 {6, 6, 12}, /* cost of loading fp registers
3805 in SFmode, DFmode and XFmode */
3806 {6, 6, 12}, /* cost of storing fp registers
3807 in SFmode, DFmode and XFmode */
3808 2, /* cost of moving MMX register */
3809 {6, 6}, /* cost of loading MMX registers
3810 in SImode and DImode */
3811 {6, 6}, /* cost of storing MMX registers
3812 in SImode and DImode */
3813 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
3814 {6, 6, 6, 10, 15}, /* cost of loading SSE registers
3815 in 32,64,128,256 and 512-bit */
3816 {6, 6, 6, 10, 15}, /* cost of storing SSE registers
3817 in 32,64,128,256 and 512-bit */
ecc3135a 3818 6, 6, /* SSE->integer and integer->SSE moves */
3819 6, 6, /* mask->integer and integer->mask moves */
00cb3494
L
3820 {6, 6, 6}, /* cost of loading mask register
3821 in QImode, HImode, SImode. */
3822 {6, 6, 6}, /* cost if storing mask register
3823 in QImode, HImode, SImode. */
3824 2, /* cost of moving mask register. */
d321551c 3825 /* End of register allocator costs. */
72bb85f8 3826 },
d321551c 3827
64766e8d 3828 COSTS_N_INSNS (1), /* cost of an add instruction */
ef9eec0b 3829 /* Setting cost to 2 makes our current implementation of synth_mult result in
64766e8d
JH
3830 use of unnecessary temporary registers causing regression on several
3831 SPECfp benchmarks. */
3832 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
3833 COSTS_N_INSNS (1), /* variable shift costs */
3834 COSTS_N_INSNS (1), /* constant shift costs */
3835 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
bc00de07 3836 COSTS_N_INSNS (3), /* HI */
64766e8d 3837 COSTS_N_INSNS (3), /* SI */
bc00de07 3838 COSTS_N_INSNS (3), /* DI */
7c080ade 3839 COSTS_N_INSNS (4)}, /* other */
64766e8d 3840 0, /* cost of multiply per each bit set */
7c080ade
JH
3841 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */
3842 COSTS_N_INSNS (22), /* HI */
3843 COSTS_N_INSNS (30), /* SI */
64766e8d
JH
3844 COSTS_N_INSNS (74), /* DI */
3845 COSTS_N_INSNS (74)}, /* other */
3846 COSTS_N_INSNS (1), /* cost of movsx */
3847 COSTS_N_INSNS (1), /* cost of movzx */
3848 8, /* "large" insn */
3849 17, /* MOVE_RATIO */
25e22b19 3850 6, /* CLEAR_RATIO */
d555138e 3851 {6, 6, 6}, /* cost of loading integer registers
64766e8d
JH
3852 in QImode, HImode and SImode.
3853 Relative to reg-reg move (2). */
af863030 3854 {6, 6, 6}, /* cost of storing integer registers */
d321551c
L
3855 {6, 6, 6, 10, 15}, /* cost of loading SSE register
3856 in 32bit, 64bit, 128bit, 256bit and 512bit */
3857 {6, 6, 6, 10, 15}, /* cost of storing SSE register
3858 in 32bit, 64bit, 128bit, 256bit and 512bit */
7c080ade 3859 {6, 6, 6, 10, 15}, /* cost of unaligned loads. */
7c080ade 3860 {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
d321551c
L
3861 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
3862 6, /* cost of moving SSE register to integer. */
7c080ade
JH
3863 18, 6, /* Gather load static, per_elt. */
3864 18, 6, /* Gather store static, per_elt. */
64766e8d
JH
3865 32, /* size of l1 cache. */
3866 512, /* size of l2 cache. */
3867 64, /* size of prefetch block */
3868 6, /* number of parallel prefetches */
3869 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
3870 value is increased to perhaps more appropriate value of 5. */
3871 3, /* Branch cost */
ef9eec0b 3872 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
7c080ade 3873 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
e8e3054e 3874 COSTS_N_INSNS (17), /* cost of FDIV instruction. */
ef9eec0b
JH
3875 COSTS_N_INSNS (1), /* cost of FABS instruction. */
3876 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
e8e3054e 3877 COSTS_N_INSNS (14), /* cost of FSQRT instruction. */
6065f444 3878
ef9eec0b
JH
3879 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3880 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3881 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
3882 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
3883 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
3884 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
e8e3054e
JH
3885 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
3886 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
3887 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
3888 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
7c080ade 3889 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
64766e8d
JH
3890 generic_memcpy,
3891 generic_memset,
e8e3054e
JH
3892 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
3893 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
00ed5424 3894 "16", /* Loop alignment. */
7dc58b50
ML
3895 "16:11:8", /* Jump alignment. */
3896 "0:0:8", /* Label alignment. */
3897 "16", /* Func alignment. */
071e428c
HW
3898 4, /* Small unroll limit. */
3899 2, /* Small unroll factor. */
64766e8d
JH
3900};
3901
3902/* core_cost should produce code tuned for Core familly of CPUs. */
3903static stringop_algs core_memcpy[2] = {
3904 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
3905 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
3906 {-1, libcall, false}}}};
3907static stringop_algs core_memset[2] = {
3908 {libcall, {{6, loop_1_byte, true},
3909 {24, loop, true},
3910 {8192, rep_prefix_4_byte, true},
3911 {-1, libcall, false}}},
3912 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
3913 {-1, libcall, false}}}};
3914
3915static const
3916struct processor_costs core_cost = {
72bb85f8 3917 {
d321551c
L
3918 /* Start of register allocator costs. integer->integer move cost is 2. */
3919 6, /* cost for loading QImode using movzbl */
3920 {4, 4, 4}, /* cost of loading integer registers
3921 in QImode, HImode and SImode.
3922 Relative to reg-reg move (2). */
3923 {6, 6, 6}, /* cost of storing integer registers */
3924 2, /* cost of reg,reg fld/fst */
3925 {6, 6, 8}, /* cost of loading fp registers
3926 in SFmode, DFmode and XFmode */
3927 {6, 6, 10}, /* cost of storing fp registers
3928 in SFmode, DFmode and XFmode */
3929 2, /* cost of moving MMX register */
3930 {6, 6}, /* cost of loading MMX registers
3931 in SImode and DImode */
3932 {6, 6}, /* cost of storing MMX registers
3933 in SImode and DImode */
3934 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
3935 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
3936 in 32,64,128,256 and 512-bit */
3937 {6, 6, 6, 6, 12}, /* cost of storing SSE registers
3938 in 32,64,128,256 and 512-bit */
ecc3135a 3939 6, 6, /* SSE->integer and integer->SSE moves */
3940 6, 6, /* mask->integer and integer->mask moves */
00cb3494
L
3941 {4, 4, 4}, /* cost of loading mask register
3942 in QImode, HImode, SImode. */
3943 {6, 6, 6}, /* cost if storing mask register
3944 in QImode, HImode, SImode. */
3945 2, /* cost of moving mask register. */
d321551c 3946 /* End of register allocator costs. */
72bb85f8 3947 },
d321551c 3948
64766e8d
JH
3949 COSTS_N_INSNS (1), /* cost of an add instruction */
3950 /* On all chips taken into consideration lea is 2 cycles and more. With
3951 this cost however our current implementation of synth_mult results in
3952 use of unnecessary temporary registers causing regression on several
3953 SPECfp benchmarks. */
3954 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
3955 COSTS_N_INSNS (1), /* variable shift costs */
3956 COSTS_N_INSNS (1), /* constant shift costs */
3957 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
3958 COSTS_N_INSNS (4), /* HI */
3959 COSTS_N_INSNS (3), /* SI */
a2ef9558
MT
3960 /* Here we tune for Sandybridge or newer. */
3961 COSTS_N_INSNS (3), /* DI */
3962 COSTS_N_INSNS (3)}, /* other */
64766e8d 3963 0, /* cost of multiply per each bit set */
02308bd3
MT
3964 /* Expanding div/mod currently doesn't consider parallelism. So the cost
3965 model is not realistic. We compensate by increasing the latencies a bit. */
3966 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
3967 COSTS_N_INSNS (11), /* HI */
3968 COSTS_N_INSNS (14), /* SI */
ffa3ce53
JH
3969 COSTS_N_INSNS (81), /* DI */
3970 COSTS_N_INSNS (81)}, /* other */
64766e8d
JH
3971 COSTS_N_INSNS (1), /* cost of movsx */
3972 COSTS_N_INSNS (1), /* cost of movzx */
3973 8, /* "large" insn */
3974 17, /* MOVE_RATIO */
25e22b19 3975 6, /* CLEAR_RATIO */
64766e8d
JH
3976 {4, 4, 4}, /* cost of loading integer registers
3977 in QImode, HImode and SImode.
3978 Relative to reg-reg move (2). */
ffa3ce53 3979 {6, 6, 6}, /* cost of storing integer registers */
d321551c
L
3980 {6, 6, 6, 6, 12}, /* cost of loading SSE register
3981 in 32bit, 64bit, 128bit, 256bit and 512bit */
3982 {6, 6, 6, 6, 12}, /* cost of storing SSE register
3983 in 32bit, 64bit, 128bit, 256bit and 512bit */
df41dbaf 3984 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */
df41dbaf 3985 {6, 6, 6, 6, 12}, /* cost of unaligned stores. */
d321551c
L
3986 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
3987 2, /* cost of moving SSE register to integer. */
a4fe6139
JH
3988 /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
3989 rec. throughput 6.
3990 So 5 uops statically and one uops per load. */
3991 10, 6, /* Gather load static, per_elt. */
3992 10, 6, /* Gather store static, per_elt. */
64766e8d
JH
3993 64, /* size of l1 cache. */
3994 512, /* size of l2 cache. */
3995 64, /* size of prefetch block */
3996 6, /* number of parallel prefetches */
3997 /* FIXME perhaps more appropriate value is 5. */
3998 3, /* Branch cost */
ef9eec0b
JH
3999 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
4000 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
ffa3ce53 4001 /* 10-24 */
ef9eec0b
JH
4002 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
4003 COSTS_N_INSNS (1), /* cost of FABS instruction. */
4004 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
ffa3ce53 4005 COSTS_N_INSNS (23), /* cost of FSQRT instruction. */
6065f444 4006
c53c148c 4007 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
4008 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
4009 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
4010 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
c53c148c
JH
4011 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
4012 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
6065f444
JH
4013 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
4014 COSTS_N_INSNS (32), /* cost of DIVSD instruction. */
4015 COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */
4016 COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */
64766e8d
JH
4017 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
4018 core_memcpy,
4019 core_memset,
f6fd8f2b
JH
4020 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
4021 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
4022 "16:11:8", /* Loop alignment. */
4023 "16:11:8", /* Jump alignment. */
4024 "0:0:8", /* Label alignment. */
4025 "16", /* Func alignment. */
071e428c
HW
4026 4, /* Small unroll limit. */
4027 2, /* Small unroll factor. */
64766e8d
JH
4028};
4029