1 /* Costs of operations of individual x86 CPUs.
2 Copyright (C) 1988-2019 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 Under Section 7 of GPL version 3, you are granted additional
17 permissions described in the GCC Runtime Library Exception, version
18 3.1, as published by the Free Software Foundation.
20 You should have received a copy of the GNU General Public License and
21 a copy of the GCC Runtime Library Exception along with this program;
22 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 <http://www.gnu.org/licenses/>. */
24 /* Processor costs (relative to an add) */
25 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
26 #define COSTS_N_BYTES(N) ((N) * 2)
28 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
30 static stringop_algs ix86_size_memcpy
[2] = {
31 {rep_prefix_1_byte
, {{-1, rep_prefix_1_byte
, false}}},
32 {rep_prefix_1_byte
, {{-1, rep_prefix_1_byte
, false}}}};
33 static stringop_algs ix86_size_memset
[2] = {
34 {rep_prefix_1_byte
, {{-1, rep_prefix_1_byte
, false}}},
35 {rep_prefix_1_byte
, {{-1, rep_prefix_1_byte
, false}}}};
38 struct processor_costs ix86_size_cost
= {/* costs for tuning for size */
40 /* Start of register allocator costs. integer->integer move cost is 2. */
41 2, /* cost for loading QImode using movzbl */
42 {2, 2, 2}, /* cost of loading integer registers
43 in QImode, HImode and SImode.
44 Relative to reg-reg move (2). */
45 {2, 2, 2}, /* cost of storing integer registers */
46 2, /* cost of reg,reg fld/fst */
47 {2, 2, 2}, /* cost of loading fp registers
48 in SFmode, DFmode and XFmode */
49 {2, 2, 2}, /* cost of storing fp registers
50 in SFmode, DFmode and XFmode */
51 3, /* cost of moving MMX register */
52 {3, 3}, /* cost of loading MMX registers
53 in SImode and DImode */
54 {3, 3}, /* cost of storing MMX registers
55 in SImode and DImode */
56 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
57 {3, 3, 3, 3, 3}, /* cost of loading SSE registers
58 in 32,64,128,256 and 512-bit */
59 {3, 3, 3, 3, 3}, /* cost of storing SSE registers
60 in 32,64,128,256 and 512-bit */
61 3, 3, /* SSE->integer and integer->SSE moves */
62 /* End of register allocator costs. */
65 COSTS_N_BYTES (2), /* cost of an add instruction */
66 COSTS_N_BYTES (3), /* cost of a lea instruction */
67 COSTS_N_BYTES (2), /* variable shift costs */
68 COSTS_N_BYTES (3), /* constant shift costs */
69 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
70 COSTS_N_BYTES (3), /* HI */
71 COSTS_N_BYTES (3), /* SI */
72 COSTS_N_BYTES (3), /* DI */
73 COSTS_N_BYTES (5)}, /* other */
74 0, /* cost of multiply per each bit set */
75 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
76 COSTS_N_BYTES (3), /* HI */
77 COSTS_N_BYTES (3), /* SI */
78 COSTS_N_BYTES (3), /* DI */
79 COSTS_N_BYTES (5)}, /* other */
80 COSTS_N_BYTES (3), /* cost of movsx */
81 COSTS_N_BYTES (3), /* cost of movzx */
85 {2, 2, 2}, /* cost of loading integer registers
86 in QImode, HImode and SImode.
87 Relative to reg-reg move (2). */
88 {2, 2, 2}, /* cost of storing integer registers */
89 {3, 3, 3, 3, 3}, /* cost of loading SSE register
90 in 32bit, 64bit, 128bit, 256bit and 512bit */
91 {3, 3, 3, 3, 3}, /* cost of storing SSE register
92 in 32bit, 64bit, 128bit, 256bit and 512bit */
93 {3, 3, 3, 3, 3}, /* cost of unaligned SSE load
94 in 128bit, 256bit and 512bit */
95 {3, 3, 3, 3, 3}, /* cost of unaligned SSE store
96 in 128bit, 256bit and 512bit */
97 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
98 3, /* cost of moving SSE register to integer. */
99 5, 0, /* Gather load static, per_elt. */
100 5, 0, /* Gather store static, per_elt. */
101 0, /* size of l1 cache */
102 0, /* size of l2 cache */
103 0, /* size of prefetch block */
104 0, /* number of parallel prefetches */
106 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
107 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
108 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
109 COSTS_N_BYTES (2), /* cost of FABS instruction. */
110 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
111 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
113 COSTS_N_BYTES (2), /* cost of cheap SSE instruction. */
114 COSTS_N_BYTES (2), /* cost of ADDSS/SD SUBSS/SD insns. */
115 COSTS_N_BYTES (2), /* cost of MULSS instruction. */
116 COSTS_N_BYTES (2), /* cost of MULSD instruction. */
117 COSTS_N_BYTES (2), /* cost of FMA SS instruction. */
118 COSTS_N_BYTES (2), /* cost of FMA SD instruction. */
119 COSTS_N_BYTES (2), /* cost of DIVSS instruction. */
120 COSTS_N_BYTES (2), /* cost of DIVSD instruction. */
121 COSTS_N_BYTES (2), /* cost of SQRTSS instruction. */
122 COSTS_N_BYTES (2), /* cost of SQRTSD instruction. */
123 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
126 COSTS_N_BYTES (1), /* cond_taken_branch_cost. */
127 COSTS_N_BYTES (1), /* cond_not_taken_branch_cost. */
128 NULL
, /* Loop alignment. */
129 NULL
, /* Jump alignment. */
130 NULL
, /* Label alignment. */
131 NULL
, /* Func alignment. */
134 /* Processor costs (relative to an add) */
135 static stringop_algs i386_memcpy
[2] = {
136 {rep_prefix_1_byte
, {{-1, rep_prefix_1_byte
, false}}},
137 DUMMY_STRINGOP_ALGS
};
138 static stringop_algs i386_memset
[2] = {
139 {rep_prefix_1_byte
, {{-1, rep_prefix_1_byte
, false}}},
140 DUMMY_STRINGOP_ALGS
};
143 struct processor_costs i386_cost
= { /* 386 specific costs */
145 /* Start of register allocator costs. integer->integer move cost is 2. */
146 4, /* cost for loading QImode using movzbl */
147 {2, 4, 2}, /* cost of loading integer registers
148 in QImode, HImode and SImode.
149 Relative to reg-reg move (2). */
150 {2, 4, 2}, /* cost of storing integer registers */
151 2, /* cost of reg,reg fld/fst */
152 {8, 8, 8}, /* cost of loading fp registers
153 in SFmode, DFmode and XFmode */
154 {8, 8, 8}, /* cost of storing fp registers
155 in SFmode, DFmode and XFmode */
156 2, /* cost of moving MMX register */
157 {4, 8}, /* cost of loading MMX registers
158 in SImode and DImode */
159 {4, 8}, /* cost of storing MMX registers
160 in SImode and DImode */
161 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
162 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
163 in 32,64,128,256 and 512-bit */
164 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
165 in 32,64,128,256 and 512-bit */
166 3, 3, /* SSE->integer and integer->SSE moves */
167 /* End of register allocator costs. */
170 COSTS_N_INSNS (1), /* cost of an add instruction */
171 COSTS_N_INSNS (1), /* cost of a lea instruction */
172 COSTS_N_INSNS (3), /* variable shift costs */
173 COSTS_N_INSNS (2), /* constant shift costs */
174 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
175 COSTS_N_INSNS (6), /* HI */
176 COSTS_N_INSNS (6), /* SI */
177 COSTS_N_INSNS (6), /* DI */
178 COSTS_N_INSNS (6)}, /* other */
179 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
180 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
181 COSTS_N_INSNS (23), /* HI */
182 COSTS_N_INSNS (23), /* SI */
183 COSTS_N_INSNS (23), /* DI */
184 COSTS_N_INSNS (23)}, /* other */
185 COSTS_N_INSNS (3), /* cost of movsx */
186 COSTS_N_INSNS (2), /* cost of movzx */
187 15, /* "large" insn */
190 {2, 4, 2}, /* cost of loading integer registers
191 in QImode, HImode and SImode.
192 Relative to reg-reg move (2). */
193 {2, 4, 2}, /* cost of storing integer registers */
194 {4, 8, 16, 32, 64}, /* cost of loading SSE register
195 in 32bit, 64bit, 128bit, 256bit and 512bit */
196 {4, 8, 16, 32, 64}, /* cost of storing SSE register
197 in 32bit, 64bit, 128bit, 256bit and 512bit */
198 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
199 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
200 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
201 3, /* cost of moving SSE register to integer. */
202 4, 4, /* Gather load static, per_elt. */
203 4, 4, /* Gather store static, per_elt. */
204 0, /* size of l1 cache */
205 0, /* size of l2 cache */
206 0, /* size of prefetch block */
207 0, /* number of parallel prefetches */
209 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
210 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
211 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
212 COSTS_N_INSNS (22), /* cost of FABS instruction. */
213 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
214 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
216 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
217 COSTS_N_INSNS (23), /* cost of ADDSS/SD SUBSS/SD insns. */
218 COSTS_N_INSNS (27), /* cost of MULSS instruction. */
219 COSTS_N_INSNS (27), /* cost of MULSD instruction. */
220 COSTS_N_INSNS (27), /* cost of FMA SS instruction. */
221 COSTS_N_INSNS (27), /* cost of FMA SD instruction. */
222 COSTS_N_INSNS (88), /* cost of DIVSS instruction. */
223 COSTS_N_INSNS (88), /* cost of DIVSD instruction. */
224 COSTS_N_INSNS (122), /* cost of SQRTSS instruction. */
225 COSTS_N_INSNS (122), /* cost of SQRTSD instruction. */
226 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
229 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
230 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
231 "4", /* Loop alignment. */
232 "4", /* Jump alignment. */
233 NULL
, /* Label alignment. */
234 "4", /* Func alignment. */
237 static stringop_algs i486_memcpy
[2] = {
238 {rep_prefix_4_byte
, {{-1, rep_prefix_4_byte
, false}}},
239 DUMMY_STRINGOP_ALGS
};
240 static stringop_algs i486_memset
[2] = {
241 {rep_prefix_4_byte
, {{-1, rep_prefix_4_byte
, false}}},
242 DUMMY_STRINGOP_ALGS
};
245 struct processor_costs i486_cost
= { /* 486 specific costs */
247 /* Start of register allocator costs. integer->integer move cost is 2. */
248 4, /* cost for loading QImode using movzbl */
249 {2, 4, 2}, /* cost of loading integer registers
250 in QImode, HImode and SImode.
251 Relative to reg-reg move (2). */
252 {2, 4, 2}, /* cost of storing integer registers */
253 2, /* cost of reg,reg fld/fst */
254 {8, 8, 8}, /* cost of loading fp registers
255 in SFmode, DFmode and XFmode */
256 {8, 8, 8}, /* cost of storing fp registers
257 in SFmode, DFmode and XFmode */
258 2, /* cost of moving MMX register */
259 {4, 8}, /* cost of loading MMX registers
260 in SImode and DImode */
261 {4, 8}, /* cost of storing MMX registers
262 in SImode and DImode */
263 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
264 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
265 in 32,64,128,256 and 512-bit */
266 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
267 in 32,64,128,256 and 512-bit */
268 3, 3, /* SSE->integer and integer->SSE moves */
269 /* End of register allocator costs. */
272 COSTS_N_INSNS (1), /* cost of an add instruction */
273 COSTS_N_INSNS (1), /* cost of a lea instruction */
274 COSTS_N_INSNS (3), /* variable shift costs */
275 COSTS_N_INSNS (2), /* constant shift costs */
276 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
277 COSTS_N_INSNS (12), /* HI */
278 COSTS_N_INSNS (12), /* SI */
279 COSTS_N_INSNS (12), /* DI */
280 COSTS_N_INSNS (12)}, /* other */
281 1, /* cost of multiply per each bit set */
282 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
283 COSTS_N_INSNS (40), /* HI */
284 COSTS_N_INSNS (40), /* SI */
285 COSTS_N_INSNS (40), /* DI */
286 COSTS_N_INSNS (40)}, /* other */
287 COSTS_N_INSNS (3), /* cost of movsx */
288 COSTS_N_INSNS (2), /* cost of movzx */
289 15, /* "large" insn */
292 {2, 4, 2}, /* cost of loading integer registers
293 in QImode, HImode and SImode.
294 Relative to reg-reg move (2). */
295 {2, 4, 2}, /* cost of storing integer registers */
296 {4, 8, 16, 32, 64}, /* cost of loading SSE register
297 in 32bit, 64bit, 128bit, 256bit and 512bit */
298 {4, 8, 16, 32, 64}, /* cost of storing SSE register
299 in 32bit, 64bit, 128bit, 256bit and 512bit */
300 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
301 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
302 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
303 3, /* cost of moving SSE register to integer. */
304 4, 4, /* Gather load static, per_elt. */
305 4, 4, /* Gather store static, per_elt. */
306 4, /* size of l1 cache. 486 has 8kB cache
307 shared for code and data, so 4kB is
308 not really precise. */
309 4, /* size of l2 cache */
310 0, /* size of prefetch block */
311 0, /* number of parallel prefetches */
313 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
314 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
315 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
316 COSTS_N_INSNS (3), /* cost of FABS instruction. */
317 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
318 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
320 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
321 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
322 COSTS_N_INSNS (16), /* cost of MULSS instruction. */
323 COSTS_N_INSNS (16), /* cost of MULSD instruction. */
324 COSTS_N_INSNS (16), /* cost of FMA SS instruction. */
325 COSTS_N_INSNS (16), /* cost of FMA SD instruction. */
326 COSTS_N_INSNS (73), /* cost of DIVSS instruction. */
327 COSTS_N_INSNS (74), /* cost of DIVSD instruction. */
328 COSTS_N_INSNS (83), /* cost of SQRTSS instruction. */
329 COSTS_N_INSNS (83), /* cost of SQRTSD instruction. */
330 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
333 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
334 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
335 "16", /* Loop alignment. */
336 "16", /* Jump alignment. */
337 "0:0:8", /* Label alignment. */
338 "16", /* Func alignment. */
341 static stringop_algs pentium_memcpy
[2] = {
342 {libcall
, {{256, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
343 DUMMY_STRINGOP_ALGS
};
344 static stringop_algs pentium_memset
[2] = {
345 {libcall
, {{-1, rep_prefix_4_byte
, false}}},
346 DUMMY_STRINGOP_ALGS
};
349 struct processor_costs pentium_cost
= {
351 /* Start of register allocator costs. integer->integer move cost is 2. */
352 6, /* cost for loading QImode using movzbl */
353 {2, 4, 2}, /* cost of loading integer registers
354 in QImode, HImode and SImode.
355 Relative to reg-reg move (2). */
356 {2, 4, 2}, /* cost of storing integer registers */
357 2, /* cost of reg,reg fld/fst */
358 {2, 2, 6}, /* cost of loading fp registers
359 in SFmode, DFmode and XFmode */
360 {4, 4, 6}, /* cost of storing fp registers
361 in SFmode, DFmode and XFmode */
362 8, /* cost of moving MMX register */
363 {8, 8}, /* cost of loading MMX registers
364 in SImode and DImode */
365 {8, 8}, /* cost of storing MMX registers
366 in SImode and DImode */
367 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
368 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
369 in 32,64,128,256 and 512-bit */
370 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
371 in 32,64,128,256 and 512-bit */
372 3, 3, /* SSE->integer and integer->SSE moves */
373 /* End of register allocator costs. */
376 COSTS_N_INSNS (1), /* cost of an add instruction */
377 COSTS_N_INSNS (1), /* cost of a lea instruction */
378 COSTS_N_INSNS (4), /* variable shift costs */
379 COSTS_N_INSNS (1), /* constant shift costs */
380 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
381 COSTS_N_INSNS (11), /* HI */
382 COSTS_N_INSNS (11), /* SI */
383 COSTS_N_INSNS (11), /* DI */
384 COSTS_N_INSNS (11)}, /* other */
385 0, /* cost of multiply per each bit set */
386 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
387 COSTS_N_INSNS (25), /* HI */
388 COSTS_N_INSNS (25), /* SI */
389 COSTS_N_INSNS (25), /* DI */
390 COSTS_N_INSNS (25)}, /* other */
391 COSTS_N_INSNS (3), /* cost of movsx */
392 COSTS_N_INSNS (2), /* cost of movzx */
393 8, /* "large" insn */
396 {2, 4, 2}, /* cost of loading integer registers
397 in QImode, HImode and SImode.
398 Relative to reg-reg move (2). */
399 {2, 4, 2}, /* cost of storing integer registers */
400 {4, 8, 16, 32, 64}, /* cost of loading SSE register
401 in 32bit, 64bit, 128bit, 256bit and 512bit */
402 {4, 8, 16, 32, 64}, /* cost of storing SSE register
403 in 32bit, 64bit, 128bit, 256bit and 512bit */
404 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
405 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
406 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
407 3, /* cost of moving SSE register to integer. */
408 4, 4, /* Gather load static, per_elt. */
409 4, 4, /* Gather store static, per_elt. */
410 8, /* size of l1 cache. */
411 8, /* size of l2 cache */
412 0, /* size of prefetch block */
413 0, /* number of parallel prefetches */
415 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
416 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
417 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
418 COSTS_N_INSNS (1), /* cost of FABS instruction. */
419 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
420 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
422 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
423 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
424 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
425 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
426 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
427 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
428 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
429 COSTS_N_INSNS (39), /* cost of DIVSD instruction. */
430 COSTS_N_INSNS (70), /* cost of SQRTSS instruction. */
431 COSTS_N_INSNS (70), /* cost of SQRTSD instruction. */
432 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
435 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
436 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
437 "16:8:8", /* Loop alignment. */
438 "16:8:8", /* Jump alignment. */
439 "0:0:8", /* Label alignment. */
440 "16", /* Func alignment. */
444 struct processor_costs lakemont_cost
= {
446 /* Start of register allocator costs. integer->integer move cost is 2. */
447 6, /* cost for loading QImode using movzbl */
448 {2, 4, 2}, /* cost of loading integer registers
449 in QImode, HImode and SImode.
450 Relative to reg-reg move (2). */
451 {2, 4, 2}, /* cost of storing integer registers */
452 2, /* cost of reg,reg fld/fst */
453 {2, 2, 6}, /* cost of loading fp registers
454 in SFmode, DFmode and XFmode */
455 {4, 4, 6}, /* cost of storing fp registers
456 in SFmode, DFmode and XFmode */
457 8, /* cost of moving MMX register */
458 {8, 8}, /* cost of loading MMX registers
459 in SImode and DImode */
460 {8, 8}, /* cost of storing MMX registers
461 in SImode and DImode */
462 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
463 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
464 in 32,64,128,256 and 512-bit */
465 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
466 in 32,64,128,256 and 512-bit */
467 3, 3, /* SSE->integer and integer->SSE moves */
468 /* End of register allocator costs. */
471 COSTS_N_INSNS (1), /* cost of an add instruction */
472 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
473 COSTS_N_INSNS (1), /* variable shift costs */
474 COSTS_N_INSNS (1), /* constant shift costs */
475 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
476 COSTS_N_INSNS (11), /* HI */
477 COSTS_N_INSNS (11), /* SI */
478 COSTS_N_INSNS (11), /* DI */
479 COSTS_N_INSNS (11)}, /* other */
480 0, /* cost of multiply per each bit set */
481 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
482 COSTS_N_INSNS (25), /* HI */
483 COSTS_N_INSNS (25), /* SI */
484 COSTS_N_INSNS (25), /* DI */
485 COSTS_N_INSNS (25)}, /* other */
486 COSTS_N_INSNS (3), /* cost of movsx */
487 COSTS_N_INSNS (2), /* cost of movzx */
488 8, /* "large" insn */
491 {2, 4, 2}, /* cost of loading integer registers
492 in QImode, HImode and SImode.
493 Relative to reg-reg move (2). */
494 {2, 4, 2}, /* cost of storing integer registers */
495 {4, 8, 16, 32, 64}, /* cost of loading SSE register
496 in 32bit, 64bit, 128bit, 256bit and 512bit */
497 {4, 8, 16, 32, 64}, /* cost of storing SSE register
498 in 32bit, 64bit, 128bit, 256bit and 512bit */
499 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
500 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
501 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
502 3, /* cost of moving SSE register to integer. */
503 4, 4, /* Gather load static, per_elt. */
504 4, 4, /* Gather store static, per_elt. */
505 8, /* size of l1 cache. */
506 8, /* size of l2 cache */
507 0, /* size of prefetch block */
508 0, /* number of parallel prefetches */
510 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
511 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
512 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
513 COSTS_N_INSNS (1), /* cost of FABS instruction. */
514 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
515 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
517 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
518 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
519 COSTS_N_INSNS (5), /* cost of MULSS instruction. */
520 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
521 COSTS_N_INSNS (10), /* cost of FMA SS instruction. */
522 COSTS_N_INSNS (10), /* cost of FMA SD instruction. */
523 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
524 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
525 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
526 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
527 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
530 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
531 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
532 "16:8:8", /* Loop alignment. */
533 "16:8:8", /* Jump alignment. */
534 "0:0:8", /* Label alignment. */
535 "16", /* Func alignment. */
538 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
539 (we ensure the alignment). For small blocks inline loop is still a
540 noticeable win, for bigger blocks either rep movsl or rep movsb is
541 way to go. Rep movsb has apparently more expensive startup time in CPU,
542 but after 4K the difference is down in the noise. */
543 static stringop_algs pentiumpro_memcpy
[2] = {
544 {rep_prefix_4_byte
, {{128, loop
, false}, {1024, unrolled_loop
, false},
545 {8192, rep_prefix_4_byte
, false},
546 {-1, rep_prefix_1_byte
, false}}},
547 DUMMY_STRINGOP_ALGS
};
548 static stringop_algs pentiumpro_memset
[2] = {
549 {rep_prefix_4_byte
, {{1024, unrolled_loop
, false},
550 {8192, rep_prefix_4_byte
, false},
551 {-1, libcall
, false}}},
552 DUMMY_STRINGOP_ALGS
};
554 struct processor_costs pentiumpro_cost
= {
556 /* Start of register allocator costs. integer->integer move cost is 2. */
557 2, /* cost for loading QImode using movzbl */
558 {4, 4, 4}, /* cost of loading integer registers
559 in QImode, HImode and SImode.
560 Relative to reg-reg move (2). */
561 {2, 2, 2}, /* cost of storing integer registers */
562 2, /* cost of reg,reg fld/fst */
563 {2, 2, 6}, /* cost of loading fp registers
564 in SFmode, DFmode and XFmode */
565 {4, 4, 6}, /* cost of storing fp registers
566 in SFmode, DFmode and XFmode */
567 2, /* cost of moving MMX register */
568 {2, 2}, /* cost of loading MMX registers
569 in SImode and DImode */
570 {2, 2}, /* cost of storing MMX registers
571 in SImode and DImode */
572 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
573 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
574 in 32,64,128,256 and 512-bit */
575 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
576 in 32,64,128,256 and 512-bit */
577 3, 3, /* SSE->integer and integer->SSE moves */
578 /* End of register allocator costs. */
581 COSTS_N_INSNS (1), /* cost of an add instruction */
582 COSTS_N_INSNS (1), /* cost of a lea instruction */
583 COSTS_N_INSNS (1), /* variable shift costs */
584 COSTS_N_INSNS (1), /* constant shift costs */
585 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
586 COSTS_N_INSNS (4), /* HI */
587 COSTS_N_INSNS (4), /* SI */
588 COSTS_N_INSNS (4), /* DI */
589 COSTS_N_INSNS (4)}, /* other */
590 0, /* cost of multiply per each bit set */
591 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
592 COSTS_N_INSNS (17), /* HI */
593 COSTS_N_INSNS (17), /* SI */
594 COSTS_N_INSNS (17), /* DI */
595 COSTS_N_INSNS (17)}, /* other */
596 COSTS_N_INSNS (1), /* cost of movsx */
597 COSTS_N_INSNS (1), /* cost of movzx */
598 8, /* "large" insn */
601 {4, 4, 4}, /* cost of loading integer registers
602 in QImode, HImode and SImode.
603 Relative to reg-reg move (2). */
604 {2, 2, 2}, /* cost of storing integer registers */
605 {4, 8, 16, 32, 64}, /* cost of loading SSE register
606 in 32bit, 64bit, 128bit, 256bit and 512bit */
607 {4, 8, 16, 32, 64}, /* cost of storing SSE register
608 in 32bit, 64bit, 128bit, 256bit and 512bit */
609 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
610 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
611 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
612 3, /* cost of moving SSE register to integer. */
613 4, 4, /* Gather load static, per_elt. */
614 4, 4, /* Gather store static, per_elt. */
615 8, /* size of l1 cache. */
616 256, /* size of l2 cache */
617 32, /* size of prefetch block */
618 6, /* number of parallel prefetches */
620 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
621 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
622 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
623 COSTS_N_INSNS (2), /* cost of FABS instruction. */
624 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
625 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
627 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
628 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
629 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
630 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
631 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
632 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
633 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
634 COSTS_N_INSNS (18), /* cost of DIVSD instruction. */
635 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
636 COSTS_N_INSNS (31), /* cost of SQRTSD instruction. */
637 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
640 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
641 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
642 "16", /* Loop alignment. */
643 "16:11:8", /* Jump alignment. */
644 "0:0:8", /* Label alignment. */
645 "16", /* Func alignment. */
648 static stringop_algs geode_memcpy
[2] = {
649 {libcall
, {{256, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
650 DUMMY_STRINGOP_ALGS
};
651 static stringop_algs geode_memset
[2] = {
652 {libcall
, {{256, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
653 DUMMY_STRINGOP_ALGS
};
655 struct processor_costs geode_cost
= {
657 /* Start of register allocator costs. integer->integer move cost is 2. */
658 2, /* cost for loading QImode using movzbl */
659 {2, 2, 2}, /* cost of loading integer registers
660 in QImode, HImode and SImode.
661 Relative to reg-reg move (2). */
662 {2, 2, 2}, /* cost of storing integer registers */
663 2, /* cost of reg,reg fld/fst */
664 {2, 2, 2}, /* cost of loading fp registers
665 in SFmode, DFmode and XFmode */
666 {4, 6, 6}, /* cost of storing fp registers
667 in SFmode, DFmode and XFmode */
668 2, /* cost of moving MMX register */
669 {2, 2}, /* cost of loading MMX registers
670 in SImode and DImode */
671 {2, 2}, /* cost of storing MMX registers
672 in SImode and DImode */
673 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
674 {2, 2, 8, 16, 32}, /* cost of loading SSE registers
675 in 32,64,128,256 and 512-bit */
676 {2, 2, 8, 16, 32}, /* cost of storing SSE registers
677 in 32,64,128,256 and 512-bit */
678 6, 6, /* SSE->integer and integer->SSE moves */
679 /* End of register allocator costs. */
682 COSTS_N_INSNS (1), /* cost of an add instruction */
683 COSTS_N_INSNS (1), /* cost of a lea instruction */
684 COSTS_N_INSNS (2), /* variable shift costs */
685 COSTS_N_INSNS (1), /* constant shift costs */
686 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
687 COSTS_N_INSNS (4), /* HI */
688 COSTS_N_INSNS (7), /* SI */
689 COSTS_N_INSNS (7), /* DI */
690 COSTS_N_INSNS (7)}, /* other */
691 0, /* cost of multiply per each bit set */
692 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
693 COSTS_N_INSNS (23), /* HI */
694 COSTS_N_INSNS (39), /* SI */
695 COSTS_N_INSNS (39), /* DI */
696 COSTS_N_INSNS (39)}, /* other */
697 COSTS_N_INSNS (1), /* cost of movsx */
698 COSTS_N_INSNS (1), /* cost of movzx */
699 8, /* "large" insn */
702 {2, 2, 2}, /* cost of loading integer registers
703 in QImode, HImode and SImode.
704 Relative to reg-reg move (2). */
705 {2, 2, 2}, /* cost of storing integer registers */
706 {2, 2, 8, 16, 32}, /* cost of loading SSE register
707 in 32bit, 64bit, 128bit, 256bit and 512bit */
708 {2, 2, 8, 16, 32}, /* cost of storing SSE register
709 in 32bit, 64bit, 128bit, 256bit and 512bit */
710 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */
711 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
712 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
713 6, /* cost of moving SSE register to integer. */
714 2, 2, /* Gather load static, per_elt. */
715 2, 2, /* Gather store static, per_elt. */
716 64, /* size of l1 cache. */
717 128, /* size of l2 cache. */
718 32, /* size of prefetch block */
719 1, /* number of parallel prefetches */
721 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
722 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
723 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
724 COSTS_N_INSNS (1), /* cost of FABS instruction. */
725 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
726 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
728 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
729 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
730 COSTS_N_INSNS (11), /* cost of MULSS instruction. */
731 COSTS_N_INSNS (11), /* cost of MULSD instruction. */
732 COSTS_N_INSNS (17), /* cost of FMA SS instruction. */
733 COSTS_N_INSNS (17), /* cost of FMA SD instruction. */
734 COSTS_N_INSNS (47), /* cost of DIVSS instruction. */
735 COSTS_N_INSNS (47), /* cost of DIVSD instruction. */
736 COSTS_N_INSNS (54), /* cost of SQRTSS instruction. */
737 COSTS_N_INSNS (54), /* cost of SQRTSD instruction. */
738 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
741 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
742 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
743 NULL
, /* Loop alignment. */
744 NULL
, /* Jump alignment. */
745 NULL
, /* Label alignment. */
746 NULL
, /* Func alignment. */
749 static stringop_algs k6_memcpy
[2] = {
750 {libcall
, {{256, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
751 DUMMY_STRINGOP_ALGS
};
752 static stringop_algs k6_memset
[2] = {
753 {libcall
, {{256, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
754 DUMMY_STRINGOP_ALGS
};
756 struct processor_costs k6_cost
= {
758 /* Start of register allocator costs. integer->integer move cost is 2. */
759 3, /* cost for loading QImode using movzbl */
760 {4, 5, 4}, /* cost of loading integer registers
761 in QImode, HImode and SImode.
762 Relative to reg-reg move (2). */
763 {2, 3, 2}, /* cost of storing integer registers */
764 4, /* cost of reg,reg fld/fst */
765 {6, 6, 6}, /* cost of loading fp registers
766 in SFmode, DFmode and XFmode */
767 {4, 4, 4}, /* cost of storing fp registers
768 in SFmode, DFmode and XFmode */
769 2, /* cost of moving MMX register */
770 {2, 2}, /* cost of loading MMX registers
771 in SImode and DImode */
772 {2, 2}, /* cost of storing MMX registers
773 in SImode and DImode */
774 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
775 {2, 2, 8, 16, 32}, /* cost of loading SSE registers
776 in 32,64,128,256 and 512-bit */
777 {2, 2, 8, 16, 32}, /* cost of storing SSE registers
778 in 32,64,128,256 and 512-bit */
779 6, 6, /* SSE->integer and integer->SSE moves */
780 /* End of register allocator costs. */
783 COSTS_N_INSNS (1), /* cost of an add instruction */
784 COSTS_N_INSNS (2), /* cost of a lea instruction */
785 COSTS_N_INSNS (1), /* variable shift costs */
786 COSTS_N_INSNS (1), /* constant shift costs */
787 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
788 COSTS_N_INSNS (3), /* HI */
789 COSTS_N_INSNS (3), /* SI */
790 COSTS_N_INSNS (3), /* DI */
791 COSTS_N_INSNS (3)}, /* other */
792 0, /* cost of multiply per each bit set */
793 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
794 COSTS_N_INSNS (18), /* HI */
795 COSTS_N_INSNS (18), /* SI */
796 COSTS_N_INSNS (18), /* DI */
797 COSTS_N_INSNS (18)}, /* other */
798 COSTS_N_INSNS (2), /* cost of movsx */
799 COSTS_N_INSNS (2), /* cost of movzx */
800 8, /* "large" insn */
803 {4, 5, 4}, /* cost of loading integer registers
804 in QImode, HImode and SImode.
805 Relative to reg-reg move (2). */
806 {2, 3, 2}, /* cost of storing integer registers */
807 {2, 2, 8, 16, 32}, /* cost of loading SSE register
808 in 32bit, 64bit, 128bit, 256bit and 512bit */
809 {2, 2, 8, 16, 32}, /* cost of storing SSE register
810 in 32bit, 64bit, 128bit, 256bit and 512bit */
811 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */
812 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
813 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
814 6, /* cost of moving SSE register to integer. */
815 2, 2, /* Gather load static, per_elt. */
816 2, 2, /* Gather store static, per_elt. */
817 32, /* size of l1 cache. */
818 32, /* size of l2 cache. Some models
819 have integrated l2 cache, but
820 optimizing for k6 is not important
821 enough to worry about that. */
822 32, /* size of prefetch block */
823 1, /* number of parallel prefetches */
825 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
826 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
827 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
828 COSTS_N_INSNS (2), /* cost of FABS instruction. */
829 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
830 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
832 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
833 COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */
834 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
835 COSTS_N_INSNS (2), /* cost of MULSD instruction. */
836 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
837 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
838 COSTS_N_INSNS (56), /* cost of DIVSS instruction. */
839 COSTS_N_INSNS (56), /* cost of DIVSD instruction. */
840 COSTS_N_INSNS (56), /* cost of SQRTSS instruction. */
841 COSTS_N_INSNS (56), /* cost of SQRTSD instruction. */
842 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
845 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
846 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
847 "32:8:8", /* Loop alignment. */
848 "32:8:8", /* Jump alignment. */
849 "0:0:8", /* Label alignment. */
850 "32", /* Func alignment. */
853 /* For some reason, Athlon deals better with REP prefix (relative to loops)
854 compared to K8. Alignment becomes important after 8 bytes for memcpy and
855 128 bytes for memset. */
856 static stringop_algs athlon_memcpy
[2] = {
857 {libcall
, {{2048, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
858 DUMMY_STRINGOP_ALGS
};
859 static stringop_algs athlon_memset
[2] = {
860 {libcall
, {{2048, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
861 DUMMY_STRINGOP_ALGS
};
863 struct processor_costs athlon_cost
= {
865 /* Start of register allocator costs. integer->integer move cost is 2. */
866 4, /* cost for loading QImode using movzbl */
867 {3, 4, 3}, /* cost of loading integer registers
868 in QImode, HImode and SImode.
869 Relative to reg-reg move (2). */
870 {3, 4, 3}, /* cost of storing integer registers */
871 4, /* cost of reg,reg fld/fst */
872 {4, 4, 12}, /* cost of loading fp registers
873 in SFmode, DFmode and XFmode */
874 {6, 6, 8}, /* cost of storing fp registers
875 in SFmode, DFmode and XFmode */
876 2, /* cost of moving MMX register */
877 {4, 4}, /* cost of loading MMX registers
878 in SImode and DImode */
879 {4, 4}, /* cost of storing MMX registers
880 in SImode and DImode */
881 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
882 {4, 4, 12, 12, 24}, /* cost of loading SSE registers
883 in 32,64,128,256 and 512-bit */
884 {4, 4, 10, 10, 20}, /* cost of storing SSE registers
885 in 32,64,128,256 and 512-bit */
886 5, 5, /* SSE->integer and integer->SSE moves */
887 /* End of register allocator costs. */
890 COSTS_N_INSNS (1), /* cost of an add instruction */
891 COSTS_N_INSNS (2), /* cost of a lea instruction */
892 COSTS_N_INSNS (1), /* variable shift costs */
893 COSTS_N_INSNS (1), /* constant shift costs */
894 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
895 COSTS_N_INSNS (5), /* HI */
896 COSTS_N_INSNS (5), /* SI */
897 COSTS_N_INSNS (5), /* DI */
898 COSTS_N_INSNS (5)}, /* other */
899 0, /* cost of multiply per each bit set */
900 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
901 COSTS_N_INSNS (26), /* HI */
902 COSTS_N_INSNS (42), /* SI */
903 COSTS_N_INSNS (74), /* DI */
904 COSTS_N_INSNS (74)}, /* other */
905 COSTS_N_INSNS (1), /* cost of movsx */
906 COSTS_N_INSNS (1), /* cost of movzx */
907 8, /* "large" insn */
910 {3, 4, 3}, /* cost of loading integer registers
911 in QImode, HImode and SImode.
912 Relative to reg-reg move (2). */
913 {3, 4, 3}, /* cost of storing integer registers */
914 {4, 4, 12, 12, 24}, /* cost of loading SSE register
915 in 32bit, 64bit, 128bit, 256bit and 512bit */
916 {4, 4, 10, 10, 20}, /* cost of storing SSE register
917 in 32bit, 64bit, 128bit, 256bit and 512bit */
918 {4, 4, 12, 12, 24}, /* cost of unaligned loads. */
919 {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
920 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
921 5, /* cost of moving SSE register to integer. */
922 4, 4, /* Gather load static, per_elt. */
923 4, 4, /* Gather store static, per_elt. */
924 64, /* size of l1 cache. */
925 256, /* size of l2 cache. */
926 64, /* size of prefetch block */
927 6, /* number of parallel prefetches */
929 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
930 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
931 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
932 COSTS_N_INSNS (2), /* cost of FABS instruction. */
933 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
934 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
936 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
937 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
938 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
939 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
940 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
941 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
943 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
944 COSTS_N_INSNS (24), /* cost of DIVSD instruction. */
945 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
946 COSTS_N_INSNS (19), /* cost of SQRTSD instruction. */
947 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
950 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
951 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
952 "16:8:8", /* Loop alignment. */
953 "16:8:8", /* Jump alignment. */
954 "0:0:8", /* Label alignment. */
955 "16", /* Func alignment. */
958 /* K8 has optimized REP instruction for medium sized blocks, but for very
959 small blocks it is better to use loop. For large blocks, libcall can
960 do nontemporary accesses and beat inline considerably. */
961 static stringop_algs k8_memcpy
[2] = {
962 {libcall
, {{6, loop
, false}, {14, unrolled_loop
, false},
963 {-1, rep_prefix_4_byte
, false}}},
964 {libcall
, {{16, loop
, false}, {8192, rep_prefix_8_byte
, false},
965 {-1, libcall
, false}}}};
966 static stringop_algs k8_memset
[2] = {
967 {libcall
, {{8, loop
, false}, {24, unrolled_loop
, false},
968 {2048, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
969 {libcall
, {{48, unrolled_loop
, false},
970 {8192, rep_prefix_8_byte
, false}, {-1, libcall
, false}}}};
972 struct processor_costs k8_cost
= {
974 /* Start of register allocator costs. integer->integer move cost is 2. */
975 4, /* cost for loading QImode using movzbl */
976 {3, 4, 3}, /* cost of loading integer registers
977 in QImode, HImode and SImode.
978 Relative to reg-reg move (2). */
979 {3, 4, 3}, /* cost of storing integer registers */
980 4, /* cost of reg,reg fld/fst */
981 {4, 4, 12}, /* cost of loading fp registers
982 in SFmode, DFmode and XFmode */
983 {6, 6, 8}, /* cost of storing fp registers
984 in SFmode, DFmode and XFmode */
985 2, /* cost of moving MMX register */
986 {3, 3}, /* cost of loading MMX registers
987 in SImode and DImode */
988 {4, 4}, /* cost of storing MMX registers
989 in SImode and DImode */
990 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
991 {4, 3, 12, 12, 24}, /* cost of loading SSE registers
992 in 32,64,128,256 and 512-bit */
993 {4, 4, 10, 10, 20}, /* cost of storing SSE registers
994 in 32,64,128,256 and 512-bit */
995 5, 5, /* SSE->integer and integer->SSE moves */
996 /* End of register allocator costs. */
999 COSTS_N_INSNS (1), /* cost of an add instruction */
1000 COSTS_N_INSNS (2), /* cost of a lea instruction */
1001 COSTS_N_INSNS (1), /* variable shift costs */
1002 COSTS_N_INSNS (1), /* constant shift costs */
1003 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1004 COSTS_N_INSNS (4), /* HI */
1005 COSTS_N_INSNS (3), /* SI */
1006 COSTS_N_INSNS (4), /* DI */
1007 COSTS_N_INSNS (5)}, /* other */
1008 0, /* cost of multiply per each bit set */
1009 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1010 COSTS_N_INSNS (26), /* HI */
1011 COSTS_N_INSNS (42), /* SI */
1012 COSTS_N_INSNS (74), /* DI */
1013 COSTS_N_INSNS (74)}, /* other */
1014 COSTS_N_INSNS (1), /* cost of movsx */
1015 COSTS_N_INSNS (1), /* cost of movzx */
1016 8, /* "large" insn */
1018 6, /* CLEAR_RATIO */
1019 {3, 4, 3}, /* cost of loading integer registers
1020 in QImode, HImode and SImode.
1021 Relative to reg-reg move (2). */
1022 {3, 4, 3}, /* cost of storing integer registers */
1023 {4, 3, 12, 12, 24}, /* cost of loading SSE register
1024 in 32bit, 64bit, 128bit, 256bit and 512bit */
1025 {4, 4, 10, 10, 20}, /* cost of storing SSE register
1026 in 32bit, 64bit, 128bit, 256bit and 512bit */
1027 {4, 3, 12, 12, 24}, /* cost of unaligned loads. */
1028 {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
1029 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1030 5, /* cost of moving SSE register to integer. */
1031 4, 4, /* Gather load static, per_elt. */
1032 4, 4, /* Gather store static, per_elt. */
1033 64, /* size of l1 cache. */
1034 512, /* size of l2 cache. */
1035 64, /* size of prefetch block */
1036 /* New AMD processors never drop prefetches; if they cannot be performed
1037 immediately, they are queued. We set number of simultaneous prefetches
1038 to a large constant to reflect this (it probably is not a good idea not
1039 to limit number of prefetches at all, as their execution also takes some
1041 100, /* number of parallel prefetches */
1042 3, /* Branch cost */
1043 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1044 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1045 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1046 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1047 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1048 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1050 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1051 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1052 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1053 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1054 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
1055 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
1057 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
1058 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
1059 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
1060 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
1061 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1064 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1065 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1066 "16:8:8", /* Loop alignment. */
1067 "16:8:8", /* Jump alignment. */
1068 "0:0:8", /* Label alignment. */
1069 "16", /* Func alignment. */
1072 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1073 very small blocks it is better to use loop. For large blocks, libcall can
1074 do nontemporary accesses and beat inline considerably. */
1075 static stringop_algs amdfam10_memcpy
[2] = {
1076 {libcall
, {{6, loop
, false}, {14, unrolled_loop
, false},
1077 {-1, rep_prefix_4_byte
, false}}},
1078 {libcall
, {{16, loop
, false}, {8192, rep_prefix_8_byte
, false},
1079 {-1, libcall
, false}}}};
1080 static stringop_algs amdfam10_memset
[2] = {
1081 {libcall
, {{8, loop
, false}, {24, unrolled_loop
, false},
1082 {2048, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
1083 {libcall
, {{48, unrolled_loop
, false}, {8192, rep_prefix_8_byte
, false},
1084 {-1, libcall
, false}}}};
1085 struct processor_costs amdfam10_cost
= {
1087 /* Start of register allocator costs. integer->integer move cost is 2. */
1088 4, /* cost for loading QImode using movzbl */
1089 {3, 4, 3}, /* cost of loading integer registers
1090 in QImode, HImode and SImode.
1091 Relative to reg-reg move (2). */
1092 {3, 4, 3}, /* cost of storing integer registers */
1093 4, /* cost of reg,reg fld/fst */
1094 {4, 4, 12}, /* cost of loading fp registers
1095 in SFmode, DFmode and XFmode */
1096 {6, 6, 8}, /* cost of storing fp registers
1097 in SFmode, DFmode and XFmode */
1098 2, /* cost of moving MMX register */
1099 {3, 3}, /* cost of loading MMX registers
1100 in SImode and DImode */
1101 {4, 4}, /* cost of storing MMX registers
1102 in SImode and DImode */
1103 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1104 {4, 4, 3, 6, 12}, /* cost of loading SSE registers
1105 in 32,64,128,256 and 512-bit */
1106 {4, 4, 5, 10, 20}, /* cost of storing SSE registers
1107 in 32,64,128,256 and 512-bit */
1108 3, 3, /* SSE->integer and integer->SSE moves */
1111 MOVD reg64, xmmreg Double FSTORE 4
1112 MOVD reg32, xmmreg Double FSTORE 4
1114 MOVD reg64, xmmreg Double FADD 3
1116 MOVD reg32, xmmreg Double FADD 3
1118 /* End of register allocator costs. */
1121 COSTS_N_INSNS (1), /* cost of an add instruction */
1122 COSTS_N_INSNS (2), /* cost of a lea instruction */
1123 COSTS_N_INSNS (1), /* variable shift costs */
1124 COSTS_N_INSNS (1), /* constant shift costs */
1125 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1126 COSTS_N_INSNS (4), /* HI */
1127 COSTS_N_INSNS (3), /* SI */
1128 COSTS_N_INSNS (4), /* DI */
1129 COSTS_N_INSNS (5)}, /* other */
1130 0, /* cost of multiply per each bit set */
1131 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1132 COSTS_N_INSNS (35), /* HI */
1133 COSTS_N_INSNS (51), /* SI */
1134 COSTS_N_INSNS (83), /* DI */
1135 COSTS_N_INSNS (83)}, /* other */
1136 COSTS_N_INSNS (1), /* cost of movsx */
1137 COSTS_N_INSNS (1), /* cost of movzx */
1138 8, /* "large" insn */
1140 6, /* CLEAR_RATIO */
1141 {3, 4, 3}, /* cost of loading integer registers
1142 in QImode, HImode and SImode.
1143 Relative to reg-reg move (2). */
1144 {3, 4, 3}, /* cost of storing integer registers */
1145 {4, 4, 3, 6, 12}, /* cost of loading SSE register
1146 in 32bit, 64bit, 128bit, 256bit and 512bit */
1147 {4, 4, 5, 10, 20}, /* cost of storing SSE register
1148 in 32bit, 64bit, 128bit, 256bit and 512bit */
1149 {4, 4, 3, 7, 12}, /* cost of unaligned loads. */
1150 {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
1151 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1152 3, /* cost of moving SSE register to integer. */
1153 4, 4, /* Gather load static, per_elt. */
1154 4, 4, /* Gather store static, per_elt. */
1155 64, /* size of l1 cache. */
1156 512, /* size of l2 cache. */
1157 64, /* size of prefetch block */
1158 /* New AMD processors never drop prefetches; if they cannot be performed
1159 immediately, they are queued. We set number of simultaneous prefetches
1160 to a large constant to reflect this (it probably is not a good idea not
1161 to limit number of prefetches at all, as their execution also takes some
1163 100, /* number of parallel prefetches */
1164 2, /* Branch cost */
1165 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1166 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1167 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1168 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1169 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1170 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1172 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1173 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1174 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1175 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1176 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
1177 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
1179 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
1180 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
1181 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
1182 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
1183 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1186 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1187 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1188 "32:25:8", /* Loop alignment. */
1189 "32:8:8", /* Jump alignment. */
1190 "0:0:8", /* Label alignment. */
1191 "32", /* Func alignment. */
1194 /* BDVER has optimized REP instruction for medium sized blocks, but for
1195 very small blocks it is better to use loop. For large blocks, libcall
1196 can do nontemporary accesses and beat inline considerably. */
1197 static stringop_algs bdver_memcpy
[2] = {
1198 {libcall
, {{6, loop
, false}, {14, unrolled_loop
, false},
1199 {-1, rep_prefix_4_byte
, false}}},
1200 {libcall
, {{16, loop
, false}, {8192, rep_prefix_8_byte
, false},
1201 {-1, libcall
, false}}}};
1202 static stringop_algs bdver_memset
[2] = {
1203 {libcall
, {{8, loop
, false}, {24, unrolled_loop
, false},
1204 {2048, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
1205 {libcall
, {{48, unrolled_loop
, false}, {8192, rep_prefix_8_byte
, false},
1206 {-1, libcall
, false}}}};
1208 const struct processor_costs bdver_cost
= {
1210 /* Start of register allocator costs. integer->integer move cost is 2. */
1211 8, /* cost for loading QImode using movzbl */
1212 {8, 8, 8}, /* cost of loading integer registers
1213 in QImode, HImode and SImode.
1214 Relative to reg-reg move (2). */
1215 {8, 8, 8}, /* cost of storing integer registers */
1216 4, /* cost of reg,reg fld/fst */
1217 {12, 12, 28}, /* cost of loading fp registers
1218 in SFmode, DFmode and XFmode */
1219 {10, 10, 18}, /* cost of storing fp registers
1220 in SFmode, DFmode and XFmode */
1221 4, /* cost of moving MMX register */
1222 {12, 12}, /* cost of loading MMX registers
1223 in SImode and DImode */
1224 {10, 10}, /* cost of storing MMX registers
1225 in SImode and DImode */
1226 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1227 {12, 12, 10, 40, 60}, /* cost of loading SSE registers
1228 in 32,64,128,256 and 512-bit */
1229 {10, 10, 10, 40, 60}, /* cost of storing SSE registers
1230 in 32,64,128,256 and 512-bit */
1231 16, 20, /* SSE->integer and integer->SSE moves */
1232 /* End of register allocator costs. */
1235 COSTS_N_INSNS (1), /* cost of an add instruction */
1236 COSTS_N_INSNS (1), /* cost of a lea instruction */
1237 COSTS_N_INSNS (1), /* variable shift costs */
1238 COSTS_N_INSNS (1), /* constant shift costs */
1239 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1240 COSTS_N_INSNS (4), /* HI */
1241 COSTS_N_INSNS (4), /* SI */
1242 COSTS_N_INSNS (6), /* DI */
1243 COSTS_N_INSNS (6)}, /* other */
1244 0, /* cost of multiply per each bit set */
1245 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1246 COSTS_N_INSNS (35), /* HI */
1247 COSTS_N_INSNS (51), /* SI */
1248 COSTS_N_INSNS (83), /* DI */
1249 COSTS_N_INSNS (83)}, /* other */
1250 COSTS_N_INSNS (1), /* cost of movsx */
1251 COSTS_N_INSNS (1), /* cost of movzx */
1252 8, /* "large" insn */
1254 6, /* CLEAR_RATIO */
1255 {8, 8, 8}, /* cost of loading integer registers
1256 in QImode, HImode and SImode.
1257 Relative to reg-reg move (2). */
1258 {8, 8, 8}, /* cost of storing integer registers */
1259 {12, 12, 10, 40, 60}, /* cost of loading SSE register
1260 in 32bit, 64bit, 128bit, 256bit and 512bit */
1261 {10, 10, 10, 40, 60}, /* cost of storing SSE register
1262 in 32bit, 64bit, 128bit, 256bit and 512bit */
1263 {12, 12, 10, 40, 60}, /* cost of unaligned loads. */
1264 {10, 10, 10, 40, 60}, /* cost of unaligned stores. */
1265 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1266 16, /* cost of moving SSE register to integer. */
1267 12, 12, /* Gather load static, per_elt. */
1268 10, 10, /* Gather store static, per_elt. */
1269 16, /* size of l1 cache. */
1270 2048, /* size of l2 cache. */
1271 64, /* size of prefetch block */
1272 /* New AMD processors never drop prefetches; if they cannot be performed
1273 immediately, they are queued. We set number of simultaneous prefetches
1274 to a large constant to reflect this (it probably is not a good idea not
1275 to limit number of prefetches at all, as their execution also takes some
1277 100, /* number of parallel prefetches */
1278 2, /* Branch cost */
1279 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1280 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1281 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1282 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1283 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1284 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1286 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1287 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1288 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1289 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1290 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1291 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1293 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1295 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1296 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1297 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
1298 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1301 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1302 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1303 "16:11:8", /* Loop alignment. */
1304 "16:8:8", /* Jump alignment. */
1305 "0:0:8", /* Label alignment. */
1306 "11", /* Func alignment. */
1310 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1311 very small blocks it is better to use loop. For large blocks, libcall
1312 can do nontemporary accesses and beat inline considerably. */
1313 static stringop_algs znver1_memcpy
[2] = {
1314 {libcall
, {{6, loop
, false}, {14, unrolled_loop
, false},
1315 {-1, rep_prefix_4_byte
, false}}},
1316 {libcall
, {{16, loop
, false}, {8192, rep_prefix_8_byte
, false},
1317 {-1, libcall
, false}}}};
1318 static stringop_algs znver1_memset
[2] = {
1319 {libcall
, {{8, loop
, false}, {24, unrolled_loop
, false},
1320 {2048, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
1321 {libcall
, {{48, unrolled_loop
, false}, {8192, rep_prefix_8_byte
, false},
1322 {-1, libcall
, false}}}};
1323 struct processor_costs znver1_cost
= {
1325 /* Start of register allocator costs. integer->integer move cost is 2. */
1327 /* reg-reg moves are done by renaming and thus they are even cheaper than
1328 1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1329 to doubles of latencies, we do not model this correctly. It does not
1330 seem to make practical difference to bump prices up even more. */
1331 6, /* cost for loading QImode using
1333 {6, 6, 6}, /* cost of loading integer registers
1334 in QImode, HImode and SImode.
1335 Relative to reg-reg move (2). */
1336 {8, 8, 8}, /* cost of storing integer
1338 2, /* cost of reg,reg fld/fst. */
1339 {6, 6, 16}, /* cost of loading fp registers
1340 in SFmode, DFmode and XFmode. */
1341 {8, 8, 16}, /* cost of storing fp registers
1342 in SFmode, DFmode and XFmode. */
1343 2, /* cost of moving MMX register. */
1344 {6, 6}, /* cost of loading MMX registers
1345 in SImode and DImode. */
1346 {8, 8}, /* cost of storing MMX registers
1347 in SImode and DImode. */
1348 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
1349 {6, 6, 6, 12, 24}, /* cost of loading SSE registers
1350 in 32,64,128,256 and 512-bit. */
1351 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
1352 in 32,64,128,256 and 512-bit. */
1353 6, 6, /* SSE->integer and integer->SSE moves. */
1354 /* End of register allocator costs. */
1357 COSTS_N_INSNS (1), /* cost of an add instruction. */
1358 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1359 COSTS_N_INSNS (1), /* variable shift costs. */
1360 COSTS_N_INSNS (1), /* constant shift costs. */
1361 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1362 COSTS_N_INSNS (3), /* HI. */
1363 COSTS_N_INSNS (3), /* SI. */
1364 COSTS_N_INSNS (3), /* DI. */
1365 COSTS_N_INSNS (3)}, /* other. */
1366 0, /* cost of multiply per each bit
1368 /* Depending on parameters, idiv can get faster on ryzen. This is upper
1370 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
1371 COSTS_N_INSNS (22), /* HI. */
1372 COSTS_N_INSNS (30), /* SI. */
1373 COSTS_N_INSNS (45), /* DI. */
1374 COSTS_N_INSNS (45)}, /* other. */
1375 COSTS_N_INSNS (1), /* cost of movsx. */
1376 COSTS_N_INSNS (1), /* cost of movzx. */
1377 8, /* "large" insn. */
1378 9, /* MOVE_RATIO. */
1379 6, /* CLEAR_RATIO */
1380 {6, 6, 6}, /* cost of loading integer registers
1381 in QImode, HImode and SImode.
1382 Relative to reg-reg move (2). */
1383 {8, 8, 8}, /* cost of storing integer
1385 {6, 6, 6, 12, 24}, /* cost of loading SSE register
1386 in 32bit, 64bit, 128bit, 256bit and 512bit */
1387 {8, 8, 8, 16, 32}, /* cost of storing SSE register
1388 in 32bit, 64bit, 128bit, 256bit and 512bit */
1389 {6, 6, 6, 12, 24}, /* cost of unaligned loads. */
1390 {8, 8, 8, 16, 32}, /* cost of unaligned stores. */
1391 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
1392 6, /* cost of moving SSE register to integer. */
1393 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1394 throughput 12. Approx 9 uops do not depend on vector size and every load
1396 18, 8, /* Gather load static, per_elt. */
1397 18, 10, /* Gather store static, per_elt. */
1398 32, /* size of l1 cache. */
1399 512, /* size of l2 cache. */
1400 64, /* size of prefetch block. */
1401 /* New AMD processors never drop prefetches; if they cannot be performed
1402 immediately, they are queued. We set number of simultaneous prefetches
1403 to a large constant to reflect this (it probably is not a good idea not
1404 to limit number of prefetches at all, as their execution also takes some
1406 100, /* number of parallel prefetches. */
1407 3, /* Branch cost. */
1408 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1409 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1410 /* Latency of fdiv is 8-15. */
1411 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1412 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1413 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1414 /* Latency of fsqrt is 4-10. */
1415 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1417 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1418 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1419 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1420 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1421 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1422 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1423 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1425 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1426 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1427 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
1428 /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1429 and it can execute 2 integer additions and 2 multiplications thus
1430 reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests
1431 that 4 works better than 6 probably due to register pressure.
1433 Integer vector operations are taken by FP unit and execute 3 vector
1434 plus/minus operations per cycle but only one multiply. This is adjusted
1435 in ix86_reassociation_width. */
1436 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1439 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1440 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1441 "16", /* Loop alignment. */
1442 "16", /* Jump alignment. */
1443 "0:0:8", /* Label alignment. */
1444 "16", /* Func alignment. */
1447 /* ZNVER2 has optimized REP instruction for medium sized blocks, but for
1448 very small blocks it is better to use loop. For large blocks, libcall
1449 can do nontemporary accesses and beat inline considerably. */
1450 static stringop_algs znver2_memcpy
[2] = {
1451 {libcall
, {{6, loop
, false}, {14, unrolled_loop
, false},
1452 {-1, rep_prefix_4_byte
, false}}},
1453 {libcall
, {{16, loop
, false}, {64, rep_prefix_4_byte
, false},
1454 {-1, libcall
, false}}}};
1455 static stringop_algs znver2_memset
[2] = {
1456 {libcall
, {{8, loop
, false}, {24, unrolled_loop
, false},
1457 {2048, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
1458 {libcall
, {{24, rep_prefix_4_byte
, false}, {128, rep_prefix_8_byte
, false},
1459 {-1, libcall
, false}}}};
1461 struct processor_costs znver2_cost
= {
1463 /* Start of register allocator costs. integer->integer move cost is 2. */
1465 /* reg-reg moves are done by renaming and thus they are even cheaper than
1466 1 cycle. Because reg-reg move cost is 2 and following tables correspond
1467 to doubles of latencies, we do not model this correctly. It does not
1468 seem to make practical difference to bump prices up even more. */
1469 6, /* cost for loading QImode using
1471 {6, 6, 6}, /* cost of loading integer registers
1472 in QImode, HImode and SImode.
1473 Relative to reg-reg move (2). */
1474 {8, 8, 8}, /* cost of storing integer
1476 2, /* cost of reg,reg fld/fst. */
1477 {6, 6, 16}, /* cost of loading fp registers
1478 in SFmode, DFmode and XFmode. */
1479 {8, 8, 16}, /* cost of storing fp registers
1480 in SFmode, DFmode and XFmode. */
1481 2, /* cost of moving MMX register. */
1482 {6, 6}, /* cost of loading MMX registers
1483 in SImode and DImode. */
1484 {8, 8}, /* cost of storing MMX registers
1485 in SImode and DImode. */
1486 2, 2, 3, /* cost of moving XMM,YMM,ZMM
1488 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
1489 in 32,64,128,256 and 512-bit. */
1490 {8, 8, 8, 8, 16}, /* cost of storing SSE registers
1491 in 32,64,128,256 and 512-bit. */
1492 6, 6, /* SSE->integer and integer->SSE
1494 /* End of register allocator costs. */
1497 COSTS_N_INSNS (1), /* cost of an add instruction. */
1498 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1499 COSTS_N_INSNS (1), /* variable shift costs. */
1500 COSTS_N_INSNS (1), /* constant shift costs. */
1501 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1502 COSTS_N_INSNS (3), /* HI. */
1503 COSTS_N_INSNS (3), /* SI. */
1504 COSTS_N_INSNS (3), /* DI. */
1505 COSTS_N_INSNS (3)}, /* other. */
1506 0, /* cost of multiply per each bit
1508 /* Depending on parameters, idiv can get faster on ryzen. This is upper
1510 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
1511 COSTS_N_INSNS (22), /* HI. */
1512 COSTS_N_INSNS (30), /* SI. */
1513 COSTS_N_INSNS (45), /* DI. */
1514 COSTS_N_INSNS (45)}, /* other. */
1515 COSTS_N_INSNS (1), /* cost of movsx. */
1516 COSTS_N_INSNS (1), /* cost of movzx. */
1517 8, /* "large" insn. */
1518 9, /* MOVE_RATIO. */
1519 6, /* CLEAR_RATIO */
1520 {6, 6, 6}, /* cost of loading integer registers
1521 in QImode, HImode and SImode.
1522 Relative to reg-reg move (2). */
1523 {8, 8, 8}, /* cost of storing integer
1525 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
1526 in 32bit, 64bit, 128bit, 256bit and 512bit */
1527 {8, 8, 8, 8, 16}, /* cost of storing SSE register
1528 in 32bit, 64bit, 128bit, 256bit and 512bit */
1529 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */
1530 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
1531 2, 2, 3, /* cost of moving XMM,YMM,ZMM
1533 6, /* cost of moving SSE register to integer. */
1534 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1535 throughput 12. Approx 9 uops do not depend on vector size and every load
1537 18, 8, /* Gather load static, per_elt. */
1538 18, 10, /* Gather store static, per_elt. */
1539 32, /* size of l1 cache. */
1540 512, /* size of l2 cache. */
1541 64, /* size of prefetch block. */
1542 /* New AMD processors never drop prefetches; if they cannot be performed
1543 immediately, they are queued. We set number of simultaneous prefetches
1544 to a large constant to reflect this (it probably is not a good idea not
1545 to limit number of prefetches at all, as their execution also takes some
1547 100, /* number of parallel prefetches. */
1548 3, /* Branch cost. */
1549 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1550 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1551 /* Latency of fdiv is 8-15. */
1552 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1553 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1554 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1555 /* Latency of fsqrt is 4-10. */
1556 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1558 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1559 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1560 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1561 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
1562 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1563 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1564 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1566 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1567 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1568 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
1569 /* Zen can execute 4 integer operations per cycle. FP operations
1570 take 3 cycles and it can execute 2 integer additions and 2
1571 multiplications thus reassociation may make sense up to with of 6.
1572 SPEC2k6 bencharks suggests
1573 that 4 works better than 6 probably due to register pressure.
1575 Integer vector operations are taken by FP unit and execute 3 vector
1576 plus/minus operations per cycle but only one multiply. This is adjusted
1577 in ix86_reassociation_width. */
1578 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1581 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1582 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1583 "16", /* Loop alignment. */
1584 "16", /* Jump alignment. */
1585 "0:0:8", /* Label alignment. */
1586 "16", /* Func alignment. */
1589 /* skylake_cost should produce code tuned for Skylake familly of CPUs. */
1590 static stringop_algs skylake_memcpy
[2] = {
1591 {libcall
, {{1024, rep_prefix_4_byte
, true}, {-1, libcall
, false}}},
1592 {libcall
, {{16, loop
, false}, {512, unrolled_loop
, false},
1593 {-1, libcall
, false}}}};
1595 static stringop_algs skylake_memset
[2] = {
1596 {libcall
, {{6, loop_1_byte
, true},
1598 {8192, rep_prefix_4_byte
, true},
1599 {-1, libcall
, false}}},
1600 {libcall
, {{24, loop
, true}, {512, unrolled_loop
, false},
1601 {-1, libcall
, false}}}};
1604 struct processor_costs skylake_cost
= {
1606 /* Start of register allocator costs. integer->integer move cost is 2. */
1607 6, /* cost for loading QImode using movzbl */
1608 {4, 4, 4}, /* cost of loading integer registers
1609 in QImode, HImode and SImode.
1610 Relative to reg-reg move (2). */
1611 {6, 6, 6}, /* cost of storing integer registers */
1612 2, /* cost of reg,reg fld/fst */
1613 {6, 6, 8}, /* cost of loading fp registers
1614 in SFmode, DFmode and XFmode */
1615 {6, 6, 10}, /* cost of storing fp registers
1616 in SFmode, DFmode and XFmode */
1617 2, /* cost of moving MMX register */
1618 {6, 6}, /* cost of loading MMX registers
1619 in SImode and DImode */
1620 {6, 6}, /* cost of storing MMX registers
1621 in SImode and DImode */
1622 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
1623 {6, 6, 6, 10, 20}, /* cost of loading SSE registers
1624 in 32,64,128,256 and 512-bit */
1625 {8, 8, 8, 12, 24}, /* cost of storing SSE registers
1626 in 32,64,128,256 and 512-bit */
1627 6, 6, /* SSE->integer and integer->SSE moves */
1628 /* End of register allocator costs. */
1631 COSTS_N_INSNS (1), /* cost of an add instruction */
1632 COSTS_N_INSNS (1)+1, /* cost of a lea instruction */
1633 COSTS_N_INSNS (1), /* variable shift costs */
1634 COSTS_N_INSNS (1), /* constant shift costs */
1635 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1636 COSTS_N_INSNS (4), /* HI */
1637 COSTS_N_INSNS (3), /* SI */
1638 COSTS_N_INSNS (3), /* DI */
1639 COSTS_N_INSNS (3)}, /* other */
1640 0, /* cost of multiply per each bit set */
1641 /* Expanding div/mod currently doesn't consider parallelism. So the cost
1642 model is not realistic. We compensate by increasing the latencies a bit. */
1643 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
1644 COSTS_N_INSNS (11), /* HI */
1645 COSTS_N_INSNS (14), /* SI */
1646 COSTS_N_INSNS (76), /* DI */
1647 COSTS_N_INSNS (76)}, /* other */
1648 COSTS_N_INSNS (1), /* cost of movsx */
1649 COSTS_N_INSNS (0), /* cost of movzx */
1650 8, /* "large" insn */
1651 17, /* MOVE_RATIO */
1652 6, /* CLEAR_RATIO */
1653 {4, 4, 4}, /* cost of loading integer registers
1654 in QImode, HImode and SImode.
1655 Relative to reg-reg move (2). */
1656 {6, 6, 6}, /* cost of storing integer registers */
1657 {6, 6, 6, 10, 20}, /* cost of loading SSE register
1658 in 32bit, 64bit, 128bit, 256bit and 512bit */
1659 {8, 8, 8, 12, 24}, /* cost of storing SSE register
1660 in 32bit, 64bit, 128bit, 256bit and 512bit */
1661 {6, 6, 6, 10, 20}, /* cost of unaligned loads. */
1662 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
1663 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
1664 2, /* cost of moving SSE register to integer. */
1665 20, 8, /* Gather load static, per_elt. */
1666 22, 10, /* Gather store static, per_elt. */
1667 64, /* size of l1 cache. */
1668 512, /* size of l2 cache. */
1669 64, /* size of prefetch block */
1670 6, /* number of parallel prefetches */
1671 3, /* Branch cost */
1672 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
1673 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1674 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1675 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1676 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1677 COSTS_N_INSNS (20), /* cost of FSQRT instruction. */
1679 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1680 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1681 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1682 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1683 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
1684 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
1685 COSTS_N_INSNS (11), /* cost of DIVSS instruction. */
1686 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
1687 COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */
1688 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
1689 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
1692 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1693 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1694 "16:11:8", /* Loop alignment. */
1695 "16:11:8", /* Jump alignment. */
1696 "0:0:8", /* Label alignment. */
1697 "16", /* Func alignment. */
1699 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1700 very small blocks it is better to use loop. For large blocks, libcall can
1701 do nontemporary accesses and beat inline considerably. */
1702 static stringop_algs btver1_memcpy
[2] = {
1703 {libcall
, {{6, loop
, false}, {14, unrolled_loop
, false},
1704 {-1, rep_prefix_4_byte
, false}}},
1705 {libcall
, {{16, loop
, false}, {8192, rep_prefix_8_byte
, false},
1706 {-1, libcall
, false}}}};
1707 static stringop_algs btver1_memset
[2] = {
1708 {libcall
, {{8, loop
, false}, {24, unrolled_loop
, false},
1709 {2048, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
1710 {libcall
, {{48, unrolled_loop
, false}, {8192, rep_prefix_8_byte
, false},
1711 {-1, libcall
, false}}}};
1712 const struct processor_costs btver1_cost
= {
1714 /* Start of register allocator costs. integer->integer move cost is 2. */
1715 8, /* cost for loading QImode using movzbl */
1716 {6, 8, 6}, /* cost of loading integer registers
1717 in QImode, HImode and SImode.
1718 Relative to reg-reg move (2). */
1719 {6, 8, 6}, /* cost of storing integer registers */
1720 4, /* cost of reg,reg fld/fst */
1721 {12, 12, 28}, /* cost of loading fp registers
1722 in SFmode, DFmode and XFmode */
1723 {12, 12, 38}, /* cost of storing fp registers
1724 in SFmode, DFmode and XFmode */
1725 4, /* cost of moving MMX register */
1726 {10, 10}, /* cost of loading MMX registers
1727 in SImode and DImode */
1728 {12, 12}, /* cost of storing MMX registers
1729 in SImode and DImode */
1730 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1731 {10, 10, 12, 48, 96}, /* cost of loading SSE registers
1732 in 32,64,128,256 and 512-bit */
1733 {10, 10, 12, 48, 96}, /* cost of storing SSE registers
1734 in 32,64,128,256 and 512-bit */
1735 14, 14, /* SSE->integer and integer->SSE moves */
1736 /* End of register allocator costs. */
1739 COSTS_N_INSNS (1), /* cost of an add instruction */
1740 COSTS_N_INSNS (2), /* cost of a lea instruction */
1741 COSTS_N_INSNS (1), /* variable shift costs */
1742 COSTS_N_INSNS (1), /* constant shift costs */
1743 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1744 COSTS_N_INSNS (4), /* HI */
1745 COSTS_N_INSNS (3), /* SI */
1746 COSTS_N_INSNS (4), /* DI */
1747 COSTS_N_INSNS (5)}, /* other */
1748 0, /* cost of multiply per each bit set */
1749 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1750 COSTS_N_INSNS (35), /* HI */
1751 COSTS_N_INSNS (51), /* SI */
1752 COSTS_N_INSNS (83), /* DI */
1753 COSTS_N_INSNS (83)}, /* other */
1754 COSTS_N_INSNS (1), /* cost of movsx */
1755 COSTS_N_INSNS (1), /* cost of movzx */
1756 8, /* "large" insn */
1758 6, /* CLEAR_RATIO */
1759 {6, 8, 6}, /* cost of loading integer registers
1760 in QImode, HImode and SImode.
1761 Relative to reg-reg move (2). */
1762 {6, 8, 6}, /* cost of storing integer registers */
1763 {10, 10, 12, 48, 96}, /* cost of loading SSE register
1764 in 32bit, 64bit, 128bit, 256bit and 512bit */
1765 {10, 10, 12, 48, 96}, /* cost of storing SSE register
1766 in 32bit, 64bit, 128bit, 256bit and 512bit */
1767 {10, 10, 12, 48, 96}, /* cost of unaligned loads. */
1768 {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
1769 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1770 14, /* cost of moving SSE register to integer. */
1771 10, 10, /* Gather load static, per_elt. */
1772 10, 10, /* Gather store static, per_elt. */
1773 32, /* size of l1 cache. */
1774 512, /* size of l2 cache. */
1775 64, /* size of prefetch block */
1776 100, /* number of parallel prefetches */
1777 2, /* Branch cost */
1778 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1779 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1780 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1781 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1782 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1783 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1785 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1786 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1787 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
1788 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1789 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1790 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1791 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
1792 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
1793 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
1794 COSTS_N_INSNS (48), /* cost of SQRTSD instruction. */
1795 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1798 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1799 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1800 "16:11:8", /* Loop alignment. */
1801 "16:8:8", /* Jump alignment. */
1802 "0:0:8", /* Label alignment. */
1803 "11", /* Func alignment. */
1806 static stringop_algs btver2_memcpy
[2] = {
1807 {libcall
, {{6, loop
, false}, {14, unrolled_loop
, false},
1808 {-1, rep_prefix_4_byte
, false}}},
1809 {libcall
, {{16, loop
, false}, {8192, rep_prefix_8_byte
, false},
1810 {-1, libcall
, false}}}};
1811 static stringop_algs btver2_memset
[2] = {
1812 {libcall
, {{8, loop
, false}, {24, unrolled_loop
, false},
1813 {2048, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
1814 {libcall
, {{48, unrolled_loop
, false}, {8192, rep_prefix_8_byte
, false},
1815 {-1, libcall
, false}}}};
1816 const struct processor_costs btver2_cost
= {
1818 /* Start of register allocator costs. integer->integer move cost is 2. */
1819 8, /* cost for loading QImode using movzbl */
1820 {8, 8, 6}, /* cost of loading integer registers
1821 in QImode, HImode and SImode.
1822 Relative to reg-reg move (2). */
1823 {8, 8, 6}, /* cost of storing integer registers */
1824 4, /* cost of reg,reg fld/fst */
1825 {12, 12, 28}, /* cost of loading fp registers
1826 in SFmode, DFmode and XFmode */
1827 {12, 12, 38}, /* cost of storing fp registers
1828 in SFmode, DFmode and XFmode */
1829 4, /* cost of moving MMX register */
1830 {10, 10}, /* cost of loading MMX registers
1831 in SImode and DImode */
1832 {12, 12}, /* cost of storing MMX registers
1833 in SImode and DImode */
1834 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1835 {10, 10, 12, 48, 96}, /* cost of loading SSE registers
1836 in 32,64,128,256 and 512-bit */
1837 {10, 10, 12, 48, 96}, /* cost of storing SSE registers
1838 in 32,64,128,256 and 512-bit */
1839 14, 14, /* SSE->integer and integer->SSE moves */
1840 /* End of register allocator costs. */
1843 COSTS_N_INSNS (1), /* cost of an add instruction */
1844 COSTS_N_INSNS (2), /* cost of a lea instruction */
1845 COSTS_N_INSNS (1), /* variable shift costs */
1846 COSTS_N_INSNS (1), /* constant shift costs */
1847 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1848 COSTS_N_INSNS (4), /* HI */
1849 COSTS_N_INSNS (3), /* SI */
1850 COSTS_N_INSNS (4), /* DI */
1851 COSTS_N_INSNS (5)}, /* other */
1852 0, /* cost of multiply per each bit set */
1853 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1854 COSTS_N_INSNS (35), /* HI */
1855 COSTS_N_INSNS (51), /* SI */
1856 COSTS_N_INSNS (83), /* DI */
1857 COSTS_N_INSNS (83)}, /* other */
1858 COSTS_N_INSNS (1), /* cost of movsx */
1859 COSTS_N_INSNS (1), /* cost of movzx */
1860 8, /* "large" insn */
1862 6, /* CLEAR_RATIO */
1863 {8, 8, 6}, /* cost of loading integer registers
1864 in QImode, HImode and SImode.
1865 Relative to reg-reg move (2). */
1866 {8, 8, 6}, /* cost of storing integer registers */
1867 {10, 10, 12, 48, 96}, /* cost of loading SSE register
1868 in 32bit, 64bit, 128bit, 256bit and 512bit */
1869 {10, 10, 12, 48, 96}, /* cost of storing SSE register
1870 in 32bit, 64bit, 128bit, 256bit and 512bit */
1871 {10, 10, 12, 48, 96}, /* cost of unaligned loads. */
1872 {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
1873 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1874 14, /* cost of moving SSE register to integer. */
1875 10, 10, /* Gather load static, per_elt. */
1876 10, 10, /* Gather store static, per_elt. */
1877 32, /* size of l1 cache. */
1878 2048, /* size of l2 cache. */
1879 64, /* size of prefetch block */
1880 100, /* number of parallel prefetches */
1881 2, /* Branch cost */
1882 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1883 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1884 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1885 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1886 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1887 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1889 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1890 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1891 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
1892 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1893 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1894 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1895 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
1896 COSTS_N_INSNS (19), /* cost of DIVSD instruction. */
1897 COSTS_N_INSNS (16), /* cost of SQRTSS instruction. */
1898 COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */
1899 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1902 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1903 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1904 "16:11:8", /* Loop alignment. */
1905 "16:8:8", /* Jump alignment. */
1906 "0:0:8", /* Label alignment. */
1907 "11", /* Func alignment. */
1910 static stringop_algs pentium4_memcpy
[2] = {
1911 {libcall
, {{12, loop_1_byte
, false}, {-1, rep_prefix_4_byte
, false}}},
1912 DUMMY_STRINGOP_ALGS
};
1913 static stringop_algs pentium4_memset
[2] = {
1914 {libcall
, {{6, loop_1_byte
, false}, {48, loop
, false},
1915 {20480, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
1916 DUMMY_STRINGOP_ALGS
};
1919 struct processor_costs pentium4_cost
= {
1921 /* Start of register allocator costs. integer->integer move cost is 2. */
1922 5, /* cost for loading QImode using movzbl */
1923 {4, 5, 4}, /* cost of loading integer registers
1924 in QImode, HImode and SImode.
1925 Relative to reg-reg move (2). */
1926 {2, 3, 2}, /* cost of storing integer registers */
1927 12, /* cost of reg,reg fld/fst */
1928 {14, 14, 14}, /* cost of loading fp registers
1929 in SFmode, DFmode and XFmode */
1930 {14, 14, 14}, /* cost of storing fp registers
1931 in SFmode, DFmode and XFmode */
1932 12, /* cost of moving MMX register */
1933 {16, 16}, /* cost of loading MMX registers
1934 in SImode and DImode */
1935 {16, 16}, /* cost of storing MMX registers
1936 in SImode and DImode */
1937 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
1938 {16, 16, 16, 32, 64}, /* cost of loading SSE registers
1939 in 32,64,128,256 and 512-bit */
1940 {16, 16, 16, 32, 64}, /* cost of storing SSE registers
1941 in 32,64,128,256 and 512-bit */
1942 20, 12, /* SSE->integer and integer->SSE moves */
1943 /* End of register allocator costs. */
1946 COSTS_N_INSNS (1), /* cost of an add instruction */
1947 COSTS_N_INSNS (3), /* cost of a lea instruction */
1948 COSTS_N_INSNS (4), /* variable shift costs */
1949 COSTS_N_INSNS (4), /* constant shift costs */
1950 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1951 COSTS_N_INSNS (15), /* HI */
1952 COSTS_N_INSNS (15), /* SI */
1953 COSTS_N_INSNS (15), /* DI */
1954 COSTS_N_INSNS (15)}, /* other */
1955 0, /* cost of multiply per each bit set */
1956 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1957 COSTS_N_INSNS (56), /* HI */
1958 COSTS_N_INSNS (56), /* SI */
1959 COSTS_N_INSNS (56), /* DI */
1960 COSTS_N_INSNS (56)}, /* other */
1961 COSTS_N_INSNS (1), /* cost of movsx */
1962 COSTS_N_INSNS (1), /* cost of movzx */
1963 16, /* "large" insn */
1965 6, /* CLEAR_RATIO */
1966 {4, 5, 4}, /* cost of loading integer registers
1967 in QImode, HImode and SImode.
1968 Relative to reg-reg move (2). */
1969 {2, 3, 2}, /* cost of storing integer registers */
1970 {16, 16, 16, 32, 64}, /* cost of loading SSE register
1971 in 32bit, 64bit, 128bit, 256bit and 512bit */
1972 {16, 16, 16, 32, 64}, /* cost of storing SSE register
1973 in 32bit, 64bit, 128bit, 256bit and 512bit */
1974 {32, 32, 32, 64, 128}, /* cost of unaligned loads. */
1975 {32, 32, 32, 64, 128}, /* cost of unaligned stores. */
1976 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
1977 20, /* cost of moving SSE register to integer. */
1978 16, 16, /* Gather load static, per_elt. */
1979 16, 16, /* Gather store static, per_elt. */
1980 8, /* size of l1 cache. */
1981 256, /* size of l2 cache. */
1982 64, /* size of prefetch block */
1983 6, /* number of parallel prefetches */
1984 2, /* Branch cost */
1985 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1986 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1987 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1988 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1989 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1990 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1992 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1993 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1994 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1995 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1996 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1997 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1998 COSTS_N_INSNS (23), /* cost of DIVSS instruction. */
1999 COSTS_N_INSNS (38), /* cost of DIVSD instruction. */
2000 COSTS_N_INSNS (23), /* cost of SQRTSS instruction. */
2001 COSTS_N_INSNS (38), /* cost of SQRTSD instruction. */
2002 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2005 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2006 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2007 NULL
, /* Loop alignment. */
2008 NULL
, /* Jump alignment. */
2009 NULL
, /* Label alignment. */
2010 NULL
, /* Func alignment. */
2013 static stringop_algs nocona_memcpy
[2] = {
2014 {libcall
, {{12, loop_1_byte
, false}, {-1, rep_prefix_4_byte
, false}}},
2015 {libcall
, {{32, loop
, false}, {20000, rep_prefix_8_byte
, false},
2016 {100000, unrolled_loop
, false}, {-1, libcall
, false}}}};
2018 static stringop_algs nocona_memset
[2] = {
2019 {libcall
, {{6, loop_1_byte
, false}, {48, loop
, false},
2020 {20480, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
2021 {libcall
, {{24, loop
, false}, {64, unrolled_loop
, false},
2022 {8192, rep_prefix_8_byte
, false}, {-1, libcall
, false}}}};
2025 struct processor_costs nocona_cost
= {
2027 /* Start of register allocator costs. integer->integer move cost is 2. */
2028 4, /* cost for loading QImode using movzbl */
2029 {4, 4, 4}, /* cost of loading integer registers
2030 in QImode, HImode and SImode.
2031 Relative to reg-reg move (2). */
2032 {4, 4, 4}, /* cost of storing integer registers */
2033 12, /* cost of reg,reg fld/fst */
2034 {14, 14, 14}, /* cost of loading fp registers
2035 in SFmode, DFmode and XFmode */
2036 {14, 14, 14}, /* cost of storing fp registers
2037 in SFmode, DFmode and XFmode */
2038 14, /* cost of moving MMX register */
2039 {12, 12}, /* cost of loading MMX registers
2040 in SImode and DImode */
2041 {12, 12}, /* cost of storing MMX registers
2042 in SImode and DImode */
2043 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
2044 {12, 12, 12, 24, 48}, /* cost of loading SSE registers
2045 in 32,64,128,256 and 512-bit */
2046 {12, 12, 12, 24, 48}, /* cost of storing SSE registers
2047 in 32,64,128,256 and 512-bit */
2048 20, 12, /* SSE->integer and integer->SSE moves */
2049 /* End of register allocator costs. */
2052 COSTS_N_INSNS (1), /* cost of an add instruction */
2053 COSTS_N_INSNS (1), /* cost of a lea instruction */
2054 COSTS_N_INSNS (1), /* variable shift costs */
2055 COSTS_N_INSNS (1), /* constant shift costs */
2056 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
2057 COSTS_N_INSNS (10), /* HI */
2058 COSTS_N_INSNS (10), /* SI */
2059 COSTS_N_INSNS (10), /* DI */
2060 COSTS_N_INSNS (10)}, /* other */
2061 0, /* cost of multiply per each bit set */
2062 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
2063 COSTS_N_INSNS (66), /* HI */
2064 COSTS_N_INSNS (66), /* SI */
2065 COSTS_N_INSNS (66), /* DI */
2066 COSTS_N_INSNS (66)}, /* other */
2067 COSTS_N_INSNS (1), /* cost of movsx */
2068 COSTS_N_INSNS (1), /* cost of movzx */
2069 16, /* "large" insn */
2070 17, /* MOVE_RATIO */
2071 6, /* CLEAR_RATIO */
2072 {4, 4, 4}, /* cost of loading integer registers
2073 in QImode, HImode and SImode.
2074 Relative to reg-reg move (2). */
2075 {4, 4, 4}, /* cost of storing integer registers */
2076 {12, 12, 12, 24, 48}, /* cost of loading SSE register
2077 in 32bit, 64bit, 128bit, 256bit and 512bit */
2078 {12, 12, 12, 24, 48}, /* cost of storing SSE register
2079 in 32bit, 64bit, 128bit, 256bit and 512bit */
2080 {24, 24, 24, 48, 96}, /* cost of unaligned loads. */
2081 {24, 24, 24, 48, 96}, /* cost of unaligned stores. */
2082 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
2083 20, /* cost of moving SSE register to integer. */
2084 12, 12, /* Gather load static, per_elt. */
2085 12, 12, /* Gather store static, per_elt. */
2086 8, /* size of l1 cache. */
2087 1024, /* size of l2 cache. */
2088 64, /* size of prefetch block */
2089 8, /* number of parallel prefetches */
2090 1, /* Branch cost */
2091 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
2092 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2093 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
2094 COSTS_N_INSNS (3), /* cost of FABS instruction. */
2095 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
2096 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
2098 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
2099 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
2100 COSTS_N_INSNS (7), /* cost of MULSS instruction. */
2101 COSTS_N_INSNS (7), /* cost of MULSD instruction. */
2102 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
2103 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
2104 COSTS_N_INSNS (32), /* cost of DIVSS instruction. */
2105 COSTS_N_INSNS (40), /* cost of DIVSD instruction. */
2106 COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */
2107 COSTS_N_INSNS (41), /* cost of SQRTSD instruction. */
2108 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2111 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2112 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2113 NULL
, /* Loop alignment. */
2114 NULL
, /* Jump alignment. */
2115 NULL
, /* Label alignment. */
2116 NULL
, /* Func alignment. */
2119 static stringop_algs atom_memcpy
[2] = {
2120 {libcall
, {{11, loop
, false}, {-1, rep_prefix_4_byte
, false}}},
2121 {libcall
, {{32, loop
, false}, {64, rep_prefix_4_byte
, false},
2122 {8192, rep_prefix_8_byte
, false}, {-1, libcall
, false}}}};
2123 static stringop_algs atom_memset
[2] = {
2124 {libcall
, {{8, loop
, false}, {15, unrolled_loop
, false},
2125 {2048, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
2126 {libcall
, {{24, loop
, false}, {32, unrolled_loop
, false},
2127 {8192, rep_prefix_8_byte
, false}, {-1, libcall
, false}}}};
2129 struct processor_costs atom_cost
= {
2131 /* Start of register allocator costs. integer->integer move cost is 2. */
2132 6, /* cost for loading QImode using movzbl */
2133 {6, 6, 6}, /* cost of loading integer registers
2134 in QImode, HImode and SImode.
2135 Relative to reg-reg move (2). */
2136 {6, 6, 6}, /* cost of storing integer registers */
2137 4, /* cost of reg,reg fld/fst */
2138 {6, 6, 18}, /* cost of loading fp registers
2139 in SFmode, DFmode and XFmode */
2140 {14, 14, 24}, /* cost of storing fp registers
2141 in SFmode, DFmode and XFmode */
2142 2, /* cost of moving MMX register */
2143 {8, 8}, /* cost of loading MMX registers
2144 in SImode and DImode */
2145 {10, 10}, /* cost of storing MMX registers
2146 in SImode and DImode */
2147 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2148 {8, 8, 8, 16, 32}, /* cost of loading SSE registers
2149 in 32,64,128,256 and 512-bit */
2150 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
2151 in 32,64,128,256 and 512-bit */
2152 8, 6, /* SSE->integer and integer->SSE moves */
2153 /* End of register allocator costs. */
2156 COSTS_N_INSNS (1), /* cost of an add instruction */
2157 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2158 COSTS_N_INSNS (1), /* variable shift costs */
2159 COSTS_N_INSNS (1), /* constant shift costs */
2160 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2161 COSTS_N_INSNS (4), /* HI */
2162 COSTS_N_INSNS (3), /* SI */
2163 COSTS_N_INSNS (4), /* DI */
2164 COSTS_N_INSNS (2)}, /* other */
2165 0, /* cost of multiply per each bit set */
2166 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2167 COSTS_N_INSNS (26), /* HI */
2168 COSTS_N_INSNS (42), /* SI */
2169 COSTS_N_INSNS (74), /* DI */
2170 COSTS_N_INSNS (74)}, /* other */
2171 COSTS_N_INSNS (1), /* cost of movsx */
2172 COSTS_N_INSNS (1), /* cost of movzx */
2173 8, /* "large" insn */
2174 17, /* MOVE_RATIO */
2175 6, /* CLEAR_RATIO */
2176 {6, 6, 6}, /* cost of loading integer registers
2177 in QImode, HImode and SImode.
2178 Relative to reg-reg move (2). */
2179 {6, 6, 6}, /* cost of storing integer registers */
2180 {8, 8, 8, 16, 32}, /* cost of loading SSE register
2181 in 32bit, 64bit, 128bit, 256bit and 512bit */
2182 {8, 8, 8, 16, 32}, /* cost of storing SSE register
2183 in 32bit, 64bit, 128bit, 256bit and 512bit */
2184 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */
2185 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
2186 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2187 8, /* cost of moving SSE register to integer. */
2188 8, 8, /* Gather load static, per_elt. */
2189 8, 8, /* Gather store static, per_elt. */
2190 32, /* size of l1 cache. */
2191 256, /* size of l2 cache. */
2192 64, /* size of prefetch block */
2193 6, /* number of parallel prefetches */
2194 3, /* Branch cost */
2195 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2196 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2197 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2198 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2199 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2200 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2202 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2203 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
2204 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2205 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2206 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2207 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
2208 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
2209 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
2210 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
2211 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
2212 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
2215 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2216 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2217 "16", /* Loop alignment. */
2218 "16:8:8", /* Jump alignment. */
2219 "0:0:8", /* Label alignment. */
2220 "16", /* Func alignment. */
2223 static stringop_algs slm_memcpy
[2] = {
2224 {libcall
, {{11, loop
, false}, {-1, rep_prefix_4_byte
, false}}},
2225 {libcall
, {{32, loop
, false}, {64, rep_prefix_4_byte
, false},
2226 {8192, rep_prefix_8_byte
, false}, {-1, libcall
, false}}}};
2227 static stringop_algs slm_memset
[2] = {
2228 {libcall
, {{8, loop
, false}, {15, unrolled_loop
, false},
2229 {2048, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
2230 {libcall
, {{24, loop
, false}, {32, unrolled_loop
, false},
2231 {8192, rep_prefix_8_byte
, false}, {-1, libcall
, false}}}};
2233 struct processor_costs slm_cost
= {
2235 /* Start of register allocator costs. integer->integer move cost is 2. */
2236 8, /* cost for loading QImode using movzbl */
2237 {8, 8, 8}, /* cost of loading integer registers
2238 in QImode, HImode and SImode.
2239 Relative to reg-reg move (2). */
2240 {6, 6, 6}, /* cost of storing integer registers */
2241 2, /* cost of reg,reg fld/fst */
2242 {8, 8, 18}, /* cost of loading fp registers
2243 in SFmode, DFmode and XFmode */
2244 {6, 6, 18}, /* cost of storing fp registers
2245 in SFmode, DFmode and XFmode */
2246 2, /* cost of moving MMX register */
2247 {8, 8}, /* cost of loading MMX registers
2248 in SImode and DImode */
2249 {6, 6}, /* cost of storing MMX registers
2250 in SImode and DImode */
2251 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2252 {8, 8, 8, 16, 32}, /* cost of loading SSE registers
2253 in 32,64,128,256 and 512-bit */
2254 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
2255 in 32,64,128,256 and 512-bit */
2256 8, 6, /* SSE->integer and integer->SSE moves */
2257 /* End of register allocator costs. */
2260 COSTS_N_INSNS (1), /* cost of an add instruction */
2261 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2262 COSTS_N_INSNS (1), /* variable shift costs */
2263 COSTS_N_INSNS (1), /* constant shift costs */
2264 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2265 COSTS_N_INSNS (3), /* HI */
2266 COSTS_N_INSNS (3), /* SI */
2267 COSTS_N_INSNS (4), /* DI */
2268 COSTS_N_INSNS (2)}, /* other */
2269 0, /* cost of multiply per each bit set */
2270 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2271 COSTS_N_INSNS (26), /* HI */
2272 COSTS_N_INSNS (42), /* SI */
2273 COSTS_N_INSNS (74), /* DI */
2274 COSTS_N_INSNS (74)}, /* other */
2275 COSTS_N_INSNS (1), /* cost of movsx */
2276 COSTS_N_INSNS (1), /* cost of movzx */
2277 8, /* "large" insn */
2278 17, /* MOVE_RATIO */
2279 6, /* CLEAR_RATIO */
2280 {8, 8, 8}, /* cost of loading integer registers
2281 in QImode, HImode and SImode.
2282 Relative to reg-reg move (2). */
2283 {6, 6, 6}, /* cost of storing integer registers */
2284 {8, 8, 8, 16, 32}, /* cost of loading SSE register
2285 in 32bit, 64bit, 128bit, 256bit and 512bit */
2286 {8, 8, 8, 16, 32}, /* cost of storing SSE register
2287 in SImode, DImode and TImode. */
2288 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */
2289 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
2290 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2291 8, /* cost of moving SSE register to integer. */
2292 8, 8, /* Gather load static, per_elt. */
2293 8, 8, /* Gather store static, per_elt. */
2294 32, /* size of l1 cache. */
2295 256, /* size of l2 cache. */
2296 64, /* size of prefetch block */
2297 6, /* number of parallel prefetches */
2298 3, /* Branch cost */
2299 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2300 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2301 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2302 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2303 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2304 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2306 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2307 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2308 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2309 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2310 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2311 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
2312 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
2313 COSTS_N_INSNS (69), /* cost of DIVSD instruction. */
2314 COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */
2315 COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */
2316 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2319 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2320 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2321 "16", /* Loop alignment. */
2322 "16:8:8", /* Jump alignment. */
2323 "0:0:8", /* Label alignment. */
2324 "16", /* Func alignment. */
2327 static stringop_algs intel_memcpy
[2] = {
2328 {libcall
, {{11, loop
, false}, {-1, rep_prefix_4_byte
, false}}},
2329 {libcall
, {{32, loop
, false}, {64, rep_prefix_4_byte
, false},
2330 {8192, rep_prefix_8_byte
, false}, {-1, libcall
, false}}}};
2331 static stringop_algs intel_memset
[2] = {
2332 {libcall
, {{8, loop
, false}, {15, unrolled_loop
, false},
2333 {2048, rep_prefix_4_byte
, false}, {-1, libcall
, false}}},
2334 {libcall
, {{24, loop
, false}, {32, unrolled_loop
, false},
2335 {8192, rep_prefix_8_byte
, false}, {-1, libcall
, false}}}};
2337 struct processor_costs intel_cost
= {
2339 /* Start of register allocator costs. integer->integer move cost is 2. */
2340 6, /* cost for loading QImode using movzbl */
2341 {4, 4, 4}, /* cost of loading integer registers
2342 in QImode, HImode and SImode.
2343 Relative to reg-reg move (2). */
2344 {6, 6, 6}, /* cost of storing integer registers */
2345 2, /* cost of reg,reg fld/fst */
2346 {6, 6, 8}, /* cost of loading fp registers
2347 in SFmode, DFmode and XFmode */
2348 {6, 6, 10}, /* cost of storing fp registers
2349 in SFmode, DFmode and XFmode */
2350 2, /* cost of moving MMX register */
2351 {6, 6}, /* cost of loading MMX registers
2352 in SImode and DImode */
2353 {6, 6}, /* cost of storing MMX registers
2354 in SImode and DImode */
2355 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */
2356 {6, 6, 6, 6, 6}, /* cost of loading SSE registers
2357 in 32,64,128,256 and 512-bit */
2358 {6, 6, 6, 6, 6}, /* cost of storing SSE registers
2359 in 32,64,128,256 and 512-bit */
2360 4, 4, /* SSE->integer and integer->SSE moves */
2361 /* End of register allocator costs. */
2364 COSTS_N_INSNS (1), /* cost of an add instruction */
2365 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2366 COSTS_N_INSNS (1), /* variable shift costs */
2367 COSTS_N_INSNS (1), /* constant shift costs */
2368 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2369 COSTS_N_INSNS (3), /* HI */
2370 COSTS_N_INSNS (3), /* SI */
2371 COSTS_N_INSNS (4), /* DI */
2372 COSTS_N_INSNS (2)}, /* other */
2373 0, /* cost of multiply per each bit set */
2374 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2375 COSTS_N_INSNS (26), /* HI */
2376 COSTS_N_INSNS (42), /* SI */
2377 COSTS_N_INSNS (74), /* DI */
2378 COSTS_N_INSNS (74)}, /* other */
2379 COSTS_N_INSNS (1), /* cost of movsx */
2380 COSTS_N_INSNS (1), /* cost of movzx */
2381 8, /* "large" insn */
2382 17, /* MOVE_RATIO */
2383 6, /* CLEAR_RATIO */
2384 {4, 4, 4}, /* cost of loading integer registers
2385 in QImode, HImode and SImode.
2386 Relative to reg-reg move (2). */
2387 {6, 6, 6}, /* cost of storing integer registers */
2388 {6, 6, 6, 6, 6}, /* cost of loading SSE register
2389 in 32bit, 64bit, 128bit, 256bit and 512bit */
2390 {6, 6, 6, 6, 6}, /* cost of storing SSE register
2391 in 32bit, 64bit, 128bit, 256bit and 512bit */
2392 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
2393 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
2394 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */
2395 4, /* cost of moving SSE register to integer. */
2396 6, 6, /* Gather load static, per_elt. */
2397 6, 6, /* Gather store static, per_elt. */
2398 32, /* size of l1 cache. */
2399 256, /* size of l2 cache. */
2400 64, /* size of prefetch block */
2401 6, /* number of parallel prefetches */
2402 3, /* Branch cost */
2403 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2404 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2405 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2406 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2407 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2408 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2410 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2411 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
2412 COSTS_N_INSNS (8), /* cost of MULSS instruction. */
2413 COSTS_N_INSNS (8), /* cost of MULSD instruction. */
2414 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2415 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
2416 COSTS_N_INSNS (20), /* cost of DIVSS instruction. */
2417 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
2418 COSTS_N_INSNS (40), /* cost of SQRTSS instruction. */
2419 COSTS_N_INSNS (40), /* cost of SQRTSD instruction. */
2420 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2423 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2424 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2425 "16", /* Loop alignment. */
2426 "16:8:8", /* Jump alignment. */
2427 "0:0:8", /* Label alignment. */
2428 "16", /* Func alignment. */
2431 /* Generic should produce code tuned for Core-i7 (and newer chips)
2432 and btver1 (and newer chips). */
2434 static stringop_algs generic_memcpy
[2] = {
2435 {libcall
, {{32, loop
, false}, {8192, rep_prefix_4_byte
, false},
2436 {-1, libcall
, false}}},
2437 {libcall
, {{32, loop
, false}, {8192, rep_prefix_8_byte
, false},
2438 {-1, libcall
, false}}}};
2439 static stringop_algs generic_memset
[2] = {
2440 {libcall
, {{32, loop
, false}, {8192, rep_prefix_4_byte
, false},
2441 {-1, libcall
, false}}},
2442 {libcall
, {{32, loop
, false}, {8192, rep_prefix_8_byte
, false},
2443 {-1, libcall
, false}}}};
2445 struct processor_costs generic_cost
= {
2447 /* Start of register allocator costs. integer->integer move cost is 2. */
2448 6, /* cost for loading QImode using movzbl */
2449 {6, 6, 6}, /* cost of loading integer registers
2450 in QImode, HImode and SImode.
2451 Relative to reg-reg move (2). */
2452 {6, 6, 6}, /* cost of storing integer registers */
2453 4, /* cost of reg,reg fld/fst */
2454 {6, 6, 12}, /* cost of loading fp registers
2455 in SFmode, DFmode and XFmode */
2456 {6, 6, 12}, /* cost of storing fp registers
2457 in SFmode, DFmode and XFmode */
2458 2, /* cost of moving MMX register */
2459 {6, 6}, /* cost of loading MMX registers
2460 in SImode and DImode */
2461 {6, 6}, /* cost of storing MMX registers
2462 in SImode and DImode */
2463 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
2464 {6, 6, 6, 10, 15}, /* cost of loading SSE registers
2465 in 32,64,128,256 and 512-bit */
2466 {6, 6, 6, 10, 15}, /* cost of storing SSE registers
2467 in 32,64,128,256 and 512-bit */
2468 6, 6, /* SSE->integer and integer->SSE moves */
2469 /* End of register allocator costs. */
2472 COSTS_N_INSNS (1), /* cost of an add instruction */
2473 /* Setting cost to 2 makes our current implementation of synth_mult result in
2474 use of unnecessary temporary registers causing regression on several
2475 SPECfp benchmarks. */
2476 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2477 COSTS_N_INSNS (1), /* variable shift costs */
2478 COSTS_N_INSNS (1), /* constant shift costs */
2479 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2480 COSTS_N_INSNS (4), /* HI */
2481 COSTS_N_INSNS (3), /* SI */
2482 COSTS_N_INSNS (4), /* DI */
2483 COSTS_N_INSNS (4)}, /* other */
2484 0, /* cost of multiply per each bit set */
2485 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */
2486 COSTS_N_INSNS (22), /* HI */
2487 COSTS_N_INSNS (30), /* SI */
2488 COSTS_N_INSNS (74), /* DI */
2489 COSTS_N_INSNS (74)}, /* other */
2490 COSTS_N_INSNS (1), /* cost of movsx */
2491 COSTS_N_INSNS (1), /* cost of movzx */
2492 8, /* "large" insn */
2493 17, /* MOVE_RATIO */
2494 6, /* CLEAR_RATIO */
2495 {6, 6, 6}, /* cost of loading integer registers
2496 in QImode, HImode and SImode.
2497 Relative to reg-reg move (2). */
2498 {6, 6, 6}, /* cost of storing integer registers */
2499 {6, 6, 6, 10, 15}, /* cost of loading SSE register
2500 in 32bit, 64bit, 128bit, 256bit and 512bit */
2501 {6, 6, 6, 10, 15}, /* cost of storing SSE register
2502 in 32bit, 64bit, 128bit, 256bit and 512bit */
2503 {6, 6, 6, 10, 15}, /* cost of unaligned loads. */
2504 {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
2505 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
2506 6, /* cost of moving SSE register to integer. */
2507 18, 6, /* Gather load static, per_elt. */
2508 18, 6, /* Gather store static, per_elt. */
2509 32, /* size of l1 cache. */
2510 512, /* size of l2 cache. */
2511 64, /* size of prefetch block */
2512 6, /* number of parallel prefetches */
2513 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2514 value is increased to perhaps more appropriate value of 5. */
2515 3, /* Branch cost */
2516 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2517 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
2518 COSTS_N_INSNS (17), /* cost of FDIV instruction. */
2519 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2520 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2521 COSTS_N_INSNS (14), /* cost of FSQRT instruction. */
2523 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2524 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2525 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2526 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2527 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2528 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
2529 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
2530 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
2531 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
2532 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
2533 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
2536 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
2537 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
2538 "16:11:8", /* Loop alignment. */
2539 "16:11:8", /* Jump alignment. */
2540 "0:0:8", /* Label alignment. */
2541 "16", /* Func alignment. */
2544 /* core_cost should produce code tuned for Core familly of CPUs. */
2545 static stringop_algs core_memcpy
[2] = {
2546 {libcall
, {{1024, rep_prefix_4_byte
, true}, {-1, libcall
, false}}},
2547 {libcall
, {{24, loop
, true}, {128, rep_prefix_8_byte
, true},
2548 {-1, libcall
, false}}}};
2549 static stringop_algs core_memset
[2] = {
2550 {libcall
, {{6, loop_1_byte
, true},
2552 {8192, rep_prefix_4_byte
, true},
2553 {-1, libcall
, false}}},
2554 {libcall
, {{24, loop
, true}, {512, rep_prefix_8_byte
, true},
2555 {-1, libcall
, false}}}};
2558 struct processor_costs core_cost
= {
2560 /* Start of register allocator costs. integer->integer move cost is 2. */
2561 6, /* cost for loading QImode using movzbl */
2562 {4, 4, 4}, /* cost of loading integer registers
2563 in QImode, HImode and SImode.
2564 Relative to reg-reg move (2). */
2565 {6, 6, 6}, /* cost of storing integer registers */
2566 2, /* cost of reg,reg fld/fst */
2567 {6, 6, 8}, /* cost of loading fp registers
2568 in SFmode, DFmode and XFmode */
2569 {6, 6, 10}, /* cost of storing fp registers
2570 in SFmode, DFmode and XFmode */
2571 2, /* cost of moving MMX register */
2572 {6, 6}, /* cost of loading MMX registers
2573 in SImode and DImode */
2574 {6, 6}, /* cost of storing MMX registers
2575 in SImode and DImode */
2576 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
2577 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
2578 in 32,64,128,256 and 512-bit */
2579 {6, 6, 6, 6, 12}, /* cost of storing SSE registers
2580 in 32,64,128,256 and 512-bit */
2581 6, 6, /* SSE->integer and integer->SSE moves */
2582 /* End of register allocator costs. */
2585 COSTS_N_INSNS (1), /* cost of an add instruction */
2586 /* On all chips taken into consideration lea is 2 cycles and more. With
2587 this cost however our current implementation of synth_mult results in
2588 use of unnecessary temporary registers causing regression on several
2589 SPECfp benchmarks. */
2590 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2591 COSTS_N_INSNS (1), /* variable shift costs */
2592 COSTS_N_INSNS (1), /* constant shift costs */
2593 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2594 COSTS_N_INSNS (4), /* HI */
2595 COSTS_N_INSNS (3), /* SI */
2596 /* Here we tune for Sandybridge or newer. */
2597 COSTS_N_INSNS (3), /* DI */
2598 COSTS_N_INSNS (3)}, /* other */
2599 0, /* cost of multiply per each bit set */
2600 /* Expanding div/mod currently doesn't consider parallelism. So the cost
2601 model is not realistic. We compensate by increasing the latencies a bit. */
2602 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
2603 COSTS_N_INSNS (11), /* HI */
2604 COSTS_N_INSNS (14), /* SI */
2605 COSTS_N_INSNS (81), /* DI */
2606 COSTS_N_INSNS (81)}, /* other */
2607 COSTS_N_INSNS (1), /* cost of movsx */
2608 COSTS_N_INSNS (1), /* cost of movzx */
2609 8, /* "large" insn */
2610 17, /* MOVE_RATIO */
2611 6, /* CLEAR_RATIO */
2612 {4, 4, 4}, /* cost of loading integer registers
2613 in QImode, HImode and SImode.
2614 Relative to reg-reg move (2). */
2615 {6, 6, 6}, /* cost of storing integer registers */
2616 {6, 6, 6, 6, 12}, /* cost of loading SSE register
2617 in 32bit, 64bit, 128bit, 256bit and 512bit */
2618 {6, 6, 6, 6, 12}, /* cost of storing SSE register
2619 in 32bit, 64bit, 128bit, 256bit and 512bit */
2620 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */
2621 {6, 6, 6, 6, 12}, /* cost of unaligned stores. */
2622 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
2623 2, /* cost of moving SSE register to integer. */
2624 /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
2626 So 5 uops statically and one uops per load. */
2627 10, 6, /* Gather load static, per_elt. */
2628 10, 6, /* Gather store static, per_elt. */
2629 64, /* size of l1 cache. */
2630 512, /* size of l2 cache. */
2631 64, /* size of prefetch block */
2632 6, /* number of parallel prefetches */
2633 /* FIXME perhaps more appropriate value is 5. */
2634 3, /* Branch cost */
2635 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2636 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
2638 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
2639 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2640 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2641 COSTS_N_INSNS (23), /* cost of FSQRT instruction. */
2643 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2644 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2645 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2646 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2647 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2648 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
2649 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
2650 COSTS_N_INSNS (32), /* cost of DIVSD instruction. */
2651 COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */
2652 COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */
2653 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
2656 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2657 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2658 "16:11:8", /* Loop alignment. */
2659 "16:11:8", /* Jump alignment. */
2660 "0:0:8", /* Label alignment. */
2661 "16", /* Func alignment. */