]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/i386/x86-tune-costs.h
i386: Add clear_ratio to processor_costs
[thirdparty/gcc.git] / gcc / config / i386 / x86-tune-costs.h
1 /* Costs of operations of individual x86 CPUs.
2 Copyright (C) 1988-2019 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 Under Section 7 of GPL version 3, you are granted additional
17 permissions described in the GCC Runtime Library Exception, version
18 3.1, as published by the Free Software Foundation.
19
20 You should have received a copy of the GNU General Public License and
21 a copy of the GCC Runtime Library Exception along with this program;
22 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 <http://www.gnu.org/licenses/>. */
24 /* Processor costs (relative to an add) */
25 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
26 #define COSTS_N_BYTES(N) ((N) * 2)
27
28 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
29
30 static stringop_algs ix86_size_memcpy[2] = {
31 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
32 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
33 static stringop_algs ix86_size_memset[2] = {
34 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
35 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
36
37 const
38 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
39 {
40 /* Start of register allocator costs. integer->integer move cost is 2. */
41 2, /* cost for loading QImode using movzbl */
42 {2, 2, 2}, /* cost of loading integer registers
43 in QImode, HImode and SImode.
44 Relative to reg-reg move (2). */
45 {2, 2, 2}, /* cost of storing integer registers */
46 2, /* cost of reg,reg fld/fst */
47 {2, 2, 2}, /* cost of loading fp registers
48 in SFmode, DFmode and XFmode */
49 {2, 2, 2}, /* cost of storing fp registers
50 in SFmode, DFmode and XFmode */
51 3, /* cost of moving MMX register */
52 {3, 3}, /* cost of loading MMX registers
53 in SImode and DImode */
54 {3, 3}, /* cost of storing MMX registers
55 in SImode and DImode */
56 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
57 {3, 3, 3, 3, 3}, /* cost of loading SSE registers
58 in 32,64,128,256 and 512-bit */
59 {3, 3, 3, 3, 3}, /* cost of storing SSE registers
60 in 32,64,128,256 and 512-bit */
61 3, 3, /* SSE->integer and integer->SSE moves */
62 /* End of register allocator costs. */
63 },
64
65 COSTS_N_BYTES (2), /* cost of an add instruction */
66 COSTS_N_BYTES (3), /* cost of a lea instruction */
67 COSTS_N_BYTES (2), /* variable shift costs */
68 COSTS_N_BYTES (3), /* constant shift costs */
69 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
70 COSTS_N_BYTES (3), /* HI */
71 COSTS_N_BYTES (3), /* SI */
72 COSTS_N_BYTES (3), /* DI */
73 COSTS_N_BYTES (5)}, /* other */
74 0, /* cost of multiply per each bit set */
75 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
76 COSTS_N_BYTES (3), /* HI */
77 COSTS_N_BYTES (3), /* SI */
78 COSTS_N_BYTES (3), /* DI */
79 COSTS_N_BYTES (5)}, /* other */
80 COSTS_N_BYTES (3), /* cost of movsx */
81 COSTS_N_BYTES (3), /* cost of movzx */
82 0, /* "large" insn */
83 2, /* MOVE_RATIO */
84 2, /* CLEAR_RATIO */
85 {2, 2, 2}, /* cost of loading integer registers
86 in QImode, HImode and SImode.
87 Relative to reg-reg move (2). */
88 {2, 2, 2}, /* cost of storing integer registers */
89 {3, 3, 3, 3, 3}, /* cost of loading SSE register
90 in 32bit, 64bit, 128bit, 256bit and 512bit */
91 {3, 3, 3, 3, 3}, /* cost of storing SSE register
92 in 32bit, 64bit, 128bit, 256bit and 512bit */
93 {3, 3, 3, 3, 3}, /* cost of unaligned SSE load
94 in 128bit, 256bit and 512bit */
95 {3, 3, 3, 3, 3}, /* cost of unaligned SSE store
96 in 128bit, 256bit and 512bit */
97 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
98 3, /* cost of moving SSE register to integer. */
99 5, 0, /* Gather load static, per_elt. */
100 5, 0, /* Gather store static, per_elt. */
101 0, /* size of l1 cache */
102 0, /* size of l2 cache */
103 0, /* size of prefetch block */
104 0, /* number of parallel prefetches */
105 2, /* Branch cost */
106 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
107 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
108 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
109 COSTS_N_BYTES (2), /* cost of FABS instruction. */
110 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
111 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
112
113 COSTS_N_BYTES (2), /* cost of cheap SSE instruction. */
114 COSTS_N_BYTES (2), /* cost of ADDSS/SD SUBSS/SD insns. */
115 COSTS_N_BYTES (2), /* cost of MULSS instruction. */
116 COSTS_N_BYTES (2), /* cost of MULSD instruction. */
117 COSTS_N_BYTES (2), /* cost of FMA SS instruction. */
118 COSTS_N_BYTES (2), /* cost of FMA SD instruction. */
119 COSTS_N_BYTES (2), /* cost of DIVSS instruction. */
120 COSTS_N_BYTES (2), /* cost of DIVSD instruction. */
121 COSTS_N_BYTES (2), /* cost of SQRTSS instruction. */
122 COSTS_N_BYTES (2), /* cost of SQRTSD instruction. */
123 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
124 ix86_size_memcpy,
125 ix86_size_memset,
126 COSTS_N_BYTES (1), /* cond_taken_branch_cost. */
127 COSTS_N_BYTES (1), /* cond_not_taken_branch_cost. */
128 NULL, /* Loop alignment. */
129 NULL, /* Jump alignment. */
130 NULL, /* Label alignment. */
131 NULL, /* Func alignment. */
132 };
133
134 /* Processor costs (relative to an add) */
135 static stringop_algs i386_memcpy[2] = {
136 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
137 DUMMY_STRINGOP_ALGS};
138 static stringop_algs i386_memset[2] = {
139 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
140 DUMMY_STRINGOP_ALGS};
141
142 static const
143 struct processor_costs i386_cost = { /* 386 specific costs */
144 {
145 /* Start of register allocator costs. integer->integer move cost is 2. */
146 4, /* cost for loading QImode using movzbl */
147 {2, 4, 2}, /* cost of loading integer registers
148 in QImode, HImode and SImode.
149 Relative to reg-reg move (2). */
150 {2, 4, 2}, /* cost of storing integer registers */
151 2, /* cost of reg,reg fld/fst */
152 {8, 8, 8}, /* cost of loading fp registers
153 in SFmode, DFmode and XFmode */
154 {8, 8, 8}, /* cost of storing fp registers
155 in SFmode, DFmode and XFmode */
156 2, /* cost of moving MMX register */
157 {4, 8}, /* cost of loading MMX registers
158 in SImode and DImode */
159 {4, 8}, /* cost of storing MMX registers
160 in SImode and DImode */
161 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
162 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
163 in 32,64,128,256 and 512-bit */
164 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
165 in 32,64,128,256 and 512-bit */
166 3, 3, /* SSE->integer and integer->SSE moves */
167 /* End of register allocator costs. */
168 },
169
170 COSTS_N_INSNS (1), /* cost of an add instruction */
171 COSTS_N_INSNS (1), /* cost of a lea instruction */
172 COSTS_N_INSNS (3), /* variable shift costs */
173 COSTS_N_INSNS (2), /* constant shift costs */
174 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
175 COSTS_N_INSNS (6), /* HI */
176 COSTS_N_INSNS (6), /* SI */
177 COSTS_N_INSNS (6), /* DI */
178 COSTS_N_INSNS (6)}, /* other */
179 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
180 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
181 COSTS_N_INSNS (23), /* HI */
182 COSTS_N_INSNS (23), /* SI */
183 COSTS_N_INSNS (23), /* DI */
184 COSTS_N_INSNS (23)}, /* other */
185 COSTS_N_INSNS (3), /* cost of movsx */
186 COSTS_N_INSNS (2), /* cost of movzx */
187 15, /* "large" insn */
188 3, /* MOVE_RATIO */
189 3, /* CLEAR_RATIO */
190 {2, 4, 2}, /* cost of loading integer registers
191 in QImode, HImode and SImode.
192 Relative to reg-reg move (2). */
193 {2, 4, 2}, /* cost of storing integer registers */
194 {4, 8, 16, 32, 64}, /* cost of loading SSE register
195 in 32bit, 64bit, 128bit, 256bit and 512bit */
196 {4, 8, 16, 32, 64}, /* cost of storing SSE register
197 in 32bit, 64bit, 128bit, 256bit and 512bit */
198 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
199 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
200 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
201 3, /* cost of moving SSE register to integer. */
202 4, 4, /* Gather load static, per_elt. */
203 4, 4, /* Gather store static, per_elt. */
204 0, /* size of l1 cache */
205 0, /* size of l2 cache */
206 0, /* size of prefetch block */
207 0, /* number of parallel prefetches */
208 1, /* Branch cost */
209 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
210 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
211 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
212 COSTS_N_INSNS (22), /* cost of FABS instruction. */
213 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
214 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
215
216 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
217 COSTS_N_INSNS (23), /* cost of ADDSS/SD SUBSS/SD insns. */
218 COSTS_N_INSNS (27), /* cost of MULSS instruction. */
219 COSTS_N_INSNS (27), /* cost of MULSD instruction. */
220 COSTS_N_INSNS (27), /* cost of FMA SS instruction. */
221 COSTS_N_INSNS (27), /* cost of FMA SD instruction. */
222 COSTS_N_INSNS (88), /* cost of DIVSS instruction. */
223 COSTS_N_INSNS (88), /* cost of DIVSD instruction. */
224 COSTS_N_INSNS (122), /* cost of SQRTSS instruction. */
225 COSTS_N_INSNS (122), /* cost of SQRTSD instruction. */
226 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
227 i386_memcpy,
228 i386_memset,
229 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
230 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
231 "4", /* Loop alignment. */
232 "4", /* Jump alignment. */
233 NULL, /* Label alignment. */
234 "4", /* Func alignment. */
235 };
236
237 static stringop_algs i486_memcpy[2] = {
238 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
239 DUMMY_STRINGOP_ALGS};
240 static stringop_algs i486_memset[2] = {
241 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
242 DUMMY_STRINGOP_ALGS};
243
244 static const
245 struct processor_costs i486_cost = { /* 486 specific costs */
246 {
247 /* Start of register allocator costs. integer->integer move cost is 2. */
248 4, /* cost for loading QImode using movzbl */
249 {2, 4, 2}, /* cost of loading integer registers
250 in QImode, HImode and SImode.
251 Relative to reg-reg move (2). */
252 {2, 4, 2}, /* cost of storing integer registers */
253 2, /* cost of reg,reg fld/fst */
254 {8, 8, 8}, /* cost of loading fp registers
255 in SFmode, DFmode and XFmode */
256 {8, 8, 8}, /* cost of storing fp registers
257 in SFmode, DFmode and XFmode */
258 2, /* cost of moving MMX register */
259 {4, 8}, /* cost of loading MMX registers
260 in SImode and DImode */
261 {4, 8}, /* cost of storing MMX registers
262 in SImode and DImode */
263 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
264 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
265 in 32,64,128,256 and 512-bit */
266 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
267 in 32,64,128,256 and 512-bit */
268 3, 3, /* SSE->integer and integer->SSE moves */
269 /* End of register allocator costs. */
270 },
271
272 COSTS_N_INSNS (1), /* cost of an add instruction */
273 COSTS_N_INSNS (1), /* cost of a lea instruction */
274 COSTS_N_INSNS (3), /* variable shift costs */
275 COSTS_N_INSNS (2), /* constant shift costs */
276 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
277 COSTS_N_INSNS (12), /* HI */
278 COSTS_N_INSNS (12), /* SI */
279 COSTS_N_INSNS (12), /* DI */
280 COSTS_N_INSNS (12)}, /* other */
281 1, /* cost of multiply per each bit set */
282 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
283 COSTS_N_INSNS (40), /* HI */
284 COSTS_N_INSNS (40), /* SI */
285 COSTS_N_INSNS (40), /* DI */
286 COSTS_N_INSNS (40)}, /* other */
287 COSTS_N_INSNS (3), /* cost of movsx */
288 COSTS_N_INSNS (2), /* cost of movzx */
289 15, /* "large" insn */
290 3, /* MOVE_RATIO */
291 3, /* CLEAR_RATIO */
292 {2, 4, 2}, /* cost of loading integer registers
293 in QImode, HImode and SImode.
294 Relative to reg-reg move (2). */
295 {2, 4, 2}, /* cost of storing integer registers */
296 {4, 8, 16, 32, 64}, /* cost of loading SSE register
297 in 32bit, 64bit, 128bit, 256bit and 512bit */
298 {4, 8, 16, 32, 64}, /* cost of storing SSE register
299 in 32bit, 64bit, 128bit, 256bit and 512bit */
300 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
301 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
302 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
303 3, /* cost of moving SSE register to integer. */
304 4, 4, /* Gather load static, per_elt. */
305 4, 4, /* Gather store static, per_elt. */
306 4, /* size of l1 cache. 486 has 8kB cache
307 shared for code and data, so 4kB is
308 not really precise. */
309 4, /* size of l2 cache */
310 0, /* size of prefetch block */
311 0, /* number of parallel prefetches */
312 1, /* Branch cost */
313 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
314 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
315 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
316 COSTS_N_INSNS (3), /* cost of FABS instruction. */
317 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
318 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
319
320 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
321 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
322 COSTS_N_INSNS (16), /* cost of MULSS instruction. */
323 COSTS_N_INSNS (16), /* cost of MULSD instruction. */
324 COSTS_N_INSNS (16), /* cost of FMA SS instruction. */
325 COSTS_N_INSNS (16), /* cost of FMA SD instruction. */
326 COSTS_N_INSNS (73), /* cost of DIVSS instruction. */
327 COSTS_N_INSNS (74), /* cost of DIVSD instruction. */
328 COSTS_N_INSNS (83), /* cost of SQRTSS instruction. */
329 COSTS_N_INSNS (83), /* cost of SQRTSD instruction. */
330 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
331 i486_memcpy,
332 i486_memset,
333 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
334 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
335 "16", /* Loop alignment. */
336 "16", /* Jump alignment. */
337 "0:0:8", /* Label alignment. */
338 "16", /* Func alignment. */
339 };
340
341 static stringop_algs pentium_memcpy[2] = {
342 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
343 DUMMY_STRINGOP_ALGS};
344 static stringop_algs pentium_memset[2] = {
345 {libcall, {{-1, rep_prefix_4_byte, false}}},
346 DUMMY_STRINGOP_ALGS};
347
348 static const
349 struct processor_costs pentium_cost = {
350 {
351 /* Start of register allocator costs. integer->integer move cost is 2. */
352 6, /* cost for loading QImode using movzbl */
353 {2, 4, 2}, /* cost of loading integer registers
354 in QImode, HImode and SImode.
355 Relative to reg-reg move (2). */
356 {2, 4, 2}, /* cost of storing integer registers */
357 2, /* cost of reg,reg fld/fst */
358 {2, 2, 6}, /* cost of loading fp registers
359 in SFmode, DFmode and XFmode */
360 {4, 4, 6}, /* cost of storing fp registers
361 in SFmode, DFmode and XFmode */
362 8, /* cost of moving MMX register */
363 {8, 8}, /* cost of loading MMX registers
364 in SImode and DImode */
365 {8, 8}, /* cost of storing MMX registers
366 in SImode and DImode */
367 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
368 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
369 in 32,64,128,256 and 512-bit */
370 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
371 in 32,64,128,256 and 512-bit */
372 3, 3, /* SSE->integer and integer->SSE moves */
373 /* End of register allocator costs. */
374 },
375
376 COSTS_N_INSNS (1), /* cost of an add instruction */
377 COSTS_N_INSNS (1), /* cost of a lea instruction */
378 COSTS_N_INSNS (4), /* variable shift costs */
379 COSTS_N_INSNS (1), /* constant shift costs */
380 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
381 COSTS_N_INSNS (11), /* HI */
382 COSTS_N_INSNS (11), /* SI */
383 COSTS_N_INSNS (11), /* DI */
384 COSTS_N_INSNS (11)}, /* other */
385 0, /* cost of multiply per each bit set */
386 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
387 COSTS_N_INSNS (25), /* HI */
388 COSTS_N_INSNS (25), /* SI */
389 COSTS_N_INSNS (25), /* DI */
390 COSTS_N_INSNS (25)}, /* other */
391 COSTS_N_INSNS (3), /* cost of movsx */
392 COSTS_N_INSNS (2), /* cost of movzx */
393 8, /* "large" insn */
394 6, /* MOVE_RATIO */
395 6, /* CLEAR_RATIO */
396 {2, 4, 2}, /* cost of loading integer registers
397 in QImode, HImode and SImode.
398 Relative to reg-reg move (2). */
399 {2, 4, 2}, /* cost of storing integer registers */
400 {4, 8, 16, 32, 64}, /* cost of loading SSE register
401 in 32bit, 64bit, 128bit, 256bit and 512bit */
402 {4, 8, 16, 32, 64}, /* cost of storing SSE register
403 in 32bit, 64bit, 128bit, 256bit and 512bit */
404 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
405 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
406 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
407 3, /* cost of moving SSE register to integer. */
408 4, 4, /* Gather load static, per_elt. */
409 4, 4, /* Gather store static, per_elt. */
410 8, /* size of l1 cache. */
411 8, /* size of l2 cache */
412 0, /* size of prefetch block */
413 0, /* number of parallel prefetches */
414 2, /* Branch cost */
415 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
416 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
417 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
418 COSTS_N_INSNS (1), /* cost of FABS instruction. */
419 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
420 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
421
422 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
423 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
424 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
425 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
426 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
427 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
428 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
429 COSTS_N_INSNS (39), /* cost of DIVSD instruction. */
430 COSTS_N_INSNS (70), /* cost of SQRTSS instruction. */
431 COSTS_N_INSNS (70), /* cost of SQRTSD instruction. */
432 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
433 pentium_memcpy,
434 pentium_memset,
435 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
436 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
437 "16:8:8", /* Loop alignment. */
438 "16:8:8", /* Jump alignment. */
439 "0:0:8", /* Label alignment. */
440 "16", /* Func alignment. */
441 };
442
443 static const
444 struct processor_costs lakemont_cost = {
445 {
446 /* Start of register allocator costs. integer->integer move cost is 2. */
447 6, /* cost for loading QImode using movzbl */
448 {2, 4, 2}, /* cost of loading integer registers
449 in QImode, HImode and SImode.
450 Relative to reg-reg move (2). */
451 {2, 4, 2}, /* cost of storing integer registers */
452 2, /* cost of reg,reg fld/fst */
453 {2, 2, 6}, /* cost of loading fp registers
454 in SFmode, DFmode and XFmode */
455 {4, 4, 6}, /* cost of storing fp registers
456 in SFmode, DFmode and XFmode */
457 8, /* cost of moving MMX register */
458 {8, 8}, /* cost of loading MMX registers
459 in SImode and DImode */
460 {8, 8}, /* cost of storing MMX registers
461 in SImode and DImode */
462 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
463 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
464 in 32,64,128,256 and 512-bit */
465 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
466 in 32,64,128,256 and 512-bit */
467 3, 3, /* SSE->integer and integer->SSE moves */
468 /* End of register allocator costs. */
469 },
470
471 COSTS_N_INSNS (1), /* cost of an add instruction */
472 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
473 COSTS_N_INSNS (1), /* variable shift costs */
474 COSTS_N_INSNS (1), /* constant shift costs */
475 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
476 COSTS_N_INSNS (11), /* HI */
477 COSTS_N_INSNS (11), /* SI */
478 COSTS_N_INSNS (11), /* DI */
479 COSTS_N_INSNS (11)}, /* other */
480 0, /* cost of multiply per each bit set */
481 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
482 COSTS_N_INSNS (25), /* HI */
483 COSTS_N_INSNS (25), /* SI */
484 COSTS_N_INSNS (25), /* DI */
485 COSTS_N_INSNS (25)}, /* other */
486 COSTS_N_INSNS (3), /* cost of movsx */
487 COSTS_N_INSNS (2), /* cost of movzx */
488 8, /* "large" insn */
489 17, /* MOVE_RATIO */
490 6, /* CLEAR_RATIO */
491 {2, 4, 2}, /* cost of loading integer registers
492 in QImode, HImode and SImode.
493 Relative to reg-reg move (2). */
494 {2, 4, 2}, /* cost of storing integer registers */
495 {4, 8, 16, 32, 64}, /* cost of loading SSE register
496 in 32bit, 64bit, 128bit, 256bit and 512bit */
497 {4, 8, 16, 32, 64}, /* cost of storing SSE register
498 in 32bit, 64bit, 128bit, 256bit and 512bit */
499 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
500 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
501 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
502 3, /* cost of moving SSE register to integer. */
503 4, 4, /* Gather load static, per_elt. */
504 4, 4, /* Gather store static, per_elt. */
505 8, /* size of l1 cache. */
506 8, /* size of l2 cache */
507 0, /* size of prefetch block */
508 0, /* number of parallel prefetches */
509 2, /* Branch cost */
510 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
511 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
512 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
513 COSTS_N_INSNS (1), /* cost of FABS instruction. */
514 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
515 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
516
517 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
518 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
519 COSTS_N_INSNS (5), /* cost of MULSS instruction. */
520 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
521 COSTS_N_INSNS (10), /* cost of FMA SS instruction. */
522 COSTS_N_INSNS (10), /* cost of FMA SD instruction. */
523 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
524 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
525 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
526 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
527 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
528 pentium_memcpy,
529 pentium_memset,
530 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
531 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
532 "16:8:8", /* Loop alignment. */
533 "16:8:8", /* Jump alignment. */
534 "0:0:8", /* Label alignment. */
535 "16", /* Func alignment. */
536 };
537
538 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
539 (we ensure the alignment). For small blocks inline loop is still a
540 noticeable win, for bigger blocks either rep movsl or rep movsb is
541 way to go. Rep movsb has apparently more expensive startup time in CPU,
542 but after 4K the difference is down in the noise. */
543 static stringop_algs pentiumpro_memcpy[2] = {
544 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
545 {8192, rep_prefix_4_byte, false},
546 {-1, rep_prefix_1_byte, false}}},
547 DUMMY_STRINGOP_ALGS};
548 static stringop_algs pentiumpro_memset[2] = {
549 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
550 {8192, rep_prefix_4_byte, false},
551 {-1, libcall, false}}},
552 DUMMY_STRINGOP_ALGS};
553 static const
554 struct processor_costs pentiumpro_cost = {
555 {
556 /* Start of register allocator costs. integer->integer move cost is 2. */
557 2, /* cost for loading QImode using movzbl */
558 {4, 4, 4}, /* cost of loading integer registers
559 in QImode, HImode and SImode.
560 Relative to reg-reg move (2). */
561 {2, 2, 2}, /* cost of storing integer registers */
562 2, /* cost of reg,reg fld/fst */
563 {2, 2, 6}, /* cost of loading fp registers
564 in SFmode, DFmode and XFmode */
565 {4, 4, 6}, /* cost of storing fp registers
566 in SFmode, DFmode and XFmode */
567 2, /* cost of moving MMX register */
568 {2, 2}, /* cost of loading MMX registers
569 in SImode and DImode */
570 {2, 2}, /* cost of storing MMX registers
571 in SImode and DImode */
572 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
573 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
574 in 32,64,128,256 and 512-bit */
575 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
576 in 32,64,128,256 and 512-bit */
577 3, 3, /* SSE->integer and integer->SSE moves */
578 /* End of register allocator costs. */
579 },
580
581 COSTS_N_INSNS (1), /* cost of an add instruction */
582 COSTS_N_INSNS (1), /* cost of a lea instruction */
583 COSTS_N_INSNS (1), /* variable shift costs */
584 COSTS_N_INSNS (1), /* constant shift costs */
585 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
586 COSTS_N_INSNS (4), /* HI */
587 COSTS_N_INSNS (4), /* SI */
588 COSTS_N_INSNS (4), /* DI */
589 COSTS_N_INSNS (4)}, /* other */
590 0, /* cost of multiply per each bit set */
591 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
592 COSTS_N_INSNS (17), /* HI */
593 COSTS_N_INSNS (17), /* SI */
594 COSTS_N_INSNS (17), /* DI */
595 COSTS_N_INSNS (17)}, /* other */
596 COSTS_N_INSNS (1), /* cost of movsx */
597 COSTS_N_INSNS (1), /* cost of movzx */
598 8, /* "large" insn */
599 6, /* MOVE_RATIO */
600 6, /* CLEAR_RATIO */
601 {4, 4, 4}, /* cost of loading integer registers
602 in QImode, HImode and SImode.
603 Relative to reg-reg move (2). */
604 {2, 2, 2}, /* cost of storing integer registers */
605 {4, 8, 16, 32, 64}, /* cost of loading SSE register
606 in 32bit, 64bit, 128bit, 256bit and 512bit */
607 {4, 8, 16, 32, 64}, /* cost of storing SSE register
608 in 32bit, 64bit, 128bit, 256bit and 512bit */
609 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
610 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
611 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
612 3, /* cost of moving SSE register to integer. */
613 4, 4, /* Gather load static, per_elt. */
614 4, 4, /* Gather store static, per_elt. */
615 8, /* size of l1 cache. */
616 256, /* size of l2 cache */
617 32, /* size of prefetch block */
618 6, /* number of parallel prefetches */
619 2, /* Branch cost */
620 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
621 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
622 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
623 COSTS_N_INSNS (2), /* cost of FABS instruction. */
624 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
625 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
626
627 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
628 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
629 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
630 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
631 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
632 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
633 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
634 COSTS_N_INSNS (18), /* cost of DIVSD instruction. */
635 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
636 COSTS_N_INSNS (31), /* cost of SQRTSD instruction. */
637 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
638 pentiumpro_memcpy,
639 pentiumpro_memset,
640 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
641 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
642 "16", /* Loop alignment. */
643 "16:11:8", /* Jump alignment. */
644 "0:0:8", /* Label alignment. */
645 "16", /* Func alignment. */
646 };
647
648 static stringop_algs geode_memcpy[2] = {
649 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
650 DUMMY_STRINGOP_ALGS};
651 static stringop_algs geode_memset[2] = {
652 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
653 DUMMY_STRINGOP_ALGS};
654 static const
655 struct processor_costs geode_cost = {
656 {
657 /* Start of register allocator costs. integer->integer move cost is 2. */
658 2, /* cost for loading QImode using movzbl */
659 {2, 2, 2}, /* cost of loading integer registers
660 in QImode, HImode and SImode.
661 Relative to reg-reg move (2). */
662 {2, 2, 2}, /* cost of storing integer registers */
663 2, /* cost of reg,reg fld/fst */
664 {2, 2, 2}, /* cost of loading fp registers
665 in SFmode, DFmode and XFmode */
666 {4, 6, 6}, /* cost of storing fp registers
667 in SFmode, DFmode and XFmode */
668 2, /* cost of moving MMX register */
669 {2, 2}, /* cost of loading MMX registers
670 in SImode and DImode */
671 {2, 2}, /* cost of storing MMX registers
672 in SImode and DImode */
673 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
674 {2, 2, 8, 16, 32}, /* cost of loading SSE registers
675 in 32,64,128,256 and 512-bit */
676 {2, 2, 8, 16, 32}, /* cost of storing SSE registers
677 in 32,64,128,256 and 512-bit */
678 6, 6, /* SSE->integer and integer->SSE moves */
679 /* End of register allocator costs. */
680 },
681
682 COSTS_N_INSNS (1), /* cost of an add instruction */
683 COSTS_N_INSNS (1), /* cost of a lea instruction */
684 COSTS_N_INSNS (2), /* variable shift costs */
685 COSTS_N_INSNS (1), /* constant shift costs */
686 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
687 COSTS_N_INSNS (4), /* HI */
688 COSTS_N_INSNS (7), /* SI */
689 COSTS_N_INSNS (7), /* DI */
690 COSTS_N_INSNS (7)}, /* other */
691 0, /* cost of multiply per each bit set */
692 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
693 COSTS_N_INSNS (23), /* HI */
694 COSTS_N_INSNS (39), /* SI */
695 COSTS_N_INSNS (39), /* DI */
696 COSTS_N_INSNS (39)}, /* other */
697 COSTS_N_INSNS (1), /* cost of movsx */
698 COSTS_N_INSNS (1), /* cost of movzx */
699 8, /* "large" insn */
700 4, /* MOVE_RATIO */
701 4, /* CLEAR_RATIO */
702 {2, 2, 2}, /* cost of loading integer registers
703 in QImode, HImode and SImode.
704 Relative to reg-reg move (2). */
705 {2, 2, 2}, /* cost of storing integer registers */
706 {2, 2, 8, 16, 32}, /* cost of loading SSE register
707 in 32bit, 64bit, 128bit, 256bit and 512bit */
708 {2, 2, 8, 16, 32}, /* cost of storing SSE register
709 in 32bit, 64bit, 128bit, 256bit and 512bit */
710 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */
711 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
712 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
713 6, /* cost of moving SSE register to integer. */
714 2, 2, /* Gather load static, per_elt. */
715 2, 2, /* Gather store static, per_elt. */
716 64, /* size of l1 cache. */
717 128, /* size of l2 cache. */
718 32, /* size of prefetch block */
719 1, /* number of parallel prefetches */
720 1, /* Branch cost */
721 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
722 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
723 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
724 COSTS_N_INSNS (1), /* cost of FABS instruction. */
725 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
726 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
727
728 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
729 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
730 COSTS_N_INSNS (11), /* cost of MULSS instruction. */
731 COSTS_N_INSNS (11), /* cost of MULSD instruction. */
732 COSTS_N_INSNS (17), /* cost of FMA SS instruction. */
733 COSTS_N_INSNS (17), /* cost of FMA SD instruction. */
734 COSTS_N_INSNS (47), /* cost of DIVSS instruction. */
735 COSTS_N_INSNS (47), /* cost of DIVSD instruction. */
736 COSTS_N_INSNS (54), /* cost of SQRTSS instruction. */
737 COSTS_N_INSNS (54), /* cost of SQRTSD instruction. */
738 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
739 geode_memcpy,
740 geode_memset,
741 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
742 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
743 NULL, /* Loop alignment. */
744 NULL, /* Jump alignment. */
745 NULL, /* Label alignment. */
746 NULL, /* Func alignment. */
747 };
748
749 static stringop_algs k6_memcpy[2] = {
750 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
751 DUMMY_STRINGOP_ALGS};
752 static stringop_algs k6_memset[2] = {
753 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
754 DUMMY_STRINGOP_ALGS};
755 static const
756 struct processor_costs k6_cost = {
757 {
758 /* Start of register allocator costs. integer->integer move cost is 2. */
759 3, /* cost for loading QImode using movzbl */
760 {4, 5, 4}, /* cost of loading integer registers
761 in QImode, HImode and SImode.
762 Relative to reg-reg move (2). */
763 {2, 3, 2}, /* cost of storing integer registers */
764 4, /* cost of reg,reg fld/fst */
765 {6, 6, 6}, /* cost of loading fp registers
766 in SFmode, DFmode and XFmode */
767 {4, 4, 4}, /* cost of storing fp registers
768 in SFmode, DFmode and XFmode */
769 2, /* cost of moving MMX register */
770 {2, 2}, /* cost of loading MMX registers
771 in SImode and DImode */
772 {2, 2}, /* cost of storing MMX registers
773 in SImode and DImode */
774 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
775 {2, 2, 8, 16, 32}, /* cost of loading SSE registers
776 in 32,64,128,256 and 512-bit */
777 {2, 2, 8, 16, 32}, /* cost of storing SSE registers
778 in 32,64,128,256 and 512-bit */
779 6, 6, /* SSE->integer and integer->SSE moves */
780 /* End of register allocator costs. */
781 },
782
783 COSTS_N_INSNS (1), /* cost of an add instruction */
784 COSTS_N_INSNS (2), /* cost of a lea instruction */
785 COSTS_N_INSNS (1), /* variable shift costs */
786 COSTS_N_INSNS (1), /* constant shift costs */
787 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
788 COSTS_N_INSNS (3), /* HI */
789 COSTS_N_INSNS (3), /* SI */
790 COSTS_N_INSNS (3), /* DI */
791 COSTS_N_INSNS (3)}, /* other */
792 0, /* cost of multiply per each bit set */
793 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
794 COSTS_N_INSNS (18), /* HI */
795 COSTS_N_INSNS (18), /* SI */
796 COSTS_N_INSNS (18), /* DI */
797 COSTS_N_INSNS (18)}, /* other */
798 COSTS_N_INSNS (2), /* cost of movsx */
799 COSTS_N_INSNS (2), /* cost of movzx */
800 8, /* "large" insn */
801 4, /* MOVE_RATIO */
802 4, /* CLEAR_RATIO */
803 {4, 5, 4}, /* cost of loading integer registers
804 in QImode, HImode and SImode.
805 Relative to reg-reg move (2). */
806 {2, 3, 2}, /* cost of storing integer registers */
807 {2, 2, 8, 16, 32}, /* cost of loading SSE register
808 in 32bit, 64bit, 128bit, 256bit and 512bit */
809 {2, 2, 8, 16, 32}, /* cost of storing SSE register
810 in 32bit, 64bit, 128bit, 256bit and 512bit */
811 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */
812 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
813 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
814 6, /* cost of moving SSE register to integer. */
815 2, 2, /* Gather load static, per_elt. */
816 2, 2, /* Gather store static, per_elt. */
817 32, /* size of l1 cache. */
818 32, /* size of l2 cache. Some models
819 have integrated l2 cache, but
820 optimizing for k6 is not important
821 enough to worry about that. */
822 32, /* size of prefetch block */
823 1, /* number of parallel prefetches */
824 1, /* Branch cost */
825 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
826 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
827 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
828 COSTS_N_INSNS (2), /* cost of FABS instruction. */
829 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
830 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
831
832 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
833 COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */
834 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
835 COSTS_N_INSNS (2), /* cost of MULSD instruction. */
836 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
837 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
838 COSTS_N_INSNS (56), /* cost of DIVSS instruction. */
839 COSTS_N_INSNS (56), /* cost of DIVSD instruction. */
840 COSTS_N_INSNS (56), /* cost of SQRTSS instruction. */
841 COSTS_N_INSNS (56), /* cost of SQRTSD instruction. */
842 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
843 k6_memcpy,
844 k6_memset,
845 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
846 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
847 "32:8:8", /* Loop alignment. */
848 "32:8:8", /* Jump alignment. */
849 "0:0:8", /* Label alignment. */
850 "32", /* Func alignment. */
851 };
852
853 /* For some reason, Athlon deals better with REP prefix (relative to loops)
854 compared to K8. Alignment becomes important after 8 bytes for memcpy and
855 128 bytes for memset. */
856 static stringop_algs athlon_memcpy[2] = {
857 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
858 DUMMY_STRINGOP_ALGS};
859 static stringop_algs athlon_memset[2] = {
860 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
861 DUMMY_STRINGOP_ALGS};
862 static const
863 struct processor_costs athlon_cost = {
864 {
865 /* Start of register allocator costs. integer->integer move cost is 2. */
866 4, /* cost for loading QImode using movzbl */
867 {3, 4, 3}, /* cost of loading integer registers
868 in QImode, HImode and SImode.
869 Relative to reg-reg move (2). */
870 {3, 4, 3}, /* cost of storing integer registers */
871 4, /* cost of reg,reg fld/fst */
872 {4, 4, 12}, /* cost of loading fp registers
873 in SFmode, DFmode and XFmode */
874 {6, 6, 8}, /* cost of storing fp registers
875 in SFmode, DFmode and XFmode */
876 2, /* cost of moving MMX register */
877 {4, 4}, /* cost of loading MMX registers
878 in SImode and DImode */
879 {4, 4}, /* cost of storing MMX registers
880 in SImode and DImode */
881 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
882 {4, 4, 12, 12, 24}, /* cost of loading SSE registers
883 in 32,64,128,256 and 512-bit */
884 {4, 4, 10, 10, 20}, /* cost of storing SSE registers
885 in 32,64,128,256 and 512-bit */
886 5, 5, /* SSE->integer and integer->SSE moves */
887 /* End of register allocator costs. */
888 },
889
890 COSTS_N_INSNS (1), /* cost of an add instruction */
891 COSTS_N_INSNS (2), /* cost of a lea instruction */
892 COSTS_N_INSNS (1), /* variable shift costs */
893 COSTS_N_INSNS (1), /* constant shift costs */
894 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
895 COSTS_N_INSNS (5), /* HI */
896 COSTS_N_INSNS (5), /* SI */
897 COSTS_N_INSNS (5), /* DI */
898 COSTS_N_INSNS (5)}, /* other */
899 0, /* cost of multiply per each bit set */
900 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
901 COSTS_N_INSNS (26), /* HI */
902 COSTS_N_INSNS (42), /* SI */
903 COSTS_N_INSNS (74), /* DI */
904 COSTS_N_INSNS (74)}, /* other */
905 COSTS_N_INSNS (1), /* cost of movsx */
906 COSTS_N_INSNS (1), /* cost of movzx */
907 8, /* "large" insn */
908 9, /* MOVE_RATIO */
909 6, /* CLEAR_RATIO */
910 {3, 4, 3}, /* cost of loading integer registers
911 in QImode, HImode and SImode.
912 Relative to reg-reg move (2). */
913 {3, 4, 3}, /* cost of storing integer registers */
914 {4, 4, 12, 12, 24}, /* cost of loading SSE register
915 in 32bit, 64bit, 128bit, 256bit and 512bit */
916 {4, 4, 10, 10, 20}, /* cost of storing SSE register
917 in 32bit, 64bit, 128bit, 256bit and 512bit */
918 {4, 4, 12, 12, 24}, /* cost of unaligned loads. */
919 {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
920 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
921 5, /* cost of moving SSE register to integer. */
922 4, 4, /* Gather load static, per_elt. */
923 4, 4, /* Gather store static, per_elt. */
924 64, /* size of l1 cache. */
925 256, /* size of l2 cache. */
926 64, /* size of prefetch block */
927 6, /* number of parallel prefetches */
928 5, /* Branch cost */
929 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
930 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
931 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
932 COSTS_N_INSNS (2), /* cost of FABS instruction. */
933 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
934 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
935
936 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
937 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
938 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
939 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
940 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
941 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
942 /* 11-16 */
943 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
944 COSTS_N_INSNS (24), /* cost of DIVSD instruction. */
945 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
946 COSTS_N_INSNS (19), /* cost of SQRTSD instruction. */
947 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
948 athlon_memcpy,
949 athlon_memset,
950 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
951 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
952 "16:8:8", /* Loop alignment. */
953 "16:8:8", /* Jump alignment. */
954 "0:0:8", /* Label alignment. */
955 "16", /* Func alignment. */
956 };
957
958 /* K8 has optimized REP instruction for medium sized blocks, but for very
959 small blocks it is better to use loop. For large blocks, libcall can
960 do nontemporary accesses and beat inline considerably. */
961 static stringop_algs k8_memcpy[2] = {
962 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
963 {-1, rep_prefix_4_byte, false}}},
964 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
965 {-1, libcall, false}}}};
966 static stringop_algs k8_memset[2] = {
967 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
968 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
969 {libcall, {{48, unrolled_loop, false},
970 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
971 static const
972 struct processor_costs k8_cost = {
973 {
974 /* Start of register allocator costs. integer->integer move cost is 2. */
975 4, /* cost for loading QImode using movzbl */
976 {3, 4, 3}, /* cost of loading integer registers
977 in QImode, HImode and SImode.
978 Relative to reg-reg move (2). */
979 {3, 4, 3}, /* cost of storing integer registers */
980 4, /* cost of reg,reg fld/fst */
981 {4, 4, 12}, /* cost of loading fp registers
982 in SFmode, DFmode and XFmode */
983 {6, 6, 8}, /* cost of storing fp registers
984 in SFmode, DFmode and XFmode */
985 2, /* cost of moving MMX register */
986 {3, 3}, /* cost of loading MMX registers
987 in SImode and DImode */
988 {4, 4}, /* cost of storing MMX registers
989 in SImode and DImode */
990 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
991 {4, 3, 12, 12, 24}, /* cost of loading SSE registers
992 in 32,64,128,256 and 512-bit */
993 {4, 4, 10, 10, 20}, /* cost of storing SSE registers
994 in 32,64,128,256 and 512-bit */
995 5, 5, /* SSE->integer and integer->SSE moves */
996 /* End of register allocator costs. */
997 },
998
999 COSTS_N_INSNS (1), /* cost of an add instruction */
1000 COSTS_N_INSNS (2), /* cost of a lea instruction */
1001 COSTS_N_INSNS (1), /* variable shift costs */
1002 COSTS_N_INSNS (1), /* constant shift costs */
1003 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1004 COSTS_N_INSNS (4), /* HI */
1005 COSTS_N_INSNS (3), /* SI */
1006 COSTS_N_INSNS (4), /* DI */
1007 COSTS_N_INSNS (5)}, /* other */
1008 0, /* cost of multiply per each bit set */
1009 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1010 COSTS_N_INSNS (26), /* HI */
1011 COSTS_N_INSNS (42), /* SI */
1012 COSTS_N_INSNS (74), /* DI */
1013 COSTS_N_INSNS (74)}, /* other */
1014 COSTS_N_INSNS (1), /* cost of movsx */
1015 COSTS_N_INSNS (1), /* cost of movzx */
1016 8, /* "large" insn */
1017 9, /* MOVE_RATIO */
1018 6, /* CLEAR_RATIO */
1019 {3, 4, 3}, /* cost of loading integer registers
1020 in QImode, HImode and SImode.
1021 Relative to reg-reg move (2). */
1022 {3, 4, 3}, /* cost of storing integer registers */
1023 {4, 3, 12, 12, 24}, /* cost of loading SSE register
1024 in 32bit, 64bit, 128bit, 256bit and 512bit */
1025 {4, 4, 10, 10, 20}, /* cost of storing SSE register
1026 in 32bit, 64bit, 128bit, 256bit and 512bit */
1027 {4, 3, 12, 12, 24}, /* cost of unaligned loads. */
1028 {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
1029 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1030 5, /* cost of moving SSE register to integer. */
1031 4, 4, /* Gather load static, per_elt. */
1032 4, 4, /* Gather store static, per_elt. */
1033 64, /* size of l1 cache. */
1034 512, /* size of l2 cache. */
1035 64, /* size of prefetch block */
1036 /* New AMD processors never drop prefetches; if they cannot be performed
1037 immediately, they are queued. We set number of simultaneous prefetches
1038 to a large constant to reflect this (it probably is not a good idea not
1039 to limit number of prefetches at all, as their execution also takes some
1040 time). */
1041 100, /* number of parallel prefetches */
1042 3, /* Branch cost */
1043 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1044 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1045 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1046 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1047 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1048 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1049
1050 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1051 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1052 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1053 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1054 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
1055 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
1056 /* 11-16 */
1057 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
1058 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
1059 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
1060 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
1061 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1062 k8_memcpy,
1063 k8_memset,
1064 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1065 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1066 "16:8:8", /* Loop alignment. */
1067 "16:8:8", /* Jump alignment. */
1068 "0:0:8", /* Label alignment. */
1069 "16", /* Func alignment. */
1070 };
1071
1072 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1073 very small blocks it is better to use loop. For large blocks, libcall can
1074 do nontemporary accesses and beat inline considerably. */
1075 static stringop_algs amdfam10_memcpy[2] = {
1076 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1077 {-1, rep_prefix_4_byte, false}}},
1078 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1079 {-1, libcall, false}}}};
1080 static stringop_algs amdfam10_memset[2] = {
1081 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1082 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1083 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1084 {-1, libcall, false}}}};
1085 struct processor_costs amdfam10_cost = {
1086 {
1087 /* Start of register allocator costs. integer->integer move cost is 2. */
1088 4, /* cost for loading QImode using movzbl */
1089 {3, 4, 3}, /* cost of loading integer registers
1090 in QImode, HImode and SImode.
1091 Relative to reg-reg move (2). */
1092 {3, 4, 3}, /* cost of storing integer registers */
1093 4, /* cost of reg,reg fld/fst */
1094 {4, 4, 12}, /* cost of loading fp registers
1095 in SFmode, DFmode and XFmode */
1096 {6, 6, 8}, /* cost of storing fp registers
1097 in SFmode, DFmode and XFmode */
1098 2, /* cost of moving MMX register */
1099 {3, 3}, /* cost of loading MMX registers
1100 in SImode and DImode */
1101 {4, 4}, /* cost of storing MMX registers
1102 in SImode and DImode */
1103 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1104 {4, 4, 3, 6, 12}, /* cost of loading SSE registers
1105 in 32,64,128,256 and 512-bit */
1106 {4, 4, 5, 10, 20}, /* cost of storing SSE registers
1107 in 32,64,128,256 and 512-bit */
1108 3, 3, /* SSE->integer and integer->SSE moves */
1109
1110 /* On K8:
1111 MOVD reg64, xmmreg Double FSTORE 4
1112 MOVD reg32, xmmreg Double FSTORE 4
1113 On AMDFAM10:
1114 MOVD reg64, xmmreg Double FADD 3
1115 1/1 1/1
1116 MOVD reg32, xmmreg Double FADD 3
1117 1/1 1/1 */
1118 /* End of register allocator costs. */
1119 },
1120
1121 COSTS_N_INSNS (1), /* cost of an add instruction */
1122 COSTS_N_INSNS (2), /* cost of a lea instruction */
1123 COSTS_N_INSNS (1), /* variable shift costs */
1124 COSTS_N_INSNS (1), /* constant shift costs */
1125 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1126 COSTS_N_INSNS (4), /* HI */
1127 COSTS_N_INSNS (3), /* SI */
1128 COSTS_N_INSNS (4), /* DI */
1129 COSTS_N_INSNS (5)}, /* other */
1130 0, /* cost of multiply per each bit set */
1131 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1132 COSTS_N_INSNS (35), /* HI */
1133 COSTS_N_INSNS (51), /* SI */
1134 COSTS_N_INSNS (83), /* DI */
1135 COSTS_N_INSNS (83)}, /* other */
1136 COSTS_N_INSNS (1), /* cost of movsx */
1137 COSTS_N_INSNS (1), /* cost of movzx */
1138 8, /* "large" insn */
1139 9, /* MOVE_RATIO */
1140 6, /* CLEAR_RATIO */
1141 {3, 4, 3}, /* cost of loading integer registers
1142 in QImode, HImode and SImode.
1143 Relative to reg-reg move (2). */
1144 {3, 4, 3}, /* cost of storing integer registers */
1145 {4, 4, 3, 6, 12}, /* cost of loading SSE register
1146 in 32bit, 64bit, 128bit, 256bit and 512bit */
1147 {4, 4, 5, 10, 20}, /* cost of storing SSE register
1148 in 32bit, 64bit, 128bit, 256bit and 512bit */
1149 {4, 4, 3, 7, 12}, /* cost of unaligned loads. */
1150 {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
1151 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1152 3, /* cost of moving SSE register to integer. */
1153 4, 4, /* Gather load static, per_elt. */
1154 4, 4, /* Gather store static, per_elt. */
1155 64, /* size of l1 cache. */
1156 512, /* size of l2 cache. */
1157 64, /* size of prefetch block */
1158 /* New AMD processors never drop prefetches; if they cannot be performed
1159 immediately, they are queued. We set number of simultaneous prefetches
1160 to a large constant to reflect this (it probably is not a good idea not
1161 to limit number of prefetches at all, as their execution also takes some
1162 time). */
1163 100, /* number of parallel prefetches */
1164 2, /* Branch cost */
1165 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1166 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1167 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1168 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1169 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1170 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1171
1172 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1173 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1174 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1175 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1176 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
1177 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
1178 /* 11-16 */
1179 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
1180 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
1181 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
1182 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
1183 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1184 amdfam10_memcpy,
1185 amdfam10_memset,
1186 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1187 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1188 "32:25:8", /* Loop alignment. */
1189 "32:8:8", /* Jump alignment. */
1190 "0:0:8", /* Label alignment. */
1191 "32", /* Func alignment. */
1192 };
1193
1194 /* BDVER has optimized REP instruction for medium sized blocks, but for
1195 very small blocks it is better to use loop. For large blocks, libcall
1196 can do nontemporary accesses and beat inline considerably. */
1197 static stringop_algs bdver_memcpy[2] = {
1198 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1199 {-1, rep_prefix_4_byte, false}}},
1200 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1201 {-1, libcall, false}}}};
1202 static stringop_algs bdver_memset[2] = {
1203 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1204 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1205 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1206 {-1, libcall, false}}}};
1207
1208 const struct processor_costs bdver_cost = {
1209 {
1210 /* Start of register allocator costs. integer->integer move cost is 2. */
1211 8, /* cost for loading QImode using movzbl */
1212 {8, 8, 8}, /* cost of loading integer registers
1213 in QImode, HImode and SImode.
1214 Relative to reg-reg move (2). */
1215 {8, 8, 8}, /* cost of storing integer registers */
1216 4, /* cost of reg,reg fld/fst */
1217 {12, 12, 28}, /* cost of loading fp registers
1218 in SFmode, DFmode and XFmode */
1219 {10, 10, 18}, /* cost of storing fp registers
1220 in SFmode, DFmode and XFmode */
1221 4, /* cost of moving MMX register */
1222 {12, 12}, /* cost of loading MMX registers
1223 in SImode and DImode */
1224 {10, 10}, /* cost of storing MMX registers
1225 in SImode and DImode */
1226 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1227 {12, 12, 10, 40, 60}, /* cost of loading SSE registers
1228 in 32,64,128,256 and 512-bit */
1229 {10, 10, 10, 40, 60}, /* cost of storing SSE registers
1230 in 32,64,128,256 and 512-bit */
1231 16, 20, /* SSE->integer and integer->SSE moves */
1232 /* End of register allocator costs. */
1233 },
1234
1235 COSTS_N_INSNS (1), /* cost of an add instruction */
1236 COSTS_N_INSNS (1), /* cost of a lea instruction */
1237 COSTS_N_INSNS (1), /* variable shift costs */
1238 COSTS_N_INSNS (1), /* constant shift costs */
1239 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1240 COSTS_N_INSNS (4), /* HI */
1241 COSTS_N_INSNS (4), /* SI */
1242 COSTS_N_INSNS (6), /* DI */
1243 COSTS_N_INSNS (6)}, /* other */
1244 0, /* cost of multiply per each bit set */
1245 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1246 COSTS_N_INSNS (35), /* HI */
1247 COSTS_N_INSNS (51), /* SI */
1248 COSTS_N_INSNS (83), /* DI */
1249 COSTS_N_INSNS (83)}, /* other */
1250 COSTS_N_INSNS (1), /* cost of movsx */
1251 COSTS_N_INSNS (1), /* cost of movzx */
1252 8, /* "large" insn */
1253 9, /* MOVE_RATIO */
1254 6, /* CLEAR_RATIO */
1255 {8, 8, 8}, /* cost of loading integer registers
1256 in QImode, HImode and SImode.
1257 Relative to reg-reg move (2). */
1258 {8, 8, 8}, /* cost of storing integer registers */
1259 {12, 12, 10, 40, 60}, /* cost of loading SSE register
1260 in 32bit, 64bit, 128bit, 256bit and 512bit */
1261 {10, 10, 10, 40, 60}, /* cost of storing SSE register
1262 in 32bit, 64bit, 128bit, 256bit and 512bit */
1263 {12, 12, 10, 40, 60}, /* cost of unaligned loads. */
1264 {10, 10, 10, 40, 60}, /* cost of unaligned stores. */
1265 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1266 16, /* cost of moving SSE register to integer. */
1267 12, 12, /* Gather load static, per_elt. */
1268 10, 10, /* Gather store static, per_elt. */
1269 16, /* size of l1 cache. */
1270 2048, /* size of l2 cache. */
1271 64, /* size of prefetch block */
1272 /* New AMD processors never drop prefetches; if they cannot be performed
1273 immediately, they are queued. We set number of simultaneous prefetches
1274 to a large constant to reflect this (it probably is not a good idea not
1275 to limit number of prefetches at all, as their execution also takes some
1276 time). */
1277 100, /* number of parallel prefetches */
1278 2, /* Branch cost */
1279 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1280 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1281 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1282 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1283 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1284 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1285
1286 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1287 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1288 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1289 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1290 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1291 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1292 /* 9-24 */
1293 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1294 /* 9-27 */
1295 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1296 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1297 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
1298 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1299 bdver_memcpy,
1300 bdver_memset,
1301 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1302 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1303 "16:11:8", /* Loop alignment. */
1304 "16:8:8", /* Jump alignment. */
1305 "0:0:8", /* Label alignment. */
1306 "11", /* Func alignment. */
1307 };
1308
1309
1310 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1311 very small blocks it is better to use loop. For large blocks, libcall
1312 can do nontemporary accesses and beat inline considerably. */
1313 static stringop_algs znver1_memcpy[2] = {
1314 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1315 {-1, rep_prefix_4_byte, false}}},
1316 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1317 {-1, libcall, false}}}};
1318 static stringop_algs znver1_memset[2] = {
1319 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1320 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1321 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1322 {-1, libcall, false}}}};
1323 struct processor_costs znver1_cost = {
1324 {
1325 /* Start of register allocator costs. integer->integer move cost is 2. */
1326
1327 /* reg-reg moves are done by renaming and thus they are even cheaper than
1328 1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1329 to doubles of latencies, we do not model this correctly. It does not
1330 seem to make practical difference to bump prices up even more. */
1331 6, /* cost for loading QImode using
1332 movzbl. */
1333 {6, 6, 6}, /* cost of loading integer registers
1334 in QImode, HImode and SImode.
1335 Relative to reg-reg move (2). */
1336 {8, 8, 8}, /* cost of storing integer
1337 registers. */
1338 2, /* cost of reg,reg fld/fst. */
1339 {6, 6, 16}, /* cost of loading fp registers
1340 in SFmode, DFmode and XFmode. */
1341 {8, 8, 16}, /* cost of storing fp registers
1342 in SFmode, DFmode and XFmode. */
1343 2, /* cost of moving MMX register. */
1344 {6, 6}, /* cost of loading MMX registers
1345 in SImode and DImode. */
1346 {8, 8}, /* cost of storing MMX registers
1347 in SImode and DImode. */
1348 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
1349 {6, 6, 6, 12, 24}, /* cost of loading SSE registers
1350 in 32,64,128,256 and 512-bit. */
1351 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
1352 in 32,64,128,256 and 512-bit. */
1353 6, 6, /* SSE->integer and integer->SSE moves. */
1354 /* End of register allocator costs. */
1355 },
1356
1357 COSTS_N_INSNS (1), /* cost of an add instruction. */
1358 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1359 COSTS_N_INSNS (1), /* variable shift costs. */
1360 COSTS_N_INSNS (1), /* constant shift costs. */
1361 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1362 COSTS_N_INSNS (3), /* HI. */
1363 COSTS_N_INSNS (3), /* SI. */
1364 COSTS_N_INSNS (3), /* DI. */
1365 COSTS_N_INSNS (3)}, /* other. */
1366 0, /* cost of multiply per each bit
1367 set. */
1368 /* Depending on parameters, idiv can get faster on ryzen. This is upper
1369 bound. */
1370 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
1371 COSTS_N_INSNS (22), /* HI. */
1372 COSTS_N_INSNS (30), /* SI. */
1373 COSTS_N_INSNS (45), /* DI. */
1374 COSTS_N_INSNS (45)}, /* other. */
1375 COSTS_N_INSNS (1), /* cost of movsx. */
1376 COSTS_N_INSNS (1), /* cost of movzx. */
1377 8, /* "large" insn. */
1378 9, /* MOVE_RATIO. */
1379 6, /* CLEAR_RATIO */
1380 {6, 6, 6}, /* cost of loading integer registers
1381 in QImode, HImode and SImode.
1382 Relative to reg-reg move (2). */
1383 {8, 8, 8}, /* cost of storing integer
1384 registers. */
1385 {6, 6, 6, 12, 24}, /* cost of loading SSE register
1386 in 32bit, 64bit, 128bit, 256bit and 512bit */
1387 {8, 8, 8, 16, 32}, /* cost of storing SSE register
1388 in 32bit, 64bit, 128bit, 256bit and 512bit */
1389 {6, 6, 6, 12, 24}, /* cost of unaligned loads. */
1390 {8, 8, 8, 16, 32}, /* cost of unaligned stores. */
1391 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
1392 6, /* cost of moving SSE register to integer. */
1393 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1394 throughput 12. Approx 9 uops do not depend on vector size and every load
1395 is 7 uops. */
1396 18, 8, /* Gather load static, per_elt. */
1397 18, 10, /* Gather store static, per_elt. */
1398 32, /* size of l1 cache. */
1399 512, /* size of l2 cache. */
1400 64, /* size of prefetch block. */
1401 /* New AMD processors never drop prefetches; if they cannot be performed
1402 immediately, they are queued. We set number of simultaneous prefetches
1403 to a large constant to reflect this (it probably is not a good idea not
1404 to limit number of prefetches at all, as their execution also takes some
1405 time). */
1406 100, /* number of parallel prefetches. */
1407 3, /* Branch cost. */
1408 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1409 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1410 /* Latency of fdiv is 8-15. */
1411 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1412 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1413 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1414 /* Latency of fsqrt is 4-10. */
1415 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1416
1417 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1418 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1419 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1420 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1421 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1422 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1423 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1424 /* 9-13 */
1425 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1426 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1427 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
1428 /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1429 and it can execute 2 integer additions and 2 multiplications thus
1430 reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests
1431 that 4 works better than 6 probably due to register pressure.
1432
1433 Integer vector operations are taken by FP unit and execute 3 vector
1434 plus/minus operations per cycle but only one multiply. This is adjusted
1435 in ix86_reassociation_width. */
1436 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1437 znver1_memcpy,
1438 znver1_memset,
1439 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1440 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1441 "16", /* Loop alignment. */
1442 "16", /* Jump alignment. */
1443 "0:0:8", /* Label alignment. */
1444 "16", /* Func alignment. */
1445 };
1446
1447 /* ZNVER2 has optimized REP instruction for medium sized blocks, but for
1448 very small blocks it is better to use loop. For large blocks, libcall
1449 can do nontemporary accesses and beat inline considerably. */
1450 static stringop_algs znver2_memcpy[2] = {
1451 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1452 {-1, rep_prefix_4_byte, false}}},
1453 {libcall, {{16, loop, false}, {64, rep_prefix_4_byte, false},
1454 {-1, libcall, false}}}};
1455 static stringop_algs znver2_memset[2] = {
1456 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1457 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1458 {libcall, {{24, rep_prefix_4_byte, false}, {128, rep_prefix_8_byte, false},
1459 {-1, libcall, false}}}};
1460
1461 struct processor_costs znver2_cost = {
1462 {
1463 /* Start of register allocator costs. integer->integer move cost is 2. */
1464
1465 /* reg-reg moves are done by renaming and thus they are even cheaper than
1466 1 cycle. Because reg-reg move cost is 2 and following tables correspond
1467 to doubles of latencies, we do not model this correctly. It does not
1468 seem to make practical difference to bump prices up even more. */
1469 6, /* cost for loading QImode using
1470 movzbl. */
1471 {6, 6, 6}, /* cost of loading integer registers
1472 in QImode, HImode and SImode.
1473 Relative to reg-reg move (2). */
1474 {8, 8, 8}, /* cost of storing integer
1475 registers. */
1476 2, /* cost of reg,reg fld/fst. */
1477 {6, 6, 16}, /* cost of loading fp registers
1478 in SFmode, DFmode and XFmode. */
1479 {8, 8, 16}, /* cost of storing fp registers
1480 in SFmode, DFmode and XFmode. */
1481 2, /* cost of moving MMX register. */
1482 {6, 6}, /* cost of loading MMX registers
1483 in SImode and DImode. */
1484 {8, 8}, /* cost of storing MMX registers
1485 in SImode and DImode. */
1486 2, 2, 3, /* cost of moving XMM,YMM,ZMM
1487 register. */
1488 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
1489 in 32,64,128,256 and 512-bit. */
1490 {8, 8, 8, 8, 16}, /* cost of storing SSE registers
1491 in 32,64,128,256 and 512-bit. */
1492 6, 6, /* SSE->integer and integer->SSE
1493 moves. */
1494 /* End of register allocator costs. */
1495 },
1496
1497 COSTS_N_INSNS (1), /* cost of an add instruction. */
1498 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1499 COSTS_N_INSNS (1), /* variable shift costs. */
1500 COSTS_N_INSNS (1), /* constant shift costs. */
1501 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1502 COSTS_N_INSNS (3), /* HI. */
1503 COSTS_N_INSNS (3), /* SI. */
1504 COSTS_N_INSNS (3), /* DI. */
1505 COSTS_N_INSNS (3)}, /* other. */
1506 0, /* cost of multiply per each bit
1507 set. */
1508 /* Depending on parameters, idiv can get faster on ryzen. This is upper
1509 bound. */
1510 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
1511 COSTS_N_INSNS (22), /* HI. */
1512 COSTS_N_INSNS (30), /* SI. */
1513 COSTS_N_INSNS (45), /* DI. */
1514 COSTS_N_INSNS (45)}, /* other. */
1515 COSTS_N_INSNS (1), /* cost of movsx. */
1516 COSTS_N_INSNS (1), /* cost of movzx. */
1517 8, /* "large" insn. */
1518 9, /* MOVE_RATIO. */
1519 6, /* CLEAR_RATIO */
1520 {6, 6, 6}, /* cost of loading integer registers
1521 in QImode, HImode and SImode.
1522 Relative to reg-reg move (2). */
1523 {8, 8, 8}, /* cost of storing integer
1524 registers. */
1525 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
1526 in 32bit, 64bit, 128bit, 256bit and 512bit */
1527 {8, 8, 8, 8, 16}, /* cost of storing SSE register
1528 in 32bit, 64bit, 128bit, 256bit and 512bit */
1529 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */
1530 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
1531 2, 2, 3, /* cost of moving XMM,YMM,ZMM
1532 register. */
1533 6, /* cost of moving SSE register to integer. */
1534 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1535 throughput 12. Approx 9 uops do not depend on vector size and every load
1536 is 7 uops. */
1537 18, 8, /* Gather load static, per_elt. */
1538 18, 10, /* Gather store static, per_elt. */
1539 32, /* size of l1 cache. */
1540 512, /* size of l2 cache. */
1541 64, /* size of prefetch block. */
1542 /* New AMD processors never drop prefetches; if they cannot be performed
1543 immediately, they are queued. We set number of simultaneous prefetches
1544 to a large constant to reflect this (it probably is not a good idea not
1545 to limit number of prefetches at all, as their execution also takes some
1546 time). */
1547 100, /* number of parallel prefetches. */
1548 3, /* Branch cost. */
1549 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1550 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1551 /* Latency of fdiv is 8-15. */
1552 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1553 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1554 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1555 /* Latency of fsqrt is 4-10. */
1556 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1557
1558 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1559 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1560 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1561 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
1562 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1563 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1564 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1565 /* 9-13. */
1566 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1567 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1568 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
1569 /* Zen can execute 4 integer operations per cycle. FP operations
1570 take 3 cycles and it can execute 2 integer additions and 2
1571 multiplications thus reassociation may make sense up to with of 6.
1572 SPEC2k6 bencharks suggests
1573 that 4 works better than 6 probably due to register pressure.
1574
1575 Integer vector operations are taken by FP unit and execute 3 vector
1576 plus/minus operations per cycle but only one multiply. This is adjusted
1577 in ix86_reassociation_width. */
1578 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1579 znver2_memcpy,
1580 znver2_memset,
1581 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1582 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1583 "16", /* Loop alignment. */
1584 "16", /* Jump alignment. */
1585 "0:0:8", /* Label alignment. */
1586 "16", /* Func alignment. */
1587 };
1588
1589 /* skylake_cost should produce code tuned for Skylake familly of CPUs. */
1590 static stringop_algs skylake_memcpy[2] = {
1591 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1592 {libcall, {{16, loop, false}, {512, unrolled_loop, false},
1593 {-1, libcall, false}}}};
1594
1595 static stringop_algs skylake_memset[2] = {
1596 {libcall, {{6, loop_1_byte, true},
1597 {24, loop, true},
1598 {8192, rep_prefix_4_byte, true},
1599 {-1, libcall, false}}},
1600 {libcall, {{24, loop, true}, {512, unrolled_loop, false},
1601 {-1, libcall, false}}}};
1602
1603 static const
1604 struct processor_costs skylake_cost = {
1605 {
1606 /* Start of register allocator costs. integer->integer move cost is 2. */
1607 6, /* cost for loading QImode using movzbl */
1608 {4, 4, 4}, /* cost of loading integer registers
1609 in QImode, HImode and SImode.
1610 Relative to reg-reg move (2). */
1611 {6, 6, 6}, /* cost of storing integer registers */
1612 2, /* cost of reg,reg fld/fst */
1613 {6, 6, 8}, /* cost of loading fp registers
1614 in SFmode, DFmode and XFmode */
1615 {6, 6, 10}, /* cost of storing fp registers
1616 in SFmode, DFmode and XFmode */
1617 2, /* cost of moving MMX register */
1618 {6, 6}, /* cost of loading MMX registers
1619 in SImode and DImode */
1620 {6, 6}, /* cost of storing MMX registers
1621 in SImode and DImode */
1622 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
1623 {6, 6, 6, 10, 20}, /* cost of loading SSE registers
1624 in 32,64,128,256 and 512-bit */
1625 {8, 8, 8, 12, 24}, /* cost of storing SSE registers
1626 in 32,64,128,256 and 512-bit */
1627 6, 6, /* SSE->integer and integer->SSE moves */
1628 /* End of register allocator costs. */
1629 },
1630
1631 COSTS_N_INSNS (1), /* cost of an add instruction */
1632 COSTS_N_INSNS (1)+1, /* cost of a lea instruction */
1633 COSTS_N_INSNS (1), /* variable shift costs */
1634 COSTS_N_INSNS (1), /* constant shift costs */
1635 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1636 COSTS_N_INSNS (4), /* HI */
1637 COSTS_N_INSNS (3), /* SI */
1638 COSTS_N_INSNS (3), /* DI */
1639 COSTS_N_INSNS (3)}, /* other */
1640 0, /* cost of multiply per each bit set */
1641 /* Expanding div/mod currently doesn't consider parallelism. So the cost
1642 model is not realistic. We compensate by increasing the latencies a bit. */
1643 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
1644 COSTS_N_INSNS (11), /* HI */
1645 COSTS_N_INSNS (14), /* SI */
1646 COSTS_N_INSNS (76), /* DI */
1647 COSTS_N_INSNS (76)}, /* other */
1648 COSTS_N_INSNS (1), /* cost of movsx */
1649 COSTS_N_INSNS (0), /* cost of movzx */
1650 8, /* "large" insn */
1651 17, /* MOVE_RATIO */
1652 6, /* CLEAR_RATIO */
1653 {4, 4, 4}, /* cost of loading integer registers
1654 in QImode, HImode and SImode.
1655 Relative to reg-reg move (2). */
1656 {6, 6, 6}, /* cost of storing integer registers */
1657 {6, 6, 6, 10, 20}, /* cost of loading SSE register
1658 in 32bit, 64bit, 128bit, 256bit and 512bit */
1659 {8, 8, 8, 12, 24}, /* cost of storing SSE register
1660 in 32bit, 64bit, 128bit, 256bit and 512bit */
1661 {6, 6, 6, 10, 20}, /* cost of unaligned loads. */
1662 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
1663 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
1664 2, /* cost of moving SSE register to integer. */
1665 20, 8, /* Gather load static, per_elt. */
1666 22, 10, /* Gather store static, per_elt. */
1667 64, /* size of l1 cache. */
1668 512, /* size of l2 cache. */
1669 64, /* size of prefetch block */
1670 6, /* number of parallel prefetches */
1671 3, /* Branch cost */
1672 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
1673 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1674 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1675 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1676 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1677 COSTS_N_INSNS (20), /* cost of FSQRT instruction. */
1678
1679 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1680 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1681 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1682 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1683 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
1684 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
1685 COSTS_N_INSNS (11), /* cost of DIVSS instruction. */
1686 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
1687 COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */
1688 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
1689 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
1690 skylake_memcpy,
1691 skylake_memset,
1692 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1693 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1694 "16:11:8", /* Loop alignment. */
1695 "16:11:8", /* Jump alignment. */
1696 "0:0:8", /* Label alignment. */
1697 "16", /* Func alignment. */
1698 };
1699 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1700 very small blocks it is better to use loop. For large blocks, libcall can
1701 do nontemporary accesses and beat inline considerably. */
1702 static stringop_algs btver1_memcpy[2] = {
1703 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1704 {-1, rep_prefix_4_byte, false}}},
1705 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1706 {-1, libcall, false}}}};
1707 static stringop_algs btver1_memset[2] = {
1708 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1709 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1710 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1711 {-1, libcall, false}}}};
1712 const struct processor_costs btver1_cost = {
1713 {
1714 /* Start of register allocator costs. integer->integer move cost is 2. */
1715 8, /* cost for loading QImode using movzbl */
1716 {6, 8, 6}, /* cost of loading integer registers
1717 in QImode, HImode and SImode.
1718 Relative to reg-reg move (2). */
1719 {6, 8, 6}, /* cost of storing integer registers */
1720 4, /* cost of reg,reg fld/fst */
1721 {12, 12, 28}, /* cost of loading fp registers
1722 in SFmode, DFmode and XFmode */
1723 {12, 12, 38}, /* cost of storing fp registers
1724 in SFmode, DFmode and XFmode */
1725 4, /* cost of moving MMX register */
1726 {10, 10}, /* cost of loading MMX registers
1727 in SImode and DImode */
1728 {12, 12}, /* cost of storing MMX registers
1729 in SImode and DImode */
1730 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1731 {10, 10, 12, 48, 96}, /* cost of loading SSE registers
1732 in 32,64,128,256 and 512-bit */
1733 {10, 10, 12, 48, 96}, /* cost of storing SSE registers
1734 in 32,64,128,256 and 512-bit */
1735 14, 14, /* SSE->integer and integer->SSE moves */
1736 /* End of register allocator costs. */
1737 },
1738
1739 COSTS_N_INSNS (1), /* cost of an add instruction */
1740 COSTS_N_INSNS (2), /* cost of a lea instruction */
1741 COSTS_N_INSNS (1), /* variable shift costs */
1742 COSTS_N_INSNS (1), /* constant shift costs */
1743 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1744 COSTS_N_INSNS (4), /* HI */
1745 COSTS_N_INSNS (3), /* SI */
1746 COSTS_N_INSNS (4), /* DI */
1747 COSTS_N_INSNS (5)}, /* other */
1748 0, /* cost of multiply per each bit set */
1749 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1750 COSTS_N_INSNS (35), /* HI */
1751 COSTS_N_INSNS (51), /* SI */
1752 COSTS_N_INSNS (83), /* DI */
1753 COSTS_N_INSNS (83)}, /* other */
1754 COSTS_N_INSNS (1), /* cost of movsx */
1755 COSTS_N_INSNS (1), /* cost of movzx */
1756 8, /* "large" insn */
1757 9, /* MOVE_RATIO */
1758 6, /* CLEAR_RATIO */
1759 {6, 8, 6}, /* cost of loading integer registers
1760 in QImode, HImode and SImode.
1761 Relative to reg-reg move (2). */
1762 {6, 8, 6}, /* cost of storing integer registers */
1763 {10, 10, 12, 48, 96}, /* cost of loading SSE register
1764 in 32bit, 64bit, 128bit, 256bit and 512bit */
1765 {10, 10, 12, 48, 96}, /* cost of storing SSE register
1766 in 32bit, 64bit, 128bit, 256bit and 512bit */
1767 {10, 10, 12, 48, 96}, /* cost of unaligned loads. */
1768 {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
1769 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1770 14, /* cost of moving SSE register to integer. */
1771 10, 10, /* Gather load static, per_elt. */
1772 10, 10, /* Gather store static, per_elt. */
1773 32, /* size of l1 cache. */
1774 512, /* size of l2 cache. */
1775 64, /* size of prefetch block */
1776 100, /* number of parallel prefetches */
1777 2, /* Branch cost */
1778 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1779 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1780 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1781 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1782 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1783 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1784
1785 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1786 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1787 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
1788 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1789 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1790 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1791 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
1792 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
1793 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
1794 COSTS_N_INSNS (48), /* cost of SQRTSD instruction. */
1795 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1796 btver1_memcpy,
1797 btver1_memset,
1798 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1799 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1800 "16:11:8", /* Loop alignment. */
1801 "16:8:8", /* Jump alignment. */
1802 "0:0:8", /* Label alignment. */
1803 "11", /* Func alignment. */
1804 };
1805
1806 static stringop_algs btver2_memcpy[2] = {
1807 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1808 {-1, rep_prefix_4_byte, false}}},
1809 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1810 {-1, libcall, false}}}};
1811 static stringop_algs btver2_memset[2] = {
1812 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1813 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1814 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1815 {-1, libcall, false}}}};
1816 const struct processor_costs btver2_cost = {
1817 {
1818 /* Start of register allocator costs. integer->integer move cost is 2. */
1819 8, /* cost for loading QImode using movzbl */
1820 {8, 8, 6}, /* cost of loading integer registers
1821 in QImode, HImode and SImode.
1822 Relative to reg-reg move (2). */
1823 {8, 8, 6}, /* cost of storing integer registers */
1824 4, /* cost of reg,reg fld/fst */
1825 {12, 12, 28}, /* cost of loading fp registers
1826 in SFmode, DFmode and XFmode */
1827 {12, 12, 38}, /* cost of storing fp registers
1828 in SFmode, DFmode and XFmode */
1829 4, /* cost of moving MMX register */
1830 {10, 10}, /* cost of loading MMX registers
1831 in SImode and DImode */
1832 {12, 12}, /* cost of storing MMX registers
1833 in SImode and DImode */
1834 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1835 {10, 10, 12, 48, 96}, /* cost of loading SSE registers
1836 in 32,64,128,256 and 512-bit */
1837 {10, 10, 12, 48, 96}, /* cost of storing SSE registers
1838 in 32,64,128,256 and 512-bit */
1839 14, 14, /* SSE->integer and integer->SSE moves */
1840 /* End of register allocator costs. */
1841 },
1842
1843 COSTS_N_INSNS (1), /* cost of an add instruction */
1844 COSTS_N_INSNS (2), /* cost of a lea instruction */
1845 COSTS_N_INSNS (1), /* variable shift costs */
1846 COSTS_N_INSNS (1), /* constant shift costs */
1847 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1848 COSTS_N_INSNS (4), /* HI */
1849 COSTS_N_INSNS (3), /* SI */
1850 COSTS_N_INSNS (4), /* DI */
1851 COSTS_N_INSNS (5)}, /* other */
1852 0, /* cost of multiply per each bit set */
1853 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1854 COSTS_N_INSNS (35), /* HI */
1855 COSTS_N_INSNS (51), /* SI */
1856 COSTS_N_INSNS (83), /* DI */
1857 COSTS_N_INSNS (83)}, /* other */
1858 COSTS_N_INSNS (1), /* cost of movsx */
1859 COSTS_N_INSNS (1), /* cost of movzx */
1860 8, /* "large" insn */
1861 9, /* MOVE_RATIO */
1862 6, /* CLEAR_RATIO */
1863 {8, 8, 6}, /* cost of loading integer registers
1864 in QImode, HImode and SImode.
1865 Relative to reg-reg move (2). */
1866 {8, 8, 6}, /* cost of storing integer registers */
1867 {10, 10, 12, 48, 96}, /* cost of loading SSE register
1868 in 32bit, 64bit, 128bit, 256bit and 512bit */
1869 {10, 10, 12, 48, 96}, /* cost of storing SSE register
1870 in 32bit, 64bit, 128bit, 256bit and 512bit */
1871 {10, 10, 12, 48, 96}, /* cost of unaligned loads. */
1872 {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
1873 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1874 14, /* cost of moving SSE register to integer. */
1875 10, 10, /* Gather load static, per_elt. */
1876 10, 10, /* Gather store static, per_elt. */
1877 32, /* size of l1 cache. */
1878 2048, /* size of l2 cache. */
1879 64, /* size of prefetch block */
1880 100, /* number of parallel prefetches */
1881 2, /* Branch cost */
1882 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1883 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1884 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1885 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1886 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1887 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1888
1889 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1890 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1891 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
1892 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1893 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1894 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1895 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
1896 COSTS_N_INSNS (19), /* cost of DIVSD instruction. */
1897 COSTS_N_INSNS (16), /* cost of SQRTSS instruction. */
1898 COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */
1899 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1900 btver2_memcpy,
1901 btver2_memset,
1902 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1903 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1904 "16:11:8", /* Loop alignment. */
1905 "16:8:8", /* Jump alignment. */
1906 "0:0:8", /* Label alignment. */
1907 "11", /* Func alignment. */
1908 };
1909
1910 static stringop_algs pentium4_memcpy[2] = {
1911 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1912 DUMMY_STRINGOP_ALGS};
1913 static stringop_algs pentium4_memset[2] = {
1914 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1915 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1916 DUMMY_STRINGOP_ALGS};
1917
1918 static const
1919 struct processor_costs pentium4_cost = {
1920 {
1921 /* Start of register allocator costs. integer->integer move cost is 2. */
1922 5, /* cost for loading QImode using movzbl */
1923 {4, 5, 4}, /* cost of loading integer registers
1924 in QImode, HImode and SImode.
1925 Relative to reg-reg move (2). */
1926 {2, 3, 2}, /* cost of storing integer registers */
1927 12, /* cost of reg,reg fld/fst */
1928 {14, 14, 14}, /* cost of loading fp registers
1929 in SFmode, DFmode and XFmode */
1930 {14, 14, 14}, /* cost of storing fp registers
1931 in SFmode, DFmode and XFmode */
1932 12, /* cost of moving MMX register */
1933 {16, 16}, /* cost of loading MMX registers
1934 in SImode and DImode */
1935 {16, 16}, /* cost of storing MMX registers
1936 in SImode and DImode */
1937 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
1938 {16, 16, 16, 32, 64}, /* cost of loading SSE registers
1939 in 32,64,128,256 and 512-bit */
1940 {16, 16, 16, 32, 64}, /* cost of storing SSE registers
1941 in 32,64,128,256 and 512-bit */
1942 20, 12, /* SSE->integer and integer->SSE moves */
1943 /* End of register allocator costs. */
1944 },
1945
1946 COSTS_N_INSNS (1), /* cost of an add instruction */
1947 COSTS_N_INSNS (3), /* cost of a lea instruction */
1948 COSTS_N_INSNS (4), /* variable shift costs */
1949 COSTS_N_INSNS (4), /* constant shift costs */
1950 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1951 COSTS_N_INSNS (15), /* HI */
1952 COSTS_N_INSNS (15), /* SI */
1953 COSTS_N_INSNS (15), /* DI */
1954 COSTS_N_INSNS (15)}, /* other */
1955 0, /* cost of multiply per each bit set */
1956 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1957 COSTS_N_INSNS (56), /* HI */
1958 COSTS_N_INSNS (56), /* SI */
1959 COSTS_N_INSNS (56), /* DI */
1960 COSTS_N_INSNS (56)}, /* other */
1961 COSTS_N_INSNS (1), /* cost of movsx */
1962 COSTS_N_INSNS (1), /* cost of movzx */
1963 16, /* "large" insn */
1964 6, /* MOVE_RATIO */
1965 6, /* CLEAR_RATIO */
1966 {4, 5, 4}, /* cost of loading integer registers
1967 in QImode, HImode and SImode.
1968 Relative to reg-reg move (2). */
1969 {2, 3, 2}, /* cost of storing integer registers */
1970 {16, 16, 16, 32, 64}, /* cost of loading SSE register
1971 in 32bit, 64bit, 128bit, 256bit and 512bit */
1972 {16, 16, 16, 32, 64}, /* cost of storing SSE register
1973 in 32bit, 64bit, 128bit, 256bit and 512bit */
1974 {32, 32, 32, 64, 128}, /* cost of unaligned loads. */
1975 {32, 32, 32, 64, 128}, /* cost of unaligned stores. */
1976 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
1977 20, /* cost of moving SSE register to integer. */
1978 16, 16, /* Gather load static, per_elt. */
1979 16, 16, /* Gather store static, per_elt. */
1980 8, /* size of l1 cache. */
1981 256, /* size of l2 cache. */
1982 64, /* size of prefetch block */
1983 6, /* number of parallel prefetches */
1984 2, /* Branch cost */
1985 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1986 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1987 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1988 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1989 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1990 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1991
1992 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1993 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1994 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1995 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1996 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1997 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1998 COSTS_N_INSNS (23), /* cost of DIVSS instruction. */
1999 COSTS_N_INSNS (38), /* cost of DIVSD instruction. */
2000 COSTS_N_INSNS (23), /* cost of SQRTSS instruction. */
2001 COSTS_N_INSNS (38), /* cost of SQRTSD instruction. */
2002 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2003 pentium4_memcpy,
2004 pentium4_memset,
2005 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2006 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2007 NULL, /* Loop alignment. */
2008 NULL, /* Jump alignment. */
2009 NULL, /* Label alignment. */
2010 NULL, /* Func alignment. */
2011 };
2012
2013 static stringop_algs nocona_memcpy[2] = {
2014 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
2015 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
2016 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
2017
2018 static stringop_algs nocona_memset[2] = {
2019 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
2020 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2021 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
2022 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2023
2024 static const
2025 struct processor_costs nocona_cost = {
2026 {
2027 /* Start of register allocator costs. integer->integer move cost is 2. */
2028 4, /* cost for loading QImode using movzbl */
2029 {4, 4, 4}, /* cost of loading integer registers
2030 in QImode, HImode and SImode.
2031 Relative to reg-reg move (2). */
2032 {4, 4, 4}, /* cost of storing integer registers */
2033 12, /* cost of reg,reg fld/fst */
2034 {14, 14, 14}, /* cost of loading fp registers
2035 in SFmode, DFmode and XFmode */
2036 {14, 14, 14}, /* cost of storing fp registers
2037 in SFmode, DFmode and XFmode */
2038 14, /* cost of moving MMX register */
2039 {12, 12}, /* cost of loading MMX registers
2040 in SImode and DImode */
2041 {12, 12}, /* cost of storing MMX registers
2042 in SImode and DImode */
2043 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
2044 {12, 12, 12, 24, 48}, /* cost of loading SSE registers
2045 in 32,64,128,256 and 512-bit */
2046 {12, 12, 12, 24, 48}, /* cost of storing SSE registers
2047 in 32,64,128,256 and 512-bit */
2048 20, 12, /* SSE->integer and integer->SSE moves */
2049 /* End of register allocator costs. */
2050 },
2051
2052 COSTS_N_INSNS (1), /* cost of an add instruction */
2053 COSTS_N_INSNS (1), /* cost of a lea instruction */
2054 COSTS_N_INSNS (1), /* variable shift costs */
2055 COSTS_N_INSNS (1), /* constant shift costs */
2056 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
2057 COSTS_N_INSNS (10), /* HI */
2058 COSTS_N_INSNS (10), /* SI */
2059 COSTS_N_INSNS (10), /* DI */
2060 COSTS_N_INSNS (10)}, /* other */
2061 0, /* cost of multiply per each bit set */
2062 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
2063 COSTS_N_INSNS (66), /* HI */
2064 COSTS_N_INSNS (66), /* SI */
2065 COSTS_N_INSNS (66), /* DI */
2066 COSTS_N_INSNS (66)}, /* other */
2067 COSTS_N_INSNS (1), /* cost of movsx */
2068 COSTS_N_INSNS (1), /* cost of movzx */
2069 16, /* "large" insn */
2070 17, /* MOVE_RATIO */
2071 6, /* CLEAR_RATIO */
2072 {4, 4, 4}, /* cost of loading integer registers
2073 in QImode, HImode and SImode.
2074 Relative to reg-reg move (2). */
2075 {4, 4, 4}, /* cost of storing integer registers */
2076 {12, 12, 12, 24, 48}, /* cost of loading SSE register
2077 in 32bit, 64bit, 128bit, 256bit and 512bit */
2078 {12, 12, 12, 24, 48}, /* cost of storing SSE register
2079 in 32bit, 64bit, 128bit, 256bit and 512bit */
2080 {24, 24, 24, 48, 96}, /* cost of unaligned loads. */
2081 {24, 24, 24, 48, 96}, /* cost of unaligned stores. */
2082 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
2083 20, /* cost of moving SSE register to integer. */
2084 12, 12, /* Gather load static, per_elt. */
2085 12, 12, /* Gather store static, per_elt. */
2086 8, /* size of l1 cache. */
2087 1024, /* size of l2 cache. */
2088 64, /* size of prefetch block */
2089 8, /* number of parallel prefetches */
2090 1, /* Branch cost */
2091 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
2092 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2093 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
2094 COSTS_N_INSNS (3), /* cost of FABS instruction. */
2095 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
2096 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
2097
2098 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
2099 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
2100 COSTS_N_INSNS (7), /* cost of MULSS instruction. */
2101 COSTS_N_INSNS (7), /* cost of MULSD instruction. */
2102 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
2103 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
2104 COSTS_N_INSNS (32), /* cost of DIVSS instruction. */
2105 COSTS_N_INSNS (40), /* cost of DIVSD instruction. */
2106 COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */
2107 COSTS_N_INSNS (41), /* cost of SQRTSD instruction. */
2108 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2109 nocona_memcpy,
2110 nocona_memset,
2111 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2112 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2113 NULL, /* Loop alignment. */
2114 NULL, /* Jump alignment. */
2115 NULL, /* Label alignment. */
2116 NULL, /* Func alignment. */
2117 };
2118
2119 static stringop_algs atom_memcpy[2] = {
2120 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2121 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2122 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2123 static stringop_algs atom_memset[2] = {
2124 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2125 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2126 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2127 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2128 static const
2129 struct processor_costs atom_cost = {
2130 {
2131 /* Start of register allocator costs. integer->integer move cost is 2. */
2132 6, /* cost for loading QImode using movzbl */
2133 {6, 6, 6}, /* cost of loading integer registers
2134 in QImode, HImode and SImode.
2135 Relative to reg-reg move (2). */
2136 {6, 6, 6}, /* cost of storing integer registers */
2137 4, /* cost of reg,reg fld/fst */
2138 {6, 6, 18}, /* cost of loading fp registers
2139 in SFmode, DFmode and XFmode */
2140 {14, 14, 24}, /* cost of storing fp registers
2141 in SFmode, DFmode and XFmode */
2142 2, /* cost of moving MMX register */
2143 {8, 8}, /* cost of loading MMX registers
2144 in SImode and DImode */
2145 {10, 10}, /* cost of storing MMX registers
2146 in SImode and DImode */
2147 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2148 {8, 8, 8, 16, 32}, /* cost of loading SSE registers
2149 in 32,64,128,256 and 512-bit */
2150 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
2151 in 32,64,128,256 and 512-bit */
2152 8, 6, /* SSE->integer and integer->SSE moves */
2153 /* End of register allocator costs. */
2154 },
2155
2156 COSTS_N_INSNS (1), /* cost of an add instruction */
2157 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2158 COSTS_N_INSNS (1), /* variable shift costs */
2159 COSTS_N_INSNS (1), /* constant shift costs */
2160 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2161 COSTS_N_INSNS (4), /* HI */
2162 COSTS_N_INSNS (3), /* SI */
2163 COSTS_N_INSNS (4), /* DI */
2164 COSTS_N_INSNS (2)}, /* other */
2165 0, /* cost of multiply per each bit set */
2166 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2167 COSTS_N_INSNS (26), /* HI */
2168 COSTS_N_INSNS (42), /* SI */
2169 COSTS_N_INSNS (74), /* DI */
2170 COSTS_N_INSNS (74)}, /* other */
2171 COSTS_N_INSNS (1), /* cost of movsx */
2172 COSTS_N_INSNS (1), /* cost of movzx */
2173 8, /* "large" insn */
2174 17, /* MOVE_RATIO */
2175 6, /* CLEAR_RATIO */
2176 {6, 6, 6}, /* cost of loading integer registers
2177 in QImode, HImode and SImode.
2178 Relative to reg-reg move (2). */
2179 {6, 6, 6}, /* cost of storing integer registers */
2180 {8, 8, 8, 16, 32}, /* cost of loading SSE register
2181 in 32bit, 64bit, 128bit, 256bit and 512bit */
2182 {8, 8, 8, 16, 32}, /* cost of storing SSE register
2183 in 32bit, 64bit, 128bit, 256bit and 512bit */
2184 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */
2185 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
2186 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2187 8, /* cost of moving SSE register to integer. */
2188 8, 8, /* Gather load static, per_elt. */
2189 8, 8, /* Gather store static, per_elt. */
2190 32, /* size of l1 cache. */
2191 256, /* size of l2 cache. */
2192 64, /* size of prefetch block */
2193 6, /* number of parallel prefetches */
2194 3, /* Branch cost */
2195 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2196 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2197 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2198 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2199 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2200 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2201
2202 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2203 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
2204 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2205 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2206 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2207 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
2208 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
2209 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
2210 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
2211 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
2212 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
2213 atom_memcpy,
2214 atom_memset,
2215 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2216 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2217 "16", /* Loop alignment. */
2218 "16:8:8", /* Jump alignment. */
2219 "0:0:8", /* Label alignment. */
2220 "16", /* Func alignment. */
2221 };
2222
2223 static stringop_algs slm_memcpy[2] = {
2224 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2225 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2226 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2227 static stringop_algs slm_memset[2] = {
2228 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2229 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2230 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2231 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2232 static const
2233 struct processor_costs slm_cost = {
2234 {
2235 /* Start of register allocator costs. integer->integer move cost is 2. */
2236 8, /* cost for loading QImode using movzbl */
2237 {8, 8, 8}, /* cost of loading integer registers
2238 in QImode, HImode and SImode.
2239 Relative to reg-reg move (2). */
2240 {6, 6, 6}, /* cost of storing integer registers */
2241 2, /* cost of reg,reg fld/fst */
2242 {8, 8, 18}, /* cost of loading fp registers
2243 in SFmode, DFmode and XFmode */
2244 {6, 6, 18}, /* cost of storing fp registers
2245 in SFmode, DFmode and XFmode */
2246 2, /* cost of moving MMX register */
2247 {8, 8}, /* cost of loading MMX registers
2248 in SImode and DImode */
2249 {6, 6}, /* cost of storing MMX registers
2250 in SImode and DImode */
2251 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2252 {8, 8, 8, 16, 32}, /* cost of loading SSE registers
2253 in 32,64,128,256 and 512-bit */
2254 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
2255 in 32,64,128,256 and 512-bit */
2256 8, 6, /* SSE->integer and integer->SSE moves */
2257 /* End of register allocator costs. */
2258 },
2259
2260 COSTS_N_INSNS (1), /* cost of an add instruction */
2261 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2262 COSTS_N_INSNS (1), /* variable shift costs */
2263 COSTS_N_INSNS (1), /* constant shift costs */
2264 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2265 COSTS_N_INSNS (3), /* HI */
2266 COSTS_N_INSNS (3), /* SI */
2267 COSTS_N_INSNS (4), /* DI */
2268 COSTS_N_INSNS (2)}, /* other */
2269 0, /* cost of multiply per each bit set */
2270 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2271 COSTS_N_INSNS (26), /* HI */
2272 COSTS_N_INSNS (42), /* SI */
2273 COSTS_N_INSNS (74), /* DI */
2274 COSTS_N_INSNS (74)}, /* other */
2275 COSTS_N_INSNS (1), /* cost of movsx */
2276 COSTS_N_INSNS (1), /* cost of movzx */
2277 8, /* "large" insn */
2278 17, /* MOVE_RATIO */
2279 6, /* CLEAR_RATIO */
2280 {8, 8, 8}, /* cost of loading integer registers
2281 in QImode, HImode and SImode.
2282 Relative to reg-reg move (2). */
2283 {6, 6, 6}, /* cost of storing integer registers */
2284 {8, 8, 8, 16, 32}, /* cost of loading SSE register
2285 in 32bit, 64bit, 128bit, 256bit and 512bit */
2286 {8, 8, 8, 16, 32}, /* cost of storing SSE register
2287 in SImode, DImode and TImode. */
2288 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */
2289 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
2290 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2291 8, /* cost of moving SSE register to integer. */
2292 8, 8, /* Gather load static, per_elt. */
2293 8, 8, /* Gather store static, per_elt. */
2294 32, /* size of l1 cache. */
2295 256, /* size of l2 cache. */
2296 64, /* size of prefetch block */
2297 6, /* number of parallel prefetches */
2298 3, /* Branch cost */
2299 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2300 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2301 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2302 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2303 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2304 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2305
2306 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2307 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2308 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2309 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2310 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2311 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
2312 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
2313 COSTS_N_INSNS (69), /* cost of DIVSD instruction. */
2314 COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */
2315 COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */
2316 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2317 slm_memcpy,
2318 slm_memset,
2319 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2320 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2321 "16", /* Loop alignment. */
2322 "16:8:8", /* Jump alignment. */
2323 "0:0:8", /* Label alignment. */
2324 "16", /* Func alignment. */
2325 };
2326
2327 static stringop_algs intel_memcpy[2] = {
2328 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2329 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2330 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2331 static stringop_algs intel_memset[2] = {
2332 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2333 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2334 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2335 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2336 static const
2337 struct processor_costs intel_cost = {
2338 {
2339 /* Start of register allocator costs. integer->integer move cost is 2. */
2340 6, /* cost for loading QImode using movzbl */
2341 {4, 4, 4}, /* cost of loading integer registers
2342 in QImode, HImode and SImode.
2343 Relative to reg-reg move (2). */
2344 {6, 6, 6}, /* cost of storing integer registers */
2345 2, /* cost of reg,reg fld/fst */
2346 {6, 6, 8}, /* cost of loading fp registers
2347 in SFmode, DFmode and XFmode */
2348 {6, 6, 10}, /* cost of storing fp registers
2349 in SFmode, DFmode and XFmode */
2350 2, /* cost of moving MMX register */
2351 {6, 6}, /* cost of loading MMX registers
2352 in SImode and DImode */
2353 {6, 6}, /* cost of storing MMX registers
2354 in SImode and DImode */
2355 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */
2356 {6, 6, 6, 6, 6}, /* cost of loading SSE registers
2357 in 32,64,128,256 and 512-bit */
2358 {6, 6, 6, 6, 6}, /* cost of storing SSE registers
2359 in 32,64,128,256 and 512-bit */
2360 4, 4, /* SSE->integer and integer->SSE moves */
2361 /* End of register allocator costs. */
2362 },
2363
2364 COSTS_N_INSNS (1), /* cost of an add instruction */
2365 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2366 COSTS_N_INSNS (1), /* variable shift costs */
2367 COSTS_N_INSNS (1), /* constant shift costs */
2368 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2369 COSTS_N_INSNS (3), /* HI */
2370 COSTS_N_INSNS (3), /* SI */
2371 COSTS_N_INSNS (4), /* DI */
2372 COSTS_N_INSNS (2)}, /* other */
2373 0, /* cost of multiply per each bit set */
2374 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2375 COSTS_N_INSNS (26), /* HI */
2376 COSTS_N_INSNS (42), /* SI */
2377 COSTS_N_INSNS (74), /* DI */
2378 COSTS_N_INSNS (74)}, /* other */
2379 COSTS_N_INSNS (1), /* cost of movsx */
2380 COSTS_N_INSNS (1), /* cost of movzx */
2381 8, /* "large" insn */
2382 17, /* MOVE_RATIO */
2383 6, /* CLEAR_RATIO */
2384 {4, 4, 4}, /* cost of loading integer registers
2385 in QImode, HImode and SImode.
2386 Relative to reg-reg move (2). */
2387 {6, 6, 6}, /* cost of storing integer registers */
2388 {6, 6, 6, 6, 6}, /* cost of loading SSE register
2389 in 32bit, 64bit, 128bit, 256bit and 512bit */
2390 {6, 6, 6, 6, 6}, /* cost of storing SSE register
2391 in 32bit, 64bit, 128bit, 256bit and 512bit */
2392 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
2393 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
2394 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */
2395 4, /* cost of moving SSE register to integer. */
2396 6, 6, /* Gather load static, per_elt. */
2397 6, 6, /* Gather store static, per_elt. */
2398 32, /* size of l1 cache. */
2399 256, /* size of l2 cache. */
2400 64, /* size of prefetch block */
2401 6, /* number of parallel prefetches */
2402 3, /* Branch cost */
2403 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2404 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2405 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2406 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2407 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2408 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2409
2410 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2411 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
2412 COSTS_N_INSNS (8), /* cost of MULSS instruction. */
2413 COSTS_N_INSNS (8), /* cost of MULSD instruction. */
2414 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2415 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
2416 COSTS_N_INSNS (20), /* cost of DIVSS instruction. */
2417 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
2418 COSTS_N_INSNS (40), /* cost of SQRTSS instruction. */
2419 COSTS_N_INSNS (40), /* cost of SQRTSD instruction. */
2420 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2421 intel_memcpy,
2422 intel_memset,
2423 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2424 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2425 "16", /* Loop alignment. */
2426 "16:8:8", /* Jump alignment. */
2427 "0:0:8", /* Label alignment. */
2428 "16", /* Func alignment. */
2429 };
2430
2431 /* Generic should produce code tuned for Core-i7 (and newer chips)
2432 and btver1 (and newer chips). */
2433
2434 static stringop_algs generic_memcpy[2] = {
2435 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2436 {-1, libcall, false}}},
2437 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2438 {-1, libcall, false}}}};
2439 static stringop_algs generic_memset[2] = {
2440 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2441 {-1, libcall, false}}},
2442 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2443 {-1, libcall, false}}}};
2444 static const
2445 struct processor_costs generic_cost = {
2446 {
2447 /* Start of register allocator costs. integer->integer move cost is 2. */
2448 6, /* cost for loading QImode using movzbl */
2449 {6, 6, 6}, /* cost of loading integer registers
2450 in QImode, HImode and SImode.
2451 Relative to reg-reg move (2). */
2452 {6, 6, 6}, /* cost of storing integer registers */
2453 4, /* cost of reg,reg fld/fst */
2454 {6, 6, 12}, /* cost of loading fp registers
2455 in SFmode, DFmode and XFmode */
2456 {6, 6, 12}, /* cost of storing fp registers
2457 in SFmode, DFmode and XFmode */
2458 2, /* cost of moving MMX register */
2459 {6, 6}, /* cost of loading MMX registers
2460 in SImode and DImode */
2461 {6, 6}, /* cost of storing MMX registers
2462 in SImode and DImode */
2463 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
2464 {6, 6, 6, 10, 15}, /* cost of loading SSE registers
2465 in 32,64,128,256 and 512-bit */
2466 {6, 6, 6, 10, 15}, /* cost of storing SSE registers
2467 in 32,64,128,256 and 512-bit */
2468 6, 6, /* SSE->integer and integer->SSE moves */
2469 /* End of register allocator costs. */
2470 },
2471
2472 COSTS_N_INSNS (1), /* cost of an add instruction */
2473 /* Setting cost to 2 makes our current implementation of synth_mult result in
2474 use of unnecessary temporary registers causing regression on several
2475 SPECfp benchmarks. */
2476 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2477 COSTS_N_INSNS (1), /* variable shift costs */
2478 COSTS_N_INSNS (1), /* constant shift costs */
2479 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2480 COSTS_N_INSNS (4), /* HI */
2481 COSTS_N_INSNS (3), /* SI */
2482 COSTS_N_INSNS (4), /* DI */
2483 COSTS_N_INSNS (4)}, /* other */
2484 0, /* cost of multiply per each bit set */
2485 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */
2486 COSTS_N_INSNS (22), /* HI */
2487 COSTS_N_INSNS (30), /* SI */
2488 COSTS_N_INSNS (74), /* DI */
2489 COSTS_N_INSNS (74)}, /* other */
2490 COSTS_N_INSNS (1), /* cost of movsx */
2491 COSTS_N_INSNS (1), /* cost of movzx */
2492 8, /* "large" insn */
2493 17, /* MOVE_RATIO */
2494 6, /* CLEAR_RATIO */
2495 {6, 6, 6}, /* cost of loading integer registers
2496 in QImode, HImode and SImode.
2497 Relative to reg-reg move (2). */
2498 {6, 6, 6}, /* cost of storing integer registers */
2499 {6, 6, 6, 10, 15}, /* cost of loading SSE register
2500 in 32bit, 64bit, 128bit, 256bit and 512bit */
2501 {6, 6, 6, 10, 15}, /* cost of storing SSE register
2502 in 32bit, 64bit, 128bit, 256bit and 512bit */
2503 {6, 6, 6, 10, 15}, /* cost of unaligned loads. */
2504 {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
2505 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
2506 6, /* cost of moving SSE register to integer. */
2507 18, 6, /* Gather load static, per_elt. */
2508 18, 6, /* Gather store static, per_elt. */
2509 32, /* size of l1 cache. */
2510 512, /* size of l2 cache. */
2511 64, /* size of prefetch block */
2512 6, /* number of parallel prefetches */
2513 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2514 value is increased to perhaps more appropriate value of 5. */
2515 3, /* Branch cost */
2516 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2517 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
2518 COSTS_N_INSNS (17), /* cost of FDIV instruction. */
2519 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2520 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2521 COSTS_N_INSNS (14), /* cost of FSQRT instruction. */
2522
2523 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2524 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2525 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2526 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2527 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2528 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
2529 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
2530 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
2531 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
2532 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
2533 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
2534 generic_memcpy,
2535 generic_memset,
2536 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
2537 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
2538 "16:11:8", /* Loop alignment. */
2539 "16:11:8", /* Jump alignment. */
2540 "0:0:8", /* Label alignment. */
2541 "16", /* Func alignment. */
2542 };
2543
2544 /* core_cost should produce code tuned for Core familly of CPUs. */
2545 static stringop_algs core_memcpy[2] = {
2546 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2547 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2548 {-1, libcall, false}}}};
2549 static stringop_algs core_memset[2] = {
2550 {libcall, {{6, loop_1_byte, true},
2551 {24, loop, true},
2552 {8192, rep_prefix_4_byte, true},
2553 {-1, libcall, false}}},
2554 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2555 {-1, libcall, false}}}};
2556
2557 static const
2558 struct processor_costs core_cost = {
2559 {
2560 /* Start of register allocator costs. integer->integer move cost is 2. */
2561 6, /* cost for loading QImode using movzbl */
2562 {4, 4, 4}, /* cost of loading integer registers
2563 in QImode, HImode and SImode.
2564 Relative to reg-reg move (2). */
2565 {6, 6, 6}, /* cost of storing integer registers */
2566 2, /* cost of reg,reg fld/fst */
2567 {6, 6, 8}, /* cost of loading fp registers
2568 in SFmode, DFmode and XFmode */
2569 {6, 6, 10}, /* cost of storing fp registers
2570 in SFmode, DFmode and XFmode */
2571 2, /* cost of moving MMX register */
2572 {6, 6}, /* cost of loading MMX registers
2573 in SImode and DImode */
2574 {6, 6}, /* cost of storing MMX registers
2575 in SImode and DImode */
2576 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
2577 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
2578 in 32,64,128,256 and 512-bit */
2579 {6, 6, 6, 6, 12}, /* cost of storing SSE registers
2580 in 32,64,128,256 and 512-bit */
2581 6, 6, /* SSE->integer and integer->SSE moves */
2582 /* End of register allocator costs. */
2583 },
2584
2585 COSTS_N_INSNS (1), /* cost of an add instruction */
2586 /* On all chips taken into consideration lea is 2 cycles and more. With
2587 this cost however our current implementation of synth_mult results in
2588 use of unnecessary temporary registers causing regression on several
2589 SPECfp benchmarks. */
2590 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2591 COSTS_N_INSNS (1), /* variable shift costs */
2592 COSTS_N_INSNS (1), /* constant shift costs */
2593 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2594 COSTS_N_INSNS (4), /* HI */
2595 COSTS_N_INSNS (3), /* SI */
2596 /* Here we tune for Sandybridge or newer. */
2597 COSTS_N_INSNS (3), /* DI */
2598 COSTS_N_INSNS (3)}, /* other */
2599 0, /* cost of multiply per each bit set */
2600 /* Expanding div/mod currently doesn't consider parallelism. So the cost
2601 model is not realistic. We compensate by increasing the latencies a bit. */
2602 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
2603 COSTS_N_INSNS (11), /* HI */
2604 COSTS_N_INSNS (14), /* SI */
2605 COSTS_N_INSNS (81), /* DI */
2606 COSTS_N_INSNS (81)}, /* other */
2607 COSTS_N_INSNS (1), /* cost of movsx */
2608 COSTS_N_INSNS (1), /* cost of movzx */
2609 8, /* "large" insn */
2610 17, /* MOVE_RATIO */
2611 6, /* CLEAR_RATIO */
2612 {4, 4, 4}, /* cost of loading integer registers
2613 in QImode, HImode and SImode.
2614 Relative to reg-reg move (2). */
2615 {6, 6, 6}, /* cost of storing integer registers */
2616 {6, 6, 6, 6, 12}, /* cost of loading SSE register
2617 in 32bit, 64bit, 128bit, 256bit and 512bit */
2618 {6, 6, 6, 6, 12}, /* cost of storing SSE register
2619 in 32bit, 64bit, 128bit, 256bit and 512bit */
2620 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */
2621 {6, 6, 6, 6, 12}, /* cost of unaligned stores. */
2622 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
2623 2, /* cost of moving SSE register to integer. */
2624 /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
2625 rec. throughput 6.
2626 So 5 uops statically and one uops per load. */
2627 10, 6, /* Gather load static, per_elt. */
2628 10, 6, /* Gather store static, per_elt. */
2629 64, /* size of l1 cache. */
2630 512, /* size of l2 cache. */
2631 64, /* size of prefetch block */
2632 6, /* number of parallel prefetches */
2633 /* FIXME perhaps more appropriate value is 5. */
2634 3, /* Branch cost */
2635 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2636 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
2637 /* 10-24 */
2638 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
2639 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2640 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2641 COSTS_N_INSNS (23), /* cost of FSQRT instruction. */
2642
2643 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2644 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2645 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2646 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2647 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2648 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
2649 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
2650 COSTS_N_INSNS (32), /* cost of DIVSD instruction. */
2651 COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */
2652 COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */
2653 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
2654 core_memcpy,
2655 core_memset,
2656 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2657 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2658 "16:11:8", /* Loop alignment. */
2659 "16:11:8", /* Jump alignment. */
2660 "0:0:8", /* Label alignment. */
2661 "16", /* Func alignment. */
2662 };
2663