]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/i386/x86-tune-costs.h
Update copyright years.
[thirdparty/gcc.git] / gcc / config / i386 / x86-tune-costs.h
1 /* Costs of operations of individual x86 CPUs.
2 Copyright (C) 1988-2021 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 Under Section 7 of GPL version 3, you are granted additional
17 permissions described in the GCC Runtime Library Exception, version
18 3.1, as published by the Free Software Foundation.
19
20 You should have received a copy of the GNU General Public License and
21 a copy of the GCC Runtime Library Exception along with this program;
22 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 <http://www.gnu.org/licenses/>. */
24 /* Processor costs (relative to an add) */
25 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
26 #define COSTS_N_BYTES(N) ((N) * 2)
27
28 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
29
30 static stringop_algs ix86_size_memcpy[2] = {
31 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
32 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
33 static stringop_algs ix86_size_memset[2] = {
34 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
35 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
36
37 const
38 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
39 {
40 /* Start of register allocator costs. integer->integer move cost is 2. */
41 2, /* cost for loading QImode using movzbl */
42 {2, 2, 2}, /* cost of loading integer registers
43 in QImode, HImode and SImode.
44 Relative to reg-reg move (2). */
45 {2, 2, 2}, /* cost of storing integer registers */
46 2, /* cost of reg,reg fld/fst */
47 {2, 2, 2}, /* cost of loading fp registers
48 in SFmode, DFmode and XFmode */
49 {2, 2, 2}, /* cost of storing fp registers
50 in SFmode, DFmode and XFmode */
51 3, /* cost of moving MMX register */
52 {3, 3}, /* cost of loading MMX registers
53 in SImode and DImode */
54 {3, 3}, /* cost of storing MMX registers
55 in SImode and DImode */
56 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
57 {3, 3, 3, 3, 3}, /* cost of loading SSE registers
58 in 32,64,128,256 and 512-bit */
59 {3, 3, 3, 3, 3}, /* cost of storing SSE registers
60 in 32,64,128,256 and 512-bit */
61 3, 3, /* SSE->integer and integer->SSE moves */
62 3, 3, /* mask->integer and integer->mask moves */
63 {2, 2, 2}, /* cost of loading mask register
64 in QImode, HImode, SImode. */
65 {2, 2, 2}, /* cost if storing mask register
66 in QImode, HImode, SImode. */
67 2, /* cost of moving mask register. */
68 /* End of register allocator costs. */
69 },
70
71 COSTS_N_BYTES (2), /* cost of an add instruction */
72 COSTS_N_BYTES (3), /* cost of a lea instruction */
73 COSTS_N_BYTES (2), /* variable shift costs */
74 COSTS_N_BYTES (3), /* constant shift costs */
75 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
76 COSTS_N_BYTES (3), /* HI */
77 COSTS_N_BYTES (3), /* SI */
78 COSTS_N_BYTES (3), /* DI */
79 COSTS_N_BYTES (5)}, /* other */
80 0, /* cost of multiply per each bit set */
81 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
82 COSTS_N_BYTES (3), /* HI */
83 COSTS_N_BYTES (3), /* SI */
84 COSTS_N_BYTES (3), /* DI */
85 COSTS_N_BYTES (5)}, /* other */
86 COSTS_N_BYTES (3), /* cost of movsx */
87 COSTS_N_BYTES (3), /* cost of movzx */
88 0, /* "large" insn */
89 2, /* MOVE_RATIO */
90 2, /* CLEAR_RATIO */
91 {2, 2, 2}, /* cost of loading integer registers
92 in QImode, HImode and SImode.
93 Relative to reg-reg move (2). */
94 {2, 2, 2}, /* cost of storing integer registers */
95 {3, 3, 3, 3, 3}, /* cost of loading SSE register
96 in 32bit, 64bit, 128bit, 256bit and 512bit */
97 {3, 3, 3, 3, 3}, /* cost of storing SSE register
98 in 32bit, 64bit, 128bit, 256bit and 512bit */
99 {3, 3, 3, 3, 3}, /* cost of unaligned SSE load
100 in 128bit, 256bit and 512bit */
101 {3, 3, 3, 3, 3}, /* cost of unaligned SSE store
102 in 128bit, 256bit and 512bit */
103 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
104 3, /* cost of moving SSE register to integer. */
105 5, 0, /* Gather load static, per_elt. */
106 5, 0, /* Gather store static, per_elt. */
107 0, /* size of l1 cache */
108 0, /* size of l2 cache */
109 0, /* size of prefetch block */
110 0, /* number of parallel prefetches */
111 2, /* Branch cost */
112 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
113 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
114 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
115 COSTS_N_BYTES (2), /* cost of FABS instruction. */
116 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
117 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
118
119 COSTS_N_BYTES (2), /* cost of cheap SSE instruction. */
120 COSTS_N_BYTES (2), /* cost of ADDSS/SD SUBSS/SD insns. */
121 COSTS_N_BYTES (2), /* cost of MULSS instruction. */
122 COSTS_N_BYTES (2), /* cost of MULSD instruction. */
123 COSTS_N_BYTES (2), /* cost of FMA SS instruction. */
124 COSTS_N_BYTES (2), /* cost of FMA SD instruction. */
125 COSTS_N_BYTES (2), /* cost of DIVSS instruction. */
126 COSTS_N_BYTES (2), /* cost of DIVSD instruction. */
127 COSTS_N_BYTES (2), /* cost of SQRTSS instruction. */
128 COSTS_N_BYTES (2), /* cost of SQRTSD instruction. */
129 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
130 ix86_size_memcpy,
131 ix86_size_memset,
132 COSTS_N_BYTES (1), /* cond_taken_branch_cost. */
133 COSTS_N_BYTES (1), /* cond_not_taken_branch_cost. */
134 NULL, /* Loop alignment. */
135 NULL, /* Jump alignment. */
136 NULL, /* Label alignment. */
137 NULL, /* Func alignment. */
138 };
139
140 /* Processor costs (relative to an add) */
141 static stringop_algs i386_memcpy[2] = {
142 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
143 DUMMY_STRINGOP_ALGS};
144 static stringop_algs i386_memset[2] = {
145 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
146 DUMMY_STRINGOP_ALGS};
147
148 static const
149 struct processor_costs i386_cost = { /* 386 specific costs */
150 {
151 /* Start of register allocator costs. integer->integer move cost is 2. */
152 4, /* cost for loading QImode using movzbl */
153 {2, 4, 2}, /* cost of loading integer registers
154 in QImode, HImode and SImode.
155 Relative to reg-reg move (2). */
156 {2, 4, 2}, /* cost of storing integer registers */
157 2, /* cost of reg,reg fld/fst */
158 {8, 8, 8}, /* cost of loading fp registers
159 in SFmode, DFmode and XFmode */
160 {8, 8, 8}, /* cost of storing fp registers
161 in SFmode, DFmode and XFmode */
162 2, /* cost of moving MMX register */
163 {4, 8}, /* cost of loading MMX registers
164 in SImode and DImode */
165 {4, 8}, /* cost of storing MMX registers
166 in SImode and DImode */
167 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
168 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
169 in 32,64,128,256 and 512-bit */
170 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
171 in 32,64,128,256 and 512-bit */
172 3, 3, /* SSE->integer and integer->SSE moves */
173 3, 3, /* mask->integer and integer->mask moves */
174 {2, 4, 2}, /* cost of loading mask register
175 in QImode, HImode, SImode. */
176 {2, 4, 2}, /* cost if storing mask register
177 in QImode, HImode, SImode. */
178 2, /* cost of moving mask register. */
179 /* End of register allocator costs. */
180 },
181
182 COSTS_N_INSNS (1), /* cost of an add instruction */
183 COSTS_N_INSNS (1), /* cost of a lea instruction */
184 COSTS_N_INSNS (3), /* variable shift costs */
185 COSTS_N_INSNS (2), /* constant shift costs */
186 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
187 COSTS_N_INSNS (6), /* HI */
188 COSTS_N_INSNS (6), /* SI */
189 COSTS_N_INSNS (6), /* DI */
190 COSTS_N_INSNS (6)}, /* other */
191 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
192 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
193 COSTS_N_INSNS (23), /* HI */
194 COSTS_N_INSNS (23), /* SI */
195 COSTS_N_INSNS (23), /* DI */
196 COSTS_N_INSNS (23)}, /* other */
197 COSTS_N_INSNS (3), /* cost of movsx */
198 COSTS_N_INSNS (2), /* cost of movzx */
199 15, /* "large" insn */
200 3, /* MOVE_RATIO */
201 3, /* CLEAR_RATIO */
202 {2, 4, 2}, /* cost of loading integer registers
203 in QImode, HImode and SImode.
204 Relative to reg-reg move (2). */
205 {2, 4, 2}, /* cost of storing integer registers */
206 {4, 8, 16, 32, 64}, /* cost of loading SSE register
207 in 32bit, 64bit, 128bit, 256bit and 512bit */
208 {4, 8, 16, 32, 64}, /* cost of storing SSE register
209 in 32bit, 64bit, 128bit, 256bit and 512bit */
210 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
211 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
212 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
213 3, /* cost of moving SSE register to integer. */
214 4, 4, /* Gather load static, per_elt. */
215 4, 4, /* Gather store static, per_elt. */
216 0, /* size of l1 cache */
217 0, /* size of l2 cache */
218 0, /* size of prefetch block */
219 0, /* number of parallel prefetches */
220 1, /* Branch cost */
221 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
222 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
223 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
224 COSTS_N_INSNS (22), /* cost of FABS instruction. */
225 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
226 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
227
228 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
229 COSTS_N_INSNS (23), /* cost of ADDSS/SD SUBSS/SD insns. */
230 COSTS_N_INSNS (27), /* cost of MULSS instruction. */
231 COSTS_N_INSNS (27), /* cost of MULSD instruction. */
232 COSTS_N_INSNS (27), /* cost of FMA SS instruction. */
233 COSTS_N_INSNS (27), /* cost of FMA SD instruction. */
234 COSTS_N_INSNS (88), /* cost of DIVSS instruction. */
235 COSTS_N_INSNS (88), /* cost of DIVSD instruction. */
236 COSTS_N_INSNS (122), /* cost of SQRTSS instruction. */
237 COSTS_N_INSNS (122), /* cost of SQRTSD instruction. */
238 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
239 i386_memcpy,
240 i386_memset,
241 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
242 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
243 "4", /* Loop alignment. */
244 "4", /* Jump alignment. */
245 NULL, /* Label alignment. */
246 "4", /* Func alignment. */
247 };
248
249 static stringop_algs i486_memcpy[2] = {
250 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
251 DUMMY_STRINGOP_ALGS};
252 static stringop_algs i486_memset[2] = {
253 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
254 DUMMY_STRINGOP_ALGS};
255
256 static const
257 struct processor_costs i486_cost = { /* 486 specific costs */
258 {
259 /* Start of register allocator costs. integer->integer move cost is 2. */
260 4, /* cost for loading QImode using movzbl */
261 {2, 4, 2}, /* cost of loading integer registers
262 in QImode, HImode and SImode.
263 Relative to reg-reg move (2). */
264 {2, 4, 2}, /* cost of storing integer registers */
265 2, /* cost of reg,reg fld/fst */
266 {8, 8, 8}, /* cost of loading fp registers
267 in SFmode, DFmode and XFmode */
268 {8, 8, 8}, /* cost of storing fp registers
269 in SFmode, DFmode and XFmode */
270 2, /* cost of moving MMX register */
271 {4, 8}, /* cost of loading MMX registers
272 in SImode and DImode */
273 {4, 8}, /* cost of storing MMX registers
274 in SImode and DImode */
275 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
276 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
277 in 32,64,128,256 and 512-bit */
278 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
279 in 32,64,128,256 and 512-bit */
280 3, 3, /* SSE->integer and integer->SSE moves */
281 3, 3, /* mask->integer and integer->mask moves */
282 {2, 4, 2}, /* cost of loading mask register
283 in QImode, HImode, SImode. */
284 {2, 4, 2}, /* cost if storing mask register
285 in QImode, HImode, SImode. */
286 2, /* cost of moving mask register. */
287 /* End of register allocator costs. */
288 },
289
290 COSTS_N_INSNS (1), /* cost of an add instruction */
291 COSTS_N_INSNS (1), /* cost of a lea instruction */
292 COSTS_N_INSNS (3), /* variable shift costs */
293 COSTS_N_INSNS (2), /* constant shift costs */
294 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
295 COSTS_N_INSNS (12), /* HI */
296 COSTS_N_INSNS (12), /* SI */
297 COSTS_N_INSNS (12), /* DI */
298 COSTS_N_INSNS (12)}, /* other */
299 1, /* cost of multiply per each bit set */
300 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
301 COSTS_N_INSNS (40), /* HI */
302 COSTS_N_INSNS (40), /* SI */
303 COSTS_N_INSNS (40), /* DI */
304 COSTS_N_INSNS (40)}, /* other */
305 COSTS_N_INSNS (3), /* cost of movsx */
306 COSTS_N_INSNS (2), /* cost of movzx */
307 15, /* "large" insn */
308 3, /* MOVE_RATIO */
309 3, /* CLEAR_RATIO */
310 {2, 4, 2}, /* cost of loading integer registers
311 in QImode, HImode and SImode.
312 Relative to reg-reg move (2). */
313 {2, 4, 2}, /* cost of storing integer registers */
314 {4, 8, 16, 32, 64}, /* cost of loading SSE register
315 in 32bit, 64bit, 128bit, 256bit and 512bit */
316 {4, 8, 16, 32, 64}, /* cost of storing SSE register
317 in 32bit, 64bit, 128bit, 256bit and 512bit */
318 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
319 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
320 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
321 3, /* cost of moving SSE register to integer. */
322 4, 4, /* Gather load static, per_elt. */
323 4, 4, /* Gather store static, per_elt. */
324 4, /* size of l1 cache. 486 has 8kB cache
325 shared for code and data, so 4kB is
326 not really precise. */
327 4, /* size of l2 cache */
328 0, /* size of prefetch block */
329 0, /* number of parallel prefetches */
330 1, /* Branch cost */
331 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
332 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
333 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
334 COSTS_N_INSNS (3), /* cost of FABS instruction. */
335 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
336 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
337
338 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
339 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
340 COSTS_N_INSNS (16), /* cost of MULSS instruction. */
341 COSTS_N_INSNS (16), /* cost of MULSD instruction. */
342 COSTS_N_INSNS (16), /* cost of FMA SS instruction. */
343 COSTS_N_INSNS (16), /* cost of FMA SD instruction. */
344 COSTS_N_INSNS (73), /* cost of DIVSS instruction. */
345 COSTS_N_INSNS (74), /* cost of DIVSD instruction. */
346 COSTS_N_INSNS (83), /* cost of SQRTSS instruction. */
347 COSTS_N_INSNS (83), /* cost of SQRTSD instruction. */
348 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
349 i486_memcpy,
350 i486_memset,
351 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
352 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
353 "16", /* Loop alignment. */
354 "16", /* Jump alignment. */
355 "0:0:8", /* Label alignment. */
356 "16", /* Func alignment. */
357 };
358
359 static stringop_algs pentium_memcpy[2] = {
360 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
361 DUMMY_STRINGOP_ALGS};
362 static stringop_algs pentium_memset[2] = {
363 {libcall, {{-1, rep_prefix_4_byte, false}}},
364 DUMMY_STRINGOP_ALGS};
365
366 static const
367 struct processor_costs pentium_cost = {
368 {
369 /* Start of register allocator costs. integer->integer move cost is 2. */
370 6, /* cost for loading QImode using movzbl */
371 {2, 4, 2}, /* cost of loading integer registers
372 in QImode, HImode and SImode.
373 Relative to reg-reg move (2). */
374 {2, 4, 2}, /* cost of storing integer registers */
375 2, /* cost of reg,reg fld/fst */
376 {2, 2, 6}, /* cost of loading fp registers
377 in SFmode, DFmode and XFmode */
378 {4, 4, 6}, /* cost of storing fp registers
379 in SFmode, DFmode and XFmode */
380 8, /* cost of moving MMX register */
381 {8, 8}, /* cost of loading MMX registers
382 in SImode and DImode */
383 {8, 8}, /* cost of storing MMX registers
384 in SImode and DImode */
385 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
386 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
387 in 32,64,128,256 and 512-bit */
388 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
389 in 32,64,128,256 and 512-bit */
390 3, 3, /* SSE->integer and integer->SSE moves */
391 3, 3, /* mask->integer and integer->mask moves */
392 {2, 4, 2}, /* cost of loading mask register
393 in QImode, HImode, SImode. */
394 {2, 4, 2}, /* cost if storing mask register
395 in QImode, HImode, SImode. */
396 2, /* cost of moving mask register. */
397 /* End of register allocator costs. */
398 },
399
400 COSTS_N_INSNS (1), /* cost of an add instruction */
401 COSTS_N_INSNS (1), /* cost of a lea instruction */
402 COSTS_N_INSNS (4), /* variable shift costs */
403 COSTS_N_INSNS (1), /* constant shift costs */
404 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
405 COSTS_N_INSNS (11), /* HI */
406 COSTS_N_INSNS (11), /* SI */
407 COSTS_N_INSNS (11), /* DI */
408 COSTS_N_INSNS (11)}, /* other */
409 0, /* cost of multiply per each bit set */
410 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
411 COSTS_N_INSNS (25), /* HI */
412 COSTS_N_INSNS (25), /* SI */
413 COSTS_N_INSNS (25), /* DI */
414 COSTS_N_INSNS (25)}, /* other */
415 COSTS_N_INSNS (3), /* cost of movsx */
416 COSTS_N_INSNS (2), /* cost of movzx */
417 8, /* "large" insn */
418 6, /* MOVE_RATIO */
419 6, /* CLEAR_RATIO */
420 {2, 4, 2}, /* cost of loading integer registers
421 in QImode, HImode and SImode.
422 Relative to reg-reg move (2). */
423 {2, 4, 2}, /* cost of storing integer registers */
424 {4, 8, 16, 32, 64}, /* cost of loading SSE register
425 in 32bit, 64bit, 128bit, 256bit and 512bit */
426 {4, 8, 16, 32, 64}, /* cost of storing SSE register
427 in 32bit, 64bit, 128bit, 256bit and 512bit */
428 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
429 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
430 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
431 3, /* cost of moving SSE register to integer. */
432 4, 4, /* Gather load static, per_elt. */
433 4, 4, /* Gather store static, per_elt. */
434 8, /* size of l1 cache. */
435 8, /* size of l2 cache */
436 0, /* size of prefetch block */
437 0, /* number of parallel prefetches */
438 2, /* Branch cost */
439 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
440 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
441 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
442 COSTS_N_INSNS (1), /* cost of FABS instruction. */
443 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
444 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
445
446 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
447 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
448 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
449 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
450 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
451 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
452 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
453 COSTS_N_INSNS (39), /* cost of DIVSD instruction. */
454 COSTS_N_INSNS (70), /* cost of SQRTSS instruction. */
455 COSTS_N_INSNS (70), /* cost of SQRTSD instruction. */
456 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
457 pentium_memcpy,
458 pentium_memset,
459 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
460 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
461 "16:8:8", /* Loop alignment. */
462 "16:8:8", /* Jump alignment. */
463 "0:0:8", /* Label alignment. */
464 "16", /* Func alignment. */
465 };
466
467 static const
468 struct processor_costs lakemont_cost = {
469 {
470 /* Start of register allocator costs. integer->integer move cost is 2. */
471 6, /* cost for loading QImode using movzbl */
472 {2, 4, 2}, /* cost of loading integer registers
473 in QImode, HImode and SImode.
474 Relative to reg-reg move (2). */
475 {2, 4, 2}, /* cost of storing integer registers */
476 2, /* cost of reg,reg fld/fst */
477 {2, 2, 6}, /* cost of loading fp registers
478 in SFmode, DFmode and XFmode */
479 {4, 4, 6}, /* cost of storing fp registers
480 in SFmode, DFmode and XFmode */
481 8, /* cost of moving MMX register */
482 {8, 8}, /* cost of loading MMX registers
483 in SImode and DImode */
484 {8, 8}, /* cost of storing MMX registers
485 in SImode and DImode */
486 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
487 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
488 in 32,64,128,256 and 512-bit */
489 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
490 in 32,64,128,256 and 512-bit */
491 3, 3, /* SSE->integer and integer->SSE moves */
492 3, 3, /* mask->integer and integer->mask moves */
493 {2, 4, 2}, /* cost of loading mask register
494 in QImode, HImode, SImode. */
495 {2, 4, 2}, /* cost if storing mask register
496 in QImode, HImode, SImode. */
497 2, /* cost of moving mask register. */
498 /* End of register allocator costs. */
499 },
500
501 COSTS_N_INSNS (1), /* cost of an add instruction */
502 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
503 COSTS_N_INSNS (1), /* variable shift costs */
504 COSTS_N_INSNS (1), /* constant shift costs */
505 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
506 COSTS_N_INSNS (11), /* HI */
507 COSTS_N_INSNS (11), /* SI */
508 COSTS_N_INSNS (11), /* DI */
509 COSTS_N_INSNS (11)}, /* other */
510 0, /* cost of multiply per each bit set */
511 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
512 COSTS_N_INSNS (25), /* HI */
513 COSTS_N_INSNS (25), /* SI */
514 COSTS_N_INSNS (25), /* DI */
515 COSTS_N_INSNS (25)}, /* other */
516 COSTS_N_INSNS (3), /* cost of movsx */
517 COSTS_N_INSNS (2), /* cost of movzx */
518 8, /* "large" insn */
519 17, /* MOVE_RATIO */
520 6, /* CLEAR_RATIO */
521 {2, 4, 2}, /* cost of loading integer registers
522 in QImode, HImode and SImode.
523 Relative to reg-reg move (2). */
524 {2, 4, 2}, /* cost of storing integer registers */
525 {4, 8, 16, 32, 64}, /* cost of loading SSE register
526 in 32bit, 64bit, 128bit, 256bit and 512bit */
527 {4, 8, 16, 32, 64}, /* cost of storing SSE register
528 in 32bit, 64bit, 128bit, 256bit and 512bit */
529 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
530 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
531 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
532 3, /* cost of moving SSE register to integer. */
533 4, 4, /* Gather load static, per_elt. */
534 4, 4, /* Gather store static, per_elt. */
535 8, /* size of l1 cache. */
536 8, /* size of l2 cache */
537 0, /* size of prefetch block */
538 0, /* number of parallel prefetches */
539 2, /* Branch cost */
540 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
541 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
542 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
543 COSTS_N_INSNS (1), /* cost of FABS instruction. */
544 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
545 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
546
547 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
548 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
549 COSTS_N_INSNS (5), /* cost of MULSS instruction. */
550 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
551 COSTS_N_INSNS (10), /* cost of FMA SS instruction. */
552 COSTS_N_INSNS (10), /* cost of FMA SD instruction. */
553 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
554 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
555 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
556 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
557 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
558 pentium_memcpy,
559 pentium_memset,
560 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
561 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
562 "16:8:8", /* Loop alignment. */
563 "16:8:8", /* Jump alignment. */
564 "0:0:8", /* Label alignment. */
565 "16", /* Func alignment. */
566 };
567
568 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
569 (we ensure the alignment). For small blocks inline loop is still a
570 noticeable win, for bigger blocks either rep movsl or rep movsb is
571 way to go. Rep movsb has apparently more expensive startup time in CPU,
572 but after 4K the difference is down in the noise. */
573 static stringop_algs pentiumpro_memcpy[2] = {
574 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
575 {8192, rep_prefix_4_byte, false},
576 {-1, rep_prefix_1_byte, false}}},
577 DUMMY_STRINGOP_ALGS};
578 static stringop_algs pentiumpro_memset[2] = {
579 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
580 {8192, rep_prefix_4_byte, false},
581 {-1, libcall, false}}},
582 DUMMY_STRINGOP_ALGS};
583 static const
584 struct processor_costs pentiumpro_cost = {
585 {
586 /* Start of register allocator costs. integer->integer move cost is 2. */
587 2, /* cost for loading QImode using movzbl */
588 {4, 4, 4}, /* cost of loading integer registers
589 in QImode, HImode and SImode.
590 Relative to reg-reg move (2). */
591 {2, 2, 2}, /* cost of storing integer registers */
592 2, /* cost of reg,reg fld/fst */
593 {2, 2, 6}, /* cost of loading fp registers
594 in SFmode, DFmode and XFmode */
595 {4, 4, 6}, /* cost of storing fp registers
596 in SFmode, DFmode and XFmode */
597 2, /* cost of moving MMX register */
598 {2, 2}, /* cost of loading MMX registers
599 in SImode and DImode */
600 {2, 2}, /* cost of storing MMX registers
601 in SImode and DImode */
602 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
603 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
604 in 32,64,128,256 and 512-bit */
605 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
606 in 32,64,128,256 and 512-bit */
607 3, 3, /* SSE->integer and integer->SSE moves */
608 3, 3, /* mask->integer and integer->mask moves */
609 {4, 4, 4}, /* cost of loading mask register
610 in QImode, HImode, SImode. */
611 {2, 2, 2}, /* cost if storing mask register
612 in QImode, HImode, SImode. */
613 2, /* cost of moving mask register. */
614 /* End of register allocator costs. */
615 },
616
617 COSTS_N_INSNS (1), /* cost of an add instruction */
618 COSTS_N_INSNS (1), /* cost of a lea instruction */
619 COSTS_N_INSNS (1), /* variable shift costs */
620 COSTS_N_INSNS (1), /* constant shift costs */
621 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
622 COSTS_N_INSNS (4), /* HI */
623 COSTS_N_INSNS (4), /* SI */
624 COSTS_N_INSNS (4), /* DI */
625 COSTS_N_INSNS (4)}, /* other */
626 0, /* cost of multiply per each bit set */
627 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
628 COSTS_N_INSNS (17), /* HI */
629 COSTS_N_INSNS (17), /* SI */
630 COSTS_N_INSNS (17), /* DI */
631 COSTS_N_INSNS (17)}, /* other */
632 COSTS_N_INSNS (1), /* cost of movsx */
633 COSTS_N_INSNS (1), /* cost of movzx */
634 8, /* "large" insn */
635 6, /* MOVE_RATIO */
636 6, /* CLEAR_RATIO */
637 {4, 4, 4}, /* cost of loading integer registers
638 in QImode, HImode and SImode.
639 Relative to reg-reg move (2). */
640 {2, 2, 2}, /* cost of storing integer registers */
641 {4, 8, 16, 32, 64}, /* cost of loading SSE register
642 in 32bit, 64bit, 128bit, 256bit and 512bit */
643 {4, 8, 16, 32, 64}, /* cost of storing SSE register
644 in 32bit, 64bit, 128bit, 256bit and 512bit */
645 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
646 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
647 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
648 3, /* cost of moving SSE register to integer. */
649 4, 4, /* Gather load static, per_elt. */
650 4, 4, /* Gather store static, per_elt. */
651 8, /* size of l1 cache. */
652 256, /* size of l2 cache */
653 32, /* size of prefetch block */
654 6, /* number of parallel prefetches */
655 2, /* Branch cost */
656 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
657 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
658 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
659 COSTS_N_INSNS (2), /* cost of FABS instruction. */
660 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
661 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
662
663 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
664 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
665 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
666 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
667 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
668 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
669 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
670 COSTS_N_INSNS (18), /* cost of DIVSD instruction. */
671 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
672 COSTS_N_INSNS (31), /* cost of SQRTSD instruction. */
673 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
674 pentiumpro_memcpy,
675 pentiumpro_memset,
676 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
677 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
678 "16", /* Loop alignment. */
679 "16:11:8", /* Jump alignment. */
680 "0:0:8", /* Label alignment. */
681 "16", /* Func alignment. */
682 };
683
684 static stringop_algs geode_memcpy[2] = {
685 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
686 DUMMY_STRINGOP_ALGS};
687 static stringop_algs geode_memset[2] = {
688 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
689 DUMMY_STRINGOP_ALGS};
690 static const
691 struct processor_costs geode_cost = {
692 {
693 /* Start of register allocator costs. integer->integer move cost is 2. */
694 2, /* cost for loading QImode using movzbl */
695 {2, 2, 2}, /* cost of loading integer registers
696 in QImode, HImode and SImode.
697 Relative to reg-reg move (2). */
698 {2, 2, 2}, /* cost of storing integer registers */
699 2, /* cost of reg,reg fld/fst */
700 {2, 2, 2}, /* cost of loading fp registers
701 in SFmode, DFmode and XFmode */
702 {4, 6, 6}, /* cost of storing fp registers
703 in SFmode, DFmode and XFmode */
704 2, /* cost of moving MMX register */
705 {2, 2}, /* cost of loading MMX registers
706 in SImode and DImode */
707 {2, 2}, /* cost of storing MMX registers
708 in SImode and DImode */
709 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
710 {2, 2, 8, 16, 32}, /* cost of loading SSE registers
711 in 32,64,128,256 and 512-bit */
712 {2, 2, 8, 16, 32}, /* cost of storing SSE registers
713 in 32,64,128,256 and 512-bit */
714 6, 6, /* SSE->integer and integer->SSE moves */
715 6, 6, /* mask->integer and integer->mask moves */
716 {2, 2, 2}, /* cost of loading mask register
717 in QImode, HImode, SImode. */
718 {2, 2, 2}, /* cost if storing mask register
719 in QImode, HImode, SImode. */
720 2, /* cost of moving mask register. */
721 /* End of register allocator costs. */
722 },
723
724 COSTS_N_INSNS (1), /* cost of an add instruction */
725 COSTS_N_INSNS (1), /* cost of a lea instruction */
726 COSTS_N_INSNS (2), /* variable shift costs */
727 COSTS_N_INSNS (1), /* constant shift costs */
728 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
729 COSTS_N_INSNS (4), /* HI */
730 COSTS_N_INSNS (7), /* SI */
731 COSTS_N_INSNS (7), /* DI */
732 COSTS_N_INSNS (7)}, /* other */
733 0, /* cost of multiply per each bit set */
734 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
735 COSTS_N_INSNS (23), /* HI */
736 COSTS_N_INSNS (39), /* SI */
737 COSTS_N_INSNS (39), /* DI */
738 COSTS_N_INSNS (39)}, /* other */
739 COSTS_N_INSNS (1), /* cost of movsx */
740 COSTS_N_INSNS (1), /* cost of movzx */
741 8, /* "large" insn */
742 4, /* MOVE_RATIO */
743 4, /* CLEAR_RATIO */
744 {2, 2, 2}, /* cost of loading integer registers
745 in QImode, HImode and SImode.
746 Relative to reg-reg move (2). */
747 {2, 2, 2}, /* cost of storing integer registers */
748 {2, 2, 8, 16, 32}, /* cost of loading SSE register
749 in 32bit, 64bit, 128bit, 256bit and 512bit */
750 {2, 2, 8, 16, 32}, /* cost of storing SSE register
751 in 32bit, 64bit, 128bit, 256bit and 512bit */
752 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */
753 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
754 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
755 6, /* cost of moving SSE register to integer. */
756 2, 2, /* Gather load static, per_elt. */
757 2, 2, /* Gather store static, per_elt. */
758 64, /* size of l1 cache. */
759 128, /* size of l2 cache. */
760 32, /* size of prefetch block */
761 1, /* number of parallel prefetches */
762 1, /* Branch cost */
763 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
764 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
765 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
766 COSTS_N_INSNS (1), /* cost of FABS instruction. */
767 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
768 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
769
770 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
771 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
772 COSTS_N_INSNS (11), /* cost of MULSS instruction. */
773 COSTS_N_INSNS (11), /* cost of MULSD instruction. */
774 COSTS_N_INSNS (17), /* cost of FMA SS instruction. */
775 COSTS_N_INSNS (17), /* cost of FMA SD instruction. */
776 COSTS_N_INSNS (47), /* cost of DIVSS instruction. */
777 COSTS_N_INSNS (47), /* cost of DIVSD instruction. */
778 COSTS_N_INSNS (54), /* cost of SQRTSS instruction. */
779 COSTS_N_INSNS (54), /* cost of SQRTSD instruction. */
780 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
781 geode_memcpy,
782 geode_memset,
783 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
784 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
785 NULL, /* Loop alignment. */
786 NULL, /* Jump alignment. */
787 NULL, /* Label alignment. */
788 NULL, /* Func alignment. */
789 };
790
791 static stringop_algs k6_memcpy[2] = {
792 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
793 DUMMY_STRINGOP_ALGS};
794 static stringop_algs k6_memset[2] = {
795 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
796 DUMMY_STRINGOP_ALGS};
797 static const
798 struct processor_costs k6_cost = {
799 {
800 /* Start of register allocator costs. integer->integer move cost is 2. */
801 3, /* cost for loading QImode using movzbl */
802 {4, 5, 4}, /* cost of loading integer registers
803 in QImode, HImode and SImode.
804 Relative to reg-reg move (2). */
805 {2, 3, 2}, /* cost of storing integer registers */
806 4, /* cost of reg,reg fld/fst */
807 {6, 6, 6}, /* cost of loading fp registers
808 in SFmode, DFmode and XFmode */
809 {4, 4, 4}, /* cost of storing fp registers
810 in SFmode, DFmode and XFmode */
811 2, /* cost of moving MMX register */
812 {2, 2}, /* cost of loading MMX registers
813 in SImode and DImode */
814 {2, 2}, /* cost of storing MMX registers
815 in SImode and DImode */
816 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
817 {2, 2, 8, 16, 32}, /* cost of loading SSE registers
818 in 32,64,128,256 and 512-bit */
819 {2, 2, 8, 16, 32}, /* cost of storing SSE registers
820 in 32,64,128,256 and 512-bit */
821 6, 6, /* SSE->integer and integer->SSE moves */
822 6, 6, /* mask->integer and integer->mask moves */
823 {4, 5, 4}, /* cost of loading mask register
824 in QImode, HImode, SImode. */
825 {2, 3, 2}, /* cost if storing mask register
826 in QImode, HImode, SImode. */
827 2, /* cost of moving mask register. */
828 /* End of register allocator costs. */
829 },
830
831 COSTS_N_INSNS (1), /* cost of an add instruction */
832 COSTS_N_INSNS (2), /* cost of a lea instruction */
833 COSTS_N_INSNS (1), /* variable shift costs */
834 COSTS_N_INSNS (1), /* constant shift costs */
835 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
836 COSTS_N_INSNS (3), /* HI */
837 COSTS_N_INSNS (3), /* SI */
838 COSTS_N_INSNS (3), /* DI */
839 COSTS_N_INSNS (3)}, /* other */
840 0, /* cost of multiply per each bit set */
841 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
842 COSTS_N_INSNS (18), /* HI */
843 COSTS_N_INSNS (18), /* SI */
844 COSTS_N_INSNS (18), /* DI */
845 COSTS_N_INSNS (18)}, /* other */
846 COSTS_N_INSNS (2), /* cost of movsx */
847 COSTS_N_INSNS (2), /* cost of movzx */
848 8, /* "large" insn */
849 4, /* MOVE_RATIO */
850 4, /* CLEAR_RATIO */
851 {4, 5, 4}, /* cost of loading integer registers
852 in QImode, HImode and SImode.
853 Relative to reg-reg move (2). */
854 {2, 3, 2}, /* cost of storing integer registers */
855 {2, 2, 8, 16, 32}, /* cost of loading SSE register
856 in 32bit, 64bit, 128bit, 256bit and 512bit */
857 {2, 2, 8, 16, 32}, /* cost of storing SSE register
858 in 32bit, 64bit, 128bit, 256bit and 512bit */
859 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */
860 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
861 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
862 6, /* cost of moving SSE register to integer. */
863 2, 2, /* Gather load static, per_elt. */
864 2, 2, /* Gather store static, per_elt. */
865 32, /* size of l1 cache. */
866 32, /* size of l2 cache. Some models
867 have integrated l2 cache, but
868 optimizing for k6 is not important
869 enough to worry about that. */
870 32, /* size of prefetch block */
871 1, /* number of parallel prefetches */
872 1, /* Branch cost */
873 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
874 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
875 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
876 COSTS_N_INSNS (2), /* cost of FABS instruction. */
877 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
878 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
879
880 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
881 COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */
882 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
883 COSTS_N_INSNS (2), /* cost of MULSD instruction. */
884 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
885 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
886 COSTS_N_INSNS (56), /* cost of DIVSS instruction. */
887 COSTS_N_INSNS (56), /* cost of DIVSD instruction. */
888 COSTS_N_INSNS (56), /* cost of SQRTSS instruction. */
889 COSTS_N_INSNS (56), /* cost of SQRTSD instruction. */
890 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
891 k6_memcpy,
892 k6_memset,
893 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
894 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
895 "32:8:8", /* Loop alignment. */
896 "32:8:8", /* Jump alignment. */
897 "0:0:8", /* Label alignment. */
898 "32", /* Func alignment. */
899 };
900
901 /* For some reason, Athlon deals better with REP prefix (relative to loops)
902 compared to K8. Alignment becomes important after 8 bytes for memcpy and
903 128 bytes for memset. */
904 static stringop_algs athlon_memcpy[2] = {
905 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
906 DUMMY_STRINGOP_ALGS};
907 static stringop_algs athlon_memset[2] = {
908 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
909 DUMMY_STRINGOP_ALGS};
910 static const
911 struct processor_costs athlon_cost = {
912 {
913 /* Start of register allocator costs. integer->integer move cost is 2. */
914 4, /* cost for loading QImode using movzbl */
915 {3, 4, 3}, /* cost of loading integer registers
916 in QImode, HImode and SImode.
917 Relative to reg-reg move (2). */
918 {3, 4, 3}, /* cost of storing integer registers */
919 4, /* cost of reg,reg fld/fst */
920 {4, 4, 12}, /* cost of loading fp registers
921 in SFmode, DFmode and XFmode */
922 {6, 6, 8}, /* cost of storing fp registers
923 in SFmode, DFmode and XFmode */
924 2, /* cost of moving MMX register */
925 {4, 4}, /* cost of loading MMX registers
926 in SImode and DImode */
927 {4, 4}, /* cost of storing MMX registers
928 in SImode and DImode */
929 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
930 {4, 4, 12, 12, 24}, /* cost of loading SSE registers
931 in 32,64,128,256 and 512-bit */
932 {4, 4, 10, 10, 20}, /* cost of storing SSE registers
933 in 32,64,128,256 and 512-bit */
934 5, 5, /* SSE->integer and integer->SSE moves */
935 5, 5, /* mask->integer and integer->mask moves */
936 {3, 4, 3}, /* cost of loading mask register
937 in QImode, HImode, SImode. */
938 {3, 4, 3}, /* cost if storing mask register
939 in QImode, HImode, SImode. */
940 2, /* cost of moving mask register. */
941 /* End of register allocator costs. */
942 },
943
944 COSTS_N_INSNS (1), /* cost of an add instruction */
945 COSTS_N_INSNS (2), /* cost of a lea instruction */
946 COSTS_N_INSNS (1), /* variable shift costs */
947 COSTS_N_INSNS (1), /* constant shift costs */
948 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
949 COSTS_N_INSNS (5), /* HI */
950 COSTS_N_INSNS (5), /* SI */
951 COSTS_N_INSNS (5), /* DI */
952 COSTS_N_INSNS (5)}, /* other */
953 0, /* cost of multiply per each bit set */
954 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
955 COSTS_N_INSNS (26), /* HI */
956 COSTS_N_INSNS (42), /* SI */
957 COSTS_N_INSNS (74), /* DI */
958 COSTS_N_INSNS (74)}, /* other */
959 COSTS_N_INSNS (1), /* cost of movsx */
960 COSTS_N_INSNS (1), /* cost of movzx */
961 8, /* "large" insn */
962 9, /* MOVE_RATIO */
963 6, /* CLEAR_RATIO */
964 {3, 4, 3}, /* cost of loading integer registers
965 in QImode, HImode and SImode.
966 Relative to reg-reg move (2). */
967 {3, 4, 3}, /* cost of storing integer registers */
968 {4, 4, 12, 12, 24}, /* cost of loading SSE register
969 in 32bit, 64bit, 128bit, 256bit and 512bit */
970 {4, 4, 10, 10, 20}, /* cost of storing SSE register
971 in 32bit, 64bit, 128bit, 256bit and 512bit */
972 {4, 4, 12, 12, 24}, /* cost of unaligned loads. */
973 {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
974 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
975 5, /* cost of moving SSE register to integer. */
976 4, 4, /* Gather load static, per_elt. */
977 4, 4, /* Gather store static, per_elt. */
978 64, /* size of l1 cache. */
979 256, /* size of l2 cache. */
980 64, /* size of prefetch block */
981 6, /* number of parallel prefetches */
982 5, /* Branch cost */
983 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
984 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
985 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
986 COSTS_N_INSNS (2), /* cost of FABS instruction. */
987 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
988 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
989
990 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
991 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
992 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
993 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
994 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
995 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
996 /* 11-16 */
997 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
998 COSTS_N_INSNS (24), /* cost of DIVSD instruction. */
999 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
1000 COSTS_N_INSNS (19), /* cost of SQRTSD instruction. */
1001 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1002 athlon_memcpy,
1003 athlon_memset,
1004 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1005 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1006 "16:8:8", /* Loop alignment. */
1007 "16:8:8", /* Jump alignment. */
1008 "0:0:8", /* Label alignment. */
1009 "16", /* Func alignment. */
1010 };
1011
1012 /* K8 has optimized REP instruction for medium sized blocks, but for very
1013 small blocks it is better to use loop. For large blocks, libcall can
1014 do nontemporary accesses and beat inline considerably. */
1015 static stringop_algs k8_memcpy[2] = {
1016 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1017 {-1, rep_prefix_4_byte, false}}},
1018 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1019 {-1, libcall, false}}}};
1020 static stringop_algs k8_memset[2] = {
1021 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1022 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1023 {libcall, {{48, unrolled_loop, false},
1024 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1025 static const
1026 struct processor_costs k8_cost = {
1027 {
1028 /* Start of register allocator costs. integer->integer move cost is 2. */
1029 4, /* cost for loading QImode using movzbl */
1030 {3, 4, 3}, /* cost of loading integer registers
1031 in QImode, HImode and SImode.
1032 Relative to reg-reg move (2). */
1033 {3, 4, 3}, /* cost of storing integer registers */
1034 4, /* cost of reg,reg fld/fst */
1035 {4, 4, 12}, /* cost of loading fp registers
1036 in SFmode, DFmode and XFmode */
1037 {6, 6, 8}, /* cost of storing fp registers
1038 in SFmode, DFmode and XFmode */
1039 2, /* cost of moving MMX register */
1040 {3, 3}, /* cost of loading MMX registers
1041 in SImode and DImode */
1042 {4, 4}, /* cost of storing MMX registers
1043 in SImode and DImode */
1044 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1045 {4, 3, 12, 12, 24}, /* cost of loading SSE registers
1046 in 32,64,128,256 and 512-bit */
1047 {4, 4, 10, 10, 20}, /* cost of storing SSE registers
1048 in 32,64,128,256 and 512-bit */
1049 5, 5, /* SSE->integer and integer->SSE moves */
1050 5, 5, /* mask->integer and integer->mask moves */
1051 {3, 4, 3}, /* cost of loading mask register
1052 in QImode, HImode, SImode. */
1053 {3, 4, 3}, /* cost if storing mask register
1054 in QImode, HImode, SImode. */
1055 2, /* cost of moving mask register. */
1056 /* End of register allocator costs. */
1057 },
1058
1059 COSTS_N_INSNS (1), /* cost of an add instruction */
1060 COSTS_N_INSNS (2), /* cost of a lea instruction */
1061 COSTS_N_INSNS (1), /* variable shift costs */
1062 COSTS_N_INSNS (1), /* constant shift costs */
1063 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1064 COSTS_N_INSNS (4), /* HI */
1065 COSTS_N_INSNS (3), /* SI */
1066 COSTS_N_INSNS (4), /* DI */
1067 COSTS_N_INSNS (5)}, /* other */
1068 0, /* cost of multiply per each bit set */
1069 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1070 COSTS_N_INSNS (26), /* HI */
1071 COSTS_N_INSNS (42), /* SI */
1072 COSTS_N_INSNS (74), /* DI */
1073 COSTS_N_INSNS (74)}, /* other */
1074 COSTS_N_INSNS (1), /* cost of movsx */
1075 COSTS_N_INSNS (1), /* cost of movzx */
1076 8, /* "large" insn */
1077 9, /* MOVE_RATIO */
1078 6, /* CLEAR_RATIO */
1079 {3, 4, 3}, /* cost of loading integer registers
1080 in QImode, HImode and SImode.
1081 Relative to reg-reg move (2). */
1082 {3, 4, 3}, /* cost of storing integer registers */
1083 {4, 3, 12, 12, 24}, /* cost of loading SSE register
1084 in 32bit, 64bit, 128bit, 256bit and 512bit */
1085 {4, 4, 10, 10, 20}, /* cost of storing SSE register
1086 in 32bit, 64bit, 128bit, 256bit and 512bit */
1087 {4, 3, 12, 12, 24}, /* cost of unaligned loads. */
1088 {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
1089 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1090 5, /* cost of moving SSE register to integer. */
1091 4, 4, /* Gather load static, per_elt. */
1092 4, 4, /* Gather store static, per_elt. */
1093 64, /* size of l1 cache. */
1094 512, /* size of l2 cache. */
1095 64, /* size of prefetch block */
1096 /* New AMD processors never drop prefetches; if they cannot be performed
1097 immediately, they are queued. We set number of simultaneous prefetches
1098 to a large constant to reflect this (it probably is not a good idea not
1099 to limit number of prefetches at all, as their execution also takes some
1100 time). */
1101 100, /* number of parallel prefetches */
1102 3, /* Branch cost */
1103 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1104 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1105 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1106 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1107 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1108 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1109
1110 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1111 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1112 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1113 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1114 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
1115 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
1116 /* 11-16 */
1117 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
1118 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
1119 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
1120 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
1121 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1122 k8_memcpy,
1123 k8_memset,
1124 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1125 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1126 "16:8:8", /* Loop alignment. */
1127 "16:8:8", /* Jump alignment. */
1128 "0:0:8", /* Label alignment. */
1129 "16", /* Func alignment. */
1130 };
1131
1132 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1133 very small blocks it is better to use loop. For large blocks, libcall can
1134 do nontemporary accesses and beat inline considerably. */
1135 static stringop_algs amdfam10_memcpy[2] = {
1136 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1137 {-1, rep_prefix_4_byte, false}}},
1138 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1139 {-1, libcall, false}}}};
1140 static stringop_algs amdfam10_memset[2] = {
1141 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1142 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1143 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1144 {-1, libcall, false}}}};
1145 struct processor_costs amdfam10_cost = {
1146 {
1147 /* Start of register allocator costs. integer->integer move cost is 2. */
1148 4, /* cost for loading QImode using movzbl */
1149 {3, 4, 3}, /* cost of loading integer registers
1150 in QImode, HImode and SImode.
1151 Relative to reg-reg move (2). */
1152 {3, 4, 3}, /* cost of storing integer registers */
1153 4, /* cost of reg,reg fld/fst */
1154 {4, 4, 12}, /* cost of loading fp registers
1155 in SFmode, DFmode and XFmode */
1156 {6, 6, 8}, /* cost of storing fp registers
1157 in SFmode, DFmode and XFmode */
1158 2, /* cost of moving MMX register */
1159 {3, 3}, /* cost of loading MMX registers
1160 in SImode and DImode */
1161 {4, 4}, /* cost of storing MMX registers
1162 in SImode and DImode */
1163 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1164 {4, 4, 3, 6, 12}, /* cost of loading SSE registers
1165 in 32,64,128,256 and 512-bit */
1166 {4, 4, 5, 10, 20}, /* cost of storing SSE registers
1167 in 32,64,128,256 and 512-bit */
1168 3, 3, /* SSE->integer and integer->SSE moves */
1169 3, 3, /* mask->integer and integer->mask moves */
1170 {3, 4, 3}, /* cost of loading mask register
1171 in QImode, HImode, SImode. */
1172 {3, 4, 3}, /* cost if storing mask register
1173 in QImode, HImode, SImode. */
1174 2, /* cost of moving mask register. */
1175
1176 /* On K8:
1177 MOVD reg64, xmmreg Double FSTORE 4
1178 MOVD reg32, xmmreg Double FSTORE 4
1179 On AMDFAM10:
1180 MOVD reg64, xmmreg Double FADD 3
1181 1/1 1/1
1182 MOVD reg32, xmmreg Double FADD 3
1183 1/1 1/1 */
1184 /* End of register allocator costs. */
1185 },
1186
1187 COSTS_N_INSNS (1), /* cost of an add instruction */
1188 COSTS_N_INSNS (2), /* cost of a lea instruction */
1189 COSTS_N_INSNS (1), /* variable shift costs */
1190 COSTS_N_INSNS (1), /* constant shift costs */
1191 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1192 COSTS_N_INSNS (4), /* HI */
1193 COSTS_N_INSNS (3), /* SI */
1194 COSTS_N_INSNS (4), /* DI */
1195 COSTS_N_INSNS (5)}, /* other */
1196 0, /* cost of multiply per each bit set */
1197 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1198 COSTS_N_INSNS (35), /* HI */
1199 COSTS_N_INSNS (51), /* SI */
1200 COSTS_N_INSNS (83), /* DI */
1201 COSTS_N_INSNS (83)}, /* other */
1202 COSTS_N_INSNS (1), /* cost of movsx */
1203 COSTS_N_INSNS (1), /* cost of movzx */
1204 8, /* "large" insn */
1205 9, /* MOVE_RATIO */
1206 6, /* CLEAR_RATIO */
1207 {3, 4, 3}, /* cost of loading integer registers
1208 in QImode, HImode and SImode.
1209 Relative to reg-reg move (2). */
1210 {3, 4, 3}, /* cost of storing integer registers */
1211 {4, 4, 3, 6, 12}, /* cost of loading SSE register
1212 in 32bit, 64bit, 128bit, 256bit and 512bit */
1213 {4, 4, 5, 10, 20}, /* cost of storing SSE register
1214 in 32bit, 64bit, 128bit, 256bit and 512bit */
1215 {4, 4, 3, 7, 12}, /* cost of unaligned loads. */
1216 {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
1217 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1218 3, /* cost of moving SSE register to integer. */
1219 4, 4, /* Gather load static, per_elt. */
1220 4, 4, /* Gather store static, per_elt. */
1221 64, /* size of l1 cache. */
1222 512, /* size of l2 cache. */
1223 64, /* size of prefetch block */
1224 /* New AMD processors never drop prefetches; if they cannot be performed
1225 immediately, they are queued. We set number of simultaneous prefetches
1226 to a large constant to reflect this (it probably is not a good idea not
1227 to limit number of prefetches at all, as their execution also takes some
1228 time). */
1229 100, /* number of parallel prefetches */
1230 2, /* Branch cost */
1231 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1232 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1233 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1234 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1235 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1236 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1237
1238 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1239 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1240 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1241 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1242 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
1243 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
1244 /* 11-16 */
1245 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
1246 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
1247 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
1248 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
1249 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1250 amdfam10_memcpy,
1251 amdfam10_memset,
1252 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1253 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1254 "32:25:8", /* Loop alignment. */
1255 "32:8:8", /* Jump alignment. */
1256 "0:0:8", /* Label alignment. */
1257 "32", /* Func alignment. */
1258 };
1259
1260 /* BDVER has optimized REP instruction for medium sized blocks, but for
1261 very small blocks it is better to use loop. For large blocks, libcall
1262 can do nontemporary accesses and beat inline considerably. */
1263 static stringop_algs bdver_memcpy[2] = {
1264 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1265 {-1, rep_prefix_4_byte, false}}},
1266 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1267 {-1, libcall, false}}}};
1268 static stringop_algs bdver_memset[2] = {
1269 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1270 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1271 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1272 {-1, libcall, false}}}};
1273
1274 const struct processor_costs bdver_cost = {
1275 {
1276 /* Start of register allocator costs. integer->integer move cost is 2. */
1277 8, /* cost for loading QImode using movzbl */
1278 {8, 8, 8}, /* cost of loading integer registers
1279 in QImode, HImode and SImode.
1280 Relative to reg-reg move (2). */
1281 {8, 8, 8}, /* cost of storing integer registers */
1282 4, /* cost of reg,reg fld/fst */
1283 {12, 12, 28}, /* cost of loading fp registers
1284 in SFmode, DFmode and XFmode */
1285 {10, 10, 18}, /* cost of storing fp registers
1286 in SFmode, DFmode and XFmode */
1287 4, /* cost of moving MMX register */
1288 {12, 12}, /* cost of loading MMX registers
1289 in SImode and DImode */
1290 {10, 10}, /* cost of storing MMX registers
1291 in SImode and DImode */
1292 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1293 {12, 12, 10, 40, 60}, /* cost of loading SSE registers
1294 in 32,64,128,256 and 512-bit */
1295 {10, 10, 10, 40, 60}, /* cost of storing SSE registers
1296 in 32,64,128,256 and 512-bit */
1297 16, 20, /* SSE->integer and integer->SSE moves */
1298 16, 20, /* mask->integer and integer->mask moves */
1299 {8, 8, 8}, /* cost of loading mask register
1300 in QImode, HImode, SImode. */
1301 {8, 8, 8}, /* cost if storing mask register
1302 in QImode, HImode, SImode. */
1303 2, /* cost of moving mask register. */
1304 /* End of register allocator costs. */
1305 },
1306
1307 COSTS_N_INSNS (1), /* cost of an add instruction */
1308 COSTS_N_INSNS (1), /* cost of a lea instruction */
1309 COSTS_N_INSNS (1), /* variable shift costs */
1310 COSTS_N_INSNS (1), /* constant shift costs */
1311 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1312 COSTS_N_INSNS (4), /* HI */
1313 COSTS_N_INSNS (4), /* SI */
1314 COSTS_N_INSNS (6), /* DI */
1315 COSTS_N_INSNS (6)}, /* other */
1316 0, /* cost of multiply per each bit set */
1317 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1318 COSTS_N_INSNS (35), /* HI */
1319 COSTS_N_INSNS (51), /* SI */
1320 COSTS_N_INSNS (83), /* DI */
1321 COSTS_N_INSNS (83)}, /* other */
1322 COSTS_N_INSNS (1), /* cost of movsx */
1323 COSTS_N_INSNS (1), /* cost of movzx */
1324 8, /* "large" insn */
1325 9, /* MOVE_RATIO */
1326 6, /* CLEAR_RATIO */
1327 {8, 8, 8}, /* cost of loading integer registers
1328 in QImode, HImode and SImode.
1329 Relative to reg-reg move (2). */
1330 {8, 8, 8}, /* cost of storing integer registers */
1331 {12, 12, 10, 40, 60}, /* cost of loading SSE register
1332 in 32bit, 64bit, 128bit, 256bit and 512bit */
1333 {10, 10, 10, 40, 60}, /* cost of storing SSE register
1334 in 32bit, 64bit, 128bit, 256bit and 512bit */
1335 {12, 12, 10, 40, 60}, /* cost of unaligned loads. */
1336 {10, 10, 10, 40, 60}, /* cost of unaligned stores. */
1337 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1338 16, /* cost of moving SSE register to integer. */
1339 12, 12, /* Gather load static, per_elt. */
1340 10, 10, /* Gather store static, per_elt. */
1341 16, /* size of l1 cache. */
1342 2048, /* size of l2 cache. */
1343 64, /* size of prefetch block */
1344 /* New AMD processors never drop prefetches; if they cannot be performed
1345 immediately, they are queued. We set number of simultaneous prefetches
1346 to a large constant to reflect this (it probably is not a good idea not
1347 to limit number of prefetches at all, as their execution also takes some
1348 time). */
1349 100, /* number of parallel prefetches */
1350 2, /* Branch cost */
1351 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1352 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1353 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1354 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1355 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1356 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1357
1358 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1359 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1360 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1361 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1362 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1363 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1364 /* 9-24 */
1365 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1366 /* 9-27 */
1367 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1368 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1369 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
1370 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1371 bdver_memcpy,
1372 bdver_memset,
1373 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1374 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1375 "16:11:8", /* Loop alignment. */
1376 "16:8:8", /* Jump alignment. */
1377 "0:0:8", /* Label alignment. */
1378 "11", /* Func alignment. */
1379 };
1380
1381
1382 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1383 very small blocks it is better to use loop. For large blocks, libcall
1384 can do nontemporary accesses and beat inline considerably. */
1385 static stringop_algs znver1_memcpy[2] = {
1386 /* 32-bit tuning. */
1387 {libcall, {{6, loop, false},
1388 {14, unrolled_loop, false},
1389 {-1, libcall, false}}},
1390 /* 64-bit tuning. */
1391 {libcall, {{16, loop, false},
1392 {128, rep_prefix_8_byte, false},
1393 {-1, libcall, false}}}};
1394 static stringop_algs znver1_memset[2] = {
1395 /* 32-bit tuning. */
1396 {libcall, {{8, loop, false},
1397 {24, unrolled_loop, false},
1398 {128, rep_prefix_4_byte, false},
1399 {-1, libcall, false}}},
1400 /* 64-bit tuning. */
1401 {libcall, {{48, unrolled_loop, false},
1402 {128, rep_prefix_8_byte, false},
1403 {-1, libcall, false}}}};
1404 struct processor_costs znver1_cost = {
1405 {
1406 /* Start of register allocator costs. integer->integer move cost is 2. */
1407
1408 /* reg-reg moves are done by renaming and thus they are even cheaper than
1409 1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1410 to doubles of latencies, we do not model this correctly. It does not
1411 seem to make practical difference to bump prices up even more. */
1412 6, /* cost for loading QImode using
1413 movzbl. */
1414 {6, 6, 6}, /* cost of loading integer registers
1415 in QImode, HImode and SImode.
1416 Relative to reg-reg move (2). */
1417 {8, 8, 8}, /* cost of storing integer
1418 registers. */
1419 2, /* cost of reg,reg fld/fst. */
1420 {6, 6, 16}, /* cost of loading fp registers
1421 in SFmode, DFmode and XFmode. */
1422 {8, 8, 16}, /* cost of storing fp registers
1423 in SFmode, DFmode and XFmode. */
1424 2, /* cost of moving MMX register. */
1425 {6, 6}, /* cost of loading MMX registers
1426 in SImode and DImode. */
1427 {8, 8}, /* cost of storing MMX registers
1428 in SImode and DImode. */
1429 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
1430 {6, 6, 6, 12, 24}, /* cost of loading SSE registers
1431 in 32,64,128,256 and 512-bit. */
1432 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
1433 in 32,64,128,256 and 512-bit. */
1434 6, 6, /* SSE->integer and integer->SSE moves. */
1435 8, 8, /* mask->integer and integer->mask moves */
1436 {6, 6, 6}, /* cost of loading mask register
1437 in QImode, HImode, SImode. */
1438 {8, 8, 8}, /* cost if storing mask register
1439 in QImode, HImode, SImode. */
1440 2, /* cost of moving mask register. */
1441 /* End of register allocator costs. */
1442 },
1443
1444 COSTS_N_INSNS (1), /* cost of an add instruction. */
1445 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1446 COSTS_N_INSNS (1), /* variable shift costs. */
1447 COSTS_N_INSNS (1), /* constant shift costs. */
1448 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1449 COSTS_N_INSNS (3), /* HI. */
1450 COSTS_N_INSNS (3), /* SI. */
1451 COSTS_N_INSNS (3), /* DI. */
1452 COSTS_N_INSNS (3)}, /* other. */
1453 0, /* cost of multiply per each bit
1454 set. */
1455 /* Depending on parameters, idiv can get faster on ryzen. This is upper
1456 bound. */
1457 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
1458 COSTS_N_INSNS (22), /* HI. */
1459 COSTS_N_INSNS (30), /* SI. */
1460 COSTS_N_INSNS (45), /* DI. */
1461 COSTS_N_INSNS (45)}, /* other. */
1462 COSTS_N_INSNS (1), /* cost of movsx. */
1463 COSTS_N_INSNS (1), /* cost of movzx. */
1464 8, /* "large" insn. */
1465 9, /* MOVE_RATIO. */
1466 6, /* CLEAR_RATIO */
1467 {6, 6, 6}, /* cost of loading integer registers
1468 in QImode, HImode and SImode.
1469 Relative to reg-reg move (2). */
1470 {8, 8, 8}, /* cost of storing integer
1471 registers. */
1472 {6, 6, 6, 12, 24}, /* cost of loading SSE register
1473 in 32bit, 64bit, 128bit, 256bit and 512bit */
1474 {8, 8, 8, 16, 32}, /* cost of storing SSE register
1475 in 32bit, 64bit, 128bit, 256bit and 512bit */
1476 {6, 6, 6, 12, 24}, /* cost of unaligned loads. */
1477 {8, 8, 8, 16, 32}, /* cost of unaligned stores. */
1478 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
1479 6, /* cost of moving SSE register to integer. */
1480 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1481 throughput 12. Approx 9 uops do not depend on vector size and every load
1482 is 7 uops. */
1483 18, 8, /* Gather load static, per_elt. */
1484 18, 10, /* Gather store static, per_elt. */
1485 32, /* size of l1 cache. */
1486 512, /* size of l2 cache. */
1487 64, /* size of prefetch block. */
1488 /* New AMD processors never drop prefetches; if they cannot be performed
1489 immediately, they are queued. We set number of simultaneous prefetches
1490 to a large constant to reflect this (it probably is not a good idea not
1491 to limit number of prefetches at all, as their execution also takes some
1492 time). */
1493 100, /* number of parallel prefetches. */
1494 3, /* Branch cost. */
1495 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1496 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1497 /* Latency of fdiv is 8-15. */
1498 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1499 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1500 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1501 /* Latency of fsqrt is 4-10. */
1502 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1503
1504 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1505 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1506 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1507 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1508 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1509 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1510 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1511 /* 9-13 */
1512 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1513 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1514 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
1515 /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1516 and it can execute 2 integer additions and 2 multiplications thus
1517 reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests
1518 that 4 works better than 6 probably due to register pressure.
1519
1520 Integer vector operations are taken by FP unit and execute 3 vector
1521 plus/minus operations per cycle but only one multiply. This is adjusted
1522 in ix86_reassociation_width. */
1523 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1524 znver1_memcpy,
1525 znver1_memset,
1526 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1527 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1528 "16", /* Loop alignment. */
1529 "16", /* Jump alignment. */
1530 "0:0:8", /* Label alignment. */
1531 "16", /* Func alignment. */
1532 };
1533
1534 /* ZNVER2 has optimized REP instruction for medium sized blocks, but for
1535 very small blocks it is better to use loop. For large blocks, libcall
1536 can do nontemporary accesses and beat inline considerably. */
1537 static stringop_algs znver2_memcpy[2] = {
1538 /* 32-bit tuning. */
1539 {libcall, {{6, loop, false},
1540 {14, unrolled_loop, false},
1541 {-1, libcall, false}}},
1542 /* 64-bit tuning. */
1543 {libcall, {{16, loop, false},
1544 {64, rep_prefix_4_byte, false},
1545 {-1, libcall, false}}}};
1546 static stringop_algs znver2_memset[2] = {
1547 /* 32-bit tuning. */
1548 {libcall, {{8, loop, false},
1549 {24, unrolled_loop, false},
1550 {128, rep_prefix_4_byte, false},
1551 {-1, libcall, false}}},
1552 /* 64-bit tuning. */
1553 {libcall, {{24, rep_prefix_4_byte, false},
1554 {128, rep_prefix_8_byte, false},
1555 {-1, libcall, false}}}};
1556
1557 struct processor_costs znver2_cost = {
1558 {
1559 /* Start of register allocator costs. integer->integer move cost is 2. */
1560
1561 /* reg-reg moves are done by renaming and thus they are even cheaper than
1562 1 cycle. Because reg-reg move cost is 2 and following tables correspond
1563 to doubles of latencies, we do not model this correctly. It does not
1564 seem to make practical difference to bump prices up even more. */
1565 6, /* cost for loading QImode using
1566 movzbl. */
1567 {6, 6, 6}, /* cost of loading integer registers
1568 in QImode, HImode and SImode.
1569 Relative to reg-reg move (2). */
1570 {8, 8, 8}, /* cost of storing integer
1571 registers. */
1572 2, /* cost of reg,reg fld/fst. */
1573 {6, 6, 16}, /* cost of loading fp registers
1574 in SFmode, DFmode and XFmode. */
1575 {8, 8, 16}, /* cost of storing fp registers
1576 in SFmode, DFmode and XFmode. */
1577 2, /* cost of moving MMX register. */
1578 {6, 6}, /* cost of loading MMX registers
1579 in SImode and DImode. */
1580 {8, 8}, /* cost of storing MMX registers
1581 in SImode and DImode. */
1582 2, 2, 3, /* cost of moving XMM,YMM,ZMM
1583 register. */
1584 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
1585 in 32,64,128,256 and 512-bit. */
1586 {8, 8, 8, 8, 16}, /* cost of storing SSE registers
1587 in 32,64,128,256 and 512-bit. */
1588 6, 6, /* SSE->integer and integer->SSE
1589 moves. */
1590 8, 8, /* mask->integer and integer->mask moves */
1591 {6, 6, 6}, /* cost of loading mask register
1592 in QImode, HImode, SImode. */
1593 {8, 8, 8}, /* cost if storing mask register
1594 in QImode, HImode, SImode. */
1595 2, /* cost of moving mask register. */
1596 /* End of register allocator costs. */
1597 },
1598
1599 COSTS_N_INSNS (1), /* cost of an add instruction. */
1600 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1601 COSTS_N_INSNS (1), /* variable shift costs. */
1602 COSTS_N_INSNS (1), /* constant shift costs. */
1603 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1604 COSTS_N_INSNS (3), /* HI. */
1605 COSTS_N_INSNS (3), /* SI. */
1606 COSTS_N_INSNS (3), /* DI. */
1607 COSTS_N_INSNS (3)}, /* other. */
1608 0, /* cost of multiply per each bit
1609 set. */
1610 /* Depending on parameters, idiv can get faster on ryzen. This is upper
1611 bound. */
1612 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
1613 COSTS_N_INSNS (22), /* HI. */
1614 COSTS_N_INSNS (30), /* SI. */
1615 COSTS_N_INSNS (45), /* DI. */
1616 COSTS_N_INSNS (45)}, /* other. */
1617 COSTS_N_INSNS (1), /* cost of movsx. */
1618 COSTS_N_INSNS (1), /* cost of movzx. */
1619 8, /* "large" insn. */
1620 9, /* MOVE_RATIO. */
1621 6, /* CLEAR_RATIO */
1622 {6, 6, 6}, /* cost of loading integer registers
1623 in QImode, HImode and SImode.
1624 Relative to reg-reg move (2). */
1625 {8, 8, 8}, /* cost of storing integer
1626 registers. */
1627 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
1628 in 32bit, 64bit, 128bit, 256bit and 512bit */
1629 {8, 8, 8, 8, 16}, /* cost of storing SSE register
1630 in 32bit, 64bit, 128bit, 256bit and 512bit */
1631 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */
1632 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
1633 2, 2, 3, /* cost of moving XMM,YMM,ZMM
1634 register. */
1635 6, /* cost of moving SSE register to integer. */
1636 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1637 throughput 12. Approx 9 uops do not depend on vector size and every load
1638 is 7 uops. */
1639 18, 8, /* Gather load static, per_elt. */
1640 18, 10, /* Gather store static, per_elt. */
1641 32, /* size of l1 cache. */
1642 512, /* size of l2 cache. */
1643 64, /* size of prefetch block. */
1644 /* New AMD processors never drop prefetches; if they cannot be performed
1645 immediately, they are queued. We set number of simultaneous prefetches
1646 to a large constant to reflect this (it probably is not a good idea not
1647 to limit number of prefetches at all, as their execution also takes some
1648 time). */
1649 100, /* number of parallel prefetches. */
1650 3, /* Branch cost. */
1651 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1652 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1653 /* Latency of fdiv is 8-15. */
1654 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1655 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1656 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1657 /* Latency of fsqrt is 4-10. */
1658 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1659
1660 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1661 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1662 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1663 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
1664 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1665 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1666 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1667 /* 9-13. */
1668 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1669 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1670 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
1671 /* Zen can execute 4 integer operations per cycle. FP operations
1672 take 3 cycles and it can execute 2 integer additions and 2
1673 multiplications thus reassociation may make sense up to with of 6.
1674 SPEC2k6 bencharks suggests
1675 that 4 works better than 6 probably due to register pressure.
1676
1677 Integer vector operations are taken by FP unit and execute 3 vector
1678 plus/minus operations per cycle but only one multiply. This is adjusted
1679 in ix86_reassociation_width. */
1680 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1681 znver2_memcpy,
1682 znver2_memset,
1683 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1684 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1685 "16", /* Loop alignment. */
1686 "16", /* Jump alignment. */
1687 "0:0:8", /* Label alignment. */
1688 "16", /* Func alignment. */
1689 };
1690
1691 /* skylake_cost should produce code tuned for Skylake familly of CPUs. */
1692 static stringop_algs skylake_memcpy[2] = {
1693 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1694 {libcall, {{16, loop, false}, {512, unrolled_loop, false},
1695 {-1, libcall, false}}}};
1696
1697 static stringop_algs skylake_memset[2] = {
1698 {libcall, {{6, loop_1_byte, true},
1699 {24, loop, true},
1700 {8192, rep_prefix_4_byte, true},
1701 {-1, libcall, false}}},
1702 {libcall, {{24, loop, true}, {512, unrolled_loop, false},
1703 {-1, libcall, false}}}};
1704
1705 static const
1706 struct processor_costs skylake_cost = {
1707 {
1708 /* Start of register allocator costs. integer->integer move cost is 2. */
1709 6, /* cost for loading QImode using movzbl */
1710 {4, 4, 4}, /* cost of loading integer registers
1711 in QImode, HImode and SImode.
1712 Relative to reg-reg move (2). */
1713 {6, 6, 6}, /* cost of storing integer registers */
1714 2, /* cost of reg,reg fld/fst */
1715 {6, 6, 8}, /* cost of loading fp registers
1716 in SFmode, DFmode and XFmode */
1717 {6, 6, 10}, /* cost of storing fp registers
1718 in SFmode, DFmode and XFmode */
1719 2, /* cost of moving MMX register */
1720 {6, 6}, /* cost of loading MMX registers
1721 in SImode and DImode */
1722 {6, 6}, /* cost of storing MMX registers
1723 in SImode and DImode */
1724 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
1725 {6, 6, 6, 10, 20}, /* cost of loading SSE registers
1726 in 32,64,128,256 and 512-bit */
1727 {8, 8, 8, 12, 24}, /* cost of storing SSE registers
1728 in 32,64,128,256 and 512-bit */
1729 6, 6, /* SSE->integer and integer->SSE moves */
1730 5, 5, /* mask->integer and integer->mask moves */
1731 {8, 8, 8}, /* cost of loading mask register
1732 in QImode, HImode, SImode. */
1733 {6, 6, 6}, /* cost if storing mask register
1734 in QImode, HImode, SImode. */
1735 3, /* cost of moving mask register. */
1736 /* End of register allocator costs. */
1737 },
1738
1739 COSTS_N_INSNS (1), /* cost of an add instruction */
1740 COSTS_N_INSNS (1)+1, /* cost of a lea instruction */
1741 COSTS_N_INSNS (1), /* variable shift costs */
1742 COSTS_N_INSNS (1), /* constant shift costs */
1743 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1744 COSTS_N_INSNS (4), /* HI */
1745 COSTS_N_INSNS (3), /* SI */
1746 COSTS_N_INSNS (3), /* DI */
1747 COSTS_N_INSNS (3)}, /* other */
1748 0, /* cost of multiply per each bit set */
1749 /* Expanding div/mod currently doesn't consider parallelism. So the cost
1750 model is not realistic. We compensate by increasing the latencies a bit. */
1751 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
1752 COSTS_N_INSNS (11), /* HI */
1753 COSTS_N_INSNS (14), /* SI */
1754 COSTS_N_INSNS (76), /* DI */
1755 COSTS_N_INSNS (76)}, /* other */
1756 COSTS_N_INSNS (1), /* cost of movsx */
1757 COSTS_N_INSNS (0), /* cost of movzx */
1758 8, /* "large" insn */
1759 17, /* MOVE_RATIO */
1760 6, /* CLEAR_RATIO */
1761 {4, 4, 4}, /* cost of loading integer registers
1762 in QImode, HImode and SImode.
1763 Relative to reg-reg move (2). */
1764 {6, 6, 6}, /* cost of storing integer registers */
1765 {6, 6, 6, 10, 20}, /* cost of loading SSE register
1766 in 32bit, 64bit, 128bit, 256bit and 512bit */
1767 {8, 8, 8, 12, 24}, /* cost of storing SSE register
1768 in 32bit, 64bit, 128bit, 256bit and 512bit */
1769 {6, 6, 6, 10, 20}, /* cost of unaligned loads. */
1770 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
1771 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
1772 6, /* cost of moving SSE register to integer. */
1773 20, 8, /* Gather load static, per_elt. */
1774 22, 10, /* Gather store static, per_elt. */
1775 64, /* size of l1 cache. */
1776 512, /* size of l2 cache. */
1777 64, /* size of prefetch block */
1778 6, /* number of parallel prefetches */
1779 3, /* Branch cost */
1780 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
1781 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1782 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1783 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1784 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1785 COSTS_N_INSNS (20), /* cost of FSQRT instruction. */
1786
1787 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1788 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1789 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1790 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1791 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
1792 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
1793 COSTS_N_INSNS (11), /* cost of DIVSS instruction. */
1794 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
1795 COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */
1796 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
1797 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
1798 skylake_memcpy,
1799 skylake_memset,
1800 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1801 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1802 "16:11:8", /* Loop alignment. */
1803 "16:11:8", /* Jump alignment. */
1804 "0:0:8", /* Label alignment. */
1805 "16", /* Func alignment. */
1806 };
1807 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1808 very small blocks it is better to use loop. For large blocks, libcall can
1809 do nontemporary accesses and beat inline considerably. */
1810 static stringop_algs btver1_memcpy[2] = {
1811 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1812 {-1, rep_prefix_4_byte, false}}},
1813 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1814 {-1, libcall, false}}}};
1815 static stringop_algs btver1_memset[2] = {
1816 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1817 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1818 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1819 {-1, libcall, false}}}};
1820 const struct processor_costs btver1_cost = {
1821 {
1822 /* Start of register allocator costs. integer->integer move cost is 2. */
1823 8, /* cost for loading QImode using movzbl */
1824 {6, 8, 6}, /* cost of loading integer registers
1825 in QImode, HImode and SImode.
1826 Relative to reg-reg move (2). */
1827 {6, 8, 6}, /* cost of storing integer registers */
1828 4, /* cost of reg,reg fld/fst */
1829 {12, 12, 28}, /* cost of loading fp registers
1830 in SFmode, DFmode and XFmode */
1831 {12, 12, 38}, /* cost of storing fp registers
1832 in SFmode, DFmode and XFmode */
1833 4, /* cost of moving MMX register */
1834 {10, 10}, /* cost of loading MMX registers
1835 in SImode and DImode */
1836 {12, 12}, /* cost of storing MMX registers
1837 in SImode and DImode */
1838 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1839 {10, 10, 12, 48, 96}, /* cost of loading SSE registers
1840 in 32,64,128,256 and 512-bit */
1841 {10, 10, 12, 48, 96}, /* cost of storing SSE registers
1842 in 32,64,128,256 and 512-bit */
1843 14, 14, /* SSE->integer and integer->SSE moves */
1844 14, 14, /* mask->integer and integer->mask moves */
1845 {6, 8, 6}, /* cost of loading mask register
1846 in QImode, HImode, SImode. */
1847 {6, 8, 6}, /* cost if storing mask register
1848 in QImode, HImode, SImode. */
1849 2, /* cost of moving mask register. */
1850 /* End of register allocator costs. */
1851 },
1852
1853 COSTS_N_INSNS (1), /* cost of an add instruction */
1854 COSTS_N_INSNS (2), /* cost of a lea instruction */
1855 COSTS_N_INSNS (1), /* variable shift costs */
1856 COSTS_N_INSNS (1), /* constant shift costs */
1857 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1858 COSTS_N_INSNS (4), /* HI */
1859 COSTS_N_INSNS (3), /* SI */
1860 COSTS_N_INSNS (4), /* DI */
1861 COSTS_N_INSNS (5)}, /* other */
1862 0, /* cost of multiply per each bit set */
1863 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1864 COSTS_N_INSNS (35), /* HI */
1865 COSTS_N_INSNS (51), /* SI */
1866 COSTS_N_INSNS (83), /* DI */
1867 COSTS_N_INSNS (83)}, /* other */
1868 COSTS_N_INSNS (1), /* cost of movsx */
1869 COSTS_N_INSNS (1), /* cost of movzx */
1870 8, /* "large" insn */
1871 9, /* MOVE_RATIO */
1872 6, /* CLEAR_RATIO */
1873 {6, 8, 6}, /* cost of loading integer registers
1874 in QImode, HImode and SImode.
1875 Relative to reg-reg move (2). */
1876 {6, 8, 6}, /* cost of storing integer registers */
1877 {10, 10, 12, 48, 96}, /* cost of loading SSE register
1878 in 32bit, 64bit, 128bit, 256bit and 512bit */
1879 {10, 10, 12, 48, 96}, /* cost of storing SSE register
1880 in 32bit, 64bit, 128bit, 256bit and 512bit */
1881 {10, 10, 12, 48, 96}, /* cost of unaligned loads. */
1882 {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
1883 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1884 14, /* cost of moving SSE register to integer. */
1885 10, 10, /* Gather load static, per_elt. */
1886 10, 10, /* Gather store static, per_elt. */
1887 32, /* size of l1 cache. */
1888 512, /* size of l2 cache. */
1889 64, /* size of prefetch block */
1890 100, /* number of parallel prefetches */
1891 2, /* Branch cost */
1892 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1893 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1894 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1895 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1896 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1897 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1898
1899 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1900 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1901 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
1902 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1903 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1904 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1905 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
1906 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
1907 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
1908 COSTS_N_INSNS (48), /* cost of SQRTSD instruction. */
1909 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1910 btver1_memcpy,
1911 btver1_memset,
1912 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1913 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1914 "16:11:8", /* Loop alignment. */
1915 "16:8:8", /* Jump alignment. */
1916 "0:0:8", /* Label alignment. */
1917 "11", /* Func alignment. */
1918 };
1919
1920 static stringop_algs btver2_memcpy[2] = {
1921 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1922 {-1, rep_prefix_4_byte, false}}},
1923 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1924 {-1, libcall, false}}}};
1925 static stringop_algs btver2_memset[2] = {
1926 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1927 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1928 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1929 {-1, libcall, false}}}};
1930 const struct processor_costs btver2_cost = {
1931 {
1932 /* Start of register allocator costs. integer->integer move cost is 2. */
1933 8, /* cost for loading QImode using movzbl */
1934 {8, 8, 6}, /* cost of loading integer registers
1935 in QImode, HImode and SImode.
1936 Relative to reg-reg move (2). */
1937 {8, 8, 6}, /* cost of storing integer registers */
1938 4, /* cost of reg,reg fld/fst */
1939 {12, 12, 28}, /* cost of loading fp registers
1940 in SFmode, DFmode and XFmode */
1941 {12, 12, 38}, /* cost of storing fp registers
1942 in SFmode, DFmode and XFmode */
1943 4, /* cost of moving MMX register */
1944 {10, 10}, /* cost of loading MMX registers
1945 in SImode and DImode */
1946 {12, 12}, /* cost of storing MMX registers
1947 in SImode and DImode */
1948 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1949 {10, 10, 12, 48, 96}, /* cost of loading SSE registers
1950 in 32,64,128,256 and 512-bit */
1951 {10, 10, 12, 48, 96}, /* cost of storing SSE registers
1952 in 32,64,128,256 and 512-bit */
1953 14, 14, /* SSE->integer and integer->SSE moves */
1954 14, 14, /* mask->integer and integer->mask moves */
1955 {8, 8, 6}, /* cost of loading mask register
1956 in QImode, HImode, SImode. */
1957 {8, 8, 6}, /* cost if storing mask register
1958 in QImode, HImode, SImode. */
1959 2, /* cost of moving mask register. */
1960 /* End of register allocator costs. */
1961 },
1962
1963 COSTS_N_INSNS (1), /* cost of an add instruction */
1964 COSTS_N_INSNS (2), /* cost of a lea instruction */
1965 COSTS_N_INSNS (1), /* variable shift costs */
1966 COSTS_N_INSNS (1), /* constant shift costs */
1967 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1968 COSTS_N_INSNS (4), /* HI */
1969 COSTS_N_INSNS (3), /* SI */
1970 COSTS_N_INSNS (4), /* DI */
1971 COSTS_N_INSNS (5)}, /* other */
1972 0, /* cost of multiply per each bit set */
1973 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1974 COSTS_N_INSNS (35), /* HI */
1975 COSTS_N_INSNS (51), /* SI */
1976 COSTS_N_INSNS (83), /* DI */
1977 COSTS_N_INSNS (83)}, /* other */
1978 COSTS_N_INSNS (1), /* cost of movsx */
1979 COSTS_N_INSNS (1), /* cost of movzx */
1980 8, /* "large" insn */
1981 9, /* MOVE_RATIO */
1982 6, /* CLEAR_RATIO */
1983 {8, 8, 6}, /* cost of loading integer registers
1984 in QImode, HImode and SImode.
1985 Relative to reg-reg move (2). */
1986 {8, 8, 6}, /* cost of storing integer registers */
1987 {10, 10, 12, 48, 96}, /* cost of loading SSE register
1988 in 32bit, 64bit, 128bit, 256bit and 512bit */
1989 {10, 10, 12, 48, 96}, /* cost of storing SSE register
1990 in 32bit, 64bit, 128bit, 256bit and 512bit */
1991 {10, 10, 12, 48, 96}, /* cost of unaligned loads. */
1992 {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
1993 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1994 14, /* cost of moving SSE register to integer. */
1995 10, 10, /* Gather load static, per_elt. */
1996 10, 10, /* Gather store static, per_elt. */
1997 32, /* size of l1 cache. */
1998 2048, /* size of l2 cache. */
1999 64, /* size of prefetch block */
2000 100, /* number of parallel prefetches */
2001 2, /* Branch cost */
2002 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
2003 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
2004 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
2005 COSTS_N_INSNS (2), /* cost of FABS instruction. */
2006 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
2007 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
2008
2009 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2010 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2011 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
2012 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
2013 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2014 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
2015 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
2016 COSTS_N_INSNS (19), /* cost of DIVSD instruction. */
2017 COSTS_N_INSNS (16), /* cost of SQRTSS instruction. */
2018 COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */
2019 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2020 btver2_memcpy,
2021 btver2_memset,
2022 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
2023 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2024 "16:11:8", /* Loop alignment. */
2025 "16:8:8", /* Jump alignment. */
2026 "0:0:8", /* Label alignment. */
2027 "11", /* Func alignment. */
2028 };
2029
2030 static stringop_algs pentium4_memcpy[2] = {
2031 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
2032 DUMMY_STRINGOP_ALGS};
2033 static stringop_algs pentium4_memset[2] = {
2034 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
2035 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2036 DUMMY_STRINGOP_ALGS};
2037
2038 static const
2039 struct processor_costs pentium4_cost = {
2040 {
2041 /* Start of register allocator costs. integer->integer move cost is 2. */
2042 5, /* cost for loading QImode using movzbl */
2043 {4, 5, 4}, /* cost of loading integer registers
2044 in QImode, HImode and SImode.
2045 Relative to reg-reg move (2). */
2046 {2, 3, 2}, /* cost of storing integer registers */
2047 12, /* cost of reg,reg fld/fst */
2048 {14, 14, 14}, /* cost of loading fp registers
2049 in SFmode, DFmode and XFmode */
2050 {14, 14, 14}, /* cost of storing fp registers
2051 in SFmode, DFmode and XFmode */
2052 12, /* cost of moving MMX register */
2053 {16, 16}, /* cost of loading MMX registers
2054 in SImode and DImode */
2055 {16, 16}, /* cost of storing MMX registers
2056 in SImode and DImode */
2057 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
2058 {16, 16, 16, 32, 64}, /* cost of loading SSE registers
2059 in 32,64,128,256 and 512-bit */
2060 {16, 16, 16, 32, 64}, /* cost of storing SSE registers
2061 in 32,64,128,256 and 512-bit */
2062 20, 12, /* SSE->integer and integer->SSE moves */
2063 20, 12, /* mask->integer and integer->mask moves */
2064 {4, 5, 4}, /* cost of loading mask register
2065 in QImode, HImode, SImode. */
2066 {2, 3, 2}, /* cost if storing mask register
2067 in QImode, HImode, SImode. */
2068 2, /* cost of moving mask register. */
2069 /* End of register allocator costs. */
2070 },
2071
2072 COSTS_N_INSNS (1), /* cost of an add instruction */
2073 COSTS_N_INSNS (3), /* cost of a lea instruction */
2074 COSTS_N_INSNS (4), /* variable shift costs */
2075 COSTS_N_INSNS (4), /* constant shift costs */
2076 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
2077 COSTS_N_INSNS (15), /* HI */
2078 COSTS_N_INSNS (15), /* SI */
2079 COSTS_N_INSNS (15), /* DI */
2080 COSTS_N_INSNS (15)}, /* other */
2081 0, /* cost of multiply per each bit set */
2082 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
2083 COSTS_N_INSNS (56), /* HI */
2084 COSTS_N_INSNS (56), /* SI */
2085 COSTS_N_INSNS (56), /* DI */
2086 COSTS_N_INSNS (56)}, /* other */
2087 COSTS_N_INSNS (1), /* cost of movsx */
2088 COSTS_N_INSNS (1), /* cost of movzx */
2089 16, /* "large" insn */
2090 6, /* MOVE_RATIO */
2091 6, /* CLEAR_RATIO */
2092 {4, 5, 4}, /* cost of loading integer registers
2093 in QImode, HImode and SImode.
2094 Relative to reg-reg move (2). */
2095 {2, 3, 2}, /* cost of storing integer registers */
2096 {16, 16, 16, 32, 64}, /* cost of loading SSE register
2097 in 32bit, 64bit, 128bit, 256bit and 512bit */
2098 {16, 16, 16, 32, 64}, /* cost of storing SSE register
2099 in 32bit, 64bit, 128bit, 256bit and 512bit */
2100 {32, 32, 32, 64, 128}, /* cost of unaligned loads. */
2101 {32, 32, 32, 64, 128}, /* cost of unaligned stores. */
2102 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
2103 20, /* cost of moving SSE register to integer. */
2104 16, 16, /* Gather load static, per_elt. */
2105 16, 16, /* Gather store static, per_elt. */
2106 8, /* size of l1 cache. */
2107 256, /* size of l2 cache. */
2108 64, /* size of prefetch block */
2109 6, /* number of parallel prefetches */
2110 2, /* Branch cost */
2111 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
2112 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
2113 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
2114 COSTS_N_INSNS (2), /* cost of FABS instruction. */
2115 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
2116 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
2117
2118 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
2119 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
2120 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
2121 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
2122 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2123 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
2124 COSTS_N_INSNS (23), /* cost of DIVSS instruction. */
2125 COSTS_N_INSNS (38), /* cost of DIVSD instruction. */
2126 COSTS_N_INSNS (23), /* cost of SQRTSS instruction. */
2127 COSTS_N_INSNS (38), /* cost of SQRTSD instruction. */
2128 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2129 pentium4_memcpy,
2130 pentium4_memset,
2131 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2132 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2133 NULL, /* Loop alignment. */
2134 NULL, /* Jump alignment. */
2135 NULL, /* Label alignment. */
2136 NULL, /* Func alignment. */
2137 };
2138
2139 static stringop_algs nocona_memcpy[2] = {
2140 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
2141 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
2142 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
2143
2144 static stringop_algs nocona_memset[2] = {
2145 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
2146 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2147 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
2148 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2149
2150 static const
2151 struct processor_costs nocona_cost = {
2152 {
2153 /* Start of register allocator costs. integer->integer move cost is 2. */
2154 4, /* cost for loading QImode using movzbl */
2155 {4, 4, 4}, /* cost of loading integer registers
2156 in QImode, HImode and SImode.
2157 Relative to reg-reg move (2). */
2158 {4, 4, 4}, /* cost of storing integer registers */
2159 12, /* cost of reg,reg fld/fst */
2160 {14, 14, 14}, /* cost of loading fp registers
2161 in SFmode, DFmode and XFmode */
2162 {14, 14, 14}, /* cost of storing fp registers
2163 in SFmode, DFmode and XFmode */
2164 14, /* cost of moving MMX register */
2165 {12, 12}, /* cost of loading MMX registers
2166 in SImode and DImode */
2167 {12, 12}, /* cost of storing MMX registers
2168 in SImode and DImode */
2169 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
2170 {12, 12, 12, 24, 48}, /* cost of loading SSE registers
2171 in 32,64,128,256 and 512-bit */
2172 {12, 12, 12, 24, 48}, /* cost of storing SSE registers
2173 in 32,64,128,256 and 512-bit */
2174 20, 12, /* SSE->integer and integer->SSE moves */
2175 20, 12, /* mask->integer and integer->mask moves */
2176 {4, 4, 4}, /* cost of loading mask register
2177 in QImode, HImode, SImode. */
2178 {4, 4, 4}, /* cost if storing mask register
2179 in QImode, HImode, SImode. */
2180 2, /* cost of moving mask register. */
2181 /* End of register allocator costs. */
2182 },
2183
2184 COSTS_N_INSNS (1), /* cost of an add instruction */
2185 COSTS_N_INSNS (1), /* cost of a lea instruction */
2186 COSTS_N_INSNS (1), /* variable shift costs */
2187 COSTS_N_INSNS (1), /* constant shift costs */
2188 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
2189 COSTS_N_INSNS (10), /* HI */
2190 COSTS_N_INSNS (10), /* SI */
2191 COSTS_N_INSNS (10), /* DI */
2192 COSTS_N_INSNS (10)}, /* other */
2193 0, /* cost of multiply per each bit set */
2194 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
2195 COSTS_N_INSNS (66), /* HI */
2196 COSTS_N_INSNS (66), /* SI */
2197 COSTS_N_INSNS (66), /* DI */
2198 COSTS_N_INSNS (66)}, /* other */
2199 COSTS_N_INSNS (1), /* cost of movsx */
2200 COSTS_N_INSNS (1), /* cost of movzx */
2201 16, /* "large" insn */
2202 17, /* MOVE_RATIO */
2203 6, /* CLEAR_RATIO */
2204 {4, 4, 4}, /* cost of loading integer registers
2205 in QImode, HImode and SImode.
2206 Relative to reg-reg move (2). */
2207 {4, 4, 4}, /* cost of storing integer registers */
2208 {12, 12, 12, 24, 48}, /* cost of loading SSE register
2209 in 32bit, 64bit, 128bit, 256bit and 512bit */
2210 {12, 12, 12, 24, 48}, /* cost of storing SSE register
2211 in 32bit, 64bit, 128bit, 256bit and 512bit */
2212 {24, 24, 24, 48, 96}, /* cost of unaligned loads. */
2213 {24, 24, 24, 48, 96}, /* cost of unaligned stores. */
2214 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
2215 20, /* cost of moving SSE register to integer. */
2216 12, 12, /* Gather load static, per_elt. */
2217 12, 12, /* Gather store static, per_elt. */
2218 8, /* size of l1 cache. */
2219 1024, /* size of l2 cache. */
2220 64, /* size of prefetch block */
2221 8, /* number of parallel prefetches */
2222 1, /* Branch cost */
2223 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
2224 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2225 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
2226 COSTS_N_INSNS (3), /* cost of FABS instruction. */
2227 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
2228 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
2229
2230 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
2231 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
2232 COSTS_N_INSNS (7), /* cost of MULSS instruction. */
2233 COSTS_N_INSNS (7), /* cost of MULSD instruction. */
2234 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
2235 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
2236 COSTS_N_INSNS (32), /* cost of DIVSS instruction. */
2237 COSTS_N_INSNS (40), /* cost of DIVSD instruction. */
2238 COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */
2239 COSTS_N_INSNS (41), /* cost of SQRTSD instruction. */
2240 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2241 nocona_memcpy,
2242 nocona_memset,
2243 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2244 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2245 NULL, /* Loop alignment. */
2246 NULL, /* Jump alignment. */
2247 NULL, /* Label alignment. */
2248 NULL, /* Func alignment. */
2249 };
2250
2251 static stringop_algs atom_memcpy[2] = {
2252 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2253 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2254 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2255 static stringop_algs atom_memset[2] = {
2256 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2257 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2258 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2259 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2260 static const
2261 struct processor_costs atom_cost = {
2262 {
2263 /* Start of register allocator costs. integer->integer move cost is 2. */
2264 6, /* cost for loading QImode using movzbl */
2265 {6, 6, 6}, /* cost of loading integer registers
2266 in QImode, HImode and SImode.
2267 Relative to reg-reg move (2). */
2268 {6, 6, 6}, /* cost of storing integer registers */
2269 4, /* cost of reg,reg fld/fst */
2270 {6, 6, 18}, /* cost of loading fp registers
2271 in SFmode, DFmode and XFmode */
2272 {14, 14, 24}, /* cost of storing fp registers
2273 in SFmode, DFmode and XFmode */
2274 2, /* cost of moving MMX register */
2275 {8, 8}, /* cost of loading MMX registers
2276 in SImode and DImode */
2277 {10, 10}, /* cost of storing MMX registers
2278 in SImode and DImode */
2279 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2280 {8, 8, 8, 16, 32}, /* cost of loading SSE registers
2281 in 32,64,128,256 and 512-bit */
2282 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
2283 in 32,64,128,256 and 512-bit */
2284 8, 6, /* SSE->integer and integer->SSE moves */
2285 8, 6, /* mask->integer and integer->mask moves */
2286 {6, 6, 6}, /* cost of loading mask register
2287 in QImode, HImode, SImode. */
2288 {6, 6, 6}, /* cost if storing mask register
2289 in QImode, HImode, SImode. */
2290 2, /* cost of moving mask register. */
2291 /* End of register allocator costs. */
2292 },
2293
2294 COSTS_N_INSNS (1), /* cost of an add instruction */
2295 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2296 COSTS_N_INSNS (1), /* variable shift costs */
2297 COSTS_N_INSNS (1), /* constant shift costs */
2298 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2299 COSTS_N_INSNS (4), /* HI */
2300 COSTS_N_INSNS (3), /* SI */
2301 COSTS_N_INSNS (4), /* DI */
2302 COSTS_N_INSNS (2)}, /* other */
2303 0, /* cost of multiply per each bit set */
2304 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2305 COSTS_N_INSNS (26), /* HI */
2306 COSTS_N_INSNS (42), /* SI */
2307 COSTS_N_INSNS (74), /* DI */
2308 COSTS_N_INSNS (74)}, /* other */
2309 COSTS_N_INSNS (1), /* cost of movsx */
2310 COSTS_N_INSNS (1), /* cost of movzx */
2311 8, /* "large" insn */
2312 17, /* MOVE_RATIO */
2313 6, /* CLEAR_RATIO */
2314 {6, 6, 6}, /* cost of loading integer registers
2315 in QImode, HImode and SImode.
2316 Relative to reg-reg move (2). */
2317 {6, 6, 6}, /* cost of storing integer registers */
2318 {8, 8, 8, 16, 32}, /* cost of loading SSE register
2319 in 32bit, 64bit, 128bit, 256bit and 512bit */
2320 {8, 8, 8, 16, 32}, /* cost of storing SSE register
2321 in 32bit, 64bit, 128bit, 256bit and 512bit */
2322 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */
2323 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
2324 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2325 8, /* cost of moving SSE register to integer. */
2326 8, 8, /* Gather load static, per_elt. */
2327 8, 8, /* Gather store static, per_elt. */
2328 32, /* size of l1 cache. */
2329 256, /* size of l2 cache. */
2330 64, /* size of prefetch block */
2331 6, /* number of parallel prefetches */
2332 3, /* Branch cost */
2333 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2334 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2335 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2336 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2337 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2338 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2339
2340 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2341 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
2342 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2343 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2344 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2345 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
2346 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
2347 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
2348 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
2349 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
2350 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
2351 atom_memcpy,
2352 atom_memset,
2353 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2354 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2355 "16", /* Loop alignment. */
2356 "16:8:8", /* Jump alignment. */
2357 "0:0:8", /* Label alignment. */
2358 "16", /* Func alignment. */
2359 };
2360
2361 static stringop_algs slm_memcpy[2] = {
2362 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2363 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2364 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2365 static stringop_algs slm_memset[2] = {
2366 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2367 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2368 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2369 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2370 static const
2371 struct processor_costs slm_cost = {
2372 {
2373 /* Start of register allocator costs. integer->integer move cost is 2. */
2374 8, /* cost for loading QImode using movzbl */
2375 {8, 8, 8}, /* cost of loading integer registers
2376 in QImode, HImode and SImode.
2377 Relative to reg-reg move (2). */
2378 {6, 6, 6}, /* cost of storing integer registers */
2379 2, /* cost of reg,reg fld/fst */
2380 {8, 8, 18}, /* cost of loading fp registers
2381 in SFmode, DFmode and XFmode */
2382 {6, 6, 18}, /* cost of storing fp registers
2383 in SFmode, DFmode and XFmode */
2384 2, /* cost of moving MMX register */
2385 {8, 8}, /* cost of loading MMX registers
2386 in SImode and DImode */
2387 {6, 6}, /* cost of storing MMX registers
2388 in SImode and DImode */
2389 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2390 {8, 8, 8, 16, 32}, /* cost of loading SSE registers
2391 in 32,64,128,256 and 512-bit */
2392 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
2393 in 32,64,128,256 and 512-bit */
2394 8, 6, /* SSE->integer and integer->SSE moves */
2395 8, 6, /* mask->integer and integer->mask moves */
2396 {8, 8, 8}, /* cost of loading mask register
2397 in QImode, HImode, SImode. */
2398 {6, 6, 6}, /* cost if storing mask register
2399 in QImode, HImode, SImode. */
2400 2, /* cost of moving mask register. */
2401 /* End of register allocator costs. */
2402 },
2403
2404 COSTS_N_INSNS (1), /* cost of an add instruction */
2405 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2406 COSTS_N_INSNS (1), /* variable shift costs */
2407 COSTS_N_INSNS (1), /* constant shift costs */
2408 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2409 COSTS_N_INSNS (3), /* HI */
2410 COSTS_N_INSNS (3), /* SI */
2411 COSTS_N_INSNS (4), /* DI */
2412 COSTS_N_INSNS (2)}, /* other */
2413 0, /* cost of multiply per each bit set */
2414 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2415 COSTS_N_INSNS (26), /* HI */
2416 COSTS_N_INSNS (42), /* SI */
2417 COSTS_N_INSNS (74), /* DI */
2418 COSTS_N_INSNS (74)}, /* other */
2419 COSTS_N_INSNS (1), /* cost of movsx */
2420 COSTS_N_INSNS (1), /* cost of movzx */
2421 8, /* "large" insn */
2422 17, /* MOVE_RATIO */
2423 6, /* CLEAR_RATIO */
2424 {8, 8, 8}, /* cost of loading integer registers
2425 in QImode, HImode and SImode.
2426 Relative to reg-reg move (2). */
2427 {6, 6, 6}, /* cost of storing integer registers */
2428 {8, 8, 8, 16, 32}, /* cost of loading SSE register
2429 in 32bit, 64bit, 128bit, 256bit and 512bit */
2430 {8, 8, 8, 16, 32}, /* cost of storing SSE register
2431 in SImode, DImode and TImode. */
2432 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */
2433 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
2434 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2435 8, /* cost of moving SSE register to integer. */
2436 8, 8, /* Gather load static, per_elt. */
2437 8, 8, /* Gather store static, per_elt. */
2438 32, /* size of l1 cache. */
2439 256, /* size of l2 cache. */
2440 64, /* size of prefetch block */
2441 6, /* number of parallel prefetches */
2442 3, /* Branch cost */
2443 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2444 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2445 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2446 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2447 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2448 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2449
2450 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2451 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2452 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2453 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2454 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2455 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
2456 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
2457 COSTS_N_INSNS (69), /* cost of DIVSD instruction. */
2458 COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */
2459 COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */
2460 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2461 slm_memcpy,
2462 slm_memset,
2463 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2464 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2465 "16", /* Loop alignment. */
2466 "16:8:8", /* Jump alignment. */
2467 "0:0:8", /* Label alignment. */
2468 "16", /* Func alignment. */
2469 };
2470
2471 static stringop_algs intel_memcpy[2] = {
2472 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2473 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2474 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2475 static stringop_algs intel_memset[2] = {
2476 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2477 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2478 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2479 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2480 static const
2481 struct processor_costs intel_cost = {
2482 {
2483 /* Start of register allocator costs. integer->integer move cost is 2. */
2484 6, /* cost for loading QImode using movzbl */
2485 {4, 4, 4}, /* cost of loading integer registers
2486 in QImode, HImode and SImode.
2487 Relative to reg-reg move (2). */
2488 {6, 6, 6}, /* cost of storing integer registers */
2489 2, /* cost of reg,reg fld/fst */
2490 {6, 6, 8}, /* cost of loading fp registers
2491 in SFmode, DFmode and XFmode */
2492 {6, 6, 10}, /* cost of storing fp registers
2493 in SFmode, DFmode and XFmode */
2494 2, /* cost of moving MMX register */
2495 {6, 6}, /* cost of loading MMX registers
2496 in SImode and DImode */
2497 {6, 6}, /* cost of storing MMX registers
2498 in SImode and DImode */
2499 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */
2500 {6, 6, 6, 6, 6}, /* cost of loading SSE registers
2501 in 32,64,128,256 and 512-bit */
2502 {6, 6, 6, 6, 6}, /* cost of storing SSE registers
2503 in 32,64,128,256 and 512-bit */
2504 4, 4, /* SSE->integer and integer->SSE moves */
2505 4, 4, /* mask->integer and integer->mask moves */
2506 {4, 4, 4}, /* cost of loading mask register
2507 in QImode, HImode, SImode. */
2508 {6, 6, 6}, /* cost if storing mask register
2509 in QImode, HImode, SImode. */
2510 2, /* cost of moving mask register. */
2511 /* End of register allocator costs. */
2512 },
2513
2514 COSTS_N_INSNS (1), /* cost of an add instruction */
2515 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2516 COSTS_N_INSNS (1), /* variable shift costs */
2517 COSTS_N_INSNS (1), /* constant shift costs */
2518 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2519 COSTS_N_INSNS (3), /* HI */
2520 COSTS_N_INSNS (3), /* SI */
2521 COSTS_N_INSNS (4), /* DI */
2522 COSTS_N_INSNS (2)}, /* other */
2523 0, /* cost of multiply per each bit set */
2524 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2525 COSTS_N_INSNS (26), /* HI */
2526 COSTS_N_INSNS (42), /* SI */
2527 COSTS_N_INSNS (74), /* DI */
2528 COSTS_N_INSNS (74)}, /* other */
2529 COSTS_N_INSNS (1), /* cost of movsx */
2530 COSTS_N_INSNS (1), /* cost of movzx */
2531 8, /* "large" insn */
2532 17, /* MOVE_RATIO */
2533 6, /* CLEAR_RATIO */
2534 {4, 4, 4}, /* cost of loading integer registers
2535 in QImode, HImode and SImode.
2536 Relative to reg-reg move (2). */
2537 {6, 6, 6}, /* cost of storing integer registers */
2538 {6, 6, 6, 6, 6}, /* cost of loading SSE register
2539 in 32bit, 64bit, 128bit, 256bit and 512bit */
2540 {6, 6, 6, 6, 6}, /* cost of storing SSE register
2541 in 32bit, 64bit, 128bit, 256bit and 512bit */
2542 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
2543 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
2544 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */
2545 4, /* cost of moving SSE register to integer. */
2546 6, 6, /* Gather load static, per_elt. */
2547 6, 6, /* Gather store static, per_elt. */
2548 32, /* size of l1 cache. */
2549 256, /* size of l2 cache. */
2550 64, /* size of prefetch block */
2551 6, /* number of parallel prefetches */
2552 3, /* Branch cost */
2553 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2554 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2555 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2556 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2557 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2558 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2559
2560 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2561 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
2562 COSTS_N_INSNS (8), /* cost of MULSS instruction. */
2563 COSTS_N_INSNS (8), /* cost of MULSD instruction. */
2564 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2565 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
2566 COSTS_N_INSNS (20), /* cost of DIVSS instruction. */
2567 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
2568 COSTS_N_INSNS (40), /* cost of SQRTSS instruction. */
2569 COSTS_N_INSNS (40), /* cost of SQRTSD instruction. */
2570 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2571 intel_memcpy,
2572 intel_memset,
2573 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2574 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2575 "16", /* Loop alignment. */
2576 "16:8:8", /* Jump alignment. */
2577 "0:0:8", /* Label alignment. */
2578 "16", /* Func alignment. */
2579 };
2580
2581 /* Generic should produce code tuned for Core-i7 (and newer chips)
2582 and btver1 (and newer chips). */
2583
2584 static stringop_algs generic_memcpy[2] = {
2585 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2586 {-1, libcall, false}}},
2587 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2588 {-1, libcall, false}}}};
2589 static stringop_algs generic_memset[2] = {
2590 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2591 {-1, libcall, false}}},
2592 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2593 {-1, libcall, false}}}};
2594 static const
2595 struct processor_costs generic_cost = {
2596 {
2597 /* Start of register allocator costs. integer->integer move cost is 2. */
2598 6, /* cost for loading QImode using movzbl */
2599 {6, 6, 6}, /* cost of loading integer registers
2600 in QImode, HImode and SImode.
2601 Relative to reg-reg move (2). */
2602 {6, 6, 6}, /* cost of storing integer registers */
2603 4, /* cost of reg,reg fld/fst */
2604 {6, 6, 12}, /* cost of loading fp registers
2605 in SFmode, DFmode and XFmode */
2606 {6, 6, 12}, /* cost of storing fp registers
2607 in SFmode, DFmode and XFmode */
2608 2, /* cost of moving MMX register */
2609 {6, 6}, /* cost of loading MMX registers
2610 in SImode and DImode */
2611 {6, 6}, /* cost of storing MMX registers
2612 in SImode and DImode */
2613 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
2614 {6, 6, 6, 10, 15}, /* cost of loading SSE registers
2615 in 32,64,128,256 and 512-bit */
2616 {6, 6, 6, 10, 15}, /* cost of storing SSE registers
2617 in 32,64,128,256 and 512-bit */
2618 6, 6, /* SSE->integer and integer->SSE moves */
2619 6, 6, /* mask->integer and integer->mask moves */
2620 {6, 6, 6}, /* cost of loading mask register
2621 in QImode, HImode, SImode. */
2622 {6, 6, 6}, /* cost if storing mask register
2623 in QImode, HImode, SImode. */
2624 2, /* cost of moving mask register. */
2625 /* End of register allocator costs. */
2626 },
2627
2628 COSTS_N_INSNS (1), /* cost of an add instruction */
2629 /* Setting cost to 2 makes our current implementation of synth_mult result in
2630 use of unnecessary temporary registers causing regression on several
2631 SPECfp benchmarks. */
2632 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2633 COSTS_N_INSNS (1), /* variable shift costs */
2634 COSTS_N_INSNS (1), /* constant shift costs */
2635 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2636 COSTS_N_INSNS (4), /* HI */
2637 COSTS_N_INSNS (3), /* SI */
2638 COSTS_N_INSNS (4), /* DI */
2639 COSTS_N_INSNS (4)}, /* other */
2640 0, /* cost of multiply per each bit set */
2641 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */
2642 COSTS_N_INSNS (22), /* HI */
2643 COSTS_N_INSNS (30), /* SI */
2644 COSTS_N_INSNS (74), /* DI */
2645 COSTS_N_INSNS (74)}, /* other */
2646 COSTS_N_INSNS (1), /* cost of movsx */
2647 COSTS_N_INSNS (1), /* cost of movzx */
2648 8, /* "large" insn */
2649 17, /* MOVE_RATIO */
2650 6, /* CLEAR_RATIO */
2651 {6, 6, 6}, /* cost of loading integer registers
2652 in QImode, HImode and SImode.
2653 Relative to reg-reg move (2). */
2654 {6, 6, 6}, /* cost of storing integer registers */
2655 {6, 6, 6, 10, 15}, /* cost of loading SSE register
2656 in 32bit, 64bit, 128bit, 256bit and 512bit */
2657 {6, 6, 6, 10, 15}, /* cost of storing SSE register
2658 in 32bit, 64bit, 128bit, 256bit and 512bit */
2659 {6, 6, 6, 10, 15}, /* cost of unaligned loads. */
2660 {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
2661 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
2662 6, /* cost of moving SSE register to integer. */
2663 18, 6, /* Gather load static, per_elt. */
2664 18, 6, /* Gather store static, per_elt. */
2665 32, /* size of l1 cache. */
2666 512, /* size of l2 cache. */
2667 64, /* size of prefetch block */
2668 6, /* number of parallel prefetches */
2669 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2670 value is increased to perhaps more appropriate value of 5. */
2671 3, /* Branch cost */
2672 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2673 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
2674 COSTS_N_INSNS (17), /* cost of FDIV instruction. */
2675 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2676 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2677 COSTS_N_INSNS (14), /* cost of FSQRT instruction. */
2678
2679 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2680 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2681 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2682 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2683 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2684 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
2685 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
2686 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
2687 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
2688 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
2689 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
2690 generic_memcpy,
2691 generic_memset,
2692 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
2693 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
2694 "16:11:8", /* Loop alignment. */
2695 "16:11:8", /* Jump alignment. */
2696 "0:0:8", /* Label alignment. */
2697 "16", /* Func alignment. */
2698 };
2699
2700 /* core_cost should produce code tuned for Core familly of CPUs. */
2701 static stringop_algs core_memcpy[2] = {
2702 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2703 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2704 {-1, libcall, false}}}};
2705 static stringop_algs core_memset[2] = {
2706 {libcall, {{6, loop_1_byte, true},
2707 {24, loop, true},
2708 {8192, rep_prefix_4_byte, true},
2709 {-1, libcall, false}}},
2710 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2711 {-1, libcall, false}}}};
2712
2713 static const
2714 struct processor_costs core_cost = {
2715 {
2716 /* Start of register allocator costs. integer->integer move cost is 2. */
2717 6, /* cost for loading QImode using movzbl */
2718 {4, 4, 4}, /* cost of loading integer registers
2719 in QImode, HImode and SImode.
2720 Relative to reg-reg move (2). */
2721 {6, 6, 6}, /* cost of storing integer registers */
2722 2, /* cost of reg,reg fld/fst */
2723 {6, 6, 8}, /* cost of loading fp registers
2724 in SFmode, DFmode and XFmode */
2725 {6, 6, 10}, /* cost of storing fp registers
2726 in SFmode, DFmode and XFmode */
2727 2, /* cost of moving MMX register */
2728 {6, 6}, /* cost of loading MMX registers
2729 in SImode and DImode */
2730 {6, 6}, /* cost of storing MMX registers
2731 in SImode and DImode */
2732 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
2733 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
2734 in 32,64,128,256 and 512-bit */
2735 {6, 6, 6, 6, 12}, /* cost of storing SSE registers
2736 in 32,64,128,256 and 512-bit */
2737 6, 6, /* SSE->integer and integer->SSE moves */
2738 6, 6, /* mask->integer and integer->mask moves */
2739 {4, 4, 4}, /* cost of loading mask register
2740 in QImode, HImode, SImode. */
2741 {6, 6, 6}, /* cost if storing mask register
2742 in QImode, HImode, SImode. */
2743 2, /* cost of moving mask register. */
2744 /* End of register allocator costs. */
2745 },
2746
2747 COSTS_N_INSNS (1), /* cost of an add instruction */
2748 /* On all chips taken into consideration lea is 2 cycles and more. With
2749 this cost however our current implementation of synth_mult results in
2750 use of unnecessary temporary registers causing regression on several
2751 SPECfp benchmarks. */
2752 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2753 COSTS_N_INSNS (1), /* variable shift costs */
2754 COSTS_N_INSNS (1), /* constant shift costs */
2755 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2756 COSTS_N_INSNS (4), /* HI */
2757 COSTS_N_INSNS (3), /* SI */
2758 /* Here we tune for Sandybridge or newer. */
2759 COSTS_N_INSNS (3), /* DI */
2760 COSTS_N_INSNS (3)}, /* other */
2761 0, /* cost of multiply per each bit set */
2762 /* Expanding div/mod currently doesn't consider parallelism. So the cost
2763 model is not realistic. We compensate by increasing the latencies a bit. */
2764 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
2765 COSTS_N_INSNS (11), /* HI */
2766 COSTS_N_INSNS (14), /* SI */
2767 COSTS_N_INSNS (81), /* DI */
2768 COSTS_N_INSNS (81)}, /* other */
2769 COSTS_N_INSNS (1), /* cost of movsx */
2770 COSTS_N_INSNS (1), /* cost of movzx */
2771 8, /* "large" insn */
2772 17, /* MOVE_RATIO */
2773 6, /* CLEAR_RATIO */
2774 {4, 4, 4}, /* cost of loading integer registers
2775 in QImode, HImode and SImode.
2776 Relative to reg-reg move (2). */
2777 {6, 6, 6}, /* cost of storing integer registers */
2778 {6, 6, 6, 6, 12}, /* cost of loading SSE register
2779 in 32bit, 64bit, 128bit, 256bit and 512bit */
2780 {6, 6, 6, 6, 12}, /* cost of storing SSE register
2781 in 32bit, 64bit, 128bit, 256bit and 512bit */
2782 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */
2783 {6, 6, 6, 6, 12}, /* cost of unaligned stores. */
2784 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
2785 2, /* cost of moving SSE register to integer. */
2786 /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
2787 rec. throughput 6.
2788 So 5 uops statically and one uops per load. */
2789 10, 6, /* Gather load static, per_elt. */
2790 10, 6, /* Gather store static, per_elt. */
2791 64, /* size of l1 cache. */
2792 512, /* size of l2 cache. */
2793 64, /* size of prefetch block */
2794 6, /* number of parallel prefetches */
2795 /* FIXME perhaps more appropriate value is 5. */
2796 3, /* Branch cost */
2797 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2798 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
2799 /* 10-24 */
2800 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
2801 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2802 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2803 COSTS_N_INSNS (23), /* cost of FSQRT instruction. */
2804
2805 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2806 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2807 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2808 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2809 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2810 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
2811 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
2812 COSTS_N_INSNS (32), /* cost of DIVSD instruction. */
2813 COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */
2814 COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */
2815 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
2816 core_memcpy,
2817 core_memset,
2818 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2819 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2820 "16:11:8", /* Loop alignment. */
2821 "16:11:8", /* Jump alignment. */
2822 "0:0:8", /* Label alignment. */
2823 "16", /* Func alignment. */
2824 };
2825