]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/i386/x86-tune-costs.h
Update copyright years.
[thirdparty/gcc.git] / gcc / config / i386 / x86-tune-costs.h
1 /* Costs of operations of individual x86 CPUs.
2 Copyright (C) 1988-2019 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 Under Section 7 of GPL version 3, you are granted additional
17 permissions described in the GCC Runtime Library Exception, version
18 3.1, as published by the Free Software Foundation.
19
20 You should have received a copy of the GNU General Public License and
21 a copy of the GCC Runtime Library Exception along with this program;
22 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 <http://www.gnu.org/licenses/>. */
24 /* Processor costs (relative to an add) */
25 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
26 #define COSTS_N_BYTES(N) ((N) * 2)
27
28 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
29
30 static stringop_algs ix86_size_memcpy[2] = {
31 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
32 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
33 static stringop_algs ix86_size_memset[2] = {
34 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
35 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
36
37 const
38 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
39 COSTS_N_BYTES (2), /* cost of an add instruction */
40 COSTS_N_BYTES (3), /* cost of a lea instruction */
41 COSTS_N_BYTES (2), /* variable shift costs */
42 COSTS_N_BYTES (3), /* constant shift costs */
43 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
44 COSTS_N_BYTES (3), /* HI */
45 COSTS_N_BYTES (3), /* SI */
46 COSTS_N_BYTES (3), /* DI */
47 COSTS_N_BYTES (5)}, /* other */
48 0, /* cost of multiply per each bit set */
49 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
50 COSTS_N_BYTES (3), /* HI */
51 COSTS_N_BYTES (3), /* SI */
52 COSTS_N_BYTES (3), /* DI */
53 COSTS_N_BYTES (5)}, /* other */
54 COSTS_N_BYTES (3), /* cost of movsx */
55 COSTS_N_BYTES (3), /* cost of movzx */
56 0, /* "large" insn */
57 2, /* MOVE_RATIO */
58
59 /* All move costs are relative to integer->integer move times 2. */
60 2, /* cost for loading QImode using movzbl */
61 {2, 2, 2}, /* cost of loading integer registers
62 in QImode, HImode and SImode.
63 Relative to reg-reg move (2). */
64 {2, 2, 2}, /* cost of storing integer registers */
65 2, /* cost of reg,reg fld/fst */
66 {2, 2, 2}, /* cost of loading fp registers
67 in SFmode, DFmode and XFmode */
68 {2, 2, 2}, /* cost of storing fp registers
69 in SFmode, DFmode and XFmode */
70 3, /* cost of moving MMX register */
71 {3, 3}, /* cost of loading MMX registers
72 in SImode and DImode */
73 {3, 3}, /* cost of storing MMX registers
74 in SImode and DImode */
75 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
76 {3, 3, 3, 3, 3}, /* cost of loading SSE registers
77 in 32,64,128,256 and 512-bit */
78 {3, 3, 3, 3, 3}, /* cost of unaligned SSE load
79 in 128bit, 256bit and 512bit */
80 {3, 3, 3, 3, 3}, /* cost of storing SSE registers
81 in 32,64,128,256 and 512-bit */
82 {3, 3, 3, 3, 3}, /* cost of unaligned SSE store
83 in 128bit, 256bit and 512bit */
84 3, 3, /* SSE->integer and integer->SSE moves */
85 5, 0, /* Gather load static, per_elt. */
86 5, 0, /* Gather store static, per_elt. */
87 0, /* size of l1 cache */
88 0, /* size of l2 cache */
89 0, /* size of prefetch block */
90 0, /* number of parallel prefetches */
91 2, /* Branch cost */
92 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
93 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
94 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
95 COSTS_N_BYTES (2), /* cost of FABS instruction. */
96 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
97 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
98
99 COSTS_N_BYTES (2), /* cost of cheap SSE instruction. */
100 COSTS_N_BYTES (2), /* cost of ADDSS/SD SUBSS/SD insns. */
101 COSTS_N_BYTES (2), /* cost of MULSS instruction. */
102 COSTS_N_BYTES (2), /* cost of MULSD instruction. */
103 COSTS_N_BYTES (2), /* cost of FMA SS instruction. */
104 COSTS_N_BYTES (2), /* cost of FMA SD instruction. */
105 COSTS_N_BYTES (2), /* cost of DIVSS instruction. */
106 COSTS_N_BYTES (2), /* cost of DIVSD instruction. */
107 COSTS_N_BYTES (2), /* cost of SQRTSS instruction. */
108 COSTS_N_BYTES (2), /* cost of SQRTSD instruction. */
109 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
110 ix86_size_memcpy,
111 ix86_size_memset,
112 COSTS_N_BYTES (1), /* cond_taken_branch_cost. */
113 COSTS_N_BYTES (1), /* cond_not_taken_branch_cost. */
114 NULL, /* Loop alignment. */
115 NULL, /* Jump alignment. */
116 NULL, /* Label alignment. */
117 NULL, /* Func alignment. */
118 };
119
120 /* Processor costs (relative to an add) */
121 static stringop_algs i386_memcpy[2] = {
122 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
123 DUMMY_STRINGOP_ALGS};
124 static stringop_algs i386_memset[2] = {
125 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
126 DUMMY_STRINGOP_ALGS};
127
128 static const
129 struct processor_costs i386_cost = { /* 386 specific costs */
130 COSTS_N_INSNS (1), /* cost of an add instruction */
131 COSTS_N_INSNS (1), /* cost of a lea instruction */
132 COSTS_N_INSNS (3), /* variable shift costs */
133 COSTS_N_INSNS (2), /* constant shift costs */
134 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
135 COSTS_N_INSNS (6), /* HI */
136 COSTS_N_INSNS (6), /* SI */
137 COSTS_N_INSNS (6), /* DI */
138 COSTS_N_INSNS (6)}, /* other */
139 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
140 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
141 COSTS_N_INSNS (23), /* HI */
142 COSTS_N_INSNS (23), /* SI */
143 COSTS_N_INSNS (23), /* DI */
144 COSTS_N_INSNS (23)}, /* other */
145 COSTS_N_INSNS (3), /* cost of movsx */
146 COSTS_N_INSNS (2), /* cost of movzx */
147 15, /* "large" insn */
148 3, /* MOVE_RATIO */
149
150 /* All move costs are relative to integer->integer move times 2 and thus
151 they are latency*2. */
152 4, /* cost for loading QImode using movzbl */
153 {2, 4, 2}, /* cost of loading integer registers
154 in QImode, HImode and SImode.
155 Relative to reg-reg move (2). */
156 {2, 4, 2}, /* cost of storing integer registers */
157 2, /* cost of reg,reg fld/fst */
158 {8, 8, 8}, /* cost of loading fp registers
159 in SFmode, DFmode and XFmode */
160 {8, 8, 8}, /* cost of storing fp registers
161 in SFmode, DFmode and XFmode */
162 2, /* cost of moving MMX register */
163 {4, 8}, /* cost of loading MMX registers
164 in SImode and DImode */
165 {4, 8}, /* cost of storing MMX registers
166 in SImode and DImode */
167 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
168 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
169 in 32,64,128,256 and 512-bit */
170 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
171 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
172 in 32,64,128,256 and 512-bit */
173 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
174 3, 3, /* SSE->integer and integer->SSE moves */
175 4, 4, /* Gather load static, per_elt. */
176 4, 4, /* Gather store static, per_elt. */
177 0, /* size of l1 cache */
178 0, /* size of l2 cache */
179 0, /* size of prefetch block */
180 0, /* number of parallel prefetches */
181 1, /* Branch cost */
182 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
183 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
184 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
185 COSTS_N_INSNS (22), /* cost of FABS instruction. */
186 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
187 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
188
189 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
190 COSTS_N_INSNS (23), /* cost of ADDSS/SD SUBSS/SD insns. */
191 COSTS_N_INSNS (27), /* cost of MULSS instruction. */
192 COSTS_N_INSNS (27), /* cost of MULSD instruction. */
193 COSTS_N_INSNS (27), /* cost of FMA SS instruction. */
194 COSTS_N_INSNS (27), /* cost of FMA SD instruction. */
195 COSTS_N_INSNS (88), /* cost of DIVSS instruction. */
196 COSTS_N_INSNS (88), /* cost of DIVSD instruction. */
197 COSTS_N_INSNS (122), /* cost of SQRTSS instruction. */
198 COSTS_N_INSNS (122), /* cost of SQRTSD instruction. */
199 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
200 i386_memcpy,
201 i386_memset,
202 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
203 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
204 "4", /* Loop alignment. */
205 "4", /* Jump alignment. */
206 NULL, /* Label alignment. */
207 "4", /* Func alignment. */
208 };
209
210 static stringop_algs i486_memcpy[2] = {
211 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
212 DUMMY_STRINGOP_ALGS};
213 static stringop_algs i486_memset[2] = {
214 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
215 DUMMY_STRINGOP_ALGS};
216
217 static const
218 struct processor_costs i486_cost = { /* 486 specific costs */
219 COSTS_N_INSNS (1), /* cost of an add instruction */
220 COSTS_N_INSNS (1), /* cost of a lea instruction */
221 COSTS_N_INSNS (3), /* variable shift costs */
222 COSTS_N_INSNS (2), /* constant shift costs */
223 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
224 COSTS_N_INSNS (12), /* HI */
225 COSTS_N_INSNS (12), /* SI */
226 COSTS_N_INSNS (12), /* DI */
227 COSTS_N_INSNS (12)}, /* other */
228 1, /* cost of multiply per each bit set */
229 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
230 COSTS_N_INSNS (40), /* HI */
231 COSTS_N_INSNS (40), /* SI */
232 COSTS_N_INSNS (40), /* DI */
233 COSTS_N_INSNS (40)}, /* other */
234 COSTS_N_INSNS (3), /* cost of movsx */
235 COSTS_N_INSNS (2), /* cost of movzx */
236 15, /* "large" insn */
237 3, /* MOVE_RATIO */
238
239 /* All move costs are relative to integer->integer move times 2 and thus
240 they are latency*2. */
241 4, /* cost for loading QImode using movzbl */
242 {2, 4, 2}, /* cost of loading integer registers
243 in QImode, HImode and SImode.
244 Relative to reg-reg move (2). */
245 {2, 4, 2}, /* cost of storing integer registers */
246 2, /* cost of reg,reg fld/fst */
247 {8, 8, 8}, /* cost of loading fp registers
248 in SFmode, DFmode and XFmode */
249 {8, 8, 8}, /* cost of storing fp registers
250 in SFmode, DFmode and XFmode */
251 2, /* cost of moving MMX register */
252 {4, 8}, /* cost of loading MMX registers
253 in SImode and DImode */
254 {4, 8}, /* cost of storing MMX registers
255 in SImode and DImode */
256 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
257 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
258 in 32,64,128,256 and 512-bit */
259 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
260 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
261 in 32,64,128,256 and 512-bit */
262 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
263 3, 3, /* SSE->integer and integer->SSE moves */
264 4, 4, /* Gather load static, per_elt. */
265 4, 4, /* Gather store static, per_elt. */
266 4, /* size of l1 cache. 486 has 8kB cache
267 shared for code and data, so 4kB is
268 not really precise. */
269 4, /* size of l2 cache */
270 0, /* size of prefetch block */
271 0, /* number of parallel prefetches */
272 1, /* Branch cost */
273 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
274 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
275 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
276 COSTS_N_INSNS (3), /* cost of FABS instruction. */
277 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
278 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
279
280 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
281 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
282 COSTS_N_INSNS (16), /* cost of MULSS instruction. */
283 COSTS_N_INSNS (16), /* cost of MULSD instruction. */
284 COSTS_N_INSNS (16), /* cost of FMA SS instruction. */
285 COSTS_N_INSNS (16), /* cost of FMA SD instruction. */
286 COSTS_N_INSNS (73), /* cost of DIVSS instruction. */
287 COSTS_N_INSNS (74), /* cost of DIVSD instruction. */
288 COSTS_N_INSNS (83), /* cost of SQRTSS instruction. */
289 COSTS_N_INSNS (83), /* cost of SQRTSD instruction. */
290 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
291 i486_memcpy,
292 i486_memset,
293 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
294 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
295 "16", /* Loop alignment. */
296 "16", /* Jump alignment. */
297 "0:0:8", /* Label alignment. */
298 "16", /* Func alignment. */
299 };
300
301 static stringop_algs pentium_memcpy[2] = {
302 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
303 DUMMY_STRINGOP_ALGS};
304 static stringop_algs pentium_memset[2] = {
305 {libcall, {{-1, rep_prefix_4_byte, false}}},
306 DUMMY_STRINGOP_ALGS};
307
308 static const
309 struct processor_costs pentium_cost = {
310 COSTS_N_INSNS (1), /* cost of an add instruction */
311 COSTS_N_INSNS (1), /* cost of a lea instruction */
312 COSTS_N_INSNS (4), /* variable shift costs */
313 COSTS_N_INSNS (1), /* constant shift costs */
314 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
315 COSTS_N_INSNS (11), /* HI */
316 COSTS_N_INSNS (11), /* SI */
317 COSTS_N_INSNS (11), /* DI */
318 COSTS_N_INSNS (11)}, /* other */
319 0, /* cost of multiply per each bit set */
320 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
321 COSTS_N_INSNS (25), /* HI */
322 COSTS_N_INSNS (25), /* SI */
323 COSTS_N_INSNS (25), /* DI */
324 COSTS_N_INSNS (25)}, /* other */
325 COSTS_N_INSNS (3), /* cost of movsx */
326 COSTS_N_INSNS (2), /* cost of movzx */
327 8, /* "large" insn */
328 6, /* MOVE_RATIO */
329
330 /* All move costs are relative to integer->integer move times 2 and thus
331 they are latency*2. */
332 6, /* cost for loading QImode using movzbl */
333 {2, 4, 2}, /* cost of loading integer registers
334 in QImode, HImode and SImode.
335 Relative to reg-reg move (2). */
336 {2, 4, 2}, /* cost of storing integer registers */
337 2, /* cost of reg,reg fld/fst */
338 {2, 2, 6}, /* cost of loading fp registers
339 in SFmode, DFmode and XFmode */
340 {4, 4, 6}, /* cost of storing fp registers
341 in SFmode, DFmode and XFmode */
342 8, /* cost of moving MMX register */
343 {8, 8}, /* cost of loading MMX registers
344 in SImode and DImode */
345 {8, 8}, /* cost of storing MMX registers
346 in SImode and DImode */
347 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
348 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
349 in 32,64,128,256 and 512-bit */
350 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
351 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
352 in 32,64,128,256 and 512-bit */
353 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
354 3, 3, /* SSE->integer and integer->SSE moves */
355 4, 4, /* Gather load static, per_elt. */
356 4, 4, /* Gather store static, per_elt. */
357 8, /* size of l1 cache. */
358 8, /* size of l2 cache */
359 0, /* size of prefetch block */
360 0, /* number of parallel prefetches */
361 2, /* Branch cost */
362 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
363 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
364 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
365 COSTS_N_INSNS (1), /* cost of FABS instruction. */
366 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
367 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
368
369 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
370 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
371 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
372 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
373 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
374 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
375 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
376 COSTS_N_INSNS (39), /* cost of DIVSD instruction. */
377 COSTS_N_INSNS (70), /* cost of SQRTSS instruction. */
378 COSTS_N_INSNS (70), /* cost of SQRTSD instruction. */
379 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
380 pentium_memcpy,
381 pentium_memset,
382 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
383 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
384 "16:8:8", /* Loop alignment. */
385 "16:8:8", /* Jump alignment. */
386 "0:0:8", /* Label alignment. */
387 "16", /* Func alignment. */
388 };
389
390 static const
391 struct processor_costs lakemont_cost = {
392 COSTS_N_INSNS (1), /* cost of an add instruction */
393 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
394 COSTS_N_INSNS (1), /* variable shift costs */
395 COSTS_N_INSNS (1), /* constant shift costs */
396 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
397 COSTS_N_INSNS (11), /* HI */
398 COSTS_N_INSNS (11), /* SI */
399 COSTS_N_INSNS (11), /* DI */
400 COSTS_N_INSNS (11)}, /* other */
401 0, /* cost of multiply per each bit set */
402 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
403 COSTS_N_INSNS (25), /* HI */
404 COSTS_N_INSNS (25), /* SI */
405 COSTS_N_INSNS (25), /* DI */
406 COSTS_N_INSNS (25)}, /* other */
407 COSTS_N_INSNS (3), /* cost of movsx */
408 COSTS_N_INSNS (2), /* cost of movzx */
409 8, /* "large" insn */
410 17, /* MOVE_RATIO */
411
412 /* All move costs are relative to integer->integer move times 2 and thus
413 they are latency*2. */
414 6, /* cost for loading QImode using movzbl */
415 {2, 4, 2}, /* cost of loading integer registers
416 in QImode, HImode and SImode.
417 Relative to reg-reg move (2). */
418 {2, 4, 2}, /* cost of storing integer registers */
419 2, /* cost of reg,reg fld/fst */
420 {2, 2, 6}, /* cost of loading fp registers
421 in SFmode, DFmode and XFmode */
422 {4, 4, 6}, /* cost of storing fp registers
423 in SFmode, DFmode and XFmode */
424 8, /* cost of moving MMX register */
425 {8, 8}, /* cost of loading MMX registers
426 in SImode and DImode */
427 {8, 8}, /* cost of storing MMX registers
428 in SImode and DImode */
429 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
430 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
431 in 32,64,128,256 and 512-bit */
432 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
433 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
434 in 32,64,128,256 and 512-bit */
435 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
436 3, 3, /* SSE->integer and integer->SSE moves */
437 4, 4, /* Gather load static, per_elt. */
438 4, 4, /* Gather store static, per_elt. */
439 8, /* size of l1 cache. */
440 8, /* size of l2 cache */
441 0, /* size of prefetch block */
442 0, /* number of parallel prefetches */
443 2, /* Branch cost */
444 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
445 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
446 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
447 COSTS_N_INSNS (1), /* cost of FABS instruction. */
448 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
449 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
450
451 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
452 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
453 COSTS_N_INSNS (5), /* cost of MULSS instruction. */
454 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
455 COSTS_N_INSNS (10), /* cost of FMA SS instruction. */
456 COSTS_N_INSNS (10), /* cost of FMA SD instruction. */
457 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
458 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
459 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
460 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
461 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
462 pentium_memcpy,
463 pentium_memset,
464 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
465 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
466 "16:8:8", /* Loop alignment. */
467 "16:8:8", /* Jump alignment. */
468 "0:0:8", /* Label alignment. */
469 "16", /* Func alignment. */
470 };
471
472 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
473 (we ensure the alignment). For small blocks inline loop is still a
474 noticeable win, for bigger blocks either rep movsl or rep movsb is
475 way to go. Rep movsb has apparently more expensive startup time in CPU,
476 but after 4K the difference is down in the noise. */
477 static stringop_algs pentiumpro_memcpy[2] = {
478 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
479 {8192, rep_prefix_4_byte, false},
480 {-1, rep_prefix_1_byte, false}}},
481 DUMMY_STRINGOP_ALGS};
482 static stringop_algs pentiumpro_memset[2] = {
483 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
484 {8192, rep_prefix_4_byte, false},
485 {-1, libcall, false}}},
486 DUMMY_STRINGOP_ALGS};
487 static const
488 struct processor_costs pentiumpro_cost = {
489 COSTS_N_INSNS (1), /* cost of an add instruction */
490 COSTS_N_INSNS (1), /* cost of a lea instruction */
491 COSTS_N_INSNS (1), /* variable shift costs */
492 COSTS_N_INSNS (1), /* constant shift costs */
493 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
494 COSTS_N_INSNS (4), /* HI */
495 COSTS_N_INSNS (4), /* SI */
496 COSTS_N_INSNS (4), /* DI */
497 COSTS_N_INSNS (4)}, /* other */
498 0, /* cost of multiply per each bit set */
499 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
500 COSTS_N_INSNS (17), /* HI */
501 COSTS_N_INSNS (17), /* SI */
502 COSTS_N_INSNS (17), /* DI */
503 COSTS_N_INSNS (17)}, /* other */
504 COSTS_N_INSNS (1), /* cost of movsx */
505 COSTS_N_INSNS (1), /* cost of movzx */
506 8, /* "large" insn */
507 6, /* MOVE_RATIO */
508
509 /* All move costs are relative to integer->integer move times 2 and thus
510 they are latency*2. */
511 2, /* cost for loading QImode using movzbl */
512 {4, 4, 4}, /* cost of loading integer registers
513 in QImode, HImode and SImode.
514 Relative to reg-reg move (2). */
515 {2, 2, 2}, /* cost of storing integer registers */
516 2, /* cost of reg,reg fld/fst */
517 {2, 2, 6}, /* cost of loading fp registers
518 in SFmode, DFmode and XFmode */
519 {4, 4, 6}, /* cost of storing fp registers
520 in SFmode, DFmode and XFmode */
521 2, /* cost of moving MMX register */
522 {2, 2}, /* cost of loading MMX registers
523 in SImode and DImode */
524 {2, 2}, /* cost of storing MMX registers
525 in SImode and DImode */
526 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
527 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
528 in 32,64,128,256 and 512-bit */
529 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
530 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
531 in 32,64,128,256 and 512-bit */
532 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
533 3, 3, /* SSE->integer and integer->SSE moves */
534 4, 4, /* Gather load static, per_elt. */
535 4, 4, /* Gather store static, per_elt. */
536 8, /* size of l1 cache. */
537 256, /* size of l2 cache */
538 32, /* size of prefetch block */
539 6, /* number of parallel prefetches */
540 2, /* Branch cost */
541 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
542 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
543 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
544 COSTS_N_INSNS (2), /* cost of FABS instruction. */
545 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
546 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
547
548 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
549 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
550 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
551 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
552 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
553 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
554 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
555 COSTS_N_INSNS (18), /* cost of DIVSD instruction. */
556 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
557 COSTS_N_INSNS (31), /* cost of SQRTSD instruction. */
558 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
559 pentiumpro_memcpy,
560 pentiumpro_memset,
561 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
562 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
563 "16", /* Loop alignment. */
564 "16:11:8", /* Jump alignment. */
565 "0:0:8", /* Label alignment. */
566 "16", /* Func alignment. */
567 };
568
569 static stringop_algs geode_memcpy[2] = {
570 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
571 DUMMY_STRINGOP_ALGS};
572 static stringop_algs geode_memset[2] = {
573 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
574 DUMMY_STRINGOP_ALGS};
575 static const
576 struct processor_costs geode_cost = {
577 COSTS_N_INSNS (1), /* cost of an add instruction */
578 COSTS_N_INSNS (1), /* cost of a lea instruction */
579 COSTS_N_INSNS (2), /* variable shift costs */
580 COSTS_N_INSNS (1), /* constant shift costs */
581 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
582 COSTS_N_INSNS (4), /* HI */
583 COSTS_N_INSNS (7), /* SI */
584 COSTS_N_INSNS (7), /* DI */
585 COSTS_N_INSNS (7)}, /* other */
586 0, /* cost of multiply per each bit set */
587 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
588 COSTS_N_INSNS (23), /* HI */
589 COSTS_N_INSNS (39), /* SI */
590 COSTS_N_INSNS (39), /* DI */
591 COSTS_N_INSNS (39)}, /* other */
592 COSTS_N_INSNS (1), /* cost of movsx */
593 COSTS_N_INSNS (1), /* cost of movzx */
594 8, /* "large" insn */
595 4, /* MOVE_RATIO */
596
597 /* All move costs are relative to integer->integer move times 2 and thus
598 they are latency*2. */
599 2, /* cost for loading QImode using movzbl */
600 {2, 2, 2}, /* cost of loading integer registers
601 in QImode, HImode and SImode.
602 Relative to reg-reg move (2). */
603 {2, 2, 2}, /* cost of storing integer registers */
604 2, /* cost of reg,reg fld/fst */
605 {2, 2, 2}, /* cost of loading fp registers
606 in SFmode, DFmode and XFmode */
607 {4, 6, 6}, /* cost of storing fp registers
608 in SFmode, DFmode and XFmode */
609
610 2, /* cost of moving MMX register */
611 {2, 2}, /* cost of loading MMX registers
612 in SImode and DImode */
613 {2, 2}, /* cost of storing MMX registers
614 in SImode and DImode */
615 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
616 {2, 2, 8, 16, 32}, /* cost of loading SSE registers
617 in 32,64,128,256 and 512-bit */
618 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */
619 {2, 2, 8, 16, 32}, /* cost of storing SSE registers
620 in 32,64,128,256 and 512-bit */
621 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
622 6, 6, /* SSE->integer and integer->SSE moves */
623 2, 2, /* Gather load static, per_elt. */
624 2, 2, /* Gather store static, per_elt. */
625 64, /* size of l1 cache. */
626 128, /* size of l2 cache. */
627 32, /* size of prefetch block */
628 1, /* number of parallel prefetches */
629 1, /* Branch cost */
630 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
631 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
632 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
633 COSTS_N_INSNS (1), /* cost of FABS instruction. */
634 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
635 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
636
637 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
638 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
639 COSTS_N_INSNS (11), /* cost of MULSS instruction. */
640 COSTS_N_INSNS (11), /* cost of MULSD instruction. */
641 COSTS_N_INSNS (17), /* cost of FMA SS instruction. */
642 COSTS_N_INSNS (17), /* cost of FMA SD instruction. */
643 COSTS_N_INSNS (47), /* cost of DIVSS instruction. */
644 COSTS_N_INSNS (47), /* cost of DIVSD instruction. */
645 COSTS_N_INSNS (54), /* cost of SQRTSS instruction. */
646 COSTS_N_INSNS (54), /* cost of SQRTSD instruction. */
647 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
648 geode_memcpy,
649 geode_memset,
650 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
651 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
652 NULL, /* Loop alignment. */
653 NULL, /* Jump alignment. */
654 NULL, /* Label alignment. */
655 NULL, /* Func alignment. */
656 };
657
658 static stringop_algs k6_memcpy[2] = {
659 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
660 DUMMY_STRINGOP_ALGS};
661 static stringop_algs k6_memset[2] = {
662 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
663 DUMMY_STRINGOP_ALGS};
664 static const
665 struct processor_costs k6_cost = {
666 COSTS_N_INSNS (1), /* cost of an add instruction */
667 COSTS_N_INSNS (2), /* cost of a lea instruction */
668 COSTS_N_INSNS (1), /* variable shift costs */
669 COSTS_N_INSNS (1), /* constant shift costs */
670 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
671 COSTS_N_INSNS (3), /* HI */
672 COSTS_N_INSNS (3), /* SI */
673 COSTS_N_INSNS (3), /* DI */
674 COSTS_N_INSNS (3)}, /* other */
675 0, /* cost of multiply per each bit set */
676 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
677 COSTS_N_INSNS (18), /* HI */
678 COSTS_N_INSNS (18), /* SI */
679 COSTS_N_INSNS (18), /* DI */
680 COSTS_N_INSNS (18)}, /* other */
681 COSTS_N_INSNS (2), /* cost of movsx */
682 COSTS_N_INSNS (2), /* cost of movzx */
683 8, /* "large" insn */
684 4, /* MOVE_RATIO */
685
686 /* All move costs are relative to integer->integer move times 2 and thus
687 they are latency*2. */
688 3, /* cost for loading QImode using movzbl */
689 {4, 5, 4}, /* cost of loading integer registers
690 in QImode, HImode and SImode.
691 Relative to reg-reg move (2). */
692 {2, 3, 2}, /* cost of storing integer registers */
693 4, /* cost of reg,reg fld/fst */
694 {6, 6, 6}, /* cost of loading fp registers
695 in SFmode, DFmode and XFmode */
696 {4, 4, 4}, /* cost of storing fp registers
697 in SFmode, DFmode and XFmode */
698 2, /* cost of moving MMX register */
699 {2, 2}, /* cost of loading MMX registers
700 in SImode and DImode */
701 {2, 2}, /* cost of storing MMX registers
702 in SImode and DImode */
703 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
704 {2, 2, 8, 16, 32}, /* cost of loading SSE registers
705 in 32,64,128,256 and 512-bit */
706 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */
707 {2, 2, 8, 16, 32}, /* cost of storing SSE registers
708 in 32,64,128,256 and 512-bit */
709 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
710 6, 6, /* SSE->integer and integer->SSE moves */
711 2, 2, /* Gather load static, per_elt. */
712 2, 2, /* Gather store static, per_elt. */
713 32, /* size of l1 cache. */
714 32, /* size of l2 cache. Some models
715 have integrated l2 cache, but
716 optimizing for k6 is not important
717 enough to worry about that. */
718 32, /* size of prefetch block */
719 1, /* number of parallel prefetches */
720 1, /* Branch cost */
721 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
722 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
723 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
724 COSTS_N_INSNS (2), /* cost of FABS instruction. */
725 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
726 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
727
728 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
729 COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */
730 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
731 COSTS_N_INSNS (2), /* cost of MULSD instruction. */
732 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
733 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
734 COSTS_N_INSNS (56), /* cost of DIVSS instruction. */
735 COSTS_N_INSNS (56), /* cost of DIVSD instruction. */
736 COSTS_N_INSNS (56), /* cost of SQRTSS instruction. */
737 COSTS_N_INSNS (56), /* cost of SQRTSD instruction. */
738 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
739 k6_memcpy,
740 k6_memset,
741 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
742 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
743 "32:8:8", /* Loop alignment. */
744 "32:8:8", /* Jump alignment. */
745 "0:0:8", /* Label alignment. */
746 "32", /* Func alignment. */
747 };
748
749 /* For some reason, Athlon deals better with REP prefix (relative to loops)
750 compared to K8. Alignment becomes important after 8 bytes for memcpy and
751 128 bytes for memset. */
752 static stringop_algs athlon_memcpy[2] = {
753 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
754 DUMMY_STRINGOP_ALGS};
755 static stringop_algs athlon_memset[2] = {
756 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
757 DUMMY_STRINGOP_ALGS};
758 static const
759 struct processor_costs athlon_cost = {
760 COSTS_N_INSNS (1), /* cost of an add instruction */
761 COSTS_N_INSNS (2), /* cost of a lea instruction */
762 COSTS_N_INSNS (1), /* variable shift costs */
763 COSTS_N_INSNS (1), /* constant shift costs */
764 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
765 COSTS_N_INSNS (5), /* HI */
766 COSTS_N_INSNS (5), /* SI */
767 COSTS_N_INSNS (5), /* DI */
768 COSTS_N_INSNS (5)}, /* other */
769 0, /* cost of multiply per each bit set */
770 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
771 COSTS_N_INSNS (26), /* HI */
772 COSTS_N_INSNS (42), /* SI */
773 COSTS_N_INSNS (74), /* DI */
774 COSTS_N_INSNS (74)}, /* other */
775 COSTS_N_INSNS (1), /* cost of movsx */
776 COSTS_N_INSNS (1), /* cost of movzx */
777 8, /* "large" insn */
778 9, /* MOVE_RATIO */
779
780 /* All move costs are relative to integer->integer move times 2 and thus
781 they are latency*2. */
782 4, /* cost for loading QImode using movzbl */
783 {3, 4, 3}, /* cost of loading integer registers
784 in QImode, HImode and SImode.
785 Relative to reg-reg move (2). */
786 {3, 4, 3}, /* cost of storing integer registers */
787 4, /* cost of reg,reg fld/fst */
788 {4, 4, 12}, /* cost of loading fp registers
789 in SFmode, DFmode and XFmode */
790 {6, 6, 8}, /* cost of storing fp registers
791 in SFmode, DFmode and XFmode */
792 2, /* cost of moving MMX register */
793 {4, 4}, /* cost of loading MMX registers
794 in SImode and DImode */
795 {4, 4}, /* cost of storing MMX registers
796 in SImode and DImode */
797 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
798 {4, 4, 12, 12, 24}, /* cost of loading SSE registers
799 in 32,64,128,256 and 512-bit */
800 {4, 4, 12, 12, 24}, /* cost of unaligned loads. */
801 {4, 4, 10, 10, 20}, /* cost of storing SSE registers
802 in 32,64,128,256 and 512-bit */
803 {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
804 5, 5, /* SSE->integer and integer->SSE moves */
805 4, 4, /* Gather load static, per_elt. */
806 4, 4, /* Gather store static, per_elt. */
807 64, /* size of l1 cache. */
808 256, /* size of l2 cache. */
809 64, /* size of prefetch block */
810 6, /* number of parallel prefetches */
811 5, /* Branch cost */
812 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
813 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
814 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
815 COSTS_N_INSNS (2), /* cost of FABS instruction. */
816 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
817 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
818
819 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
820 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
821 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
822 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
823 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
824 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
825 /* 11-16 */
826 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
827 COSTS_N_INSNS (24), /* cost of DIVSD instruction. */
828 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
829 COSTS_N_INSNS (19), /* cost of SQRTSD instruction. */
830 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
831 athlon_memcpy,
832 athlon_memset,
833 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
834 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
835 "16:8:8", /* Loop alignment. */
836 "16:8:8", /* Jump alignment. */
837 "0:0:8", /* Label alignment. */
838 "16", /* Func alignment. */
839 };
840
841 /* K8 has optimized REP instruction for medium sized blocks, but for very
842 small blocks it is better to use loop. For large blocks, libcall can
843 do nontemporary accesses and beat inline considerably. */
844 static stringop_algs k8_memcpy[2] = {
845 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
846 {-1, rep_prefix_4_byte, false}}},
847 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
848 {-1, libcall, false}}}};
849 static stringop_algs k8_memset[2] = {
850 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
851 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
852 {libcall, {{48, unrolled_loop, false},
853 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
854 static const
855 struct processor_costs k8_cost = {
856 COSTS_N_INSNS (1), /* cost of an add instruction */
857 COSTS_N_INSNS (2), /* cost of a lea instruction */
858 COSTS_N_INSNS (1), /* variable shift costs */
859 COSTS_N_INSNS (1), /* constant shift costs */
860 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
861 COSTS_N_INSNS (4), /* HI */
862 COSTS_N_INSNS (3), /* SI */
863 COSTS_N_INSNS (4), /* DI */
864 COSTS_N_INSNS (5)}, /* other */
865 0, /* cost of multiply per each bit set */
866 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
867 COSTS_N_INSNS (26), /* HI */
868 COSTS_N_INSNS (42), /* SI */
869 COSTS_N_INSNS (74), /* DI */
870 COSTS_N_INSNS (74)}, /* other */
871 COSTS_N_INSNS (1), /* cost of movsx */
872 COSTS_N_INSNS (1), /* cost of movzx */
873 8, /* "large" insn */
874 9, /* MOVE_RATIO */
875
876 /* All move costs are relative to integer->integer move times 2 and thus
877 they are latency*2. */
878 4, /* cost for loading QImode using movzbl */
879 {3, 4, 3}, /* cost of loading integer registers
880 in QImode, HImode and SImode.
881 Relative to reg-reg move (2). */
882 {3, 4, 3}, /* cost of storing integer registers */
883 4, /* cost of reg,reg fld/fst */
884 {4, 4, 12}, /* cost of loading fp registers
885 in SFmode, DFmode and XFmode */
886 {6, 6, 8}, /* cost of storing fp registers
887 in SFmode, DFmode and XFmode */
888 2, /* cost of moving MMX register */
889 {3, 3}, /* cost of loading MMX registers
890 in SImode and DImode */
891 {4, 4}, /* cost of storing MMX registers
892 in SImode and DImode */
893 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
894 {4, 3, 12, 12, 24}, /* cost of loading SSE registers
895 in 32,64,128,256 and 512-bit */
896 {4, 3, 12, 12, 24}, /* cost of unaligned loads. */
897 {4, 4, 10, 10, 20}, /* cost of storing SSE registers
898 in 32,64,128,256 and 512-bit */
899 {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
900 5, 5, /* SSE->integer and integer->SSE moves */
901 4, 4, /* Gather load static, per_elt. */
902 4, 4, /* Gather store static, per_elt. */
903 64, /* size of l1 cache. */
904 512, /* size of l2 cache. */
905 64, /* size of prefetch block */
906 /* New AMD processors never drop prefetches; if they cannot be performed
907 immediately, they are queued. We set number of simultaneous prefetches
908 to a large constant to reflect this (it probably is not a good idea not
909 to limit number of prefetches at all, as their execution also takes some
910 time). */
911 100, /* number of parallel prefetches */
912 3, /* Branch cost */
913 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
914 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
915 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
916 COSTS_N_INSNS (2), /* cost of FABS instruction. */
917 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
918 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
919
920 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
921 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
922 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
923 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
924 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
925 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
926 /* 11-16 */
927 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
928 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
929 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
930 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
931 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
932 k8_memcpy,
933 k8_memset,
934 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
935 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
936 "16:8:8", /* Loop alignment. */
937 "16:8:8", /* Jump alignment. */
938 "0:0:8", /* Label alignment. */
939 "16", /* Func alignment. */
940 };
941
942 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
943 very small blocks it is better to use loop. For large blocks, libcall can
944 do nontemporary accesses and beat inline considerably. */
945 static stringop_algs amdfam10_memcpy[2] = {
946 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
947 {-1, rep_prefix_4_byte, false}}},
948 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
949 {-1, libcall, false}}}};
950 static stringop_algs amdfam10_memset[2] = {
951 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
952 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
953 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
954 {-1, libcall, false}}}};
955 struct processor_costs amdfam10_cost = {
956 COSTS_N_INSNS (1), /* cost of an add instruction */
957 COSTS_N_INSNS (2), /* cost of a lea instruction */
958 COSTS_N_INSNS (1), /* variable shift costs */
959 COSTS_N_INSNS (1), /* constant shift costs */
960 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
961 COSTS_N_INSNS (4), /* HI */
962 COSTS_N_INSNS (3), /* SI */
963 COSTS_N_INSNS (4), /* DI */
964 COSTS_N_INSNS (5)}, /* other */
965 0, /* cost of multiply per each bit set */
966 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
967 COSTS_N_INSNS (35), /* HI */
968 COSTS_N_INSNS (51), /* SI */
969 COSTS_N_INSNS (83), /* DI */
970 COSTS_N_INSNS (83)}, /* other */
971 COSTS_N_INSNS (1), /* cost of movsx */
972 COSTS_N_INSNS (1), /* cost of movzx */
973 8, /* "large" insn */
974 9, /* MOVE_RATIO */
975
976 /* All move costs are relative to integer->integer move times 2 and thus
977 they are latency*2. */
978 4, /* cost for loading QImode using movzbl */
979 {3, 4, 3}, /* cost of loading integer registers
980 in QImode, HImode and SImode.
981 Relative to reg-reg move (2). */
982 {3, 4, 3}, /* cost of storing integer registers */
983 4, /* cost of reg,reg fld/fst */
984 {4, 4, 12}, /* cost of loading fp registers
985 in SFmode, DFmode and XFmode */
986 {6, 6, 8}, /* cost of storing fp registers
987 in SFmode, DFmode and XFmode */
988 2, /* cost of moving MMX register */
989 {3, 3}, /* cost of loading MMX registers
990 in SImode and DImode */
991 {4, 4}, /* cost of storing MMX registers
992 in SImode and DImode */
993 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
994 {4, 4, 3, 6, 12}, /* cost of loading SSE registers
995 in 32,64,128,256 and 512-bit */
996 {4, 4, 3, 7, 12}, /* cost of unaligned loads. */
997 {4, 4, 5, 10, 20}, /* cost of storing SSE registers
998 in 32,64,128,256 and 512-bit */
999 {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
1000 3, 3, /* SSE->integer and integer->SSE moves */
1001 /* On K8:
1002 MOVD reg64, xmmreg Double FSTORE 4
1003 MOVD reg32, xmmreg Double FSTORE 4
1004 On AMDFAM10:
1005 MOVD reg64, xmmreg Double FADD 3
1006 1/1 1/1
1007 MOVD reg32, xmmreg Double FADD 3
1008 1/1 1/1 */
1009 4, 4, /* Gather load static, per_elt. */
1010 4, 4, /* Gather store static, per_elt. */
1011 64, /* size of l1 cache. */
1012 512, /* size of l2 cache. */
1013 64, /* size of prefetch block */
1014 /* New AMD processors never drop prefetches; if they cannot be performed
1015 immediately, they are queued. We set number of simultaneous prefetches
1016 to a large constant to reflect this (it probably is not a good idea not
1017 to limit number of prefetches at all, as their execution also takes some
1018 time). */
1019 100, /* number of parallel prefetches */
1020 2, /* Branch cost */
1021 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1022 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1023 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1024 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1025 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1026 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1027
1028 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1029 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1030 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1031 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1032 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
1033 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
1034 /* 11-16 */
1035 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
1036 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
1037 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
1038 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
1039 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1040 amdfam10_memcpy,
1041 amdfam10_memset,
1042 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1043 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1044 "32:25:8", /* Loop alignment. */
1045 "32:8:8", /* Jump alignment. */
1046 "0:0:8", /* Label alignment. */
1047 "32", /* Func alignment. */
1048 };
1049
1050 /* BDVER has optimized REP instruction for medium sized blocks, but for
1051 very small blocks it is better to use loop. For large blocks, libcall
1052 can do nontemporary accesses and beat inline considerably. */
1053 static stringop_algs bdver_memcpy[2] = {
1054 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1055 {-1, rep_prefix_4_byte, false}}},
1056 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1057 {-1, libcall, false}}}};
1058 static stringop_algs bdver_memset[2] = {
1059 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1060 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1061 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1062 {-1, libcall, false}}}};
1063
1064 const struct processor_costs bdver_cost = {
1065 COSTS_N_INSNS (1), /* cost of an add instruction */
1066 COSTS_N_INSNS (1), /* cost of a lea instruction */
1067 COSTS_N_INSNS (1), /* variable shift costs */
1068 COSTS_N_INSNS (1), /* constant shift costs */
1069 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1070 COSTS_N_INSNS (4), /* HI */
1071 COSTS_N_INSNS (4), /* SI */
1072 COSTS_N_INSNS (6), /* DI */
1073 COSTS_N_INSNS (6)}, /* other */
1074 0, /* cost of multiply per each bit set */
1075 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1076 COSTS_N_INSNS (35), /* HI */
1077 COSTS_N_INSNS (51), /* SI */
1078 COSTS_N_INSNS (83), /* DI */
1079 COSTS_N_INSNS (83)}, /* other */
1080 COSTS_N_INSNS (1), /* cost of movsx */
1081 COSTS_N_INSNS (1), /* cost of movzx */
1082 8, /* "large" insn */
1083 9, /* MOVE_RATIO */
1084
1085 /* All move costs are relative to integer->integer move times 2 and thus
1086 they are latency*2. */
1087 8, /* cost for loading QImode using movzbl */
1088 {8, 8, 8}, /* cost of loading integer registers
1089 in QImode, HImode and SImode.
1090 Relative to reg-reg move (2). */
1091 {8, 8, 8}, /* cost of storing integer registers */
1092 4, /* cost of reg,reg fld/fst */
1093 {12, 12, 28}, /* cost of loading fp registers
1094 in SFmode, DFmode and XFmode */
1095 {10, 10, 18}, /* cost of storing fp registers
1096 in SFmode, DFmode and XFmode */
1097 4, /* cost of moving MMX register */
1098 {12, 12}, /* cost of loading MMX registers
1099 in SImode and DImode */
1100 {10, 10}, /* cost of storing MMX registers
1101 in SImode and DImode */
1102 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1103 {12, 12, 10, 40, 60}, /* cost of loading SSE registers
1104 in 32,64,128,256 and 512-bit */
1105 {12, 12, 10, 40, 60}, /* cost of unaligned loads. */
1106 {10, 10, 10, 40, 60}, /* cost of storing SSE registers
1107 in 32,64,128,256 and 512-bit */
1108 {10, 10, 10, 40, 60}, /* cost of unaligned stores. */
1109 16, 20, /* SSE->integer and integer->SSE moves */
1110 12, 12, /* Gather load static, per_elt. */
1111 10, 10, /* Gather store static, per_elt. */
1112 16, /* size of l1 cache. */
1113 2048, /* size of l2 cache. */
1114 64, /* size of prefetch block */
1115 /* New AMD processors never drop prefetches; if they cannot be performed
1116 immediately, they are queued. We set number of simultaneous prefetches
1117 to a large constant to reflect this (it probably is not a good idea not
1118 to limit number of prefetches at all, as their execution also takes some
1119 time). */
1120 100, /* number of parallel prefetches */
1121 2, /* Branch cost */
1122 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1123 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1124 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1125 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1126 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1127 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1128
1129 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1130 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1131 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1132 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1133 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1134 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1135 /* 9-24 */
1136 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1137 /* 9-27 */
1138 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1139 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1140 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
1141 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1142 bdver_memcpy,
1143 bdver_memset,
1144 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1145 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1146 "16:11:8", /* Loop alignment. */
1147 "16:8:8", /* Jump alignment. */
1148 "0:0:8", /* Label alignment. */
1149 "11", /* Func alignment. */
1150 };
1151
1152
1153 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1154 very small blocks it is better to use loop. For large blocks, libcall
1155 can do nontemporary accesses and beat inline considerably. */
1156 static stringop_algs znver1_memcpy[2] = {
1157 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1158 {-1, rep_prefix_4_byte, false}}},
1159 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1160 {-1, libcall, false}}}};
1161 static stringop_algs znver1_memset[2] = {
1162 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1163 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1164 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1165 {-1, libcall, false}}}};
1166 struct processor_costs znver1_cost = {
1167 COSTS_N_INSNS (1), /* cost of an add instruction. */
1168 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1169 COSTS_N_INSNS (1), /* variable shift costs. */
1170 COSTS_N_INSNS (1), /* constant shift costs. */
1171 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1172 COSTS_N_INSNS (3), /* HI. */
1173 COSTS_N_INSNS (3), /* SI. */
1174 COSTS_N_INSNS (3), /* DI. */
1175 COSTS_N_INSNS (3)}, /* other. */
1176 0, /* cost of multiply per each bit
1177 set. */
1178 /* Depending on parameters, idiv can get faster on ryzen. This is upper
1179 bound. */
1180 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
1181 COSTS_N_INSNS (22), /* HI. */
1182 COSTS_N_INSNS (30), /* SI. */
1183 COSTS_N_INSNS (45), /* DI. */
1184 COSTS_N_INSNS (45)}, /* other. */
1185 COSTS_N_INSNS (1), /* cost of movsx. */
1186 COSTS_N_INSNS (1), /* cost of movzx. */
1187 8, /* "large" insn. */
1188 9, /* MOVE_RATIO. */
1189
1190 /* All move costs are relative to integer->integer move times 2 and thus
1191 they are latency*2. */
1192
1193 /* reg-reg moves are done by renaming and thus they are even cheaper than
1194 1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1195 to doubles of latencies, we do not model this correctly. It does not
1196 seem to make practical difference to bump prices up even more. */
1197 6, /* cost for loading QImode using
1198 movzbl. */
1199 {6, 6, 6}, /* cost of loading integer registers
1200 in QImode, HImode and SImode.
1201 Relative to reg-reg move (2). */
1202 {8, 8, 8}, /* cost of storing integer
1203 registers. */
1204 2, /* cost of reg,reg fld/fst. */
1205 {6, 6, 16}, /* cost of loading fp registers
1206 in SFmode, DFmode and XFmode. */
1207 {8, 8, 16}, /* cost of storing fp registers
1208 in SFmode, DFmode and XFmode. */
1209 2, /* cost of moving MMX register. */
1210 {6, 6}, /* cost of loading MMX registers
1211 in SImode and DImode. */
1212 {8, 8}, /* cost of storing MMX registers
1213 in SImode and DImode. */
1214 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
1215 {6, 6, 6, 12, 24}, /* cost of loading SSE registers
1216 in 32,64,128,256 and 512-bit. */
1217 {6, 6, 6, 12, 24}, /* cost of unaligned loads. */
1218 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
1219 in 32,64,128,256 and 512-bit. */
1220 {8, 8, 8, 16, 32}, /* cost of unaligned stores. */
1221 6, 6, /* SSE->integer and integer->SSE moves. */
1222 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1223 throughput 12. Approx 9 uops do not depend on vector size and every load
1224 is 7 uops. */
1225 18, 8, /* Gather load static, per_elt. */
1226 18, 10, /* Gather store static, per_elt. */
1227 32, /* size of l1 cache. */
1228 512, /* size of l2 cache. */
1229 64, /* size of prefetch block. */
1230 /* New AMD processors never drop prefetches; if they cannot be performed
1231 immediately, they are queued. We set number of simultaneous prefetches
1232 to a large constant to reflect this (it probably is not a good idea not
1233 to limit number of prefetches at all, as their execution also takes some
1234 time). */
1235 100, /* number of parallel prefetches. */
1236 3, /* Branch cost. */
1237 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1238 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1239 /* Latency of fdiv is 8-15. */
1240 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1241 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1242 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1243 /* Latency of fsqrt is 4-10. */
1244 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1245
1246 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1247 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1248 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1249 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1250 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1251 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1252 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1253 /* 9-13 */
1254 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1255 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1256 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
1257 /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1258 and it can execute 2 integer additions and 2 multiplications thus
1259 reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests
1260 that 4 works better than 6 probably due to register pressure.
1261
1262 Integer vector operations are taken by FP unit and execute 3 vector
1263 plus/minus operations per cycle but only one multiply. This is adjusted
1264 in ix86_reassociation_width. */
1265 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1266 znver1_memcpy,
1267 znver1_memset,
1268 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1269 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1270 "16", /* Loop alignment. */
1271 "16", /* Jump alignment. */
1272 "0:0:8", /* Label alignment. */
1273 "16", /* Func alignment. */
1274 };
1275
1276 /* ZNVER2 has optimized REP instruction for medium sized blocks, but for
1277 very small blocks it is better to use loop. For large blocks, libcall
1278 can do nontemporary accesses and beat inline considerably. */
1279 static stringop_algs znver2_memcpy[2] = {
1280 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1281 {-1, rep_prefix_4_byte, false}}},
1282 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1283 {-1, libcall, false}}}};
1284 static stringop_algs znver2_memset[2] = {
1285 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1286 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1287 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1288 {-1, libcall, false}}}};
1289
1290 struct processor_costs znver2_cost = {
1291 COSTS_N_INSNS (1), /* cost of an add instruction. */
1292 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1293 COSTS_N_INSNS (1), /* variable shift costs. */
1294 COSTS_N_INSNS (1), /* constant shift costs. */
1295 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1296 COSTS_N_INSNS (3), /* HI. */
1297 COSTS_N_INSNS (3), /* SI. */
1298 COSTS_N_INSNS (3), /* DI. */
1299 COSTS_N_INSNS (3)}, /* other. */
1300 0, /* cost of multiply per each bit
1301 set. */
1302 /* Depending on parameters, idiv can get faster on ryzen. This is upper
1303 bound. */
1304 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
1305 COSTS_N_INSNS (22), /* HI. */
1306 COSTS_N_INSNS (30), /* SI. */
1307 COSTS_N_INSNS (45), /* DI. */
1308 COSTS_N_INSNS (45)}, /* other. */
1309 COSTS_N_INSNS (1), /* cost of movsx. */
1310 COSTS_N_INSNS (1), /* cost of movzx. */
1311 8, /* "large" insn. */
1312 9, /* MOVE_RATIO. */
1313
1314 /* All move costs are relative to integer->integer move times 2 and thus
1315 they are latency*2. */
1316
1317 /* reg-reg moves are done by renaming and thus they are even cheaper than
1318 1 cycle. Because reg-reg move cost is 2 and following tables correspond
1319 to doubles of latencies, we do not model this correctly. It does not
1320 seem to make practical difference to bump prices up even more. */
1321 6, /* cost for loading QImode using
1322 movzbl. */
1323 {6, 6, 6}, /* cost of loading integer registers
1324 in QImode, HImode and SImode.
1325 Relative to reg-reg move (2). */
1326 {8, 8, 8}, /* cost of storing integer
1327 registers. */
1328 2, /* cost of reg,reg fld/fst. */
1329 {6, 6, 16}, /* cost of loading fp registers
1330 in SFmode, DFmode and XFmode. */
1331 {8, 8, 16}, /* cost of storing fp registers
1332 in SFmode, DFmode and XFmode. */
1333 2, /* cost of moving MMX register. */
1334 {6, 6}, /* cost of loading MMX registers
1335 in SImode and DImode. */
1336 {8, 8}, /* cost of storing MMX registers
1337 in SImode and DImode. */
1338 2, 3, 6, /* cost of moving XMM,YMM,ZMM
1339 register. */
1340 {6, 6, 6, 10, 20}, /* cost of loading SSE registers
1341 in 32,64,128,256 and 512-bit. */
1342 {6, 6, 6, 10, 20}, /* cost of unaligned loads. */
1343 {8, 8, 8, 8, 16}, /* cost of storing SSE registers
1344 in 32,64,128,256 and 512-bit. */
1345 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
1346 6, 6, /* SSE->integer and integer->SSE
1347 moves. */
1348 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1349 throughput 12. Approx 9 uops do not depend on vector size and every load
1350 is 7 uops. */
1351 18, 8, /* Gather load static, per_elt. */
1352 18, 10, /* Gather store static, per_elt. */
1353 32, /* size of l1 cache. */
1354 512, /* size of l2 cache. */
1355 64, /* size of prefetch block. */
1356 /* New AMD processors never drop prefetches; if they cannot be performed
1357 immediately, they are queued. We set number of simultaneous prefetches
1358 to a large constant to reflect this (it probably is not a good idea not
1359 to limit number of prefetches at all, as their execution also takes some
1360 time). */
1361 100, /* number of parallel prefetches. */
1362 3, /* Branch cost. */
1363 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1364 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1365 /* Latency of fdiv is 8-15. */
1366 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1367 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1368 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1369 /* Latency of fsqrt is 4-10. */
1370 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1371
1372 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1373 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1374 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1375 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1376 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1377 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1378 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1379 /* 9-13. */
1380 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1381 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1382 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
1383 /* Zen can execute 4 integer operations per cycle. FP operations
1384 take 3 cycles and it can execute 2 integer additions and 2
1385 multiplications thus reassociation may make sense up to with of 6.
1386 SPEC2k6 bencharks suggests
1387 that 4 works better than 6 probably due to register pressure.
1388
1389 Integer vector operations are taken by FP unit and execute 3 vector
1390 plus/minus operations per cycle but only one multiply. This is adjusted
1391 in ix86_reassociation_width. */
1392 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1393 znver2_memcpy,
1394 znver2_memset,
1395 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1396 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1397 "16", /* Loop alignment. */
1398 "16", /* Jump alignment. */
1399 "0:0:8", /* Label alignment. */
1400 "16", /* Func alignment. */
1401 };
1402
1403 /* skylake_cost should produce code tuned for Skylake familly of CPUs. */
1404 static stringop_algs skylake_memcpy[2] = {
1405 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1406 {libcall, {{16, loop, false}, {512, unrolled_loop, false},
1407 {-1, libcall, false}}}};
1408
1409 static stringop_algs skylake_memset[2] = {
1410 {libcall, {{6, loop_1_byte, true},
1411 {24, loop, true},
1412 {8192, rep_prefix_4_byte, true},
1413 {-1, libcall, false}}},
1414 {libcall, {{24, loop, true}, {512, unrolled_loop, false},
1415 {-1, libcall, false}}}};
1416
1417 static const
1418 struct processor_costs skylake_cost = {
1419 COSTS_N_INSNS (1), /* cost of an add instruction */
1420 COSTS_N_INSNS (1)+1, /* cost of a lea instruction */
1421 COSTS_N_INSNS (1), /* variable shift costs */
1422 COSTS_N_INSNS (1), /* constant shift costs */
1423 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1424 COSTS_N_INSNS (4), /* HI */
1425 COSTS_N_INSNS (3), /* SI */
1426 COSTS_N_INSNS (3), /* DI */
1427 COSTS_N_INSNS (3)}, /* other */
1428 0, /* cost of multiply per each bit set */
1429 /* Expanding div/mod currently doesn't consider parallelism. So the cost
1430 model is not realistic. We compensate by increasing the latencies a bit. */
1431 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
1432 COSTS_N_INSNS (11), /* HI */
1433 COSTS_N_INSNS (14), /* SI */
1434 COSTS_N_INSNS (76), /* DI */
1435 COSTS_N_INSNS (76)}, /* other */
1436 COSTS_N_INSNS (1), /* cost of movsx */
1437 COSTS_N_INSNS (0), /* cost of movzx */
1438 8, /* "large" insn */
1439 17, /* MOVE_RATIO */
1440
1441 6, /* cost for loading QImode using movzbl */
1442 {4, 4, 4}, /* cost of loading integer registers
1443 in QImode, HImode and SImode.
1444 Relative to reg-reg move (2). */
1445 {6, 6, 3}, /* cost of storing integer registers */
1446 2, /* cost of reg,reg fld/fst */
1447 {6, 6, 8}, /* cost of loading fp registers
1448 in SFmode, DFmode and XFmode */
1449 {6, 6, 10}, /* cost of storing fp registers
1450 in SFmode, DFmode and XFmode */
1451 2, /* cost of moving MMX register */
1452 {6, 6}, /* cost of loading MMX registers
1453 in SImode and DImode */
1454 {6, 6}, /* cost of storing MMX registers
1455 in SImode and DImode */
1456 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
1457 {6, 6, 6, 10, 20}, /* cost of loading SSE registers
1458 in 32,64,128,256 and 512-bit */
1459 {6, 6, 6, 10, 20}, /* cost of unaligned loads. */
1460 {8, 8, 8, 12, 24}, /* cost of storing SSE registers
1461 in 32,64,128,256 and 512-bit */
1462 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
1463 2, 2, /* SSE->integer and integer->SSE moves */
1464 20, 8, /* Gather load static, per_elt. */
1465 22, 10, /* Gather store static, per_elt. */
1466 64, /* size of l1 cache. */
1467 512, /* size of l2 cache. */
1468 64, /* size of prefetch block */
1469 6, /* number of parallel prefetches */
1470 3, /* Branch cost */
1471 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
1472 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1473 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1474 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1475 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1476 COSTS_N_INSNS (20), /* cost of FSQRT instruction. */
1477
1478 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1479 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1480 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1481 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1482 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
1483 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
1484 COSTS_N_INSNS (11), /* cost of DIVSS instruction. */
1485 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
1486 COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */
1487 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
1488 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
1489 skylake_memcpy,
1490 skylake_memset,
1491 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1492 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1493 "16:11:8", /* Loop alignment. */
1494 "16:11:8", /* Jump alignment. */
1495 "0:0:8", /* Label alignment. */
1496 "16", /* Func alignment. */
1497 };
1498 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1499 very small blocks it is better to use loop. For large blocks, libcall can
1500 do nontemporary accesses and beat inline considerably. */
1501 static stringop_algs btver1_memcpy[2] = {
1502 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1503 {-1, rep_prefix_4_byte, false}}},
1504 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1505 {-1, libcall, false}}}};
1506 static stringop_algs btver1_memset[2] = {
1507 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1508 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1509 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1510 {-1, libcall, false}}}};
1511 const struct processor_costs btver1_cost = {
1512 COSTS_N_INSNS (1), /* cost of an add instruction */
1513 COSTS_N_INSNS (2), /* cost of a lea instruction */
1514 COSTS_N_INSNS (1), /* variable shift costs */
1515 COSTS_N_INSNS (1), /* constant shift costs */
1516 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1517 COSTS_N_INSNS (4), /* HI */
1518 COSTS_N_INSNS (3), /* SI */
1519 COSTS_N_INSNS (4), /* DI */
1520 COSTS_N_INSNS (5)}, /* other */
1521 0, /* cost of multiply per each bit set */
1522 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1523 COSTS_N_INSNS (35), /* HI */
1524 COSTS_N_INSNS (51), /* SI */
1525 COSTS_N_INSNS (83), /* DI */
1526 COSTS_N_INSNS (83)}, /* other */
1527 COSTS_N_INSNS (1), /* cost of movsx */
1528 COSTS_N_INSNS (1), /* cost of movzx */
1529 8, /* "large" insn */
1530 9, /* MOVE_RATIO */
1531
1532 /* All move costs are relative to integer->integer move times 2 and thus
1533 they are latency*2. */
1534 8, /* cost for loading QImode using movzbl */
1535 {6, 8, 6}, /* cost of loading integer registers
1536 in QImode, HImode and SImode.
1537 Relative to reg-reg move (2). */
1538 {6, 8, 6}, /* cost of storing integer registers */
1539 4, /* cost of reg,reg fld/fst */
1540 {12, 12, 28}, /* cost of loading fp registers
1541 in SFmode, DFmode and XFmode */
1542 {12, 12, 38}, /* cost of storing fp registers
1543 in SFmode, DFmode and XFmode */
1544 4, /* cost of moving MMX register */
1545 {10, 10}, /* cost of loading MMX registers
1546 in SImode and DImode */
1547 {12, 12}, /* cost of storing MMX registers
1548 in SImode and DImode */
1549 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1550 {10, 10, 12, 48, 96}, /* cost of loading SSE registers
1551 in 32,64,128,256 and 512-bit */
1552 {10, 10, 12, 48, 96}, /* cost of unaligned loads. */
1553 {10, 10, 12, 48, 96}, /* cost of storing SSE registers
1554 in 32,64,128,256 and 512-bit */
1555 {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
1556 14, 14, /* SSE->integer and integer->SSE moves */
1557 10, 10, /* Gather load static, per_elt. */
1558 10, 10, /* Gather store static, per_elt. */
1559 32, /* size of l1 cache. */
1560 512, /* size of l2 cache. */
1561 64, /* size of prefetch block */
1562 100, /* number of parallel prefetches */
1563 2, /* Branch cost */
1564 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1565 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1566 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1567 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1568 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1569 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1570
1571 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1572 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1573 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
1574 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1575 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1576 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1577 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
1578 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
1579 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
1580 COSTS_N_INSNS (48), /* cost of SQRTSD instruction. */
1581 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1582 btver1_memcpy,
1583 btver1_memset,
1584 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1585 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1586 "16:11:8", /* Loop alignment. */
1587 "16:8:8", /* Jump alignment. */
1588 "0:0:8", /* Label alignment. */
1589 "11", /* Func alignment. */
1590 };
1591
1592 static stringop_algs btver2_memcpy[2] = {
1593 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1594 {-1, rep_prefix_4_byte, false}}},
1595 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1596 {-1, libcall, false}}}};
1597 static stringop_algs btver2_memset[2] = {
1598 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1599 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1600 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1601 {-1, libcall, false}}}};
1602 const struct processor_costs btver2_cost = {
1603 COSTS_N_INSNS (1), /* cost of an add instruction */
1604 COSTS_N_INSNS (2), /* cost of a lea instruction */
1605 COSTS_N_INSNS (1), /* variable shift costs */
1606 COSTS_N_INSNS (1), /* constant shift costs */
1607 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1608 COSTS_N_INSNS (4), /* HI */
1609 COSTS_N_INSNS (3), /* SI */
1610 COSTS_N_INSNS (4), /* DI */
1611 COSTS_N_INSNS (5)}, /* other */
1612 0, /* cost of multiply per each bit set */
1613 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1614 COSTS_N_INSNS (35), /* HI */
1615 COSTS_N_INSNS (51), /* SI */
1616 COSTS_N_INSNS (83), /* DI */
1617 COSTS_N_INSNS (83)}, /* other */
1618 COSTS_N_INSNS (1), /* cost of movsx */
1619 COSTS_N_INSNS (1), /* cost of movzx */
1620 8, /* "large" insn */
1621 9, /* MOVE_RATIO */
1622
1623 /* All move costs are relative to integer->integer move times 2 and thus
1624 they are latency*2. */
1625 8, /* cost for loading QImode using movzbl */
1626 {8, 8, 6}, /* cost of loading integer registers
1627 in QImode, HImode and SImode.
1628 Relative to reg-reg move (2). */
1629 {8, 8, 6}, /* cost of storing integer registers */
1630 4, /* cost of reg,reg fld/fst */
1631 {12, 12, 28}, /* cost of loading fp registers
1632 in SFmode, DFmode and XFmode */
1633 {12, 12, 38}, /* cost of storing fp registers
1634 in SFmode, DFmode and XFmode */
1635 4, /* cost of moving MMX register */
1636 {10, 10}, /* cost of loading MMX registers
1637 in SImode and DImode */
1638 {12, 12}, /* cost of storing MMX registers
1639 in SImode and DImode */
1640 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1641 {10, 10, 12, 48, 96}, /* cost of loading SSE registers
1642 in 32,64,128,256 and 512-bit */
1643 {10, 10, 12, 48, 96}, /* cost of unaligned loads. */
1644 {10, 10, 12, 48, 96}, /* cost of storing SSE registers
1645 in 32,64,128,256 and 512-bit */
1646 {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
1647 14, 14, /* SSE->integer and integer->SSE moves */
1648 10, 10, /* Gather load static, per_elt. */
1649 10, 10, /* Gather store static, per_elt. */
1650 32, /* size of l1 cache. */
1651 2048, /* size of l2 cache. */
1652 64, /* size of prefetch block */
1653 100, /* number of parallel prefetches */
1654 2, /* Branch cost */
1655 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1656 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1657 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1658 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1659 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1660 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1661
1662 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1663 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1664 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
1665 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1666 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1667 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1668 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
1669 COSTS_N_INSNS (19), /* cost of DIVSD instruction. */
1670 COSTS_N_INSNS (16), /* cost of SQRTSS instruction. */
1671 COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */
1672 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1673 btver2_memcpy,
1674 btver2_memset,
1675 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1676 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1677 "16:11:8", /* Loop alignment. */
1678 "16:8:8", /* Jump alignment. */
1679 "0:0:8", /* Label alignment. */
1680 "11", /* Func alignment. */
1681 };
1682
1683 static stringop_algs pentium4_memcpy[2] = {
1684 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1685 DUMMY_STRINGOP_ALGS};
1686 static stringop_algs pentium4_memset[2] = {
1687 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1688 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1689 DUMMY_STRINGOP_ALGS};
1690
1691 static const
1692 struct processor_costs pentium4_cost = {
1693 COSTS_N_INSNS (1), /* cost of an add instruction */
1694 COSTS_N_INSNS (3), /* cost of a lea instruction */
1695 COSTS_N_INSNS (4), /* variable shift costs */
1696 COSTS_N_INSNS (4), /* constant shift costs */
1697 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1698 COSTS_N_INSNS (15), /* HI */
1699 COSTS_N_INSNS (15), /* SI */
1700 COSTS_N_INSNS (15), /* DI */
1701 COSTS_N_INSNS (15)}, /* other */
1702 0, /* cost of multiply per each bit set */
1703 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1704 COSTS_N_INSNS (56), /* HI */
1705 COSTS_N_INSNS (56), /* SI */
1706 COSTS_N_INSNS (56), /* DI */
1707 COSTS_N_INSNS (56)}, /* other */
1708 COSTS_N_INSNS (1), /* cost of movsx */
1709 COSTS_N_INSNS (1), /* cost of movzx */
1710 16, /* "large" insn */
1711 6, /* MOVE_RATIO */
1712
1713 /* All move costs are relative to integer->integer move times 2 and thus
1714 they are latency*2. */
1715 5, /* cost for loading QImode using movzbl */
1716 {4, 5, 4}, /* cost of loading integer registers
1717 in QImode, HImode and SImode.
1718 Relative to reg-reg move (2). */
1719 {2, 3, 2}, /* cost of storing integer registers */
1720 12, /* cost of reg,reg fld/fst */
1721 {14, 14, 14}, /* cost of loading fp registers
1722 in SFmode, DFmode and XFmode */
1723 {14, 14, 14}, /* cost of storing fp registers
1724 in SFmode, DFmode and XFmode */
1725 12, /* cost of moving MMX register */
1726 {16, 16}, /* cost of loading MMX registers
1727 in SImode and DImode */
1728 {16, 16}, /* cost of storing MMX registers
1729 in SImode and DImode */
1730 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
1731 {16, 16, 16, 32, 64}, /* cost of loading SSE registers
1732 in 32,64,128,256 and 512-bit */
1733 {32, 32, 32, 64, 128}, /* cost of unaligned loads. */
1734 {16, 16, 16, 32, 64}, /* cost of storing SSE registers
1735 in 32,64,128,256 and 512-bit */
1736 {32, 32, 32, 64, 128}, /* cost of unaligned stores. */
1737 20, 12, /* SSE->integer and integer->SSE moves */
1738 16, 16, /* Gather load static, per_elt. */
1739 16, 16, /* Gather store static, per_elt. */
1740 8, /* size of l1 cache. */
1741 256, /* size of l2 cache. */
1742 64, /* size of prefetch block */
1743 6, /* number of parallel prefetches */
1744 2, /* Branch cost */
1745 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1746 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1747 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1748 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1749 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1750 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1751
1752 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1753 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1754 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1755 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1756 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1757 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1758 COSTS_N_INSNS (23), /* cost of DIVSS instruction. */
1759 COSTS_N_INSNS (38), /* cost of DIVSD instruction. */
1760 COSTS_N_INSNS (23), /* cost of SQRTSS instruction. */
1761 COSTS_N_INSNS (38), /* cost of SQRTSD instruction. */
1762 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1763 pentium4_memcpy,
1764 pentium4_memset,
1765 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1766 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1767 NULL, /* Loop alignment. */
1768 NULL, /* Jump alignment. */
1769 NULL, /* Label alignment. */
1770 NULL, /* Func alignment. */
1771 };
1772
1773 static stringop_algs nocona_memcpy[2] = {
1774 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1775 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1776 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1777
1778 static stringop_algs nocona_memset[2] = {
1779 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1780 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1781 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1782 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1783
1784 static const
1785 struct processor_costs nocona_cost = {
1786 COSTS_N_INSNS (1), /* cost of an add instruction */
1787 COSTS_N_INSNS (1), /* cost of a lea instruction */
1788 COSTS_N_INSNS (1), /* variable shift costs */
1789 COSTS_N_INSNS (1), /* constant shift costs */
1790 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1791 COSTS_N_INSNS (10), /* HI */
1792 COSTS_N_INSNS (10), /* SI */
1793 COSTS_N_INSNS (10), /* DI */
1794 COSTS_N_INSNS (10)}, /* other */
1795 0, /* cost of multiply per each bit set */
1796 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1797 COSTS_N_INSNS (66), /* HI */
1798 COSTS_N_INSNS (66), /* SI */
1799 COSTS_N_INSNS (66), /* DI */
1800 COSTS_N_INSNS (66)}, /* other */
1801 COSTS_N_INSNS (1), /* cost of movsx */
1802 COSTS_N_INSNS (1), /* cost of movzx */
1803 16, /* "large" insn */
1804 17, /* MOVE_RATIO */
1805
1806 /* All move costs are relative to integer->integer move times 2 and thus
1807 they are latency*2. */
1808 4, /* cost for loading QImode using movzbl */
1809 {4, 4, 4}, /* cost of loading integer registers
1810 in QImode, HImode and SImode.
1811 Relative to reg-reg move (2). */
1812 {4, 4, 4}, /* cost of storing integer registers */
1813 12, /* cost of reg,reg fld/fst */
1814 {14, 14, 14}, /* cost of loading fp registers
1815 in SFmode, DFmode and XFmode */
1816 {14, 14, 14}, /* cost of storing fp registers
1817 in SFmode, DFmode and XFmode */
1818 14, /* cost of moving MMX register */
1819 {12, 12}, /* cost of loading MMX registers
1820 in SImode and DImode */
1821 {12, 12}, /* cost of storing MMX registers
1822 in SImode and DImode */
1823 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
1824 {12, 12, 12, 24, 48}, /* cost of loading SSE registers
1825 in 32,64,128,256 and 512-bit */
1826 {24, 24, 24, 48, 96}, /* cost of unaligned loads. */
1827 {12, 12, 12, 24, 48}, /* cost of storing SSE registers
1828 in 32,64,128,256 and 512-bit */
1829 {24, 24, 24, 48, 96}, /* cost of unaligned stores. */
1830 20, 12, /* SSE->integer and integer->SSE moves */
1831 12, 12, /* Gather load static, per_elt. */
1832 12, 12, /* Gather store static, per_elt. */
1833 8, /* size of l1 cache. */
1834 1024, /* size of l2 cache. */
1835 64, /* size of prefetch block */
1836 8, /* number of parallel prefetches */
1837 1, /* Branch cost */
1838 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1839 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1840 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1841 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1842 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1843 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1844
1845 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1846 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
1847 COSTS_N_INSNS (7), /* cost of MULSS instruction. */
1848 COSTS_N_INSNS (7), /* cost of MULSD instruction. */
1849 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
1850 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
1851 COSTS_N_INSNS (32), /* cost of DIVSS instruction. */
1852 COSTS_N_INSNS (40), /* cost of DIVSD instruction. */
1853 COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */
1854 COSTS_N_INSNS (41), /* cost of SQRTSD instruction. */
1855 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1856 nocona_memcpy,
1857 nocona_memset,
1858 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1859 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1860 NULL, /* Loop alignment. */
1861 NULL, /* Jump alignment. */
1862 NULL, /* Label alignment. */
1863 NULL, /* Func alignment. */
1864 };
1865
1866 static stringop_algs atom_memcpy[2] = {
1867 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1868 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1869 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1870 static stringop_algs atom_memset[2] = {
1871 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1872 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1873 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1874 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1875 static const
1876 struct processor_costs atom_cost = {
1877 COSTS_N_INSNS (1), /* cost of an add instruction */
1878 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1879 COSTS_N_INSNS (1), /* variable shift costs */
1880 COSTS_N_INSNS (1), /* constant shift costs */
1881 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1882 COSTS_N_INSNS (4), /* HI */
1883 COSTS_N_INSNS (3), /* SI */
1884 COSTS_N_INSNS (4), /* DI */
1885 COSTS_N_INSNS (2)}, /* other */
1886 0, /* cost of multiply per each bit set */
1887 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1888 COSTS_N_INSNS (26), /* HI */
1889 COSTS_N_INSNS (42), /* SI */
1890 COSTS_N_INSNS (74), /* DI */
1891 COSTS_N_INSNS (74)}, /* other */
1892 COSTS_N_INSNS (1), /* cost of movsx */
1893 COSTS_N_INSNS (1), /* cost of movzx */
1894 8, /* "large" insn */
1895 17, /* MOVE_RATIO */
1896
1897 /* All move costs are relative to integer->integer move times 2 and thus
1898 they are latency*2. */
1899 6, /* cost for loading QImode using movzbl */
1900 {6, 6, 6}, /* cost of loading integer registers
1901 in QImode, HImode and SImode.
1902 Relative to reg-reg move (2). */
1903 {6, 6, 6}, /* cost of storing integer registers */
1904 4, /* cost of reg,reg fld/fst */
1905 {6, 6, 18}, /* cost of loading fp registers
1906 in SFmode, DFmode and XFmode */
1907 {14, 14, 24}, /* cost of storing fp registers
1908 in SFmode, DFmode and XFmode */
1909 2, /* cost of moving MMX register */
1910 {8, 8}, /* cost of loading MMX registers
1911 in SImode and DImode */
1912 {10, 10}, /* cost of storing MMX registers
1913 in SImode and DImode */
1914 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1915 {8, 8, 8, 16, 32}, /* cost of loading SSE registers
1916 in 32,64,128,256 and 512-bit */
1917 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */
1918 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
1919 in 32,64,128,256 and 512-bit */
1920 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
1921 8, 6, /* SSE->integer and integer->SSE moves */
1922 8, 8, /* Gather load static, per_elt. */
1923 8, 8, /* Gather store static, per_elt. */
1924 32, /* size of l1 cache. */
1925 256, /* size of l2 cache. */
1926 64, /* size of prefetch block */
1927 6, /* number of parallel prefetches */
1928 3, /* Branch cost */
1929 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1930 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1931 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1932 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1933 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1934 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1935
1936 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1937 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
1938 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1939 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
1940 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1941 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1942 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
1943 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
1944 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
1945 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
1946 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
1947 atom_memcpy,
1948 atom_memset,
1949 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1950 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1951 "16", /* Loop alignment. */
1952 "16:8:8", /* Jump alignment. */
1953 "0:0:8", /* Label alignment. */
1954 "16", /* Func alignment. */
1955 };
1956
1957 static stringop_algs slm_memcpy[2] = {
1958 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1959 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1960 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1961 static stringop_algs slm_memset[2] = {
1962 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1963 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1964 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1965 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1966 static const
1967 struct processor_costs slm_cost = {
1968 COSTS_N_INSNS (1), /* cost of an add instruction */
1969 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1970 COSTS_N_INSNS (1), /* variable shift costs */
1971 COSTS_N_INSNS (1), /* constant shift costs */
1972 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1973 COSTS_N_INSNS (3), /* HI */
1974 COSTS_N_INSNS (3), /* SI */
1975 COSTS_N_INSNS (4), /* DI */
1976 COSTS_N_INSNS (2)}, /* other */
1977 0, /* cost of multiply per each bit set */
1978 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1979 COSTS_N_INSNS (26), /* HI */
1980 COSTS_N_INSNS (42), /* SI */
1981 COSTS_N_INSNS (74), /* DI */
1982 COSTS_N_INSNS (74)}, /* other */
1983 COSTS_N_INSNS (1), /* cost of movsx */
1984 COSTS_N_INSNS (1), /* cost of movzx */
1985 8, /* "large" insn */
1986 17, /* MOVE_RATIO */
1987
1988 /* All move costs are relative to integer->integer move times 2 and thus
1989 they are latency*2. */
1990 8, /* cost for loading QImode using movzbl */
1991 {8, 8, 8}, /* cost of loading integer registers
1992 in QImode, HImode and SImode.
1993 Relative to reg-reg move (2). */
1994 {6, 6, 6}, /* cost of storing integer registers */
1995 2, /* cost of reg,reg fld/fst */
1996 {8, 8, 18}, /* cost of loading fp registers
1997 in SFmode, DFmode and XFmode */
1998 {6, 6, 18}, /* cost of storing fp registers
1999 in SFmode, DFmode and XFmode */
2000 2, /* cost of moving MMX register */
2001 {8, 8}, /* cost of loading MMX registers
2002 in SImode and DImode */
2003 {6, 6}, /* cost of storing MMX registers
2004 in SImode and DImode */
2005 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2006 {8, 8, 8, 16, 32}, /* cost of loading SSE registers
2007 in 32,64,128,256 and 512-bit */
2008 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */
2009 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
2010 in 32,64,128,256 and 512-bit */
2011 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
2012 8, 6, /* SSE->integer and integer->SSE moves */
2013 8, 8, /* Gather load static, per_elt. */
2014 8, 8, /* Gather store static, per_elt. */
2015 32, /* size of l1 cache. */
2016 256, /* size of l2 cache. */
2017 64, /* size of prefetch block */
2018 6, /* number of parallel prefetches */
2019 3, /* Branch cost */
2020 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2021 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2022 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2023 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2024 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2025 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2026
2027 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2028 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2029 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2030 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2031 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2032 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
2033 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
2034 COSTS_N_INSNS (69), /* cost of DIVSD instruction. */
2035 COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */
2036 COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */
2037 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2038 slm_memcpy,
2039 slm_memset,
2040 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2041 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2042 "16", /* Loop alignment. */
2043 "16:8:8", /* Jump alignment. */
2044 "0:0:8", /* Label alignment. */
2045 "16", /* Func alignment. */
2046 };
2047
2048 static stringop_algs intel_memcpy[2] = {
2049 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2050 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2051 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2052 static stringop_algs intel_memset[2] = {
2053 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2054 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2055 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2056 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2057 static const
2058 struct processor_costs intel_cost = {
2059 COSTS_N_INSNS (1), /* cost of an add instruction */
2060 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2061 COSTS_N_INSNS (1), /* variable shift costs */
2062 COSTS_N_INSNS (1), /* constant shift costs */
2063 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2064 COSTS_N_INSNS (3), /* HI */
2065 COSTS_N_INSNS (3), /* SI */
2066 COSTS_N_INSNS (4), /* DI */
2067 COSTS_N_INSNS (2)}, /* other */
2068 0, /* cost of multiply per each bit set */
2069 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2070 COSTS_N_INSNS (26), /* HI */
2071 COSTS_N_INSNS (42), /* SI */
2072 COSTS_N_INSNS (74), /* DI */
2073 COSTS_N_INSNS (74)}, /* other */
2074 COSTS_N_INSNS (1), /* cost of movsx */
2075 COSTS_N_INSNS (1), /* cost of movzx */
2076 8, /* "large" insn */
2077 17, /* MOVE_RATIO */
2078
2079 /* All move costs are relative to integer->integer move times 2 and thus
2080 they are latency*2. */
2081 6, /* cost for loading QImode using movzbl */
2082 {4, 4, 4}, /* cost of loading integer registers
2083 in QImode, HImode and SImode.
2084 Relative to reg-reg move (2). */
2085 {6, 6, 6}, /* cost of storing integer registers */
2086 2, /* cost of reg,reg fld/fst */
2087 {6, 6, 8}, /* cost of loading fp registers
2088 in SFmode, DFmode and XFmode */
2089 {6, 6, 10}, /* cost of storing fp registers
2090 in SFmode, DFmode and XFmode */
2091 2, /* cost of moving MMX register */
2092 {6, 6}, /* cost of loading MMX registers
2093 in SImode and DImode */
2094 {6, 6}, /* cost of storing MMX registers
2095 in SImode and DImode */
2096 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */
2097 {6, 6, 6, 6, 6}, /* cost of loading SSE registers
2098 in 32,64,128,256 and 512-bit */
2099 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
2100 {6, 6, 6, 6, 6}, /* cost of storing SSE registers
2101 in 32,64,128,256 and 512-bit */
2102 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
2103 4, 4, /* SSE->integer and integer->SSE moves */
2104 6, 6, /* Gather load static, per_elt. */
2105 6, 6, /* Gather store static, per_elt. */
2106 32, /* size of l1 cache. */
2107 256, /* size of l2 cache. */
2108 64, /* size of prefetch block */
2109 6, /* number of parallel prefetches */
2110 3, /* Branch cost */
2111 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2112 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2113 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2114 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2115 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2116 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2117
2118 COSTS_N_INSNS (8), /* cost of cheap SSE instruction. */
2119 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
2120 COSTS_N_INSNS (8), /* cost of MULSS instruction. */
2121 COSTS_N_INSNS (8), /* cost of MULSD instruction. */
2122 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2123 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
2124 COSTS_N_INSNS (20), /* cost of DIVSS instruction. */
2125 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
2126 COSTS_N_INSNS (40), /* cost of SQRTSS instruction. */
2127 COSTS_N_INSNS (40), /* cost of SQRTSD instruction. */
2128 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2129 intel_memcpy,
2130 intel_memset,
2131 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2132 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2133 "16", /* Loop alignment. */
2134 "16:8:8", /* Jump alignment. */
2135 "0:0:8", /* Label alignment. */
2136 "16", /* Func alignment. */
2137 };
2138
2139 /* Generic should produce code tuned for Core-i7 (and newer chips)
2140 and btver1 (and newer chips). */
2141
2142 static stringop_algs generic_memcpy[2] = {
2143 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2144 {-1, libcall, false}}},
2145 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2146 {-1, libcall, false}}}};
2147 static stringop_algs generic_memset[2] = {
2148 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2149 {-1, libcall, false}}},
2150 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2151 {-1, libcall, false}}}};
2152 static const
2153 struct processor_costs generic_cost = {
2154 COSTS_N_INSNS (1), /* cost of an add instruction */
2155 /* Setting cost to 2 makes our current implementation of synth_mult result in
2156 use of unnecessary temporary registers causing regression on several
2157 SPECfp benchmarks. */
2158 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2159 COSTS_N_INSNS (1), /* variable shift costs */
2160 COSTS_N_INSNS (1), /* constant shift costs */
2161 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2162 COSTS_N_INSNS (4), /* HI */
2163 COSTS_N_INSNS (3), /* SI */
2164 COSTS_N_INSNS (4), /* DI */
2165 COSTS_N_INSNS (4)}, /* other */
2166 0, /* cost of multiply per each bit set */
2167 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */
2168 COSTS_N_INSNS (22), /* HI */
2169 COSTS_N_INSNS (30), /* SI */
2170 COSTS_N_INSNS (74), /* DI */
2171 COSTS_N_INSNS (74)}, /* other */
2172 COSTS_N_INSNS (1), /* cost of movsx */
2173 COSTS_N_INSNS (1), /* cost of movzx */
2174 8, /* "large" insn */
2175 17, /* MOVE_RATIO */
2176
2177 /* All move costs are relative to integer->integer move times 2 and thus
2178 they are latency*2. */
2179 6, /* cost for loading QImode using movzbl */
2180 {6, 6, 6}, /* cost of loading integer registers
2181 in QImode, HImode and SImode.
2182 Relative to reg-reg move (2). */
2183 {6, 6, 6}, /* cost of storing integer registers */
2184 4, /* cost of reg,reg fld/fst */
2185 {6, 6, 12}, /* cost of loading fp registers
2186 in SFmode, DFmode and XFmode */
2187 {6, 6, 12}, /* cost of storing fp registers
2188 in SFmode, DFmode and XFmode */
2189 2, /* cost of moving MMX register */
2190 {6, 6}, /* cost of loading MMX registers
2191 in SImode and DImode */
2192 {6, 6}, /* cost of storing MMX registers
2193 in SImode and DImode */
2194 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
2195 {6, 6, 6, 10, 15}, /* cost of loading SSE registers
2196 in 32,64,128,256 and 512-bit */
2197 {6, 6, 6, 10, 15}, /* cost of unaligned loads. */
2198 {6, 6, 6, 10, 15}, /* cost of storing SSE registers
2199 in 32,64,128,256 and 512-bit */
2200 {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
2201 6, 6, /* SSE->integer and integer->SSE moves */
2202 18, 6, /* Gather load static, per_elt. */
2203 18, 6, /* Gather store static, per_elt. */
2204 32, /* size of l1 cache. */
2205 512, /* size of l2 cache. */
2206 64, /* size of prefetch block */
2207 6, /* number of parallel prefetches */
2208 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2209 value is increased to perhaps more appropriate value of 5. */
2210 3, /* Branch cost */
2211 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2212 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
2213 COSTS_N_INSNS (17), /* cost of FDIV instruction. */
2214 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2215 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2216 COSTS_N_INSNS (14), /* cost of FSQRT instruction. */
2217
2218 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2219 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2220 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2221 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2222 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2223 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
2224 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
2225 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
2226 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
2227 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
2228 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
2229 generic_memcpy,
2230 generic_memset,
2231 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
2232 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
2233 "16:11:8", /* Loop alignment. */
2234 "16:11:8", /* Jump alignment. */
2235 "0:0:8", /* Label alignment. */
2236 "16", /* Func alignment. */
2237 };
2238
2239 /* core_cost should produce code tuned for Core familly of CPUs. */
2240 static stringop_algs core_memcpy[2] = {
2241 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2242 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2243 {-1, libcall, false}}}};
2244 static stringop_algs core_memset[2] = {
2245 {libcall, {{6, loop_1_byte, true},
2246 {24, loop, true},
2247 {8192, rep_prefix_4_byte, true},
2248 {-1, libcall, false}}},
2249 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2250 {-1, libcall, false}}}};
2251
2252 static const
2253 struct processor_costs core_cost = {
2254 COSTS_N_INSNS (1), /* cost of an add instruction */
2255 /* On all chips taken into consideration lea is 2 cycles and more. With
2256 this cost however our current implementation of synth_mult results in
2257 use of unnecessary temporary registers causing regression on several
2258 SPECfp benchmarks. */
2259 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2260 COSTS_N_INSNS (1), /* variable shift costs */
2261 COSTS_N_INSNS (1), /* constant shift costs */
2262 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2263 COSTS_N_INSNS (4), /* HI */
2264 COSTS_N_INSNS (3), /* SI */
2265 /* Here we tune for Sandybridge or newer. */
2266 COSTS_N_INSNS (3), /* DI */
2267 COSTS_N_INSNS (3)}, /* other */
2268 0, /* cost of multiply per each bit set */
2269 /* Expanding div/mod currently doesn't consider parallelism. So the cost
2270 model is not realistic. We compensate by increasing the latencies a bit. */
2271 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
2272 COSTS_N_INSNS (11), /* HI */
2273 COSTS_N_INSNS (14), /* SI */
2274 COSTS_N_INSNS (81), /* DI */
2275 COSTS_N_INSNS (81)}, /* other */
2276 COSTS_N_INSNS (1), /* cost of movsx */
2277 COSTS_N_INSNS (1), /* cost of movzx */
2278 8, /* "large" insn */
2279 17, /* MOVE_RATIO */
2280
2281 /* All move costs are relative to integer->integer move times 2 and thus
2282 they are latency*2. */
2283 6, /* cost for loading QImode using movzbl */
2284 {4, 4, 4}, /* cost of loading integer registers
2285 in QImode, HImode and SImode.
2286 Relative to reg-reg move (2). */
2287 {6, 6, 6}, /* cost of storing integer registers */
2288 2, /* cost of reg,reg fld/fst */
2289 {6, 6, 8}, /* cost of loading fp registers
2290 in SFmode, DFmode and XFmode */
2291 {6, 6, 10}, /* cost of storing fp registers
2292 in SFmode, DFmode and XFmode */
2293 2, /* cost of moving MMX register */
2294 {6, 6}, /* cost of loading MMX registers
2295 in SImode and DImode */
2296 {6, 6}, /* cost of storing MMX registers
2297 in SImode and DImode */
2298 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
2299 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
2300 in 32,64,128,256 and 512-bit */
2301 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */
2302 {6, 6, 6, 6, 12}, /* cost of storing SSE registers
2303 in 32,64,128,256 and 512-bit */
2304 {6, 6, 6, 6, 12}, /* cost of unaligned stores. */
2305 2, 2, /* SSE->integer and integer->SSE moves */
2306 /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
2307 rec. throughput 6.
2308 So 5 uops statically and one uops per load. */
2309 10, 6, /* Gather load static, per_elt. */
2310 10, 6, /* Gather store static, per_elt. */
2311 64, /* size of l1 cache. */
2312 512, /* size of l2 cache. */
2313 64, /* size of prefetch block */
2314 6, /* number of parallel prefetches */
2315 /* FIXME perhaps more appropriate value is 5. */
2316 3, /* Branch cost */
2317 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2318 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
2319 /* 10-24 */
2320 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
2321 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2322 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2323 COSTS_N_INSNS (23), /* cost of FSQRT instruction. */
2324
2325 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2326 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2327 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2328 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2329 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2330 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
2331 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
2332 COSTS_N_INSNS (32), /* cost of DIVSD instruction. */
2333 COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */
2334 COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */
2335 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
2336 core_memcpy,
2337 core_memset,
2338 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2339 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2340 "16:11:8", /* Loop alignment. */
2341 "16:11:8", /* Jump alignment. */
2342 "0:0:8", /* Label alignment. */
2343 "16", /* Func alignment. */
2344 };
2345