]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/i386/x86-tune-costs.h
Update copyright years.
[thirdparty/gcc.git] / gcc / config / i386 / x86-tune-costs.h
CommitLineData
df41dbaf 1/* Costs of operations of individual x86 CPUs.
a5544970 2 Copyright (C) 1988-2019 Free Software Foundation, Inc.
64766e8d 3
df41dbaf
JH
4This file is part of GCC.
5
6GCC is free software; you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation; either version 3, or (at your option)
9any later version.
10
11GCC is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14GNU General Public License for more details.
15
16Under Section 7 of GPL version 3, you are granted additional
17permissions described in the GCC Runtime Library Exception, version
183.1, as published by the Free Software Foundation.
19
20You should have received a copy of the GNU General Public License and
21a copy of the GCC Runtime Library Exception along with this program;
22see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23<http://www.gnu.org/licenses/>. */
64766e8d
JH
24/* Processor costs (relative to an add) */
25/* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
26#define COSTS_N_BYTES(N) ((N) * 2)
27
28#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
29
30static stringop_algs ix86_size_memcpy[2] = {
31 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
32 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
33static stringop_algs ix86_size_memset[2] = {
34 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
35 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
36
37const
38struct processor_costs ix86_size_cost = {/* costs for tuning for size */
39 COSTS_N_BYTES (2), /* cost of an add instruction */
40 COSTS_N_BYTES (3), /* cost of a lea instruction */
41 COSTS_N_BYTES (2), /* variable shift costs */
42 COSTS_N_BYTES (3), /* constant shift costs */
43 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
44 COSTS_N_BYTES (3), /* HI */
45 COSTS_N_BYTES (3), /* SI */
46 COSTS_N_BYTES (3), /* DI */
47 COSTS_N_BYTES (5)}, /* other */
48 0, /* cost of multiply per each bit set */
49 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
50 COSTS_N_BYTES (3), /* HI */
51 COSTS_N_BYTES (3), /* SI */
52 COSTS_N_BYTES (3), /* DI */
53 COSTS_N_BYTES (5)}, /* other */
54 COSTS_N_BYTES (3), /* cost of movsx */
55 COSTS_N_BYTES (3), /* cost of movzx */
56 0, /* "large" insn */
57 2, /* MOVE_RATIO */
df41dbaf
JH
58
59 /* All move costs are relative to integer->integer move times 2. */
64766e8d
JH
60 2, /* cost for loading QImode using movzbl */
61 {2, 2, 2}, /* cost of loading integer registers
62 in QImode, HImode and SImode.
63 Relative to reg-reg move (2). */
64 {2, 2, 2}, /* cost of storing integer registers */
65 2, /* cost of reg,reg fld/fst */
66 {2, 2, 2}, /* cost of loading fp registers
67 in SFmode, DFmode and XFmode */
68 {2, 2, 2}, /* cost of storing fp registers
69 in SFmode, DFmode and XFmode */
70 3, /* cost of moving MMX register */
71 {3, 3}, /* cost of loading MMX registers
72 in SImode and DImode */
73 {3, 3}, /* cost of storing MMX registers
74 in SImode and DImode */
df41dbaf
JH
75 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
76 {3, 3, 3, 3, 3}, /* cost of loading SSE registers
77 in 32,64,128,256 and 512-bit */
78 {3, 3, 3, 3, 3}, /* cost of unaligned SSE load
79 in 128bit, 256bit and 512bit */
80 {3, 3, 3, 3, 3}, /* cost of storing SSE registers
81 in 32,64,128,256 and 512-bit */
82 {3, 3, 3, 3, 3}, /* cost of unaligned SSE store
83 in 128bit, 256bit and 512bit */
84 3, 3, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
85 5, 0, /* Gather load static, per_elt. */
86 5, 0, /* Gather store static, per_elt. */
64766e8d
JH
87 0, /* size of l1 cache */
88 0, /* size of l2 cache */
89 0, /* size of prefetch block */
90 0, /* number of parallel prefetches */
91 2, /* Branch cost */
92 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
93 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
94 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
95 COSTS_N_BYTES (2), /* cost of FABS instruction. */
96 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
97 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
6065f444 98
c53c148c 99 COSTS_N_BYTES (2), /* cost of cheap SSE instruction. */
6065f444
JH
100 COSTS_N_BYTES (2), /* cost of ADDSS/SD SUBSS/SD insns. */
101 COSTS_N_BYTES (2), /* cost of MULSS instruction. */
102 COSTS_N_BYTES (2), /* cost of MULSD instruction. */
c53c148c
JH
103 COSTS_N_BYTES (2), /* cost of FMA SS instruction. */
104 COSTS_N_BYTES (2), /* cost of FMA SD instruction. */
6065f444
JH
105 COSTS_N_BYTES (2), /* cost of DIVSS instruction. */
106 COSTS_N_BYTES (2), /* cost of DIVSD instruction. */
107 COSTS_N_BYTES (2), /* cost of SQRTSS instruction. */
108 COSTS_N_BYTES (2), /* cost of SQRTSD instruction. */
64766e8d
JH
109 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
110 ix86_size_memcpy,
111 ix86_size_memset,
f6fd8f2b
JH
112 COSTS_N_BYTES (1), /* cond_taken_branch_cost. */
113 COSTS_N_BYTES (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
114 NULL, /* Loop alignment. */
115 NULL, /* Jump alignment. */
116 NULL, /* Label alignment. */
117 NULL, /* Func alignment. */
64766e8d
JH
118};
119
120/* Processor costs (relative to an add) */
121static stringop_algs i386_memcpy[2] = {
122 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
123 DUMMY_STRINGOP_ALGS};
124static stringop_algs i386_memset[2] = {
125 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
126 DUMMY_STRINGOP_ALGS};
127
128static const
129struct processor_costs i386_cost = { /* 386 specific costs */
130 COSTS_N_INSNS (1), /* cost of an add instruction */
131 COSTS_N_INSNS (1), /* cost of a lea instruction */
132 COSTS_N_INSNS (3), /* variable shift costs */
133 COSTS_N_INSNS (2), /* constant shift costs */
134 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
135 COSTS_N_INSNS (6), /* HI */
136 COSTS_N_INSNS (6), /* SI */
137 COSTS_N_INSNS (6), /* DI */
138 COSTS_N_INSNS (6)}, /* other */
139 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
140 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
141 COSTS_N_INSNS (23), /* HI */
142 COSTS_N_INSNS (23), /* SI */
143 COSTS_N_INSNS (23), /* DI */
144 COSTS_N_INSNS (23)}, /* other */
145 COSTS_N_INSNS (3), /* cost of movsx */
146 COSTS_N_INSNS (2), /* cost of movzx */
147 15, /* "large" insn */
148 3, /* MOVE_RATIO */
df41dbaf
JH
149
150 /* All move costs are relative to integer->integer move times 2 and thus
151 they are latency*2. */
64766e8d
JH
152 4, /* cost for loading QImode using movzbl */
153 {2, 4, 2}, /* cost of loading integer registers
154 in QImode, HImode and SImode.
155 Relative to reg-reg move (2). */
156 {2, 4, 2}, /* cost of storing integer registers */
157 2, /* cost of reg,reg fld/fst */
158 {8, 8, 8}, /* cost of loading fp registers
159 in SFmode, DFmode and XFmode */
160 {8, 8, 8}, /* cost of storing fp registers
161 in SFmode, DFmode and XFmode */
162 2, /* cost of moving MMX register */
163 {4, 8}, /* cost of loading MMX registers
164 in SImode and DImode */
165 {4, 8}, /* cost of storing MMX registers
166 in SImode and DImode */
df41dbaf
JH
167 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
168 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
169 in 32,64,128,256 and 512-bit */
170 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
171 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
172 in 32,64,128,256 and 512-bit */
173 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
174 3, 3, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
175 4, 4, /* Gather load static, per_elt. */
176 4, 4, /* Gather store static, per_elt. */
64766e8d
JH
177 0, /* size of l1 cache */
178 0, /* size of l2 cache */
179 0, /* size of prefetch block */
180 0, /* number of parallel prefetches */
181 1, /* Branch cost */
182 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
183 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
184 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
185 COSTS_N_INSNS (22), /* cost of FABS instruction. */
186 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
187 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
6065f444 188
c53c148c 189 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
190 COSTS_N_INSNS (23), /* cost of ADDSS/SD SUBSS/SD insns. */
191 COSTS_N_INSNS (27), /* cost of MULSS instruction. */
192 COSTS_N_INSNS (27), /* cost of MULSD instruction. */
c53c148c
JH
193 COSTS_N_INSNS (27), /* cost of FMA SS instruction. */
194 COSTS_N_INSNS (27), /* cost of FMA SD instruction. */
6065f444
JH
195 COSTS_N_INSNS (88), /* cost of DIVSS instruction. */
196 COSTS_N_INSNS (88), /* cost of DIVSD instruction. */
197 COSTS_N_INSNS (122), /* cost of SQRTSS instruction. */
198 COSTS_N_INSNS (122), /* cost of SQRTSD instruction. */
64766e8d
JH
199 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
200 i386_memcpy,
201 i386_memset,
f6fd8f2b
JH
202 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
203 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
204 "4", /* Loop alignment. */
205 "4", /* Jump alignment. */
206 NULL, /* Label alignment. */
207 "4", /* Func alignment. */
64766e8d
JH
208};
209
210static stringop_algs i486_memcpy[2] = {
211 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
212 DUMMY_STRINGOP_ALGS};
213static stringop_algs i486_memset[2] = {
214 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
215 DUMMY_STRINGOP_ALGS};
216
217static const
218struct processor_costs i486_cost = { /* 486 specific costs */
219 COSTS_N_INSNS (1), /* cost of an add instruction */
220 COSTS_N_INSNS (1), /* cost of a lea instruction */
221 COSTS_N_INSNS (3), /* variable shift costs */
222 COSTS_N_INSNS (2), /* constant shift costs */
223 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
224 COSTS_N_INSNS (12), /* HI */
225 COSTS_N_INSNS (12), /* SI */
226 COSTS_N_INSNS (12), /* DI */
227 COSTS_N_INSNS (12)}, /* other */
228 1, /* cost of multiply per each bit set */
229 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
230 COSTS_N_INSNS (40), /* HI */
231 COSTS_N_INSNS (40), /* SI */
232 COSTS_N_INSNS (40), /* DI */
233 COSTS_N_INSNS (40)}, /* other */
234 COSTS_N_INSNS (3), /* cost of movsx */
235 COSTS_N_INSNS (2), /* cost of movzx */
236 15, /* "large" insn */
237 3, /* MOVE_RATIO */
df41dbaf
JH
238
239 /* All move costs are relative to integer->integer move times 2 and thus
240 they are latency*2. */
64766e8d
JH
241 4, /* cost for loading QImode using movzbl */
242 {2, 4, 2}, /* cost of loading integer registers
243 in QImode, HImode and SImode.
244 Relative to reg-reg move (2). */
245 {2, 4, 2}, /* cost of storing integer registers */
246 2, /* cost of reg,reg fld/fst */
247 {8, 8, 8}, /* cost of loading fp registers
248 in SFmode, DFmode and XFmode */
249 {8, 8, 8}, /* cost of storing fp registers
250 in SFmode, DFmode and XFmode */
251 2, /* cost of moving MMX register */
252 {4, 8}, /* cost of loading MMX registers
253 in SImode and DImode */
254 {4, 8}, /* cost of storing MMX registers
255 in SImode and DImode */
df41dbaf
JH
256 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
257 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
258 in 32,64,128,256 and 512-bit */
259 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
260 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
261 in 32,64,128,256 and 512-bit */
262 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
263 3, 3, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
264 4, 4, /* Gather load static, per_elt. */
265 4, 4, /* Gather store static, per_elt. */
64766e8d
JH
266 4, /* size of l1 cache. 486 has 8kB cache
267 shared for code and data, so 4kB is
268 not really precise. */
269 4, /* size of l2 cache */
270 0, /* size of prefetch block */
271 0, /* number of parallel prefetches */
272 1, /* Branch cost */
273 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
274 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
275 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
276 COSTS_N_INSNS (3), /* cost of FABS instruction. */
277 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
278 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
6065f444 279
c53c148c 280 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
281 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
282 COSTS_N_INSNS (16), /* cost of MULSS instruction. */
283 COSTS_N_INSNS (16), /* cost of MULSD instruction. */
c53c148c
JH
284 COSTS_N_INSNS (16), /* cost of FMA SS instruction. */
285 COSTS_N_INSNS (16), /* cost of FMA SD instruction. */
6065f444
JH
286 COSTS_N_INSNS (73), /* cost of DIVSS instruction. */
287 COSTS_N_INSNS (74), /* cost of DIVSD instruction. */
288 COSTS_N_INSNS (83), /* cost of SQRTSS instruction. */
289 COSTS_N_INSNS (83), /* cost of SQRTSD instruction. */
64766e8d
JH
290 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
291 i486_memcpy,
292 i486_memset,
f6fd8f2b
JH
293 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
294 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
295 "16", /* Loop alignment. */
296 "16", /* Jump alignment. */
297 "0:0:8", /* Label alignment. */
298 "16", /* Func alignment. */
64766e8d
JH
299};
300
301static stringop_algs pentium_memcpy[2] = {
302 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
303 DUMMY_STRINGOP_ALGS};
304static stringop_algs pentium_memset[2] = {
305 {libcall, {{-1, rep_prefix_4_byte, false}}},
306 DUMMY_STRINGOP_ALGS};
307
308static const
309struct processor_costs pentium_cost = {
310 COSTS_N_INSNS (1), /* cost of an add instruction */
311 COSTS_N_INSNS (1), /* cost of a lea instruction */
312 COSTS_N_INSNS (4), /* variable shift costs */
313 COSTS_N_INSNS (1), /* constant shift costs */
314 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
315 COSTS_N_INSNS (11), /* HI */
316 COSTS_N_INSNS (11), /* SI */
317 COSTS_N_INSNS (11), /* DI */
318 COSTS_N_INSNS (11)}, /* other */
319 0, /* cost of multiply per each bit set */
320 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
321 COSTS_N_INSNS (25), /* HI */
322 COSTS_N_INSNS (25), /* SI */
323 COSTS_N_INSNS (25), /* DI */
324 COSTS_N_INSNS (25)}, /* other */
325 COSTS_N_INSNS (3), /* cost of movsx */
326 COSTS_N_INSNS (2), /* cost of movzx */
327 8, /* "large" insn */
328 6, /* MOVE_RATIO */
df41dbaf
JH
329
330 /* All move costs are relative to integer->integer move times 2 and thus
331 they are latency*2. */
64766e8d
JH
332 6, /* cost for loading QImode using movzbl */
333 {2, 4, 2}, /* cost of loading integer registers
334 in QImode, HImode and SImode.
335 Relative to reg-reg move (2). */
336 {2, 4, 2}, /* cost of storing integer registers */
337 2, /* cost of reg,reg fld/fst */
338 {2, 2, 6}, /* cost of loading fp registers
339 in SFmode, DFmode and XFmode */
340 {4, 4, 6}, /* cost of storing fp registers
341 in SFmode, DFmode and XFmode */
342 8, /* cost of moving MMX register */
343 {8, 8}, /* cost of loading MMX registers
344 in SImode and DImode */
345 {8, 8}, /* cost of storing MMX registers
346 in SImode and DImode */
df41dbaf
JH
347 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
348 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
349 in 32,64,128,256 and 512-bit */
350 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
351 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
352 in 32,64,128,256 and 512-bit */
353 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
354 3, 3, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
355 4, 4, /* Gather load static, per_elt. */
356 4, 4, /* Gather store static, per_elt. */
64766e8d
JH
357 8, /* size of l1 cache. */
358 8, /* size of l2 cache */
359 0, /* size of prefetch block */
360 0, /* number of parallel prefetches */
361 2, /* Branch cost */
362 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
363 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
364 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
365 COSTS_N_INSNS (1), /* cost of FABS instruction. */
366 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
367 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
6065f444 368
c53c148c 369 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
370 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
371 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
372 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
c53c148c
JH
373 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
374 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
6065f444
JH
375 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
376 COSTS_N_INSNS (39), /* cost of DIVSD instruction. */
377 COSTS_N_INSNS (70), /* cost of SQRTSS instruction. */
378 COSTS_N_INSNS (70), /* cost of SQRTSD instruction. */
64766e8d
JH
379 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
380 pentium_memcpy,
381 pentium_memset,
f6fd8f2b
JH
382 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
383 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
384 "16:8:8", /* Loop alignment. */
385 "16:8:8", /* Jump alignment. */
386 "0:0:8", /* Label alignment. */
387 "16", /* Func alignment. */
64766e8d
JH
388};
389
390static const
391struct processor_costs lakemont_cost = {
392 COSTS_N_INSNS (1), /* cost of an add instruction */
393 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
394 COSTS_N_INSNS (1), /* variable shift costs */
395 COSTS_N_INSNS (1), /* constant shift costs */
396 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
397 COSTS_N_INSNS (11), /* HI */
398 COSTS_N_INSNS (11), /* SI */
399 COSTS_N_INSNS (11), /* DI */
400 COSTS_N_INSNS (11)}, /* other */
401 0, /* cost of multiply per each bit set */
402 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
403 COSTS_N_INSNS (25), /* HI */
404 COSTS_N_INSNS (25), /* SI */
405 COSTS_N_INSNS (25), /* DI */
406 COSTS_N_INSNS (25)}, /* other */
407 COSTS_N_INSNS (3), /* cost of movsx */
408 COSTS_N_INSNS (2), /* cost of movzx */
409 8, /* "large" insn */
410 17, /* MOVE_RATIO */
df41dbaf
JH
411
412 /* All move costs are relative to integer->integer move times 2 and thus
413 they are latency*2. */
64766e8d
JH
414 6, /* cost for loading QImode using movzbl */
415 {2, 4, 2}, /* cost of loading integer registers
416 in QImode, HImode and SImode.
417 Relative to reg-reg move (2). */
418 {2, 4, 2}, /* cost of storing integer registers */
419 2, /* cost of reg,reg fld/fst */
420 {2, 2, 6}, /* cost of loading fp registers
421 in SFmode, DFmode and XFmode */
422 {4, 4, 6}, /* cost of storing fp registers
423 in SFmode, DFmode and XFmode */
424 8, /* cost of moving MMX register */
425 {8, 8}, /* cost of loading MMX registers
426 in SImode and DImode */
427 {8, 8}, /* cost of storing MMX registers
428 in SImode and DImode */
df41dbaf
JH
429 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
430 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
431 in 32,64,128,256 and 512-bit */
432 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
433 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
434 in 32,64,128,256 and 512-bit */
435 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
436 3, 3, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
437 4, 4, /* Gather load static, per_elt. */
438 4, 4, /* Gather store static, per_elt. */
64766e8d
JH
439 8, /* size of l1 cache. */
440 8, /* size of l2 cache */
441 0, /* size of prefetch block */
442 0, /* number of parallel prefetches */
443 2, /* Branch cost */
444 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
445 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
446 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
447 COSTS_N_INSNS (1), /* cost of FABS instruction. */
448 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
449 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
6065f444 450
c53c148c 451 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
452 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
453 COSTS_N_INSNS (5), /* cost of MULSS instruction. */
454 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
c53c148c
JH
455 COSTS_N_INSNS (10), /* cost of FMA SS instruction. */
456 COSTS_N_INSNS (10), /* cost of FMA SD instruction. */
6065f444
JH
457 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
458 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
459 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
460 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
64766e8d
JH
461 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
462 pentium_memcpy,
463 pentium_memset,
f6fd8f2b
JH
464 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
465 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
466 "16:8:8", /* Loop alignment. */
467 "16:8:8", /* Jump alignment. */
468 "0:0:8", /* Label alignment. */
469 "16", /* Func alignment. */
64766e8d
JH
470};
471
472/* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
473 (we ensure the alignment). For small blocks inline loop is still a
474 noticeable win, for bigger blocks either rep movsl or rep movsb is
475 way to go. Rep movsb has apparently more expensive startup time in CPU,
476 but after 4K the difference is down in the noise. */
477static stringop_algs pentiumpro_memcpy[2] = {
478 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
479 {8192, rep_prefix_4_byte, false},
480 {-1, rep_prefix_1_byte, false}}},
481 DUMMY_STRINGOP_ALGS};
482static stringop_algs pentiumpro_memset[2] = {
483 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
484 {8192, rep_prefix_4_byte, false},
485 {-1, libcall, false}}},
486 DUMMY_STRINGOP_ALGS};
487static const
488struct processor_costs pentiumpro_cost = {
489 COSTS_N_INSNS (1), /* cost of an add instruction */
490 COSTS_N_INSNS (1), /* cost of a lea instruction */
491 COSTS_N_INSNS (1), /* variable shift costs */
492 COSTS_N_INSNS (1), /* constant shift costs */
493 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
494 COSTS_N_INSNS (4), /* HI */
495 COSTS_N_INSNS (4), /* SI */
496 COSTS_N_INSNS (4), /* DI */
497 COSTS_N_INSNS (4)}, /* other */
498 0, /* cost of multiply per each bit set */
499 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
500 COSTS_N_INSNS (17), /* HI */
501 COSTS_N_INSNS (17), /* SI */
502 COSTS_N_INSNS (17), /* DI */
503 COSTS_N_INSNS (17)}, /* other */
504 COSTS_N_INSNS (1), /* cost of movsx */
505 COSTS_N_INSNS (1), /* cost of movzx */
506 8, /* "large" insn */
507 6, /* MOVE_RATIO */
df41dbaf
JH
508
509 /* All move costs are relative to integer->integer move times 2 and thus
510 they are latency*2. */
64766e8d
JH
511 2, /* cost for loading QImode using movzbl */
512 {4, 4, 4}, /* cost of loading integer registers
513 in QImode, HImode and SImode.
514 Relative to reg-reg move (2). */
515 {2, 2, 2}, /* cost of storing integer registers */
516 2, /* cost of reg,reg fld/fst */
517 {2, 2, 6}, /* cost of loading fp registers
518 in SFmode, DFmode and XFmode */
519 {4, 4, 6}, /* cost of storing fp registers
520 in SFmode, DFmode and XFmode */
521 2, /* cost of moving MMX register */
522 {2, 2}, /* cost of loading MMX registers
523 in SImode and DImode */
524 {2, 2}, /* cost of storing MMX registers
525 in SImode and DImode */
df41dbaf
JH
526 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
527 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
528 in 32,64,128,256 and 512-bit */
529 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
530 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
531 in 32,64,128,256 and 512-bit */
532 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
533 3, 3, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
534 4, 4, /* Gather load static, per_elt. */
535 4, 4, /* Gather store static, per_elt. */
64766e8d
JH
536 8, /* size of l1 cache. */
537 256, /* size of l2 cache */
538 32, /* size of prefetch block */
539 6, /* number of parallel prefetches */
540 2, /* Branch cost */
541 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
542 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
543 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
544 COSTS_N_INSNS (2), /* cost of FABS instruction. */
545 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
546 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
6065f444 547
c53c148c 548 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
549 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
550 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
551 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
c53c148c
JH
552 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
553 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
6065f444
JH
554 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
555 COSTS_N_INSNS (18), /* cost of DIVSD instruction. */
556 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
557 COSTS_N_INSNS (31), /* cost of SQRTSD instruction. */
64766e8d
JH
558 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
559 pentiumpro_memcpy,
560 pentiumpro_memset,
f6fd8f2b
JH
561 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
562 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
563 "16", /* Loop alignment. */
564 "16:11:8", /* Jump alignment. */
565 "0:0:8", /* Label alignment. */
566 "16", /* Func alignment. */
64766e8d
JH
567};
568
569static stringop_algs geode_memcpy[2] = {
570 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
571 DUMMY_STRINGOP_ALGS};
572static stringop_algs geode_memset[2] = {
573 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
574 DUMMY_STRINGOP_ALGS};
575static const
576struct processor_costs geode_cost = {
577 COSTS_N_INSNS (1), /* cost of an add instruction */
578 COSTS_N_INSNS (1), /* cost of a lea instruction */
579 COSTS_N_INSNS (2), /* variable shift costs */
580 COSTS_N_INSNS (1), /* constant shift costs */
581 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
582 COSTS_N_INSNS (4), /* HI */
583 COSTS_N_INSNS (7), /* SI */
584 COSTS_N_INSNS (7), /* DI */
585 COSTS_N_INSNS (7)}, /* other */
586 0, /* cost of multiply per each bit set */
587 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
588 COSTS_N_INSNS (23), /* HI */
589 COSTS_N_INSNS (39), /* SI */
590 COSTS_N_INSNS (39), /* DI */
591 COSTS_N_INSNS (39)}, /* other */
592 COSTS_N_INSNS (1), /* cost of movsx */
593 COSTS_N_INSNS (1), /* cost of movzx */
594 8, /* "large" insn */
595 4, /* MOVE_RATIO */
df41dbaf
JH
596
597 /* All move costs are relative to integer->integer move times 2 and thus
598 they are latency*2. */
599 2, /* cost for loading QImode using movzbl */
600 {2, 2, 2}, /* cost of loading integer registers
64766e8d
JH
601 in QImode, HImode and SImode.
602 Relative to reg-reg move (2). */
df41dbaf
JH
603 {2, 2, 2}, /* cost of storing integer registers */
604 2, /* cost of reg,reg fld/fst */
605 {2, 2, 2}, /* cost of loading fp registers
64766e8d
JH
606 in SFmode, DFmode and XFmode */
607 {4, 6, 6}, /* cost of storing fp registers
608 in SFmode, DFmode and XFmode */
609
610 2, /* cost of moving MMX register */
611 {2, 2}, /* cost of loading MMX registers
612 in SImode and DImode */
613 {2, 2}, /* cost of storing MMX registers
614 in SImode and DImode */
df41dbaf
JH
615 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
616 {2, 2, 8, 16, 32}, /* cost of loading SSE registers
617 in 32,64,128,256 and 512-bit */
618 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */
619 {2, 2, 8, 16, 32}, /* cost of storing SSE registers
620 in 32,64,128,256 and 512-bit */
621 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
622 6, 6, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
623 2, 2, /* Gather load static, per_elt. */
624 2, 2, /* Gather store static, per_elt. */
64766e8d
JH
625 64, /* size of l1 cache. */
626 128, /* size of l2 cache. */
627 32, /* size of prefetch block */
628 1, /* number of parallel prefetches */
629 1, /* Branch cost */
630 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
631 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
632 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
633 COSTS_N_INSNS (1), /* cost of FABS instruction. */
634 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
635 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
6065f444 636
c53c148c 637 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
638 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
639 COSTS_N_INSNS (11), /* cost of MULSS instruction. */
640 COSTS_N_INSNS (11), /* cost of MULSD instruction. */
c53c148c
JH
641 COSTS_N_INSNS (17), /* cost of FMA SS instruction. */
642 COSTS_N_INSNS (17), /* cost of FMA SD instruction. */
6065f444
JH
643 COSTS_N_INSNS (47), /* cost of DIVSS instruction. */
644 COSTS_N_INSNS (47), /* cost of DIVSD instruction. */
645 COSTS_N_INSNS (54), /* cost of SQRTSS instruction. */
646 COSTS_N_INSNS (54), /* cost of SQRTSD instruction. */
64766e8d
JH
647 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
648 geode_memcpy,
649 geode_memset,
f6fd8f2b
JH
650 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
651 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
652 NULL, /* Loop alignment. */
653 NULL, /* Jump alignment. */
654 NULL, /* Label alignment. */
655 NULL, /* Func alignment. */
64766e8d
JH
656};
657
658static stringop_algs k6_memcpy[2] = {
659 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
660 DUMMY_STRINGOP_ALGS};
661static stringop_algs k6_memset[2] = {
662 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
663 DUMMY_STRINGOP_ALGS};
664static const
665struct processor_costs k6_cost = {
666 COSTS_N_INSNS (1), /* cost of an add instruction */
667 COSTS_N_INSNS (2), /* cost of a lea instruction */
668 COSTS_N_INSNS (1), /* variable shift costs */
669 COSTS_N_INSNS (1), /* constant shift costs */
670 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
671 COSTS_N_INSNS (3), /* HI */
672 COSTS_N_INSNS (3), /* SI */
673 COSTS_N_INSNS (3), /* DI */
674 COSTS_N_INSNS (3)}, /* other */
675 0, /* cost of multiply per each bit set */
676 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
677 COSTS_N_INSNS (18), /* HI */
678 COSTS_N_INSNS (18), /* SI */
679 COSTS_N_INSNS (18), /* DI */
680 COSTS_N_INSNS (18)}, /* other */
681 COSTS_N_INSNS (2), /* cost of movsx */
682 COSTS_N_INSNS (2), /* cost of movzx */
683 8, /* "large" insn */
684 4, /* MOVE_RATIO */
df41dbaf
JH
685
686 /* All move costs are relative to integer->integer move times 2 and thus
687 they are latency*2. */
64766e8d
JH
688 3, /* cost for loading QImode using movzbl */
689 {4, 5, 4}, /* cost of loading integer registers
690 in QImode, HImode and SImode.
691 Relative to reg-reg move (2). */
692 {2, 3, 2}, /* cost of storing integer registers */
693 4, /* cost of reg,reg fld/fst */
694 {6, 6, 6}, /* cost of loading fp registers
695 in SFmode, DFmode and XFmode */
696 {4, 4, 4}, /* cost of storing fp registers
697 in SFmode, DFmode and XFmode */
698 2, /* cost of moving MMX register */
699 {2, 2}, /* cost of loading MMX registers
700 in SImode and DImode */
701 {2, 2}, /* cost of storing MMX registers
702 in SImode and DImode */
df41dbaf
JH
703 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
704 {2, 2, 8, 16, 32}, /* cost of loading SSE registers
705 in 32,64,128,256 and 512-bit */
706 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */
707 {2, 2, 8, 16, 32}, /* cost of storing SSE registers
708 in 32,64,128,256 and 512-bit */
709 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
710 6, 6, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
711 2, 2, /* Gather load static, per_elt. */
712 2, 2, /* Gather store static, per_elt. */
64766e8d
JH
713 32, /* size of l1 cache. */
714 32, /* size of l2 cache. Some models
715 have integrated l2 cache, but
716 optimizing for k6 is not important
717 enough to worry about that. */
718 32, /* size of prefetch block */
719 1, /* number of parallel prefetches */
720 1, /* Branch cost */
721 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
722 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
723 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
724 COSTS_N_INSNS (2), /* cost of FABS instruction. */
725 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
726 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
6065f444 727
c53c148c 728 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
729 COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */
730 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
731 COSTS_N_INSNS (2), /* cost of MULSD instruction. */
c53c148c
JH
732 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
733 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
6065f444
JH
734 COSTS_N_INSNS (56), /* cost of DIVSS instruction. */
735 COSTS_N_INSNS (56), /* cost of DIVSD instruction. */
736 COSTS_N_INSNS (56), /* cost of SQRTSS instruction. */
737 COSTS_N_INSNS (56), /* cost of SQRTSD instruction. */
64766e8d
JH
738 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
739 k6_memcpy,
740 k6_memset,
f6fd8f2b
JH
741 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
742 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
743 "32:8:8", /* Loop alignment. */
744 "32:8:8", /* Jump alignment. */
745 "0:0:8", /* Label alignment. */
746 "32", /* Func alignment. */
64766e8d
JH
747};
748
749/* For some reason, Athlon deals better with REP prefix (relative to loops)
750 compared to K8. Alignment becomes important after 8 bytes for memcpy and
751 128 bytes for memset. */
752static stringop_algs athlon_memcpy[2] = {
753 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
754 DUMMY_STRINGOP_ALGS};
755static stringop_algs athlon_memset[2] = {
756 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
757 DUMMY_STRINGOP_ALGS};
758static const
759struct processor_costs athlon_cost = {
760 COSTS_N_INSNS (1), /* cost of an add instruction */
761 COSTS_N_INSNS (2), /* cost of a lea instruction */
762 COSTS_N_INSNS (1), /* variable shift costs */
763 COSTS_N_INSNS (1), /* constant shift costs */
764 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
765 COSTS_N_INSNS (5), /* HI */
766 COSTS_N_INSNS (5), /* SI */
767 COSTS_N_INSNS (5), /* DI */
768 COSTS_N_INSNS (5)}, /* other */
769 0, /* cost of multiply per each bit set */
770 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
771 COSTS_N_INSNS (26), /* HI */
772 COSTS_N_INSNS (42), /* SI */
773 COSTS_N_INSNS (74), /* DI */
774 COSTS_N_INSNS (74)}, /* other */
775 COSTS_N_INSNS (1), /* cost of movsx */
776 COSTS_N_INSNS (1), /* cost of movzx */
777 8, /* "large" insn */
778 9, /* MOVE_RATIO */
df41dbaf
JH
779
780 /* All move costs are relative to integer->integer move times 2 and thus
781 they are latency*2. */
64766e8d
JH
782 4, /* cost for loading QImode using movzbl */
783 {3, 4, 3}, /* cost of loading integer registers
784 in QImode, HImode and SImode.
785 Relative to reg-reg move (2). */
786 {3, 4, 3}, /* cost of storing integer registers */
787 4, /* cost of reg,reg fld/fst */
788 {4, 4, 12}, /* cost of loading fp registers
789 in SFmode, DFmode and XFmode */
790 {6, 6, 8}, /* cost of storing fp registers
791 in SFmode, DFmode and XFmode */
792 2, /* cost of moving MMX register */
793 {4, 4}, /* cost of loading MMX registers
794 in SImode and DImode */
795 {4, 4}, /* cost of storing MMX registers
796 in SImode and DImode */
df41dbaf 797 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
b7167993 798 {4, 4, 12, 12, 24}, /* cost of loading SSE registers
df41dbaf 799 in 32,64,128,256 and 512-bit */
b7167993
RB
800 {4, 4, 12, 12, 24}, /* cost of unaligned loads. */
801 {4, 4, 10, 10, 20}, /* cost of storing SSE registers
df41dbaf 802 in 32,64,128,256 and 512-bit */
b7167993 803 {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
df41dbaf 804 5, 5, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
805 4, 4, /* Gather load static, per_elt. */
806 4, 4, /* Gather store static, per_elt. */
64766e8d
JH
807 64, /* size of l1 cache. */
808 256, /* size of l2 cache. */
809 64, /* size of prefetch block */
810 6, /* number of parallel prefetches */
811 5, /* Branch cost */
812 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
813 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
814 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
815 COSTS_N_INSNS (2), /* cost of FABS instruction. */
816 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
817 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
6065f444 818
c53c148c 819 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
6065f444
JH
820 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
821 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
822 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
c53c148c
JH
823 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
824 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
6065f444
JH
825 /* 11-16 */
826 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
827 COSTS_N_INSNS (24), /* cost of DIVSD instruction. */
828 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
829 COSTS_N_INSNS (19), /* cost of SQRTSD instruction. */
64766e8d
JH
830 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
831 athlon_memcpy,
832 athlon_memset,
f6fd8f2b
JH
833 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
834 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
835 "16:8:8", /* Loop alignment. */
836 "16:8:8", /* Jump alignment. */
837 "0:0:8", /* Label alignment. */
838 "16", /* Func alignment. */
64766e8d
JH
839};
840
841/* K8 has optimized REP instruction for medium sized blocks, but for very
842 small blocks it is better to use loop. For large blocks, libcall can
843 do nontemporary accesses and beat inline considerably. */
844static stringop_algs k8_memcpy[2] = {
845 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
846 {-1, rep_prefix_4_byte, false}}},
847 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
848 {-1, libcall, false}}}};
849static stringop_algs k8_memset[2] = {
850 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
851 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
852 {libcall, {{48, unrolled_loop, false},
853 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
854static const
855struct processor_costs k8_cost = {
856 COSTS_N_INSNS (1), /* cost of an add instruction */
857 COSTS_N_INSNS (2), /* cost of a lea instruction */
858 COSTS_N_INSNS (1), /* variable shift costs */
859 COSTS_N_INSNS (1), /* constant shift costs */
860 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
861 COSTS_N_INSNS (4), /* HI */
862 COSTS_N_INSNS (3), /* SI */
863 COSTS_N_INSNS (4), /* DI */
864 COSTS_N_INSNS (5)}, /* other */
865 0, /* cost of multiply per each bit set */
866 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
867 COSTS_N_INSNS (26), /* HI */
868 COSTS_N_INSNS (42), /* SI */
869 COSTS_N_INSNS (74), /* DI */
870 COSTS_N_INSNS (74)}, /* other */
871 COSTS_N_INSNS (1), /* cost of movsx */
872 COSTS_N_INSNS (1), /* cost of movzx */
873 8, /* "large" insn */
874 9, /* MOVE_RATIO */
df41dbaf
JH
875
876 /* All move costs are relative to integer->integer move times 2 and thus
877 they are latency*2. */
64766e8d
JH
878 4, /* cost for loading QImode using movzbl */
879 {3, 4, 3}, /* cost of loading integer registers
880 in QImode, HImode and SImode.
881 Relative to reg-reg move (2). */
882 {3, 4, 3}, /* cost of storing integer registers */
883 4, /* cost of reg,reg fld/fst */
884 {4, 4, 12}, /* cost of loading fp registers
885 in SFmode, DFmode and XFmode */
886 {6, 6, 8}, /* cost of storing fp registers
887 in SFmode, DFmode and XFmode */
888 2, /* cost of moving MMX register */
889 {3, 3}, /* cost of loading MMX registers
890 in SImode and DImode */
891 {4, 4}, /* cost of storing MMX registers
892 in SImode and DImode */
df41dbaf 893 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
b7167993 894 {4, 3, 12, 12, 24}, /* cost of loading SSE registers
df41dbaf 895 in 32,64,128,256 and 512-bit */
b7167993
RB
896 {4, 3, 12, 12, 24}, /* cost of unaligned loads. */
897 {4, 4, 10, 10, 20}, /* cost of storing SSE registers
df41dbaf 898 in 32,64,128,256 and 512-bit */
b7167993 899 {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
df41dbaf 900 5, 5, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
901 4, 4, /* Gather load static, per_elt. */
902 4, 4, /* Gather store static, per_elt. */
64766e8d
JH
903 64, /* size of l1 cache. */
904 512, /* size of l2 cache. */
905 64, /* size of prefetch block */
906 /* New AMD processors never drop prefetches; if they cannot be performed
907 immediately, they are queued. We set number of simultaneous prefetches
908 to a large constant to reflect this (it probably is not a good idea not
909 to limit number of prefetches at all, as their execution also takes some
910 time). */
911 100, /* number of parallel prefetches */
912 3, /* Branch cost */
913 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
914 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
915 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
916 COSTS_N_INSNS (2), /* cost of FABS instruction. */
917 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
918 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
6065f444 919
c53c148c 920 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
6065f444
JH
921 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
922 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
923 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
c53c148c
JH
924 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
925 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
6065f444
JH
926 /* 11-16 */
927 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
928 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
929 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
930 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
64766e8d
JH
931 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
932 k8_memcpy,
933 k8_memset,
f6fd8f2b
JH
934 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
935 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
7dc58b50
ML
936 "16:8:8", /* Loop alignment. */
937 "16:8:8", /* Jump alignment. */
938 "0:0:8", /* Label alignment. */
939 "16", /* Func alignment. */
64766e8d
JH
940};
941
942/* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
943 very small blocks it is better to use loop. For large blocks, libcall can
944 do nontemporary accesses and beat inline considerably. */
945static stringop_algs amdfam10_memcpy[2] = {
946 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
947 {-1, rep_prefix_4_byte, false}}},
948 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
949 {-1, libcall, false}}}};
950static stringop_algs amdfam10_memset[2] = {
951 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
952 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
953 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
954 {-1, libcall, false}}}};
955struct processor_costs amdfam10_cost = {
956 COSTS_N_INSNS (1), /* cost of an add instruction */
957 COSTS_N_INSNS (2), /* cost of a lea instruction */
958 COSTS_N_INSNS (1), /* variable shift costs */
959 COSTS_N_INSNS (1), /* constant shift costs */
960 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
961 COSTS_N_INSNS (4), /* HI */
962 COSTS_N_INSNS (3), /* SI */
963 COSTS_N_INSNS (4), /* DI */
964 COSTS_N_INSNS (5)}, /* other */
965 0, /* cost of multiply per each bit set */
966 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
967 COSTS_N_INSNS (35), /* HI */
968 COSTS_N_INSNS (51), /* SI */
969 COSTS_N_INSNS (83), /* DI */
970 COSTS_N_INSNS (83)}, /* other */
971 COSTS_N_INSNS (1), /* cost of movsx */
972 COSTS_N_INSNS (1), /* cost of movzx */
973 8, /* "large" insn */
974 9, /* MOVE_RATIO */
df41dbaf
JH
975
976 /* All move costs are relative to integer->integer move times 2 and thus
977 they are latency*2. */
64766e8d
JH
978 4, /* cost for loading QImode using movzbl */
979 {3, 4, 3}, /* cost of loading integer registers
980 in QImode, HImode and SImode.
981 Relative to reg-reg move (2). */
982 {3, 4, 3}, /* cost of storing integer registers */
983 4, /* cost of reg,reg fld/fst */
984 {4, 4, 12}, /* cost of loading fp registers
985 in SFmode, DFmode and XFmode */
986 {6, 6, 8}, /* cost of storing fp registers
987 in SFmode, DFmode and XFmode */
988 2, /* cost of moving MMX register */
989 {3, 3}, /* cost of loading MMX registers
990 in SImode and DImode */
991 {4, 4}, /* cost of storing MMX registers
992 in SImode and DImode */
df41dbaf
JH
993 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
994 {4, 4, 3, 6, 12}, /* cost of loading SSE registers
995 in 32,64,128,256 and 512-bit */
996 {4, 4, 3, 7, 12}, /* cost of unaligned loads. */
997 {4, 4, 5, 10, 20}, /* cost of storing SSE registers
998 in 32,64,128,256 and 512-bit */
999 {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
1000 3, 3, /* SSE->integer and integer->SSE moves */
64766e8d
JH
1001 /* On K8:
1002 MOVD reg64, xmmreg Double FSTORE 4
1003 MOVD reg32, xmmreg Double FSTORE 4
1004 On AMDFAM10:
1005 MOVD reg64, xmmreg Double FADD 3
1006 1/1 1/1
1007 MOVD reg32, xmmreg Double FADD 3
1008 1/1 1/1 */
a4fe6139
JH
1009 4, 4, /* Gather load static, per_elt. */
1010 4, 4, /* Gather store static, per_elt. */
64766e8d
JH
1011 64, /* size of l1 cache. */
1012 512, /* size of l2 cache. */
1013 64, /* size of prefetch block */
1014 /* New AMD processors never drop prefetches; if they cannot be performed
1015 immediately, they are queued. We set number of simultaneous prefetches
1016 to a large constant to reflect this (it probably is not a good idea not
1017 to limit number of prefetches at all, as their execution also takes some
1018 time). */
1019 100, /* number of parallel prefetches */
1020 2, /* Branch cost */
1021 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1022 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1023 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1024 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1025 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1026 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
6065f444 1027
c53c148c 1028 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
6065f444
JH
1029 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1030 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1031 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
c53c148c
JH
1032 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
1033 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
6065f444
JH
1034 /* 11-16 */
1035 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
1036 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
1037 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
1038 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
64766e8d
JH
1039 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1040 amdfam10_memcpy,
1041 amdfam10_memset,
f6fd8f2b
JH
1042 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1043 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
1044 "32:25:8", /* Loop alignment. */
1045 "32:8:8", /* Jump alignment. */
1046 "0:0:8", /* Label alignment. */
1047 "32", /* Func alignment. */
64766e8d
JH
1048};
1049
c727b835 1050/* BDVER has optimized REP instruction for medium sized blocks, but for
64766e8d
JH
1051 very small blocks it is better to use loop. For large blocks, libcall
1052 can do nontemporary accesses and beat inline considerably. */
c727b835 1053static stringop_algs bdver_memcpy[2] = {
64766e8d
JH
1054 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1055 {-1, rep_prefix_4_byte, false}}},
1056 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1057 {-1, libcall, false}}}};
c727b835 1058static stringop_algs bdver_memset[2] = {
64766e8d
JH
1059 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1060 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1061 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1062 {-1, libcall, false}}}};
1063
c727b835 1064const struct processor_costs bdver_cost = {
64766e8d
JH
1065 COSTS_N_INSNS (1), /* cost of an add instruction */
1066 COSTS_N_INSNS (1), /* cost of a lea instruction */
1067 COSTS_N_INSNS (1), /* variable shift costs */
1068 COSTS_N_INSNS (1), /* constant shift costs */
1069 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1070 COSTS_N_INSNS (4), /* HI */
1071 COSTS_N_INSNS (4), /* SI */
1072 COSTS_N_INSNS (6), /* DI */
1073 COSTS_N_INSNS (6)}, /* other */
1074 0, /* cost of multiply per each bit set */
1075 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1076 COSTS_N_INSNS (35), /* HI */
1077 COSTS_N_INSNS (51), /* SI */
1078 COSTS_N_INSNS (83), /* DI */
1079 COSTS_N_INSNS (83)}, /* other */
1080 COSTS_N_INSNS (1), /* cost of movsx */
1081 COSTS_N_INSNS (1), /* cost of movzx */
1082 8, /* "large" insn */
1083 9, /* MOVE_RATIO */
df41dbaf
JH
1084
1085 /* All move costs are relative to integer->integer move times 2 and thus
1086 they are latency*2. */
1087 8, /* cost for loading QImode using movzbl */
1088 {8, 8, 8}, /* cost of loading integer registers
64766e8d
JH
1089 in QImode, HImode and SImode.
1090 Relative to reg-reg move (2). */
df41dbaf
JH
1091 {8, 8, 8}, /* cost of storing integer registers */
1092 4, /* cost of reg,reg fld/fst */
1093 {12, 12, 28}, /* cost of loading fp registers
64766e8d 1094 in SFmode, DFmode and XFmode */
df41dbaf 1095 {10, 10, 18}, /* cost of storing fp registers
64766e8d 1096 in SFmode, DFmode and XFmode */
df41dbaf
JH
1097 4, /* cost of moving MMX register */
1098 {12, 12}, /* cost of loading MMX registers
64766e8d 1099 in SImode and DImode */
df41dbaf 1100 {10, 10}, /* cost of storing MMX registers
64766e8d 1101 in SImode and DImode */
df41dbaf 1102 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
b7167993 1103 {12, 12, 10, 40, 60}, /* cost of loading SSE registers
df41dbaf 1104 in 32,64,128,256 and 512-bit */
b7167993
RB
1105 {12, 12, 10, 40, 60}, /* cost of unaligned loads. */
1106 {10, 10, 10, 40, 60}, /* cost of storing SSE registers
df41dbaf 1107 in 32,64,128,256 and 512-bit */
b7167993 1108 {10, 10, 10, 40, 60}, /* cost of unaligned stores. */
df41dbaf 1109 16, 20, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
1110 12, 12, /* Gather load static, per_elt. */
1111 10, 10, /* Gather store static, per_elt. */
64766e8d
JH
1112 16, /* size of l1 cache. */
1113 2048, /* size of l2 cache. */
1114 64, /* size of prefetch block */
1115 /* New AMD processors never drop prefetches; if they cannot be performed
1116 immediately, they are queued. We set number of simultaneous prefetches
1117 to a large constant to reflect this (it probably is not a good idea not
1118 to limit number of prefetches at all, as their execution also takes some
1119 time). */
1120 100, /* number of parallel prefetches */
1121 2, /* Branch cost */
1122 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1123 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1124 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1125 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1126 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1127 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
6065f444 1128
c53c148c 1129 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
6065f444
JH
1130 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1131 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1132 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
c53c148c
JH
1133 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1134 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
6065f444
JH
1135 /* 9-24 */
1136 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1137 /* 9-27 */
1138 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1139 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1140 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
64766e8d 1141 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
c727b835
RB
1142 bdver_memcpy,
1143 bdver_memset,
f6fd8f2b
JH
1144 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1145 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
7dc58b50
ML
1146 "16:11:8", /* Loop alignment. */
1147 "16:8:8", /* Jump alignment. */
1148 "0:0:8", /* Label alignment. */
1149 "11", /* Func alignment. */
64766e8d
JH
1150};
1151
1152
1153/* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1154 very small blocks it is better to use loop. For large blocks, libcall
1155 can do nontemporary accesses and beat inline considerably. */
1156static stringop_algs znver1_memcpy[2] = {
1157 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1158 {-1, rep_prefix_4_byte, false}}},
1159 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1160 {-1, libcall, false}}}};
1161static stringop_algs znver1_memset[2] = {
1162 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1163 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1164 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1165 {-1, libcall, false}}}};
1166struct processor_costs znver1_cost = {
1167 COSTS_N_INSNS (1), /* cost of an add instruction. */
1168 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1169 COSTS_N_INSNS (1), /* variable shift costs. */
1170 COSTS_N_INSNS (1), /* constant shift costs. */
1171 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1172 COSTS_N_INSNS (3), /* HI. */
1173 COSTS_N_INSNS (3), /* SI. */
6065f444
JH
1174 COSTS_N_INSNS (3), /* DI. */
1175 COSTS_N_INSNS (3)}, /* other. */
64766e8d
JH
1176 0, /* cost of multiply per each bit
1177 set. */
6065f444
JH
1178 /* Depending on parameters, idiv can get faster on ryzen. This is upper
1179 bound. */
1180 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
1181 COSTS_N_INSNS (22), /* HI. */
1182 COSTS_N_INSNS (30), /* SI. */
1183 COSTS_N_INSNS (45), /* DI. */
1184 COSTS_N_INSNS (45)}, /* other. */
64766e8d
JH
1185 COSTS_N_INSNS (1), /* cost of movsx. */
1186 COSTS_N_INSNS (1), /* cost of movzx. */
1187 8, /* "large" insn. */
1188 9, /* MOVE_RATIO. */
01118373 1189
df41dbaf
JH
1190 /* All move costs are relative to integer->integer move times 2 and thus
1191 they are latency*2. */
1192
01118373
JH
1193 /* reg-reg moves are done by renaming and thus they are even cheaper than
1194 1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1195 to doubles of latencies, we do not model this correctly. It does not
1196 seem to make practical difference to bump prices up even more. */
1197 6, /* cost for loading QImode using
64766e8d 1198 movzbl. */
01118373 1199 {6, 6, 6}, /* cost of loading integer registers
64766e8d
JH
1200 in QImode, HImode and SImode.
1201 Relative to reg-reg move (2). */
01118373 1202 {8, 8, 8}, /* cost of storing integer
64766e8d
JH
1203 registers. */
1204 2, /* cost of reg,reg fld/fst. */
01118373 1205 {6, 6, 16}, /* cost of loading fp registers
64766e8d 1206 in SFmode, DFmode and XFmode. */
01118373 1207 {8, 8, 16}, /* cost of storing fp registers
64766e8d
JH
1208 in SFmode, DFmode and XFmode. */
1209 2, /* cost of moving MMX register. */
01118373 1210 {6, 6}, /* cost of loading MMX registers
64766e8d 1211 in SImode and DImode. */
01118373 1212 {8, 8}, /* cost of storing MMX registers
64766e8d 1213 in SImode and DImode. */
df41dbaf 1214 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
b7167993 1215 {6, 6, 6, 12, 24}, /* cost of loading SSE registers
df41dbaf 1216 in 32,64,128,256 and 512-bit. */
b7167993
RB
1217 {6, 6, 6, 12, 24}, /* cost of unaligned loads. */
1218 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
df41dbaf 1219 in 32,64,128,256 and 512-bit. */
b7167993 1220 {8, 8, 8, 16, 32}, /* cost of unaligned stores. */
df41dbaf 1221 6, 6, /* SSE->integer and integer->SSE moves. */
a4fe6139
JH
1222 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1223 throughput 12. Approx 9 uops do not depend on vector size and every load
1224 is 7 uops. */
1225 18, 8, /* Gather load static, per_elt. */
1226 18, 10, /* Gather store static, per_elt. */
64766e8d
JH
1227 32, /* size of l1 cache. */
1228 512, /* size of l2 cache. */
1229 64, /* size of prefetch block. */
1230 /* New AMD processors never drop prefetches; if they cannot be performed
1231 immediately, they are queued. We set number of simultaneous prefetches
1232 to a large constant to reflect this (it probably is not a good idea not
1233 to limit number of prefetches at all, as their execution also takes some
1234 time). */
1235 100, /* number of parallel prefetches. */
1236 3, /* Branch cost. */
6065f444
JH
1237 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1238 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1239 /* Latency of fdiv is 8-15. */
1240 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1241 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1242 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1243 /* Latency of fsqrt is 4-10. */
1244 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1245
c53c148c 1246 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
1247 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1248 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1249 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
c53c148c
JH
1250 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1251 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
6065f444
JH
1252 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1253 /* 9-13 */
1254 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1255 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1256 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
64766e8d
JH
1257 /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1258 and it can execute 2 integer additions and 2 multiplications thus
1259 reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests
1260 that 4 works better than 6 probably due to register pressure.
1261
1262 Integer vector operations are taken by FP unit and execute 3 vector
1263 plus/minus operations per cycle but only one multiply. This is adjusted
1264 in ix86_reassociation_width. */
1265 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1266 znver1_memcpy,
1267 znver1_memset,
f6fd8f2b
JH
1268 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1269 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
7dc58b50
ML
1270 "16", /* Loop alignment. */
1271 "16", /* Jump alignment. */
1272 "0:0:8", /* Label alignment. */
1273 "16", /* Func alignment. */
64766e8d
JH
1274};
1275
2901f42f
VK
1276/* ZNVER2 has optimized REP instruction for medium sized blocks, but for
1277 very small blocks it is better to use loop. For large blocks, libcall
1278 can do nontemporary accesses and beat inline considerably. */
1279static stringop_algs znver2_memcpy[2] = {
1280 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1281 {-1, rep_prefix_4_byte, false}}},
1282 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1283 {-1, libcall, false}}}};
1284static stringop_algs znver2_memset[2] = {
1285 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1286 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1287 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1288 {-1, libcall, false}}}};
1289
1290struct processor_costs znver2_cost = {
1291 COSTS_N_INSNS (1), /* cost of an add instruction. */
1292 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1293 COSTS_N_INSNS (1), /* variable shift costs. */
1294 COSTS_N_INSNS (1), /* constant shift costs. */
1295 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1296 COSTS_N_INSNS (3), /* HI. */
1297 COSTS_N_INSNS (3), /* SI. */
1298 COSTS_N_INSNS (3), /* DI. */
1299 COSTS_N_INSNS (3)}, /* other. */
1300 0, /* cost of multiply per each bit
1301 set. */
1302 /* Depending on parameters, idiv can get faster on ryzen. This is upper
1303 bound. */
1304 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
1305 COSTS_N_INSNS (22), /* HI. */
1306 COSTS_N_INSNS (30), /* SI. */
1307 COSTS_N_INSNS (45), /* DI. */
1308 COSTS_N_INSNS (45)}, /* other. */
1309 COSTS_N_INSNS (1), /* cost of movsx. */
1310 COSTS_N_INSNS (1), /* cost of movzx. */
1311 8, /* "large" insn. */
1312 9, /* MOVE_RATIO. */
1313
1314 /* All move costs are relative to integer->integer move times 2 and thus
1315 they are latency*2. */
1316
1317 /* reg-reg moves are done by renaming and thus they are even cheaper than
1318 1 cycle. Because reg-reg move cost is 2 and following tables correspond
1319 to doubles of latencies, we do not model this correctly. It does not
1320 seem to make practical difference to bump prices up even more. */
1321 6, /* cost for loading QImode using
1322 movzbl. */
1323 {6, 6, 6}, /* cost of loading integer registers
1324 in QImode, HImode and SImode.
1325 Relative to reg-reg move (2). */
1326 {8, 8, 8}, /* cost of storing integer
1327 registers. */
1328 2, /* cost of reg,reg fld/fst. */
1329 {6, 6, 16}, /* cost of loading fp registers
1330 in SFmode, DFmode and XFmode. */
1331 {8, 8, 16}, /* cost of storing fp registers
1332 in SFmode, DFmode and XFmode. */
1333 2, /* cost of moving MMX register. */
1334 {6, 6}, /* cost of loading MMX registers
1335 in SImode and DImode. */
1336 {8, 8}, /* cost of storing MMX registers
1337 in SImode and DImode. */
1338 2, 3, 6, /* cost of moving XMM,YMM,ZMM
1339 register. */
1340 {6, 6, 6, 10, 20}, /* cost of loading SSE registers
1341 in 32,64,128,256 and 512-bit. */
1342 {6, 6, 6, 10, 20}, /* cost of unaligned loads. */
1343 {8, 8, 8, 8, 16}, /* cost of storing SSE registers
1344 in 32,64,128,256 and 512-bit. */
1345 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
1346 6, 6, /* SSE->integer and integer->SSE
1347 moves. */
1348 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1349 throughput 12. Approx 9 uops do not depend on vector size and every load
1350 is 7 uops. */
1351 18, 8, /* Gather load static, per_elt. */
1352 18, 10, /* Gather store static, per_elt. */
1353 32, /* size of l1 cache. */
1354 512, /* size of l2 cache. */
1355 64, /* size of prefetch block. */
1356 /* New AMD processors never drop prefetches; if they cannot be performed
1357 immediately, they are queued. We set number of simultaneous prefetches
1358 to a large constant to reflect this (it probably is not a good idea not
1359 to limit number of prefetches at all, as their execution also takes some
1360 time). */
1361 100, /* number of parallel prefetches. */
1362 3, /* Branch cost. */
1363 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1364 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1365 /* Latency of fdiv is 8-15. */
1366 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1367 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1368 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1369 /* Latency of fsqrt is 4-10. */
1370 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1371
1372 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1373 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1374 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1375 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1376 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1377 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1378 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1379 /* 9-13. */
1380 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1381 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1382 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
1383 /* Zen can execute 4 integer operations per cycle. FP operations
1384 take 3 cycles and it can execute 2 integer additions and 2
1385 multiplications thus reassociation may make sense up to with of 6.
1386 SPEC2k6 bencharks suggests
1387 that 4 works better than 6 probably due to register pressure.
1388
1389 Integer vector operations are taken by FP unit and execute 3 vector
1390 plus/minus operations per cycle but only one multiply. This is adjusted
1391 in ix86_reassociation_width. */
1392 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1393 znver2_memcpy,
1394 znver2_memset,
1395 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1396 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1397 "16", /* Loop alignment. */
1398 "16", /* Jump alignment. */
1399 "0:0:8", /* Label alignment. */
1400 "16", /* Func alignment. */
1401};
1402
c234d831
UB
1403/* skylake_cost should produce code tuned for Skylake familly of CPUs. */
1404static stringop_algs skylake_memcpy[2] = {
1405 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
6e559c70 1406 {libcall, {{16, loop, false}, {512, unrolled_loop, false},
c234d831
UB
1407 {-1, libcall, false}}}};
1408
1409static stringop_algs skylake_memset[2] = {
1410 {libcall, {{6, loop_1_byte, true},
1411 {24, loop, true},
1412 {8192, rep_prefix_4_byte, true},
1413 {-1, libcall, false}}},
6e559c70 1414 {libcall, {{24, loop, true}, {512, unrolled_loop, false},
c234d831
UB
1415 {-1, libcall, false}}}};
1416
1417static const
1418struct processor_costs skylake_cost = {
1419 COSTS_N_INSNS (1), /* cost of an add instruction */
1420 COSTS_N_INSNS (1)+1, /* cost of a lea instruction */
1421 COSTS_N_INSNS (1), /* variable shift costs */
1422 COSTS_N_INSNS (1), /* constant shift costs */
1423 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1424 COSTS_N_INSNS (4), /* HI */
1425 COSTS_N_INSNS (3), /* SI */
a2ef9558
MT
1426 COSTS_N_INSNS (3), /* DI */
1427 COSTS_N_INSNS (3)}, /* other */
c234d831 1428 0, /* cost of multiply per each bit set */
02308bd3
MT
1429 /* Expanding div/mod currently doesn't consider parallelism. So the cost
1430 model is not realistic. We compensate by increasing the latencies a bit. */
1431 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
1432 COSTS_N_INSNS (11), /* HI */
1433 COSTS_N_INSNS (14), /* SI */
c234d831
UB
1434 COSTS_N_INSNS (76), /* DI */
1435 COSTS_N_INSNS (76)}, /* other */
1436 COSTS_N_INSNS (1), /* cost of movsx */
1437 COSTS_N_INSNS (0), /* cost of movzx */
1438 8, /* "large" insn */
1439 17, /* MOVE_RATIO */
1440
1441 6, /* cost for loading QImode using movzbl */
1442 {4, 4, 4}, /* cost of loading integer registers
1443 in QImode, HImode and SImode.
1444 Relative to reg-reg move (2). */
001e7337 1445 {6, 6, 3}, /* cost of storing integer registers */
c234d831
UB
1446 2, /* cost of reg,reg fld/fst */
1447 {6, 6, 8}, /* cost of loading fp registers
1448 in SFmode, DFmode and XFmode */
1449 {6, 6, 10}, /* cost of storing fp registers
1450 in SFmode, DFmode and XFmode */
1451 2, /* cost of moving MMX register */
1452 {6, 6}, /* cost of loading MMX registers
1453 in SImode and DImode */
1454 {6, 6}, /* cost of storing MMX registers
1455 in SImode and DImode */
1456 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
1457 {6, 6, 6, 10, 20}, /* cost of loading SSE registers
1458 in 32,64,128,256 and 512-bit */
1459 {6, 6, 6, 10, 20}, /* cost of unaligned loads. */
001e7337 1460 {8, 8, 8, 12, 24}, /* cost of storing SSE registers
c234d831
UB
1461 in 32,64,128,256 and 512-bit */
1462 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
1463 2, 2, /* SSE->integer and integer->SSE moves */
1464 20, 8, /* Gather load static, per_elt. */
1465 22, 10, /* Gather store static, per_elt. */
1466 64, /* size of l1 cache. */
1467 512, /* size of l2 cache. */
1468 64, /* size of prefetch block */
1469 6, /* number of parallel prefetches */
1470 3, /* Branch cost */
1471 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
1472 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1473 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1474 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1475 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1476 COSTS_N_INSNS (20), /* cost of FSQRT instruction. */
1477
1478 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1479 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1480 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1481 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1482 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
1483 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
1484 COSTS_N_INSNS (11), /* cost of DIVSS instruction. */
1485 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
1486 COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */
1487 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
1488 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
1489 skylake_memcpy,
1490 skylake_memset,
1491 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1492 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
1493 "16:11:8", /* Loop alignment. */
1494 "16:11:8", /* Jump alignment. */
1495 "0:0:8", /* Label alignment. */
1496 "16", /* Func alignment. */
c234d831 1497};
64766e8d
JH
1498 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1499 very small blocks it is better to use loop. For large blocks, libcall can
1500 do nontemporary accesses and beat inline considerably. */
1501static stringop_algs btver1_memcpy[2] = {
1502 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1503 {-1, rep_prefix_4_byte, false}}},
1504 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1505 {-1, libcall, false}}}};
1506static stringop_algs btver1_memset[2] = {
1507 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1508 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1509 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1510 {-1, libcall, false}}}};
1511const struct processor_costs btver1_cost = {
1512 COSTS_N_INSNS (1), /* cost of an add instruction */
1513 COSTS_N_INSNS (2), /* cost of a lea instruction */
1514 COSTS_N_INSNS (1), /* variable shift costs */
1515 COSTS_N_INSNS (1), /* constant shift costs */
1516 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1517 COSTS_N_INSNS (4), /* HI */
1518 COSTS_N_INSNS (3), /* SI */
1519 COSTS_N_INSNS (4), /* DI */
1520 COSTS_N_INSNS (5)}, /* other */
1521 0, /* cost of multiply per each bit set */
1522 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1523 COSTS_N_INSNS (35), /* HI */
1524 COSTS_N_INSNS (51), /* SI */
1525 COSTS_N_INSNS (83), /* DI */
1526 COSTS_N_INSNS (83)}, /* other */
1527 COSTS_N_INSNS (1), /* cost of movsx */
1528 COSTS_N_INSNS (1), /* cost of movzx */
1529 8, /* "large" insn */
1530 9, /* MOVE_RATIO */
df41dbaf
JH
1531
1532 /* All move costs are relative to integer->integer move times 2 and thus
1533 they are latency*2. */
1534 8, /* cost for loading QImode using movzbl */
1535 {6, 8, 6}, /* cost of loading integer registers
64766e8d
JH
1536 in QImode, HImode and SImode.
1537 Relative to reg-reg move (2). */
df41dbaf 1538 {6, 8, 6}, /* cost of storing integer registers */
64766e8d 1539 4, /* cost of reg,reg fld/fst */
df41dbaf 1540 {12, 12, 28}, /* cost of loading fp registers
64766e8d 1541 in SFmode, DFmode and XFmode */
df41dbaf 1542 {12, 12, 38}, /* cost of storing fp registers
64766e8d 1543 in SFmode, DFmode and XFmode */
df41dbaf
JH
1544 4, /* cost of moving MMX register */
1545 {10, 10}, /* cost of loading MMX registers
64766e8d 1546 in SImode and DImode */
df41dbaf 1547 {12, 12}, /* cost of storing MMX registers
64766e8d 1548 in SImode and DImode */
df41dbaf 1549 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
b7167993 1550 {10, 10, 12, 48, 96}, /* cost of loading SSE registers
df41dbaf 1551 in 32,64,128,256 and 512-bit */
b7167993
RB
1552 {10, 10, 12, 48, 96}, /* cost of unaligned loads. */
1553 {10, 10, 12, 48, 96}, /* cost of storing SSE registers
df41dbaf 1554 in 32,64,128,256 and 512-bit */
b7167993 1555 {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
df41dbaf 1556 14, 14, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
1557 10, 10, /* Gather load static, per_elt. */
1558 10, 10, /* Gather store static, per_elt. */
64766e8d
JH
1559 32, /* size of l1 cache. */
1560 512, /* size of l2 cache. */
1561 64, /* size of prefetch block */
1562 100, /* number of parallel prefetches */
1563 2, /* Branch cost */
1564 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1565 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1566 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1567 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1568 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1569 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
6065f444 1570
c53c148c 1571 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
1572 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1573 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
1574 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
c53c148c
JH
1575 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1576 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
6065f444
JH
1577 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
1578 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
1579 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
1580 COSTS_N_INSNS (48), /* cost of SQRTSD instruction. */
64766e8d
JH
1581 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1582 btver1_memcpy,
1583 btver1_memset,
f6fd8f2b
JH
1584 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1585 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
1586 "16:11:8", /* Loop alignment. */
1587 "16:8:8", /* Jump alignment. */
1588 "0:0:8", /* Label alignment. */
1589 "11", /* Func alignment. */
64766e8d
JH
1590};
1591
1592static stringop_algs btver2_memcpy[2] = {
1593 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1594 {-1, rep_prefix_4_byte, false}}},
1595 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1596 {-1, libcall, false}}}};
1597static stringop_algs btver2_memset[2] = {
1598 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1599 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1600 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1601 {-1, libcall, false}}}};
1602const struct processor_costs btver2_cost = {
1603 COSTS_N_INSNS (1), /* cost of an add instruction */
1604 COSTS_N_INSNS (2), /* cost of a lea instruction */
1605 COSTS_N_INSNS (1), /* variable shift costs */
1606 COSTS_N_INSNS (1), /* constant shift costs */
1607 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1608 COSTS_N_INSNS (4), /* HI */
1609 COSTS_N_INSNS (3), /* SI */
1610 COSTS_N_INSNS (4), /* DI */
1611 COSTS_N_INSNS (5)}, /* other */
1612 0, /* cost of multiply per each bit set */
1613 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1614 COSTS_N_INSNS (35), /* HI */
1615 COSTS_N_INSNS (51), /* SI */
1616 COSTS_N_INSNS (83), /* DI */
1617 COSTS_N_INSNS (83)}, /* other */
1618 COSTS_N_INSNS (1), /* cost of movsx */
1619 COSTS_N_INSNS (1), /* cost of movzx */
1620 8, /* "large" insn */
1621 9, /* MOVE_RATIO */
df41dbaf
JH
1622
1623 /* All move costs are relative to integer->integer move times 2 and thus
1624 they are latency*2. */
1625 8, /* cost for loading QImode using movzbl */
1626 {8, 8, 6}, /* cost of loading integer registers
64766e8d
JH
1627 in QImode, HImode and SImode.
1628 Relative to reg-reg move (2). */
df41dbaf 1629 {8, 8, 6}, /* cost of storing integer registers */
64766e8d 1630 4, /* cost of reg,reg fld/fst */
df41dbaf 1631 {12, 12, 28}, /* cost of loading fp registers
64766e8d 1632 in SFmode, DFmode and XFmode */
df41dbaf 1633 {12, 12, 38}, /* cost of storing fp registers
64766e8d 1634 in SFmode, DFmode and XFmode */
df41dbaf
JH
1635 4, /* cost of moving MMX register */
1636 {10, 10}, /* cost of loading MMX registers
64766e8d 1637 in SImode and DImode */
df41dbaf 1638 {12, 12}, /* cost of storing MMX registers
64766e8d 1639 in SImode and DImode */
df41dbaf 1640 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
b7167993 1641 {10, 10, 12, 48, 96}, /* cost of loading SSE registers
df41dbaf 1642 in 32,64,128,256 and 512-bit */
b7167993
RB
1643 {10, 10, 12, 48, 96}, /* cost of unaligned loads. */
1644 {10, 10, 12, 48, 96}, /* cost of storing SSE registers
df41dbaf 1645 in 32,64,128,256 and 512-bit */
b7167993 1646 {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
df41dbaf 1647 14, 14, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
1648 10, 10, /* Gather load static, per_elt. */
1649 10, 10, /* Gather store static, per_elt. */
64766e8d
JH
1650 32, /* size of l1 cache. */
1651 2048, /* size of l2 cache. */
1652 64, /* size of prefetch block */
1653 100, /* number of parallel prefetches */
1654 2, /* Branch cost */
1655 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1656 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1657 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1658 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1659 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1660 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
6065f444 1661
c53c148c 1662 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
1663 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1664 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
1665 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
c53c148c
JH
1666 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1667 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
6065f444
JH
1668 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
1669 COSTS_N_INSNS (19), /* cost of DIVSD instruction. */
1670 COSTS_N_INSNS (16), /* cost of SQRTSS instruction. */
1671 COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */
64766e8d
JH
1672 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1673 btver2_memcpy,
1674 btver2_memset,
f6fd8f2b
JH
1675 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1676 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
1677 "16:11:8", /* Loop alignment. */
1678 "16:8:8", /* Jump alignment. */
1679 "0:0:8", /* Label alignment. */
1680 "11", /* Func alignment. */
64766e8d
JH
1681};
1682
1683static stringop_algs pentium4_memcpy[2] = {
1684 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1685 DUMMY_STRINGOP_ALGS};
1686static stringop_algs pentium4_memset[2] = {
1687 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1688 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1689 DUMMY_STRINGOP_ALGS};
1690
1691static const
1692struct processor_costs pentium4_cost = {
1693 COSTS_N_INSNS (1), /* cost of an add instruction */
1694 COSTS_N_INSNS (3), /* cost of a lea instruction */
1695 COSTS_N_INSNS (4), /* variable shift costs */
1696 COSTS_N_INSNS (4), /* constant shift costs */
1697 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1698 COSTS_N_INSNS (15), /* HI */
1699 COSTS_N_INSNS (15), /* SI */
1700 COSTS_N_INSNS (15), /* DI */
1701 COSTS_N_INSNS (15)}, /* other */
1702 0, /* cost of multiply per each bit set */
1703 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1704 COSTS_N_INSNS (56), /* HI */
1705 COSTS_N_INSNS (56), /* SI */
1706 COSTS_N_INSNS (56), /* DI */
1707 COSTS_N_INSNS (56)}, /* other */
1708 COSTS_N_INSNS (1), /* cost of movsx */
1709 COSTS_N_INSNS (1), /* cost of movzx */
1710 16, /* "large" insn */
1711 6, /* MOVE_RATIO */
df41dbaf
JH
1712
1713 /* All move costs are relative to integer->integer move times 2 and thus
1714 they are latency*2. */
1715 5, /* cost for loading QImode using movzbl */
64766e8d
JH
1716 {4, 5, 4}, /* cost of loading integer registers
1717 in QImode, HImode and SImode.
1718 Relative to reg-reg move (2). */
1719 {2, 3, 2}, /* cost of storing integer registers */
df41dbaf
JH
1720 12, /* cost of reg,reg fld/fst */
1721 {14, 14, 14}, /* cost of loading fp registers
64766e8d 1722 in SFmode, DFmode and XFmode */
df41dbaf 1723 {14, 14, 14}, /* cost of storing fp registers
64766e8d 1724 in SFmode, DFmode and XFmode */
df41dbaf
JH
1725 12, /* cost of moving MMX register */
1726 {16, 16}, /* cost of loading MMX registers
64766e8d 1727 in SImode and DImode */
df41dbaf 1728 {16, 16}, /* cost of storing MMX registers
64766e8d 1729 in SImode and DImode */
df41dbaf
JH
1730 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
1731 {16, 16, 16, 32, 64}, /* cost of loading SSE registers
1732 in 32,64,128,256 and 512-bit */
1733 {32, 32, 32, 64, 128}, /* cost of unaligned loads. */
1734 {16, 16, 16, 32, 64}, /* cost of storing SSE registers
1735 in 32,64,128,256 and 512-bit */
1736 {32, 32, 32, 64, 128}, /* cost of unaligned stores. */
1737 20, 12, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
1738 16, 16, /* Gather load static, per_elt. */
1739 16, 16, /* Gather store static, per_elt. */
64766e8d
JH
1740 8, /* size of l1 cache. */
1741 256, /* size of l2 cache. */
1742 64, /* size of prefetch block */
1743 6, /* number of parallel prefetches */
1744 2, /* Branch cost */
1745 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1746 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1747 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1748 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1749 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1750 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
6065f444 1751
c53c148c 1752 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
6065f444
JH
1753 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1754 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1755 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
c53c148c
JH
1756 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1757 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
6065f444
JH
1758 COSTS_N_INSNS (23), /* cost of DIVSS instruction. */
1759 COSTS_N_INSNS (38), /* cost of DIVSD instruction. */
1760 COSTS_N_INSNS (23), /* cost of SQRTSS instruction. */
1761 COSTS_N_INSNS (38), /* cost of SQRTSD instruction. */
64766e8d
JH
1762 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1763 pentium4_memcpy,
1764 pentium4_memset,
f6fd8f2b
JH
1765 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1766 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
1767 NULL, /* Loop alignment. */
1768 NULL, /* Jump alignment. */
1769 NULL, /* Label alignment. */
1770 NULL, /* Func alignment. */
64766e8d
JH
1771};
1772
1773static stringop_algs nocona_memcpy[2] = {
1774 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1775 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1776 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1777
1778static stringop_algs nocona_memset[2] = {
1779 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1780 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1781 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1782 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1783
1784static const
1785struct processor_costs nocona_cost = {
1786 COSTS_N_INSNS (1), /* cost of an add instruction */
1787 COSTS_N_INSNS (1), /* cost of a lea instruction */
1788 COSTS_N_INSNS (1), /* variable shift costs */
1789 COSTS_N_INSNS (1), /* constant shift costs */
1790 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1791 COSTS_N_INSNS (10), /* HI */
1792 COSTS_N_INSNS (10), /* SI */
1793 COSTS_N_INSNS (10), /* DI */
1794 COSTS_N_INSNS (10)}, /* other */
1795 0, /* cost of multiply per each bit set */
1796 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1797 COSTS_N_INSNS (66), /* HI */
1798 COSTS_N_INSNS (66), /* SI */
1799 COSTS_N_INSNS (66), /* DI */
1800 COSTS_N_INSNS (66)}, /* other */
1801 COSTS_N_INSNS (1), /* cost of movsx */
1802 COSTS_N_INSNS (1), /* cost of movzx */
1803 16, /* "large" insn */
1804 17, /* MOVE_RATIO */
df41dbaf
JH
1805
1806 /* All move costs are relative to integer->integer move times 2 and thus
1807 they are latency*2. */
64766e8d
JH
1808 4, /* cost for loading QImode using movzbl */
1809 {4, 4, 4}, /* cost of loading integer registers
1810 in QImode, HImode and SImode.
1811 Relative to reg-reg move (2). */
1812 {4, 4, 4}, /* cost of storing integer registers */
df41dbaf
JH
1813 12, /* cost of reg,reg fld/fst */
1814 {14, 14, 14}, /* cost of loading fp registers
64766e8d 1815 in SFmode, DFmode and XFmode */
df41dbaf 1816 {14, 14, 14}, /* cost of storing fp registers
64766e8d 1817 in SFmode, DFmode and XFmode */
df41dbaf 1818 14, /* cost of moving MMX register */
64766e8d
JH
1819 {12, 12}, /* cost of loading MMX registers
1820 in SImode and DImode */
1821 {12, 12}, /* cost of storing MMX registers
1822 in SImode and DImode */
df41dbaf
JH
1823 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
1824 {12, 12, 12, 24, 48}, /* cost of loading SSE registers
1825 in 32,64,128,256 and 512-bit */
1826 {24, 24, 24, 48, 96}, /* cost of unaligned loads. */
1827 {12, 12, 12, 24, 48}, /* cost of storing SSE registers
1828 in 32,64,128,256 and 512-bit */
1829 {24, 24, 24, 48, 96}, /* cost of unaligned stores. */
1830 20, 12, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
1831 12, 12, /* Gather load static, per_elt. */
1832 12, 12, /* Gather store static, per_elt. */
64766e8d
JH
1833 8, /* size of l1 cache. */
1834 1024, /* size of l2 cache. */
1835 64, /* size of prefetch block */
1836 8, /* number of parallel prefetches */
1837 1, /* Branch cost */
1838 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1839 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1840 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1841 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1842 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1843 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
6065f444 1844
c53c148c 1845 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
6065f444
JH
1846 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
1847 COSTS_N_INSNS (7), /* cost of MULSS instruction. */
1848 COSTS_N_INSNS (7), /* cost of MULSD instruction. */
c53c148c
JH
1849 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
1850 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
6065f444
JH
1851 COSTS_N_INSNS (32), /* cost of DIVSS instruction. */
1852 COSTS_N_INSNS (40), /* cost of DIVSD instruction. */
1853 COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */
1854 COSTS_N_INSNS (41), /* cost of SQRTSD instruction. */
64766e8d
JH
1855 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1856 nocona_memcpy,
1857 nocona_memset,
f6fd8f2b
JH
1858 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1859 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
1860 NULL, /* Loop alignment. */
1861 NULL, /* Jump alignment. */
1862 NULL, /* Label alignment. */
1863 NULL, /* Func alignment. */
64766e8d
JH
1864};
1865
1866static stringop_algs atom_memcpy[2] = {
1867 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1868 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1869 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1870static stringop_algs atom_memset[2] = {
1871 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1872 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1873 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1874 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1875static const
1876struct processor_costs atom_cost = {
1877 COSTS_N_INSNS (1), /* cost of an add instruction */
1878 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1879 COSTS_N_INSNS (1), /* variable shift costs */
1880 COSTS_N_INSNS (1), /* constant shift costs */
1881 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1882 COSTS_N_INSNS (4), /* HI */
1883 COSTS_N_INSNS (3), /* SI */
1884 COSTS_N_INSNS (4), /* DI */
1885 COSTS_N_INSNS (2)}, /* other */
1886 0, /* cost of multiply per each bit set */
1887 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1888 COSTS_N_INSNS (26), /* HI */
1889 COSTS_N_INSNS (42), /* SI */
1890 COSTS_N_INSNS (74), /* DI */
1891 COSTS_N_INSNS (74)}, /* other */
1892 COSTS_N_INSNS (1), /* cost of movsx */
1893 COSTS_N_INSNS (1), /* cost of movzx */
1894 8, /* "large" insn */
1895 17, /* MOVE_RATIO */
df41dbaf
JH
1896
1897 /* All move costs are relative to integer->integer move times 2 and thus
1898 they are latency*2. */
1899 6, /* cost for loading QImode using movzbl */
1900 {6, 6, 6}, /* cost of loading integer registers
64766e8d
JH
1901 in QImode, HImode and SImode.
1902 Relative to reg-reg move (2). */
df41dbaf 1903 {6, 6, 6}, /* cost of storing integer registers */
64766e8d 1904 4, /* cost of reg,reg fld/fst */
df41dbaf 1905 {6, 6, 18}, /* cost of loading fp registers
64766e8d 1906 in SFmode, DFmode and XFmode */
df41dbaf 1907 {14, 14, 24}, /* cost of storing fp registers
64766e8d
JH
1908 in SFmode, DFmode and XFmode */
1909 2, /* cost of moving MMX register */
1910 {8, 8}, /* cost of loading MMX registers
1911 in SImode and DImode */
df41dbaf 1912 {10, 10}, /* cost of storing MMX registers
64766e8d 1913 in SImode and DImode */
df41dbaf
JH
1914 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1915 {8, 8, 8, 16, 32}, /* cost of loading SSE registers
1916 in 32,64,128,256 and 512-bit */
1917 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */
1918 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
1919 in 32,64,128,256 and 512-bit */
1920 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
1921 8, 6, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
1922 8, 8, /* Gather load static, per_elt. */
1923 8, 8, /* Gather store static, per_elt. */
64766e8d
JH
1924 32, /* size of l1 cache. */
1925 256, /* size of l2 cache. */
1926 64, /* size of prefetch block */
1927 6, /* number of parallel prefetches */
1928 3, /* Branch cost */
1929 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1930 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1931 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1932 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1933 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1934 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
6065f444 1935
c53c148c 1936 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
1937 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
1938 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1939 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
c53c148c
JH
1940 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1941 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
6065f444
JH
1942 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
1943 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
1944 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
1945 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
64766e8d
JH
1946 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
1947 atom_memcpy,
1948 atom_memset,
f6fd8f2b
JH
1949 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1950 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
1951 "16", /* Loop alignment. */
1952 "16:8:8", /* Jump alignment. */
1953 "0:0:8", /* Label alignment. */
1954 "16", /* Func alignment. */
64766e8d
JH
1955};
1956
1957static stringop_algs slm_memcpy[2] = {
1958 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1959 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1960 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1961static stringop_algs slm_memset[2] = {
1962 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1963 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1964 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1965 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1966static const
1967struct processor_costs slm_cost = {
1968 COSTS_N_INSNS (1), /* cost of an add instruction */
1969 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1970 COSTS_N_INSNS (1), /* variable shift costs */
1971 COSTS_N_INSNS (1), /* constant shift costs */
1972 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1973 COSTS_N_INSNS (3), /* HI */
1974 COSTS_N_INSNS (3), /* SI */
1975 COSTS_N_INSNS (4), /* DI */
1976 COSTS_N_INSNS (2)}, /* other */
1977 0, /* cost of multiply per each bit set */
1978 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1979 COSTS_N_INSNS (26), /* HI */
1980 COSTS_N_INSNS (42), /* SI */
1981 COSTS_N_INSNS (74), /* DI */
1982 COSTS_N_INSNS (74)}, /* other */
1983 COSTS_N_INSNS (1), /* cost of movsx */
1984 COSTS_N_INSNS (1), /* cost of movzx */
1985 8, /* "large" insn */
1986 17, /* MOVE_RATIO */
df41dbaf
JH
1987
1988 /* All move costs are relative to integer->integer move times 2 and thus
1989 they are latency*2. */
1990 8, /* cost for loading QImode using movzbl */
1991 {8, 8, 8}, /* cost of loading integer registers
64766e8d
JH
1992 in QImode, HImode and SImode.
1993 Relative to reg-reg move (2). */
df41dbaf
JH
1994 {6, 6, 6}, /* cost of storing integer registers */
1995 2, /* cost of reg,reg fld/fst */
1996 {8, 8, 18}, /* cost of loading fp registers
64766e8d 1997 in SFmode, DFmode and XFmode */
df41dbaf 1998 {6, 6, 18}, /* cost of storing fp registers
64766e8d
JH
1999 in SFmode, DFmode and XFmode */
2000 2, /* cost of moving MMX register */
2001 {8, 8}, /* cost of loading MMX registers
2002 in SImode and DImode */
df41dbaf 2003 {6, 6}, /* cost of storing MMX registers
64766e8d 2004 in SImode and DImode */
df41dbaf
JH
2005 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2006 {8, 8, 8, 16, 32}, /* cost of loading SSE registers
2007 in 32,64,128,256 and 512-bit */
2008 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */
2009 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
2010 in 32,64,128,256 and 512-bit */
2011 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
2012 8, 6, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
2013 8, 8, /* Gather load static, per_elt. */
2014 8, 8, /* Gather store static, per_elt. */
64766e8d
JH
2015 32, /* size of l1 cache. */
2016 256, /* size of l2 cache. */
2017 64, /* size of prefetch block */
2018 6, /* number of parallel prefetches */
2019 3, /* Branch cost */
2020 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2021 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2022 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2023 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2024 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2025 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
6065f444 2026
c53c148c 2027 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
2028 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2029 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2030 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
c53c148c
JH
2031 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2032 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
6065f444
JH
2033 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
2034 COSTS_N_INSNS (69), /* cost of DIVSD instruction. */
2035 COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */
2036 COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */
64766e8d
JH
2037 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2038 slm_memcpy,
2039 slm_memset,
f6fd8f2b
JH
2040 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2041 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
2042 "16", /* Loop alignment. */
2043 "16:8:8", /* Jump alignment. */
2044 "0:0:8", /* Label alignment. */
2045 "16", /* Func alignment. */
64766e8d
JH
2046};
2047
2048static stringop_algs intel_memcpy[2] = {
2049 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2050 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2051 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2052static stringop_algs intel_memset[2] = {
2053 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2054 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2055 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2056 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2057static const
2058struct processor_costs intel_cost = {
2059 COSTS_N_INSNS (1), /* cost of an add instruction */
2060 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2061 COSTS_N_INSNS (1), /* variable shift costs */
2062 COSTS_N_INSNS (1), /* constant shift costs */
2063 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2064 COSTS_N_INSNS (3), /* HI */
2065 COSTS_N_INSNS (3), /* SI */
2066 COSTS_N_INSNS (4), /* DI */
2067 COSTS_N_INSNS (2)}, /* other */
2068 0, /* cost of multiply per each bit set */
2069 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2070 COSTS_N_INSNS (26), /* HI */
2071 COSTS_N_INSNS (42), /* SI */
2072 COSTS_N_INSNS (74), /* DI */
2073 COSTS_N_INSNS (74)}, /* other */
2074 COSTS_N_INSNS (1), /* cost of movsx */
2075 COSTS_N_INSNS (1), /* cost of movzx */
2076 8, /* "large" insn */
2077 17, /* MOVE_RATIO */
df41dbaf
JH
2078
2079 /* All move costs are relative to integer->integer move times 2 and thus
2080 they are latency*2. */
af863030 2081 6, /* cost for loading QImode using movzbl */
64766e8d
JH
2082 {4, 4, 4}, /* cost of loading integer registers
2083 in QImode, HImode and SImode.
2084 Relative to reg-reg move (2). */
af863030
JH
2085 {6, 6, 6}, /* cost of storing integer registers */
2086 2, /* cost of reg,reg fld/fst */
2087 {6, 6, 8}, /* cost of loading fp registers
64766e8d 2088 in SFmode, DFmode and XFmode */
af863030 2089 {6, 6, 10}, /* cost of storing fp registers
64766e8d
JH
2090 in SFmode, DFmode and XFmode */
2091 2, /* cost of moving MMX register */
af863030 2092 {6, 6}, /* cost of loading MMX registers
64766e8d 2093 in SImode and DImode */
af863030 2094 {6, 6}, /* cost of storing MMX registers
64766e8d 2095 in SImode and DImode */
df41dbaf
JH
2096 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */
2097 {6, 6, 6, 6, 6}, /* cost of loading SSE registers
2098 in 32,64,128,256 and 512-bit */
2099 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
2100 {6, 6, 6, 6, 6}, /* cost of storing SSE registers
2101 in 32,64,128,256 and 512-bit */
2102 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
2103 4, 4, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
2104 6, 6, /* Gather load static, per_elt. */
2105 6, 6, /* Gather store static, per_elt. */
64766e8d
JH
2106 32, /* size of l1 cache. */
2107 256, /* size of l2 cache. */
2108 64, /* size of prefetch block */
2109 6, /* number of parallel prefetches */
2110 3, /* Branch cost */
2111 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2112 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2113 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2114 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2115 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2116 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
6065f444 2117
c53c148c 2118 COSTS_N_INSNS (8), /* cost of cheap SSE instruction. */
6065f444
JH
2119 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
2120 COSTS_N_INSNS (8), /* cost of MULSS instruction. */
2121 COSTS_N_INSNS (8), /* cost of MULSD instruction. */
c53c148c
JH
2122 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2123 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
6065f444
JH
2124 COSTS_N_INSNS (20), /* cost of DIVSS instruction. */
2125 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
2126 COSTS_N_INSNS (40), /* cost of SQRTSS instruction. */
2127 COSTS_N_INSNS (40), /* cost of SQRTSD instruction. */
64766e8d
JH
2128 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2129 intel_memcpy,
2130 intel_memset,
f6fd8f2b
JH
2131 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2132 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
2133 "16", /* Loop alignment. */
2134 "16:8:8", /* Jump alignment. */
2135 "0:0:8", /* Label alignment. */
2136 "16", /* Func alignment. */
64766e8d
JH
2137};
2138
2139/* Generic should produce code tuned for Core-i7 (and newer chips)
2140 and btver1 (and newer chips). */
2141
2142static stringop_algs generic_memcpy[2] = {
2143 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2144 {-1, libcall, false}}},
2145 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2146 {-1, libcall, false}}}};
2147static stringop_algs generic_memset[2] = {
2148 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2149 {-1, libcall, false}}},
2150 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2151 {-1, libcall, false}}}};
2152static const
2153struct processor_costs generic_cost = {
2154 COSTS_N_INSNS (1), /* cost of an add instruction */
ef9eec0b 2155 /* Setting cost to 2 makes our current implementation of synth_mult result in
64766e8d
JH
2156 use of unnecessary temporary registers causing regression on several
2157 SPECfp benchmarks. */
2158 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2159 COSTS_N_INSNS (1), /* variable shift costs */
2160 COSTS_N_INSNS (1), /* constant shift costs */
2161 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2162 COSTS_N_INSNS (4), /* HI */
2163 COSTS_N_INSNS (3), /* SI */
2164 COSTS_N_INSNS (4), /* DI */
7c080ade 2165 COSTS_N_INSNS (4)}, /* other */
64766e8d 2166 0, /* cost of multiply per each bit set */
7c080ade
JH
2167 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */
2168 COSTS_N_INSNS (22), /* HI */
2169 COSTS_N_INSNS (30), /* SI */
64766e8d
JH
2170 COSTS_N_INSNS (74), /* DI */
2171 COSTS_N_INSNS (74)}, /* other */
2172 COSTS_N_INSNS (1), /* cost of movsx */
2173 COSTS_N_INSNS (1), /* cost of movzx */
2174 8, /* "large" insn */
2175 17, /* MOVE_RATIO */
df41dbaf
JH
2176
2177 /* All move costs are relative to integer->integer move times 2 and thus
2178 they are latency*2. */
d555138e
JH
2179 6, /* cost for loading QImode using movzbl */
2180 {6, 6, 6}, /* cost of loading integer registers
64766e8d
JH
2181 in QImode, HImode and SImode.
2182 Relative to reg-reg move (2). */
af863030 2183 {6, 6, 6}, /* cost of storing integer registers */
64766e8d 2184 4, /* cost of reg,reg fld/fst */
af863030 2185 {6, 6, 12}, /* cost of loading fp registers
64766e8d 2186 in SFmode, DFmode and XFmode */
af863030 2187 {6, 6, 12}, /* cost of storing fp registers
64766e8d
JH
2188 in SFmode, DFmode and XFmode */
2189 2, /* cost of moving MMX register */
af863030 2190 {6, 6}, /* cost of loading MMX registers
64766e8d 2191 in SImode and DImode */
af863030 2192 {6, 6}, /* cost of storing MMX registers
64766e8d 2193 in SImode and DImode */
df41dbaf
JH
2194 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
2195 {6, 6, 6, 10, 15}, /* cost of loading SSE registers
2196 in 32,64,128,256 and 512-bit */
7c080ade 2197 {6, 6, 6, 10, 15}, /* cost of unaligned loads. */
df41dbaf
JH
2198 {6, 6, 6, 10, 15}, /* cost of storing SSE registers
2199 in 32,64,128,256 and 512-bit */
7c080ade
JH
2200 {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
2201 6, 6, /* SSE->integer and integer->SSE moves */
2202 18, 6, /* Gather load static, per_elt. */
2203 18, 6, /* Gather store static, per_elt. */
64766e8d
JH
2204 32, /* size of l1 cache. */
2205 512, /* size of l2 cache. */
2206 64, /* size of prefetch block */
2207 6, /* number of parallel prefetches */
2208 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2209 value is increased to perhaps more appropriate value of 5. */
2210 3, /* Branch cost */
ef9eec0b 2211 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
7c080ade 2212 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
e8e3054e 2213 COSTS_N_INSNS (17), /* cost of FDIV instruction. */
ef9eec0b
JH
2214 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2215 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
e8e3054e 2216 COSTS_N_INSNS (14), /* cost of FSQRT instruction. */
6065f444 2217
ef9eec0b
JH
2218 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2219 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2220 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2221 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2222 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2223 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
e8e3054e
JH
2224 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
2225 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
2226 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
2227 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
7c080ade 2228 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
64766e8d
JH
2229 generic_memcpy,
2230 generic_memset,
e8e3054e
JH
2231 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
2232 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
7dc58b50
ML
2233 "16:11:8", /* Loop alignment. */
2234 "16:11:8", /* Jump alignment. */
2235 "0:0:8", /* Label alignment. */
2236 "16", /* Func alignment. */
64766e8d
JH
2237};
2238
2239/* core_cost should produce code tuned for Core familly of CPUs. */
2240static stringop_algs core_memcpy[2] = {
2241 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2242 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2243 {-1, libcall, false}}}};
2244static stringop_algs core_memset[2] = {
2245 {libcall, {{6, loop_1_byte, true},
2246 {24, loop, true},
2247 {8192, rep_prefix_4_byte, true},
2248 {-1, libcall, false}}},
2249 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2250 {-1, libcall, false}}}};
2251
2252static const
2253struct processor_costs core_cost = {
2254 COSTS_N_INSNS (1), /* cost of an add instruction */
2255 /* On all chips taken into consideration lea is 2 cycles and more. With
2256 this cost however our current implementation of synth_mult results in
2257 use of unnecessary temporary registers causing regression on several
2258 SPECfp benchmarks. */
2259 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2260 COSTS_N_INSNS (1), /* variable shift costs */
2261 COSTS_N_INSNS (1), /* constant shift costs */
2262 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2263 COSTS_N_INSNS (4), /* HI */
2264 COSTS_N_INSNS (3), /* SI */
a2ef9558
MT
2265 /* Here we tune for Sandybridge or newer. */
2266 COSTS_N_INSNS (3), /* DI */
2267 COSTS_N_INSNS (3)}, /* other */
64766e8d 2268 0, /* cost of multiply per each bit set */
02308bd3
MT
2269 /* Expanding div/mod currently doesn't consider parallelism. So the cost
2270 model is not realistic. We compensate by increasing the latencies a bit. */
2271 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
2272 COSTS_N_INSNS (11), /* HI */
2273 COSTS_N_INSNS (14), /* SI */
ffa3ce53
JH
2274 COSTS_N_INSNS (81), /* DI */
2275 COSTS_N_INSNS (81)}, /* other */
64766e8d
JH
2276 COSTS_N_INSNS (1), /* cost of movsx */
2277 COSTS_N_INSNS (1), /* cost of movzx */
2278 8, /* "large" insn */
2279 17, /* MOVE_RATIO */
df41dbaf
JH
2280
2281 /* All move costs are relative to integer->integer move times 2 and thus
2282 they are latency*2. */
ffa3ce53 2283 6, /* cost for loading QImode using movzbl */
64766e8d
JH
2284 {4, 4, 4}, /* cost of loading integer registers
2285 in QImode, HImode and SImode.
2286 Relative to reg-reg move (2). */
ffa3ce53
JH
2287 {6, 6, 6}, /* cost of storing integer registers */
2288 2, /* cost of reg,reg fld/fst */
2289 {6, 6, 8}, /* cost of loading fp registers
64766e8d 2290 in SFmode, DFmode and XFmode */
af863030 2291 {6, 6, 10}, /* cost of storing fp registers
64766e8d
JH
2292 in SFmode, DFmode and XFmode */
2293 2, /* cost of moving MMX register */
ffa3ce53 2294 {6, 6}, /* cost of loading MMX registers
64766e8d 2295 in SImode and DImode */
ffa3ce53 2296 {6, 6}, /* cost of storing MMX registers
64766e8d 2297 in SImode and DImode */
df41dbaf
JH
2298 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
2299 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
2300 in 32,64,128,256 and 512-bit */
2301 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */
2302 {6, 6, 6, 6, 12}, /* cost of storing SSE registers
2303 in 32,64,128,256 and 512-bit */
2304 {6, 6, 6, 6, 12}, /* cost of unaligned stores. */
2305 2, 2, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
2306 /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
2307 rec. throughput 6.
2308 So 5 uops statically and one uops per load. */
2309 10, 6, /* Gather load static, per_elt. */
2310 10, 6, /* Gather store static, per_elt. */
64766e8d
JH
2311 64, /* size of l1 cache. */
2312 512, /* size of l2 cache. */
2313 64, /* size of prefetch block */
2314 6, /* number of parallel prefetches */
2315 /* FIXME perhaps more appropriate value is 5. */
2316 3, /* Branch cost */
ef9eec0b
JH
2317 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2318 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
ffa3ce53 2319 /* 10-24 */
ef9eec0b
JH
2320 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
2321 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2322 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
ffa3ce53 2323 COSTS_N_INSNS (23), /* cost of FSQRT instruction. */
6065f444 2324
c53c148c 2325 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
2326 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2327 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2328 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
c53c148c
JH
2329 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2330 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
6065f444
JH
2331 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
2332 COSTS_N_INSNS (32), /* cost of DIVSD instruction. */
2333 COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */
2334 COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */
64766e8d
JH
2335 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
2336 core_memcpy,
2337 core_memset,
f6fd8f2b
JH
2338 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2339 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
7dc58b50
ML
2340 "16:11:8", /* Loop alignment. */
2341 "16:11:8", /* Jump alignment. */
2342 "0:0:8", /* Label alignment. */
2343 "16", /* Func alignment. */
64766e8d
JH
2344};
2345