]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/i386/x86-tune-costs.h
re PR fortran/85954 (ICE in make_ssa_name_fn, at tree-ssanames.c:266)
[thirdparty/gcc.git] / gcc / config / i386 / x86-tune-costs.h
CommitLineData
df41dbaf 1/* Costs of operations of individual x86 CPUs.
85ec4feb 2 Copyright (C) 1988-2018 Free Software Foundation, Inc.
64766e8d 3
df41dbaf
JH
4This file is part of GCC.
5
6GCC is free software; you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation; either version 3, or (at your option)
9any later version.
10
11GCC is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14GNU General Public License for more details.
15
16Under Section 7 of GPL version 3, you are granted additional
17permissions described in the GCC Runtime Library Exception, version
183.1, as published by the Free Software Foundation.
19
20You should have received a copy of the GNU General Public License and
21a copy of the GCC Runtime Library Exception along with this program;
22see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23<http://www.gnu.org/licenses/>. */
64766e8d
JH
24/* Processor costs (relative to an add) */
25/* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
26#define COSTS_N_BYTES(N) ((N) * 2)
27
28#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
29
30static stringop_algs ix86_size_memcpy[2] = {
31 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
32 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
33static stringop_algs ix86_size_memset[2] = {
34 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
35 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
36
37const
38struct processor_costs ix86_size_cost = {/* costs for tuning for size */
39 COSTS_N_BYTES (2), /* cost of an add instruction */
40 COSTS_N_BYTES (3), /* cost of a lea instruction */
41 COSTS_N_BYTES (2), /* variable shift costs */
42 COSTS_N_BYTES (3), /* constant shift costs */
43 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
44 COSTS_N_BYTES (3), /* HI */
45 COSTS_N_BYTES (3), /* SI */
46 COSTS_N_BYTES (3), /* DI */
47 COSTS_N_BYTES (5)}, /* other */
48 0, /* cost of multiply per each bit set */
49 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
50 COSTS_N_BYTES (3), /* HI */
51 COSTS_N_BYTES (3), /* SI */
52 COSTS_N_BYTES (3), /* DI */
53 COSTS_N_BYTES (5)}, /* other */
54 COSTS_N_BYTES (3), /* cost of movsx */
55 COSTS_N_BYTES (3), /* cost of movzx */
56 0, /* "large" insn */
57 2, /* MOVE_RATIO */
df41dbaf
JH
58
59 /* All move costs are relative to integer->integer move times 2. */
64766e8d
JH
60 2, /* cost for loading QImode using movzbl */
61 {2, 2, 2}, /* cost of loading integer registers
62 in QImode, HImode and SImode.
63 Relative to reg-reg move (2). */
64 {2, 2, 2}, /* cost of storing integer registers */
65 2, /* cost of reg,reg fld/fst */
66 {2, 2, 2}, /* cost of loading fp registers
67 in SFmode, DFmode and XFmode */
68 {2, 2, 2}, /* cost of storing fp registers
69 in SFmode, DFmode and XFmode */
70 3, /* cost of moving MMX register */
71 {3, 3}, /* cost of loading MMX registers
72 in SImode and DImode */
73 {3, 3}, /* cost of storing MMX registers
74 in SImode and DImode */
df41dbaf
JH
75 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
76 {3, 3, 3, 3, 3}, /* cost of loading SSE registers
77 in 32,64,128,256 and 512-bit */
78 {3, 3, 3, 3, 3}, /* cost of unaligned SSE load
79 in 128bit, 256bit and 512bit */
80 {3, 3, 3, 3, 3}, /* cost of storing SSE registers
81 in 32,64,128,256 and 512-bit */
82 {3, 3, 3, 3, 3}, /* cost of unaligned SSE store
83 in 128bit, 256bit and 512bit */
84 3, 3, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
85 5, 0, /* Gather load static, per_elt. */
86 5, 0, /* Gather store static, per_elt. */
64766e8d
JH
87 0, /* size of l1 cache */
88 0, /* size of l2 cache */
89 0, /* size of prefetch block */
90 0, /* number of parallel prefetches */
91 2, /* Branch cost */
92 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
93 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
94 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
95 COSTS_N_BYTES (2), /* cost of FABS instruction. */
96 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
97 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
6065f444 98
c53c148c 99 COSTS_N_BYTES (2), /* cost of cheap SSE instruction. */
6065f444
JH
100 COSTS_N_BYTES (2), /* cost of ADDSS/SD SUBSS/SD insns. */
101 COSTS_N_BYTES (2), /* cost of MULSS instruction. */
102 COSTS_N_BYTES (2), /* cost of MULSD instruction. */
c53c148c
JH
103 COSTS_N_BYTES (2), /* cost of FMA SS instruction. */
104 COSTS_N_BYTES (2), /* cost of FMA SD instruction. */
6065f444
JH
105 COSTS_N_BYTES (2), /* cost of DIVSS instruction. */
106 COSTS_N_BYTES (2), /* cost of DIVSD instruction. */
107 COSTS_N_BYTES (2), /* cost of SQRTSS instruction. */
108 COSTS_N_BYTES (2), /* cost of SQRTSD instruction. */
64766e8d
JH
109 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
110 ix86_size_memcpy,
111 ix86_size_memset,
f6fd8f2b
JH
112 COSTS_N_BYTES (1), /* cond_taken_branch_cost. */
113 COSTS_N_BYTES (1), /* cond_not_taken_branch_cost. */
64766e8d
JH
114};
115
116/* Processor costs (relative to an add) */
117static stringop_algs i386_memcpy[2] = {
118 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
119 DUMMY_STRINGOP_ALGS};
120static stringop_algs i386_memset[2] = {
121 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
122 DUMMY_STRINGOP_ALGS};
123
124static const
125struct processor_costs i386_cost = { /* 386 specific costs */
126 COSTS_N_INSNS (1), /* cost of an add instruction */
127 COSTS_N_INSNS (1), /* cost of a lea instruction */
128 COSTS_N_INSNS (3), /* variable shift costs */
129 COSTS_N_INSNS (2), /* constant shift costs */
130 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
131 COSTS_N_INSNS (6), /* HI */
132 COSTS_N_INSNS (6), /* SI */
133 COSTS_N_INSNS (6), /* DI */
134 COSTS_N_INSNS (6)}, /* other */
135 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
136 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
137 COSTS_N_INSNS (23), /* HI */
138 COSTS_N_INSNS (23), /* SI */
139 COSTS_N_INSNS (23), /* DI */
140 COSTS_N_INSNS (23)}, /* other */
141 COSTS_N_INSNS (3), /* cost of movsx */
142 COSTS_N_INSNS (2), /* cost of movzx */
143 15, /* "large" insn */
144 3, /* MOVE_RATIO */
df41dbaf
JH
145
146 /* All move costs are relative to integer->integer move times 2 and thus
147 they are latency*2. */
64766e8d
JH
148 4, /* cost for loading QImode using movzbl */
149 {2, 4, 2}, /* cost of loading integer registers
150 in QImode, HImode and SImode.
151 Relative to reg-reg move (2). */
152 {2, 4, 2}, /* cost of storing integer registers */
153 2, /* cost of reg,reg fld/fst */
154 {8, 8, 8}, /* cost of loading fp registers
155 in SFmode, DFmode and XFmode */
156 {8, 8, 8}, /* cost of storing fp registers
157 in SFmode, DFmode and XFmode */
158 2, /* cost of moving MMX register */
159 {4, 8}, /* cost of loading MMX registers
160 in SImode and DImode */
161 {4, 8}, /* cost of storing MMX registers
162 in SImode and DImode */
df41dbaf
JH
163 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
164 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
165 in 32,64,128,256 and 512-bit */
166 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
167 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
168 in 32,64,128,256 and 512-bit */
169 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
170 3, 3, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
171 4, 4, /* Gather load static, per_elt. */
172 4, 4, /* Gather store static, per_elt. */
64766e8d
JH
173 0, /* size of l1 cache */
174 0, /* size of l2 cache */
175 0, /* size of prefetch block */
176 0, /* number of parallel prefetches */
177 1, /* Branch cost */
178 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
179 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
180 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
181 COSTS_N_INSNS (22), /* cost of FABS instruction. */
182 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
183 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
6065f444 184
c53c148c 185 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
186 COSTS_N_INSNS (23), /* cost of ADDSS/SD SUBSS/SD insns. */
187 COSTS_N_INSNS (27), /* cost of MULSS instruction. */
188 COSTS_N_INSNS (27), /* cost of MULSD instruction. */
c53c148c
JH
189 COSTS_N_INSNS (27), /* cost of FMA SS instruction. */
190 COSTS_N_INSNS (27), /* cost of FMA SD instruction. */
6065f444
JH
191 COSTS_N_INSNS (88), /* cost of DIVSS instruction. */
192 COSTS_N_INSNS (88), /* cost of DIVSD instruction. */
193 COSTS_N_INSNS (122), /* cost of SQRTSS instruction. */
194 COSTS_N_INSNS (122), /* cost of SQRTSD instruction. */
64766e8d
JH
195 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
196 i386_memcpy,
197 i386_memset,
f6fd8f2b
JH
198 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
199 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
64766e8d
JH
200};
201
202static stringop_algs i486_memcpy[2] = {
203 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
204 DUMMY_STRINGOP_ALGS};
205static stringop_algs i486_memset[2] = {
206 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
207 DUMMY_STRINGOP_ALGS};
208
209static const
210struct processor_costs i486_cost = { /* 486 specific costs */
211 COSTS_N_INSNS (1), /* cost of an add instruction */
212 COSTS_N_INSNS (1), /* cost of a lea instruction */
213 COSTS_N_INSNS (3), /* variable shift costs */
214 COSTS_N_INSNS (2), /* constant shift costs */
215 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
216 COSTS_N_INSNS (12), /* HI */
217 COSTS_N_INSNS (12), /* SI */
218 COSTS_N_INSNS (12), /* DI */
219 COSTS_N_INSNS (12)}, /* other */
220 1, /* cost of multiply per each bit set */
221 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
222 COSTS_N_INSNS (40), /* HI */
223 COSTS_N_INSNS (40), /* SI */
224 COSTS_N_INSNS (40), /* DI */
225 COSTS_N_INSNS (40)}, /* other */
226 COSTS_N_INSNS (3), /* cost of movsx */
227 COSTS_N_INSNS (2), /* cost of movzx */
228 15, /* "large" insn */
229 3, /* MOVE_RATIO */
df41dbaf
JH
230
231 /* All move costs are relative to integer->integer move times 2 and thus
232 they are latency*2. */
64766e8d
JH
233 4, /* cost for loading QImode using movzbl */
234 {2, 4, 2}, /* cost of loading integer registers
235 in QImode, HImode and SImode.
236 Relative to reg-reg move (2). */
237 {2, 4, 2}, /* cost of storing integer registers */
238 2, /* cost of reg,reg fld/fst */
239 {8, 8, 8}, /* cost of loading fp registers
240 in SFmode, DFmode and XFmode */
241 {8, 8, 8}, /* cost of storing fp registers
242 in SFmode, DFmode and XFmode */
243 2, /* cost of moving MMX register */
244 {4, 8}, /* cost of loading MMX registers
245 in SImode and DImode */
246 {4, 8}, /* cost of storing MMX registers
247 in SImode and DImode */
df41dbaf
JH
248 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
249 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
250 in 32,64,128,256 and 512-bit */
251 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
252 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
253 in 32,64,128,256 and 512-bit */
254 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
255 3, 3, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
256 4, 4, /* Gather load static, per_elt. */
257 4, 4, /* Gather store static, per_elt. */
64766e8d
JH
258 4, /* size of l1 cache. 486 has 8kB cache
259 shared for code and data, so 4kB is
260 not really precise. */
261 4, /* size of l2 cache */
262 0, /* size of prefetch block */
263 0, /* number of parallel prefetches */
264 1, /* Branch cost */
265 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
266 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
267 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
268 COSTS_N_INSNS (3), /* cost of FABS instruction. */
269 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
270 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
6065f444 271
c53c148c 272 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
273 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
274 COSTS_N_INSNS (16), /* cost of MULSS instruction. */
275 COSTS_N_INSNS (16), /* cost of MULSD instruction. */
c53c148c
JH
276 COSTS_N_INSNS (16), /* cost of FMA SS instruction. */
277 COSTS_N_INSNS (16), /* cost of FMA SD instruction. */
6065f444
JH
278 COSTS_N_INSNS (73), /* cost of DIVSS instruction. */
279 COSTS_N_INSNS (74), /* cost of DIVSD instruction. */
280 COSTS_N_INSNS (83), /* cost of SQRTSS instruction. */
281 COSTS_N_INSNS (83), /* cost of SQRTSD instruction. */
64766e8d
JH
282 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
283 i486_memcpy,
284 i486_memset,
f6fd8f2b
JH
285 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
286 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
64766e8d
JH
287};
288
289static stringop_algs pentium_memcpy[2] = {
290 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
291 DUMMY_STRINGOP_ALGS};
292static stringop_algs pentium_memset[2] = {
293 {libcall, {{-1, rep_prefix_4_byte, false}}},
294 DUMMY_STRINGOP_ALGS};
295
296static const
297struct processor_costs pentium_cost = {
298 COSTS_N_INSNS (1), /* cost of an add instruction */
299 COSTS_N_INSNS (1), /* cost of a lea instruction */
300 COSTS_N_INSNS (4), /* variable shift costs */
301 COSTS_N_INSNS (1), /* constant shift costs */
302 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
303 COSTS_N_INSNS (11), /* HI */
304 COSTS_N_INSNS (11), /* SI */
305 COSTS_N_INSNS (11), /* DI */
306 COSTS_N_INSNS (11)}, /* other */
307 0, /* cost of multiply per each bit set */
308 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
309 COSTS_N_INSNS (25), /* HI */
310 COSTS_N_INSNS (25), /* SI */
311 COSTS_N_INSNS (25), /* DI */
312 COSTS_N_INSNS (25)}, /* other */
313 COSTS_N_INSNS (3), /* cost of movsx */
314 COSTS_N_INSNS (2), /* cost of movzx */
315 8, /* "large" insn */
316 6, /* MOVE_RATIO */
df41dbaf
JH
317
318 /* All move costs are relative to integer->integer move times 2 and thus
319 they are latency*2. */
64766e8d
JH
320 6, /* cost for loading QImode using movzbl */
321 {2, 4, 2}, /* cost of loading integer registers
322 in QImode, HImode and SImode.
323 Relative to reg-reg move (2). */
324 {2, 4, 2}, /* cost of storing integer registers */
325 2, /* cost of reg,reg fld/fst */
326 {2, 2, 6}, /* cost of loading fp registers
327 in SFmode, DFmode and XFmode */
328 {4, 4, 6}, /* cost of storing fp registers
329 in SFmode, DFmode and XFmode */
330 8, /* cost of moving MMX register */
331 {8, 8}, /* cost of loading MMX registers
332 in SImode and DImode */
333 {8, 8}, /* cost of storing MMX registers
334 in SImode and DImode */
df41dbaf
JH
335 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
336 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
337 in 32,64,128,256 and 512-bit */
338 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
339 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
340 in 32,64,128,256 and 512-bit */
341 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
342 3, 3, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
343 4, 4, /* Gather load static, per_elt. */
344 4, 4, /* Gather store static, per_elt. */
64766e8d
JH
345 8, /* size of l1 cache. */
346 8, /* size of l2 cache */
347 0, /* size of prefetch block */
348 0, /* number of parallel prefetches */
349 2, /* Branch cost */
350 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
351 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
352 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
353 COSTS_N_INSNS (1), /* cost of FABS instruction. */
354 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
355 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
6065f444 356
c53c148c 357 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
358 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
359 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
360 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
c53c148c
JH
361 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
362 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
6065f444
JH
363 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
364 COSTS_N_INSNS (39), /* cost of DIVSD instruction. */
365 COSTS_N_INSNS (70), /* cost of SQRTSS instruction. */
366 COSTS_N_INSNS (70), /* cost of SQRTSD instruction. */
64766e8d
JH
367 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
368 pentium_memcpy,
369 pentium_memset,
f6fd8f2b
JH
370 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
371 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
64766e8d
JH
372};
373
374static const
375struct processor_costs lakemont_cost = {
376 COSTS_N_INSNS (1), /* cost of an add instruction */
377 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
378 COSTS_N_INSNS (1), /* variable shift costs */
379 COSTS_N_INSNS (1), /* constant shift costs */
380 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
381 COSTS_N_INSNS (11), /* HI */
382 COSTS_N_INSNS (11), /* SI */
383 COSTS_N_INSNS (11), /* DI */
384 COSTS_N_INSNS (11)}, /* other */
385 0, /* cost of multiply per each bit set */
386 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
387 COSTS_N_INSNS (25), /* HI */
388 COSTS_N_INSNS (25), /* SI */
389 COSTS_N_INSNS (25), /* DI */
390 COSTS_N_INSNS (25)}, /* other */
391 COSTS_N_INSNS (3), /* cost of movsx */
392 COSTS_N_INSNS (2), /* cost of movzx */
393 8, /* "large" insn */
394 17, /* MOVE_RATIO */
df41dbaf
JH
395
396 /* All move costs are relative to integer->integer move times 2 and thus
397 they are latency*2. */
64766e8d
JH
398 6, /* cost for loading QImode using movzbl */
399 {2, 4, 2}, /* cost of loading integer registers
400 in QImode, HImode and SImode.
401 Relative to reg-reg move (2). */
402 {2, 4, 2}, /* cost of storing integer registers */
403 2, /* cost of reg,reg fld/fst */
404 {2, 2, 6}, /* cost of loading fp registers
405 in SFmode, DFmode and XFmode */
406 {4, 4, 6}, /* cost of storing fp registers
407 in SFmode, DFmode and XFmode */
408 8, /* cost of moving MMX register */
409 {8, 8}, /* cost of loading MMX registers
410 in SImode and DImode */
411 {8, 8}, /* cost of storing MMX registers
412 in SImode and DImode */
df41dbaf
JH
413 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
414 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
415 in 32,64,128,256 and 512-bit */
416 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
417 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
418 in 32,64,128,256 and 512-bit */
419 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
420 3, 3, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
421 4, 4, /* Gather load static, per_elt. */
422 4, 4, /* Gather store static, per_elt. */
64766e8d
JH
423 8, /* size of l1 cache. */
424 8, /* size of l2 cache */
425 0, /* size of prefetch block */
426 0, /* number of parallel prefetches */
427 2, /* Branch cost */
428 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
429 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
430 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
431 COSTS_N_INSNS (1), /* cost of FABS instruction. */
432 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
433 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
6065f444 434
c53c148c 435 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
436 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
437 COSTS_N_INSNS (5), /* cost of MULSS instruction. */
438 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
c53c148c
JH
439 COSTS_N_INSNS (10), /* cost of FMA SS instruction. */
440 COSTS_N_INSNS (10), /* cost of FMA SD instruction. */
6065f444
JH
441 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
442 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
443 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
444 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
64766e8d
JH
445 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
446 pentium_memcpy,
447 pentium_memset,
f6fd8f2b
JH
448 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
449 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
64766e8d
JH
450};
451
452/* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
453 (we ensure the alignment). For small blocks inline loop is still a
454 noticeable win, for bigger blocks either rep movsl or rep movsb is
455 way to go. Rep movsb has apparently more expensive startup time in CPU,
456 but after 4K the difference is down in the noise. */
457static stringop_algs pentiumpro_memcpy[2] = {
458 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
459 {8192, rep_prefix_4_byte, false},
460 {-1, rep_prefix_1_byte, false}}},
461 DUMMY_STRINGOP_ALGS};
462static stringop_algs pentiumpro_memset[2] = {
463 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
464 {8192, rep_prefix_4_byte, false},
465 {-1, libcall, false}}},
466 DUMMY_STRINGOP_ALGS};
467static const
468struct processor_costs pentiumpro_cost = {
469 COSTS_N_INSNS (1), /* cost of an add instruction */
470 COSTS_N_INSNS (1), /* cost of a lea instruction */
471 COSTS_N_INSNS (1), /* variable shift costs */
472 COSTS_N_INSNS (1), /* constant shift costs */
473 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
474 COSTS_N_INSNS (4), /* HI */
475 COSTS_N_INSNS (4), /* SI */
476 COSTS_N_INSNS (4), /* DI */
477 COSTS_N_INSNS (4)}, /* other */
478 0, /* cost of multiply per each bit set */
479 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
480 COSTS_N_INSNS (17), /* HI */
481 COSTS_N_INSNS (17), /* SI */
482 COSTS_N_INSNS (17), /* DI */
483 COSTS_N_INSNS (17)}, /* other */
484 COSTS_N_INSNS (1), /* cost of movsx */
485 COSTS_N_INSNS (1), /* cost of movzx */
486 8, /* "large" insn */
487 6, /* MOVE_RATIO */
df41dbaf
JH
488
489 /* All move costs are relative to integer->integer move times 2 and thus
490 they are latency*2. */
64766e8d
JH
491 2, /* cost for loading QImode using movzbl */
492 {4, 4, 4}, /* cost of loading integer registers
493 in QImode, HImode and SImode.
494 Relative to reg-reg move (2). */
495 {2, 2, 2}, /* cost of storing integer registers */
496 2, /* cost of reg,reg fld/fst */
497 {2, 2, 6}, /* cost of loading fp registers
498 in SFmode, DFmode and XFmode */
499 {4, 4, 6}, /* cost of storing fp registers
500 in SFmode, DFmode and XFmode */
501 2, /* cost of moving MMX register */
502 {2, 2}, /* cost of loading MMX registers
503 in SImode and DImode */
504 {2, 2}, /* cost of storing MMX registers
505 in SImode and DImode */
df41dbaf
JH
506 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
507 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
508 in 32,64,128,256 and 512-bit */
509 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
510 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
511 in 32,64,128,256 and 512-bit */
512 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
513 3, 3, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
514 4, 4, /* Gather load static, per_elt. */
515 4, 4, /* Gather store static, per_elt. */
64766e8d
JH
516 8, /* size of l1 cache. */
517 256, /* size of l2 cache */
518 32, /* size of prefetch block */
519 6, /* number of parallel prefetches */
520 2, /* Branch cost */
521 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
522 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
523 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
524 COSTS_N_INSNS (2), /* cost of FABS instruction. */
525 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
526 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
6065f444 527
c53c148c 528 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
529 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
530 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
531 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
c53c148c
JH
532 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
533 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
6065f444
JH
534 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
535 COSTS_N_INSNS (18), /* cost of DIVSD instruction. */
536 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
537 COSTS_N_INSNS (31), /* cost of SQRTSD instruction. */
64766e8d
JH
538 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
539 pentiumpro_memcpy,
540 pentiumpro_memset,
f6fd8f2b
JH
541 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
542 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
64766e8d
JH
543};
544
545static stringop_algs geode_memcpy[2] = {
546 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
547 DUMMY_STRINGOP_ALGS};
548static stringop_algs geode_memset[2] = {
549 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
550 DUMMY_STRINGOP_ALGS};
551static const
552struct processor_costs geode_cost = {
553 COSTS_N_INSNS (1), /* cost of an add instruction */
554 COSTS_N_INSNS (1), /* cost of a lea instruction */
555 COSTS_N_INSNS (2), /* variable shift costs */
556 COSTS_N_INSNS (1), /* constant shift costs */
557 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
558 COSTS_N_INSNS (4), /* HI */
559 COSTS_N_INSNS (7), /* SI */
560 COSTS_N_INSNS (7), /* DI */
561 COSTS_N_INSNS (7)}, /* other */
562 0, /* cost of multiply per each bit set */
563 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
564 COSTS_N_INSNS (23), /* HI */
565 COSTS_N_INSNS (39), /* SI */
566 COSTS_N_INSNS (39), /* DI */
567 COSTS_N_INSNS (39)}, /* other */
568 COSTS_N_INSNS (1), /* cost of movsx */
569 COSTS_N_INSNS (1), /* cost of movzx */
570 8, /* "large" insn */
571 4, /* MOVE_RATIO */
df41dbaf
JH
572
573 /* All move costs are relative to integer->integer move times 2 and thus
574 they are latency*2. */
575 2, /* cost for loading QImode using movzbl */
576 {2, 2, 2}, /* cost of loading integer registers
64766e8d
JH
577 in QImode, HImode and SImode.
578 Relative to reg-reg move (2). */
df41dbaf
JH
579 {2, 2, 2}, /* cost of storing integer registers */
580 2, /* cost of reg,reg fld/fst */
581 {2, 2, 2}, /* cost of loading fp registers
64766e8d
JH
582 in SFmode, DFmode and XFmode */
583 {4, 6, 6}, /* cost of storing fp registers
584 in SFmode, DFmode and XFmode */
585
586 2, /* cost of moving MMX register */
587 {2, 2}, /* cost of loading MMX registers
588 in SImode and DImode */
589 {2, 2}, /* cost of storing MMX registers
590 in SImode and DImode */
df41dbaf
JH
591 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
592 {2, 2, 8, 16, 32}, /* cost of loading SSE registers
593 in 32,64,128,256 and 512-bit */
594 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */
595 {2, 2, 8, 16, 32}, /* cost of storing SSE registers
596 in 32,64,128,256 and 512-bit */
597 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
598 6, 6, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
599 2, 2, /* Gather load static, per_elt. */
600 2, 2, /* Gather store static, per_elt. */
64766e8d
JH
601 64, /* size of l1 cache. */
602 128, /* size of l2 cache. */
603 32, /* size of prefetch block */
604 1, /* number of parallel prefetches */
605 1, /* Branch cost */
606 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
607 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
608 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
609 COSTS_N_INSNS (1), /* cost of FABS instruction. */
610 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
611 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
6065f444 612
c53c148c 613 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
614 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
615 COSTS_N_INSNS (11), /* cost of MULSS instruction. */
616 COSTS_N_INSNS (11), /* cost of MULSD instruction. */
c53c148c
JH
617 COSTS_N_INSNS (17), /* cost of FMA SS instruction. */
618 COSTS_N_INSNS (17), /* cost of FMA SD instruction. */
6065f444
JH
619 COSTS_N_INSNS (47), /* cost of DIVSS instruction. */
620 COSTS_N_INSNS (47), /* cost of DIVSD instruction. */
621 COSTS_N_INSNS (54), /* cost of SQRTSS instruction. */
622 COSTS_N_INSNS (54), /* cost of SQRTSD instruction. */
64766e8d
JH
623 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
624 geode_memcpy,
625 geode_memset,
f6fd8f2b
JH
626 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
627 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
64766e8d
JH
628};
629
630static stringop_algs k6_memcpy[2] = {
631 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
632 DUMMY_STRINGOP_ALGS};
633static stringop_algs k6_memset[2] = {
634 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
635 DUMMY_STRINGOP_ALGS};
636static const
637struct processor_costs k6_cost = {
638 COSTS_N_INSNS (1), /* cost of an add instruction */
639 COSTS_N_INSNS (2), /* cost of a lea instruction */
640 COSTS_N_INSNS (1), /* variable shift costs */
641 COSTS_N_INSNS (1), /* constant shift costs */
642 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
643 COSTS_N_INSNS (3), /* HI */
644 COSTS_N_INSNS (3), /* SI */
645 COSTS_N_INSNS (3), /* DI */
646 COSTS_N_INSNS (3)}, /* other */
647 0, /* cost of multiply per each bit set */
648 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
649 COSTS_N_INSNS (18), /* HI */
650 COSTS_N_INSNS (18), /* SI */
651 COSTS_N_INSNS (18), /* DI */
652 COSTS_N_INSNS (18)}, /* other */
653 COSTS_N_INSNS (2), /* cost of movsx */
654 COSTS_N_INSNS (2), /* cost of movzx */
655 8, /* "large" insn */
656 4, /* MOVE_RATIO */
df41dbaf
JH
657
658 /* All move costs are relative to integer->integer move times 2 and thus
659 they are latency*2. */
64766e8d
JH
660 3, /* cost for loading QImode using movzbl */
661 {4, 5, 4}, /* cost of loading integer registers
662 in QImode, HImode and SImode.
663 Relative to reg-reg move (2). */
664 {2, 3, 2}, /* cost of storing integer registers */
665 4, /* cost of reg,reg fld/fst */
666 {6, 6, 6}, /* cost of loading fp registers
667 in SFmode, DFmode and XFmode */
668 {4, 4, 4}, /* cost of storing fp registers
669 in SFmode, DFmode and XFmode */
670 2, /* cost of moving MMX register */
671 {2, 2}, /* cost of loading MMX registers
672 in SImode and DImode */
673 {2, 2}, /* cost of storing MMX registers
674 in SImode and DImode */
df41dbaf
JH
675 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
676 {2, 2, 8, 16, 32}, /* cost of loading SSE registers
677 in 32,64,128,256 and 512-bit */
678 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */
679 {2, 2, 8, 16, 32}, /* cost of storing SSE registers
680 in 32,64,128,256 and 512-bit */
681 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
682 6, 6, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
683 2, 2, /* Gather load static, per_elt. */
684 2, 2, /* Gather store static, per_elt. */
64766e8d
JH
685 32, /* size of l1 cache. */
686 32, /* size of l2 cache. Some models
687 have integrated l2 cache, but
688 optimizing for k6 is not important
689 enough to worry about that. */
690 32, /* size of prefetch block */
691 1, /* number of parallel prefetches */
692 1, /* Branch cost */
693 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
694 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
695 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
696 COSTS_N_INSNS (2), /* cost of FABS instruction. */
697 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
698 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
6065f444 699
c53c148c 700 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
701 COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */
702 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
703 COSTS_N_INSNS (2), /* cost of MULSD instruction. */
c53c148c
JH
704 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
705 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
6065f444
JH
706 COSTS_N_INSNS (56), /* cost of DIVSS instruction. */
707 COSTS_N_INSNS (56), /* cost of DIVSD instruction. */
708 COSTS_N_INSNS (56), /* cost of SQRTSS instruction. */
709 COSTS_N_INSNS (56), /* cost of SQRTSD instruction. */
64766e8d
JH
710 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
711 k6_memcpy,
712 k6_memset,
f6fd8f2b
JH
713 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
714 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
64766e8d
JH
715};
716
717/* For some reason, Athlon deals better with REP prefix (relative to loops)
718 compared to K8. Alignment becomes important after 8 bytes for memcpy and
719 128 bytes for memset. */
720static stringop_algs athlon_memcpy[2] = {
721 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
722 DUMMY_STRINGOP_ALGS};
723static stringop_algs athlon_memset[2] = {
724 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
725 DUMMY_STRINGOP_ALGS};
726static const
727struct processor_costs athlon_cost = {
728 COSTS_N_INSNS (1), /* cost of an add instruction */
729 COSTS_N_INSNS (2), /* cost of a lea instruction */
730 COSTS_N_INSNS (1), /* variable shift costs */
731 COSTS_N_INSNS (1), /* constant shift costs */
732 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
733 COSTS_N_INSNS (5), /* HI */
734 COSTS_N_INSNS (5), /* SI */
735 COSTS_N_INSNS (5), /* DI */
736 COSTS_N_INSNS (5)}, /* other */
737 0, /* cost of multiply per each bit set */
738 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
739 COSTS_N_INSNS (26), /* HI */
740 COSTS_N_INSNS (42), /* SI */
741 COSTS_N_INSNS (74), /* DI */
742 COSTS_N_INSNS (74)}, /* other */
743 COSTS_N_INSNS (1), /* cost of movsx */
744 COSTS_N_INSNS (1), /* cost of movzx */
745 8, /* "large" insn */
746 9, /* MOVE_RATIO */
df41dbaf
JH
747
748 /* All move costs are relative to integer->integer move times 2 and thus
749 they are latency*2. */
64766e8d
JH
750 4, /* cost for loading QImode using movzbl */
751 {3, 4, 3}, /* cost of loading integer registers
752 in QImode, HImode and SImode.
753 Relative to reg-reg move (2). */
754 {3, 4, 3}, /* cost of storing integer registers */
755 4, /* cost of reg,reg fld/fst */
756 {4, 4, 12}, /* cost of loading fp registers
757 in SFmode, DFmode and XFmode */
758 {6, 6, 8}, /* cost of storing fp registers
759 in SFmode, DFmode and XFmode */
760 2, /* cost of moving MMX register */
761 {4, 4}, /* cost of loading MMX registers
762 in SImode and DImode */
763 {4, 4}, /* cost of storing MMX registers
764 in SImode and DImode */
df41dbaf
JH
765 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
766 {4, 4, 6, 12, 24}, /* cost of loading SSE registers
767 in 32,64,128,256 and 512-bit */
768 {4, 4, 6, 12, 24}, /* cost of unaligned loads. */
769 {4, 4, 5, 10, 20}, /* cost of storing SSE registers
770 in 32,64,128,256 and 512-bit */
771 {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
772 5, 5, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
773 4, 4, /* Gather load static, per_elt. */
774 4, 4, /* Gather store static, per_elt. */
64766e8d
JH
775 64, /* size of l1 cache. */
776 256, /* size of l2 cache. */
777 64, /* size of prefetch block */
778 6, /* number of parallel prefetches */
779 5, /* Branch cost */
780 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
781 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
782 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
783 COSTS_N_INSNS (2), /* cost of FABS instruction. */
784 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
785 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
6065f444 786
c53c148c 787 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
6065f444
JH
788 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
789 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
790 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
c53c148c
JH
791 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
792 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
6065f444
JH
793 /* 11-16 */
794 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
795 COSTS_N_INSNS (24), /* cost of DIVSD instruction. */
796 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
797 COSTS_N_INSNS (19), /* cost of SQRTSD instruction. */
64766e8d
JH
798 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
799 athlon_memcpy,
800 athlon_memset,
f6fd8f2b
JH
801 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
802 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
64766e8d
JH
803};
804
805/* K8 has optimized REP instruction for medium sized blocks, but for very
806 small blocks it is better to use loop. For large blocks, libcall can
807 do nontemporary accesses and beat inline considerably. */
808static stringop_algs k8_memcpy[2] = {
809 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
810 {-1, rep_prefix_4_byte, false}}},
811 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
812 {-1, libcall, false}}}};
813static stringop_algs k8_memset[2] = {
814 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
815 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
816 {libcall, {{48, unrolled_loop, false},
817 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
818static const
819struct processor_costs k8_cost = {
820 COSTS_N_INSNS (1), /* cost of an add instruction */
821 COSTS_N_INSNS (2), /* cost of a lea instruction */
822 COSTS_N_INSNS (1), /* variable shift costs */
823 COSTS_N_INSNS (1), /* constant shift costs */
824 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
825 COSTS_N_INSNS (4), /* HI */
826 COSTS_N_INSNS (3), /* SI */
827 COSTS_N_INSNS (4), /* DI */
828 COSTS_N_INSNS (5)}, /* other */
829 0, /* cost of multiply per each bit set */
830 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
831 COSTS_N_INSNS (26), /* HI */
832 COSTS_N_INSNS (42), /* SI */
833 COSTS_N_INSNS (74), /* DI */
834 COSTS_N_INSNS (74)}, /* other */
835 COSTS_N_INSNS (1), /* cost of movsx */
836 COSTS_N_INSNS (1), /* cost of movzx */
837 8, /* "large" insn */
838 9, /* MOVE_RATIO */
df41dbaf
JH
839
840 /* All move costs are relative to integer->integer move times 2 and thus
841 they are latency*2. */
64766e8d
JH
842 4, /* cost for loading QImode using movzbl */
843 {3, 4, 3}, /* cost of loading integer registers
844 in QImode, HImode and SImode.
845 Relative to reg-reg move (2). */
846 {3, 4, 3}, /* cost of storing integer registers */
847 4, /* cost of reg,reg fld/fst */
848 {4, 4, 12}, /* cost of loading fp registers
849 in SFmode, DFmode and XFmode */
850 {6, 6, 8}, /* cost of storing fp registers
851 in SFmode, DFmode and XFmode */
852 2, /* cost of moving MMX register */
853 {3, 3}, /* cost of loading MMX registers
854 in SImode and DImode */
855 {4, 4}, /* cost of storing MMX registers
856 in SImode and DImode */
df41dbaf
JH
857 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
858 {4, 3, 6, 12, 24}, /* cost of loading SSE registers
859 in 32,64,128,256 and 512-bit */
860 {4, 3, 6, 12, 24}, /* cost of unaligned loads. */
861 {4, 4, 5, 10, 20}, /* cost of storing SSE registers
862 in 32,64,128,256 and 512-bit */
863 {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
864 5, 5, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
865 4, 4, /* Gather load static, per_elt. */
866 4, 4, /* Gather store static, per_elt. */
64766e8d
JH
867 64, /* size of l1 cache. */
868 512, /* size of l2 cache. */
869 64, /* size of prefetch block */
870 /* New AMD processors never drop prefetches; if they cannot be performed
871 immediately, they are queued. We set number of simultaneous prefetches
872 to a large constant to reflect this (it probably is not a good idea not
873 to limit number of prefetches at all, as their execution also takes some
874 time). */
875 100, /* number of parallel prefetches */
876 3, /* Branch cost */
877 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
878 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
879 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
880 COSTS_N_INSNS (2), /* cost of FABS instruction. */
881 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
882 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
6065f444 883
c53c148c 884 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
6065f444
JH
885 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
886 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
887 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
c53c148c
JH
888 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
889 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
6065f444
JH
890 /* 11-16 */
891 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
892 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
893 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
894 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
64766e8d
JH
895 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
896 k8_memcpy,
897 k8_memset,
f6fd8f2b
JH
898 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
899 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
64766e8d
JH
900};
901
902/* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
903 very small blocks it is better to use loop. For large blocks, libcall can
904 do nontemporary accesses and beat inline considerably. */
905static stringop_algs amdfam10_memcpy[2] = {
906 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
907 {-1, rep_prefix_4_byte, false}}},
908 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
909 {-1, libcall, false}}}};
910static stringop_algs amdfam10_memset[2] = {
911 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
912 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
913 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
914 {-1, libcall, false}}}};
915struct processor_costs amdfam10_cost = {
916 COSTS_N_INSNS (1), /* cost of an add instruction */
917 COSTS_N_INSNS (2), /* cost of a lea instruction */
918 COSTS_N_INSNS (1), /* variable shift costs */
919 COSTS_N_INSNS (1), /* constant shift costs */
920 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
921 COSTS_N_INSNS (4), /* HI */
922 COSTS_N_INSNS (3), /* SI */
923 COSTS_N_INSNS (4), /* DI */
924 COSTS_N_INSNS (5)}, /* other */
925 0, /* cost of multiply per each bit set */
926 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
927 COSTS_N_INSNS (35), /* HI */
928 COSTS_N_INSNS (51), /* SI */
929 COSTS_N_INSNS (83), /* DI */
930 COSTS_N_INSNS (83)}, /* other */
931 COSTS_N_INSNS (1), /* cost of movsx */
932 COSTS_N_INSNS (1), /* cost of movzx */
933 8, /* "large" insn */
934 9, /* MOVE_RATIO */
df41dbaf
JH
935
936 /* All move costs are relative to integer->integer move times 2 and thus
937 they are latency*2. */
64766e8d
JH
938 4, /* cost for loading QImode using movzbl */
939 {3, 4, 3}, /* cost of loading integer registers
940 in QImode, HImode and SImode.
941 Relative to reg-reg move (2). */
942 {3, 4, 3}, /* cost of storing integer registers */
943 4, /* cost of reg,reg fld/fst */
944 {4, 4, 12}, /* cost of loading fp registers
945 in SFmode, DFmode and XFmode */
946 {6, 6, 8}, /* cost of storing fp registers
947 in SFmode, DFmode and XFmode */
948 2, /* cost of moving MMX register */
949 {3, 3}, /* cost of loading MMX registers
950 in SImode and DImode */
951 {4, 4}, /* cost of storing MMX registers
952 in SImode and DImode */
df41dbaf
JH
953 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
954 {4, 4, 3, 6, 12}, /* cost of loading SSE registers
955 in 32,64,128,256 and 512-bit */
956 {4, 4, 3, 7, 12}, /* cost of unaligned loads. */
957 {4, 4, 5, 10, 20}, /* cost of storing SSE registers
958 in 32,64,128,256 and 512-bit */
959 {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
960 3, 3, /* SSE->integer and integer->SSE moves */
64766e8d
JH
961 /* On K8:
962 MOVD reg64, xmmreg Double FSTORE 4
963 MOVD reg32, xmmreg Double FSTORE 4
964 On AMDFAM10:
965 MOVD reg64, xmmreg Double FADD 3
966 1/1 1/1
967 MOVD reg32, xmmreg Double FADD 3
968 1/1 1/1 */
a4fe6139
JH
969 4, 4, /* Gather load static, per_elt. */
970 4, 4, /* Gather store static, per_elt. */
64766e8d
JH
971 64, /* size of l1 cache. */
972 512, /* size of l2 cache. */
973 64, /* size of prefetch block */
974 /* New AMD processors never drop prefetches; if they cannot be performed
975 immediately, they are queued. We set number of simultaneous prefetches
976 to a large constant to reflect this (it probably is not a good idea not
977 to limit number of prefetches at all, as their execution also takes some
978 time). */
979 100, /* number of parallel prefetches */
980 2, /* Branch cost */
981 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
982 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
983 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
984 COSTS_N_INSNS (2), /* cost of FABS instruction. */
985 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
986 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
6065f444 987
c53c148c 988 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
6065f444
JH
989 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
990 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
991 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
c53c148c
JH
992 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
993 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
6065f444
JH
994 /* 11-16 */
995 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
996 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
997 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
998 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
64766e8d
JH
999 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1000 amdfam10_memcpy,
1001 amdfam10_memset,
f6fd8f2b
JH
1002 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1003 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
64766e8d
JH
1004};
1005
1006/* BDVER1 has optimized REP instruction for medium sized blocks, but for
1007 very small blocks it is better to use loop. For large blocks, libcall
1008 can do nontemporary accesses and beat inline considerably. */
1009static stringop_algs bdver1_memcpy[2] = {
1010 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1011 {-1, rep_prefix_4_byte, false}}},
1012 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1013 {-1, libcall, false}}}};
1014static stringop_algs bdver1_memset[2] = {
1015 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1016 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1017 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1018 {-1, libcall, false}}}};
1019
1020const struct processor_costs bdver1_cost = {
1021 COSTS_N_INSNS (1), /* cost of an add instruction */
1022 COSTS_N_INSNS (1), /* cost of a lea instruction */
1023 COSTS_N_INSNS (1), /* variable shift costs */
1024 COSTS_N_INSNS (1), /* constant shift costs */
1025 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1026 COSTS_N_INSNS (4), /* HI */
1027 COSTS_N_INSNS (4), /* SI */
1028 COSTS_N_INSNS (6), /* DI */
1029 COSTS_N_INSNS (6)}, /* other */
1030 0, /* cost of multiply per each bit set */
1031 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1032 COSTS_N_INSNS (35), /* HI */
1033 COSTS_N_INSNS (51), /* SI */
1034 COSTS_N_INSNS (83), /* DI */
1035 COSTS_N_INSNS (83)}, /* other */
1036 COSTS_N_INSNS (1), /* cost of movsx */
1037 COSTS_N_INSNS (1), /* cost of movzx */
1038 8, /* "large" insn */
1039 9, /* MOVE_RATIO */
df41dbaf
JH
1040
1041 /* All move costs are relative to integer->integer move times 2 and thus
1042 they are latency*2. */
1043 8, /* cost for loading QImode using movzbl */
1044 {8, 8, 8}, /* cost of loading integer registers
64766e8d
JH
1045 in QImode, HImode and SImode.
1046 Relative to reg-reg move (2). */
df41dbaf
JH
1047 {8, 8, 8}, /* cost of storing integer registers */
1048 4, /* cost of reg,reg fld/fst */
1049 {12, 12, 28}, /* cost of loading fp registers
64766e8d 1050 in SFmode, DFmode and XFmode */
df41dbaf 1051 {10, 10, 18}, /* cost of storing fp registers
64766e8d 1052 in SFmode, DFmode and XFmode */
df41dbaf
JH
1053 4, /* cost of moving MMX register */
1054 {12, 12}, /* cost of loading MMX registers
64766e8d 1055 in SImode and DImode */
df41dbaf 1056 {10, 10}, /* cost of storing MMX registers
64766e8d 1057 in SImode and DImode */
df41dbaf
JH
1058 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1059 {12, 12, 10, 20, 30}, /* cost of loading SSE registers
1060 in 32,64,128,256 and 512-bit */
1061 {12, 12, 10, 20, 30}, /* cost of unaligned loads. */
1062 {10, 10, 10, 20, 30}, /* cost of storing SSE registers
1063 in 32,64,128,256 and 512-bit */
1064 {10, 10, 10, 20, 30}, /* cost of unaligned stores. */
1065 16, 20, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
1066 12, 12, /* Gather load static, per_elt. */
1067 10, 10, /* Gather store static, per_elt. */
64766e8d
JH
1068 16, /* size of l1 cache. */
1069 2048, /* size of l2 cache. */
1070 64, /* size of prefetch block */
1071 /* New AMD processors never drop prefetches; if they cannot be performed
1072 immediately, they are queued. We set number of simultaneous prefetches
1073 to a large constant to reflect this (it probably is not a good idea not
1074 to limit number of prefetches at all, as their execution also takes some
1075 time). */
1076 100, /* number of parallel prefetches */
1077 2, /* Branch cost */
1078 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1079 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1080 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1081 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1082 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1083 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
6065f444 1084
c53c148c 1085 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
6065f444
JH
1086 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1087 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1088 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
c53c148c
JH
1089 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1090 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
6065f444
JH
1091 /* 9-24 */
1092 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1093 /* 9-27 */
1094 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1095 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1096 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
64766e8d
JH
1097 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1098 bdver1_memcpy,
1099 bdver1_memset,
f6fd8f2b
JH
1100 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1101 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
64766e8d
JH
1102};
1103
1104/* BDVER2 has optimized REP instruction for medium sized blocks, but for
1105 very small blocks it is better to use loop. For large blocks, libcall
1106 can do nontemporary accesses and beat inline considerably. */
1107
1108static stringop_algs bdver2_memcpy[2] = {
1109 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1110 {-1, rep_prefix_4_byte, false}}},
1111 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1112 {-1, libcall, false}}}};
1113static stringop_algs bdver2_memset[2] = {
1114 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1115 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1116 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1117 {-1, libcall, false}}}};
1118
1119const struct processor_costs bdver2_cost = {
1120 COSTS_N_INSNS (1), /* cost of an add instruction */
1121 COSTS_N_INSNS (1), /* cost of a lea instruction */
1122 COSTS_N_INSNS (1), /* variable shift costs */
1123 COSTS_N_INSNS (1), /* constant shift costs */
1124 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1125 COSTS_N_INSNS (4), /* HI */
1126 COSTS_N_INSNS (4), /* SI */
1127 COSTS_N_INSNS (6), /* DI */
1128 COSTS_N_INSNS (6)}, /* other */
1129 0, /* cost of multiply per each bit set */
1130 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1131 COSTS_N_INSNS (35), /* HI */
1132 COSTS_N_INSNS (51), /* SI */
1133 COSTS_N_INSNS (83), /* DI */
1134 COSTS_N_INSNS (83)}, /* other */
1135 COSTS_N_INSNS (1), /* cost of movsx */
1136 COSTS_N_INSNS (1), /* cost of movzx */
1137 8, /* "large" insn */
1138 9, /* MOVE_RATIO */
df41dbaf
JH
1139
1140 /* All move costs are relative to integer->integer move times 2 and thus
1141 they are latency*2. */
1142 8, /* cost for loading QImode using movzbl */
1143 {8, 8, 8}, /* cost of loading integer registers
64766e8d
JH
1144 in QImode, HImode and SImode.
1145 Relative to reg-reg move (2). */
df41dbaf
JH
1146 {8, 8, 8}, /* cost of storing integer registers */
1147 4, /* cost of reg,reg fld/fst */
1148 {12, 12, 28}, /* cost of loading fp registers
64766e8d 1149 in SFmode, DFmode and XFmode */
df41dbaf 1150 {10, 10, 18}, /* cost of storing fp registers
64766e8d 1151 in SFmode, DFmode and XFmode */
df41dbaf
JH
1152 4, /* cost of moving MMX register */
1153 {12, 12}, /* cost of loading MMX registers
64766e8d 1154 in SImode and DImode */
df41dbaf 1155 {10, 10}, /* cost of storing MMX registers
64766e8d 1156 in SImode and DImode */
df41dbaf
JH
1157 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1158 {12, 12, 10, 20, 30}, /* cost of loading SSE registers
1159 in 32,64,128,256 and 512-bit */
1160 {12, 12, 10, 20, 30}, /* cost of unaligned loads. */
1161 {10, 10, 10, 20, 30}, /* cost of storing SSE registers
1162 in 32,64,128,256 and 512-bit */
1163 {10, 10, 10, 20, 30}, /* cost of unaligned stores. */
1164 16, 20, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
1165 12, 12, /* Gather load static, per_elt. */
1166 10, 10, /* Gather store static, per_elt. */
64766e8d
JH
1167 16, /* size of l1 cache. */
1168 2048, /* size of l2 cache. */
1169 64, /* size of prefetch block */
1170 /* New AMD processors never drop prefetches; if they cannot be performed
1171 immediately, they are queued. We set number of simultaneous prefetches
1172 to a large constant to reflect this (it probably is not a good idea not
1173 to limit number of prefetches at all, as their execution also takes some
1174 time). */
1175 100, /* number of parallel prefetches */
1176 2, /* Branch cost */
1177 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1178 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1179 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1180 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1181 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1182 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
6065f444 1183
c53c148c 1184 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
6065f444
JH
1185 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1186 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1187 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
c53c148c
JH
1188 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1189 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
6065f444
JH
1190 /* 9-24 */
1191 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1192 /* 9-27 */
1193 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1194 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1195 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
64766e8d
JH
1196 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1197 bdver2_memcpy,
1198 bdver2_memset,
f6fd8f2b
JH
1199 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1200 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
64766e8d
JH
1201};
1202
1203
1204 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1205 very small blocks it is better to use loop. For large blocks, libcall
1206 can do nontemporary accesses and beat inline considerably. */
1207static stringop_algs bdver3_memcpy[2] = {
1208 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1209 {-1, rep_prefix_4_byte, false}}},
1210 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1211 {-1, libcall, false}}}};
1212static stringop_algs bdver3_memset[2] = {
1213 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1214 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1215 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1216 {-1, libcall, false}}}};
1217struct processor_costs bdver3_cost = {
1218 COSTS_N_INSNS (1), /* cost of an add instruction */
1219 COSTS_N_INSNS (1), /* cost of a lea instruction */
1220 COSTS_N_INSNS (1), /* variable shift costs */
1221 COSTS_N_INSNS (1), /* constant shift costs */
1222 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1223 COSTS_N_INSNS (4), /* HI */
1224 COSTS_N_INSNS (4), /* SI */
1225 COSTS_N_INSNS (6), /* DI */
1226 COSTS_N_INSNS (6)}, /* other */
1227 0, /* cost of multiply per each bit set */
1228 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1229 COSTS_N_INSNS (35), /* HI */
1230 COSTS_N_INSNS (51), /* SI */
1231 COSTS_N_INSNS (83), /* DI */
1232 COSTS_N_INSNS (83)}, /* other */
1233 COSTS_N_INSNS (1), /* cost of movsx */
1234 COSTS_N_INSNS (1), /* cost of movzx */
1235 8, /* "large" insn */
1236 9, /* MOVE_RATIO */
df41dbaf
JH
1237
1238 /* All move costs are relative to integer->integer move times 2 and thus
1239 they are latency*2. */
1240 8, /* cost for loading QImode using movzbl */
1241 {8, 8, 8}, /* cost of loading integer registers
64766e8d
JH
1242 in QImode, HImode and SImode.
1243 Relative to reg-reg move (2). */
df41dbaf
JH
1244 {8, 8, 8}, /* cost of storing integer registers */
1245 4, /* cost of reg,reg fld/fst */
1246 {12, 12, 28}, /* cost of loading fp registers
64766e8d 1247 in SFmode, DFmode and XFmode */
df41dbaf 1248 {10, 10, 18}, /* cost of storing fp registers
64766e8d 1249 in SFmode, DFmode and XFmode */
df41dbaf
JH
1250 4, /* cost of moving MMX register */
1251 {12, 12}, /* cost of loading MMX registers
64766e8d 1252 in SImode and DImode */
df41dbaf 1253 {10, 10}, /* cost of storing MMX registers
64766e8d 1254 in SImode and DImode */
df41dbaf
JH
1255 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1256 {12, 12, 10, 20, 30}, /* cost of loading SSE registers
1257 in 32,64,128,256 and 512-bit */
1258 {12, 12, 10, 20, 30}, /* cost of unaligned loads. */
1259 {10, 10, 10, 20, 30}, /* cost of storing SSE registers
1260 in 32,64,128,256 and 512-bit */
1261 {10, 10, 10, 20, 30}, /* cost of unaligned stores. */
1262 16, 20, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
1263 12, 12, /* Gather load static, per_elt. */
1264 10, 10, /* Gather store static, per_elt. */
64766e8d
JH
1265 16, /* size of l1 cache. */
1266 2048, /* size of l2 cache. */
1267 64, /* size of prefetch block */
1268 /* New AMD processors never drop prefetches; if they cannot be performed
1269 immediately, they are queued. We set number of simultaneous prefetches
1270 to a large constant to reflect this (it probably is not a good idea not
1271 to limit number of prefetches at all, as their execution also takes some
1272 time). */
1273 100, /* number of parallel prefetches */
1274 2, /* Branch cost */
1275 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1276 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1277 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1278 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1279 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1280 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
6065f444 1281
c53c148c 1282 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
6065f444
JH
1283 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1284 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1285 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
c53c148c
JH
1286 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1287 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
6065f444
JH
1288 /* 9-24 */
1289 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1290 /* 9-27 */
1291 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1292 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1293 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
64766e8d
JH
1294 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1295 bdver3_memcpy,
1296 bdver3_memset,
f6fd8f2b
JH
1297 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1298 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
64766e8d
JH
1299};
1300
1301/* BDVER4 has optimized REP instruction for medium sized blocks, but for
1302 very small blocks it is better to use loop. For large blocks, libcall
1303 can do nontemporary accesses and beat inline considerably. */
1304static stringop_algs bdver4_memcpy[2] = {
1305 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1306 {-1, rep_prefix_4_byte, false}}},
1307 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1308 {-1, libcall, false}}}};
1309static stringop_algs bdver4_memset[2] = {
1310 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1311 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1312 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1313 {-1, libcall, false}}}};
1314struct processor_costs bdver4_cost = {
1315 COSTS_N_INSNS (1), /* cost of an add instruction */
1316 COSTS_N_INSNS (1), /* cost of a lea instruction */
1317 COSTS_N_INSNS (1), /* variable shift costs */
1318 COSTS_N_INSNS (1), /* constant shift costs */
1319 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1320 COSTS_N_INSNS (4), /* HI */
1321 COSTS_N_INSNS (4), /* SI */
1322 COSTS_N_INSNS (6), /* DI */
1323 COSTS_N_INSNS (6)}, /* other */
1324 0, /* cost of multiply per each bit set */
1325 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1326 COSTS_N_INSNS (35), /* HI */
1327 COSTS_N_INSNS (51), /* SI */
1328 COSTS_N_INSNS (83), /* DI */
1329 COSTS_N_INSNS (83)}, /* other */
1330 COSTS_N_INSNS (1), /* cost of movsx */
1331 COSTS_N_INSNS (1), /* cost of movzx */
1332 8, /* "large" insn */
1333 9, /* MOVE_RATIO */
df41dbaf
JH
1334
1335 /* All move costs are relative to integer->integer move times 2 and thus
1336 they are latency*2. */
1337 8, /* cost for loading QImode using movzbl */
1338 {8, 8, 8}, /* cost of loading integer registers
64766e8d
JH
1339 in QImode, HImode and SImode.
1340 Relative to reg-reg move (2). */
df41dbaf
JH
1341 {8, 8, 8}, /* cost of storing integer registers */
1342 4, /* cost of reg,reg fld/fst */
1343 {12, 12, 28}, /* cost of loading fp registers
64766e8d 1344 in SFmode, DFmode and XFmode */
df41dbaf 1345 {10, 10, 18}, /* cost of storing fp registers
64766e8d 1346 in SFmode, DFmode and XFmode */
df41dbaf
JH
1347 4, /* cost of moving MMX register */
1348 {12, 12}, /* cost of loading MMX registers
64766e8d 1349 in SImode and DImode */
df41dbaf 1350 {10, 10}, /* cost of storing MMX registers
64766e8d 1351 in SImode and DImode */
df41dbaf
JH
1352 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1353 {12, 12, 10, 20, 30}, /* cost of loading SSE registers
1354 in 32,64,128,256 and 512-bit */
1355 {12, 12, 10, 20, 30}, /* cost of unaligned loads. */
1356 {10, 10, 10, 20, 30}, /* cost of storing SSE registers
1357 in 32,64,128,256 and 512-bit */
1358 {10, 10, 10, 20, 30}, /* cost of unaligned stores. */
1359 16, 20, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
1360 12, 12, /* Gather load static, per_elt. */
1361 10, 10, /* Gather store static, per_elt. */
64766e8d
JH
1362 16, /* size of l1 cache. */
1363 2048, /* size of l2 cache. */
1364 64, /* size of prefetch block */
1365 /* New AMD processors never drop prefetches; if they cannot be performed
1366 immediately, they are queued. We set number of simultaneous prefetches
1367 to a large constant to reflect this (it probably is not a good idea not
1368 to limit number of prefetches at all, as their execution also takes some
1369 time). */
1370 100, /* number of parallel prefetches */
1371 2, /* Branch cost */
1372 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1373 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1374 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1375 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1376 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1377 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
6065f444 1378
c53c148c 1379 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
6065f444
JH
1380 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1381 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1382 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
c53c148c
JH
1383 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1384 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
6065f444
JH
1385 /* 9-24 */
1386 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1387 /* 9-27 */
1388 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1389 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1390 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
64766e8d
JH
1391 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1392 bdver4_memcpy,
1393 bdver4_memset,
f6fd8f2b
JH
1394 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1395 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
64766e8d
JH
1396};
1397
1398
1399/* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1400 very small blocks it is better to use loop. For large blocks, libcall
1401 can do nontemporary accesses and beat inline considerably. */
1402static stringop_algs znver1_memcpy[2] = {
1403 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1404 {-1, rep_prefix_4_byte, false}}},
1405 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1406 {-1, libcall, false}}}};
1407static stringop_algs znver1_memset[2] = {
1408 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1409 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1410 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1411 {-1, libcall, false}}}};
1412struct processor_costs znver1_cost = {
1413 COSTS_N_INSNS (1), /* cost of an add instruction. */
1414 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1415 COSTS_N_INSNS (1), /* variable shift costs. */
1416 COSTS_N_INSNS (1), /* constant shift costs. */
1417 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1418 COSTS_N_INSNS (3), /* HI. */
1419 COSTS_N_INSNS (3), /* SI. */
6065f444
JH
1420 COSTS_N_INSNS (3), /* DI. */
1421 COSTS_N_INSNS (3)}, /* other. */
64766e8d
JH
1422 0, /* cost of multiply per each bit
1423 set. */
6065f444
JH
1424 /* Depending on parameters, idiv can get faster on ryzen. This is upper
1425 bound. */
1426 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
1427 COSTS_N_INSNS (22), /* HI. */
1428 COSTS_N_INSNS (30), /* SI. */
1429 COSTS_N_INSNS (45), /* DI. */
1430 COSTS_N_INSNS (45)}, /* other. */
64766e8d
JH
1431 COSTS_N_INSNS (1), /* cost of movsx. */
1432 COSTS_N_INSNS (1), /* cost of movzx. */
1433 8, /* "large" insn. */
1434 9, /* MOVE_RATIO. */
01118373 1435
df41dbaf
JH
1436 /* All move costs are relative to integer->integer move times 2 and thus
1437 they are latency*2. */
1438
01118373
JH
1439 /* reg-reg moves are done by renaming and thus they are even cheaper than
1440 1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1441 to doubles of latencies, we do not model this correctly. It does not
1442 seem to make practical difference to bump prices up even more. */
1443 6, /* cost for loading QImode using
64766e8d 1444 movzbl. */
01118373 1445 {6, 6, 6}, /* cost of loading integer registers
64766e8d
JH
1446 in QImode, HImode and SImode.
1447 Relative to reg-reg move (2). */
01118373 1448 {8, 8, 8}, /* cost of storing integer
64766e8d
JH
1449 registers. */
1450 2, /* cost of reg,reg fld/fst. */
01118373 1451 {6, 6, 16}, /* cost of loading fp registers
64766e8d 1452 in SFmode, DFmode and XFmode. */
01118373 1453 {8, 8, 16}, /* cost of storing fp registers
64766e8d
JH
1454 in SFmode, DFmode and XFmode. */
1455 2, /* cost of moving MMX register. */
01118373 1456 {6, 6}, /* cost of loading MMX registers
64766e8d 1457 in SImode and DImode. */
01118373 1458 {8, 8}, /* cost of storing MMX registers
64766e8d 1459 in SImode and DImode. */
df41dbaf
JH
1460 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
1461 {6, 6, 6, 10, 20}, /* cost of loading SSE registers
1462 in 32,64,128,256 and 512-bit. */
1463 {6, 6, 6, 10, 20}, /* cost of unaligned loads. */
1464 {8, 8, 8, 8, 16}, /* cost of storing SSE registers
1465 in 32,64,128,256 and 512-bit. */
1466 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
1467 6, 6, /* SSE->integer and integer->SSE moves. */
a4fe6139
JH
1468 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1469 throughput 12. Approx 9 uops do not depend on vector size and every load
1470 is 7 uops. */
1471 18, 8, /* Gather load static, per_elt. */
1472 18, 10, /* Gather store static, per_elt. */
64766e8d
JH
1473 32, /* size of l1 cache. */
1474 512, /* size of l2 cache. */
1475 64, /* size of prefetch block. */
1476 /* New AMD processors never drop prefetches; if they cannot be performed
1477 immediately, they are queued. We set number of simultaneous prefetches
1478 to a large constant to reflect this (it probably is not a good idea not
1479 to limit number of prefetches at all, as their execution also takes some
1480 time). */
1481 100, /* number of parallel prefetches. */
1482 3, /* Branch cost. */
6065f444
JH
1483 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1484 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1485 /* Latency of fdiv is 8-15. */
1486 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1487 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1488 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1489 /* Latency of fsqrt is 4-10. */
1490 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1491
c53c148c 1492 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
1493 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1494 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1495 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
c53c148c
JH
1496 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1497 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
6065f444
JH
1498 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1499 /* 9-13 */
1500 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1501 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1502 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
64766e8d
JH
1503 /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1504 and it can execute 2 integer additions and 2 multiplications thus
1505 reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests
1506 that 4 works better than 6 probably due to register pressure.
1507
1508 Integer vector operations are taken by FP unit and execute 3 vector
1509 plus/minus operations per cycle but only one multiply. This is adjusted
1510 in ix86_reassociation_width. */
1511 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1512 znver1_memcpy,
1513 znver1_memset,
f6fd8f2b
JH
1514 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1515 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
64766e8d
JH
1516};
1517
c234d831
UB
1518/* skylake_cost should produce code tuned for Skylake familly of CPUs. */
1519static stringop_algs skylake_memcpy[2] = {
1520 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
6e559c70 1521 {libcall, {{16, loop, false}, {512, unrolled_loop, false},
c234d831
UB
1522 {-1, libcall, false}}}};
1523
1524static stringop_algs skylake_memset[2] = {
1525 {libcall, {{6, loop_1_byte, true},
1526 {24, loop, true},
1527 {8192, rep_prefix_4_byte, true},
1528 {-1, libcall, false}}},
6e559c70 1529 {libcall, {{24, loop, true}, {512, unrolled_loop, false},
c234d831
UB
1530 {-1, libcall, false}}}};
1531
1532static const
1533struct processor_costs skylake_cost = {
1534 COSTS_N_INSNS (1), /* cost of an add instruction */
1535 COSTS_N_INSNS (1)+1, /* cost of a lea instruction */
1536 COSTS_N_INSNS (1), /* variable shift costs */
1537 COSTS_N_INSNS (1), /* constant shift costs */
1538 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1539 COSTS_N_INSNS (4), /* HI */
1540 COSTS_N_INSNS (3), /* SI */
a2ef9558
MT
1541 COSTS_N_INSNS (3), /* DI */
1542 COSTS_N_INSNS (3)}, /* other */
c234d831 1543 0, /* cost of multiply per each bit set */
02308bd3
MT
1544 /* Expanding div/mod currently doesn't consider parallelism. So the cost
1545 model is not realistic. We compensate by increasing the latencies a bit. */
1546 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
1547 COSTS_N_INSNS (11), /* HI */
1548 COSTS_N_INSNS (14), /* SI */
c234d831
UB
1549 COSTS_N_INSNS (76), /* DI */
1550 COSTS_N_INSNS (76)}, /* other */
1551 COSTS_N_INSNS (1), /* cost of movsx */
1552 COSTS_N_INSNS (0), /* cost of movzx */
1553 8, /* "large" insn */
1554 17, /* MOVE_RATIO */
1555
1556 6, /* cost for loading QImode using movzbl */
1557 {4, 4, 4}, /* cost of loading integer registers
1558 in QImode, HImode and SImode.
1559 Relative to reg-reg move (2). */
001e7337 1560 {6, 6, 3}, /* cost of storing integer registers */
c234d831
UB
1561 2, /* cost of reg,reg fld/fst */
1562 {6, 6, 8}, /* cost of loading fp registers
1563 in SFmode, DFmode and XFmode */
1564 {6, 6, 10}, /* cost of storing fp registers
1565 in SFmode, DFmode and XFmode */
1566 2, /* cost of moving MMX register */
1567 {6, 6}, /* cost of loading MMX registers
1568 in SImode and DImode */
1569 {6, 6}, /* cost of storing MMX registers
1570 in SImode and DImode */
1571 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
1572 {6, 6, 6, 10, 20}, /* cost of loading SSE registers
1573 in 32,64,128,256 and 512-bit */
1574 {6, 6, 6, 10, 20}, /* cost of unaligned loads. */
001e7337 1575 {8, 8, 8, 12, 24}, /* cost of storing SSE registers
c234d831
UB
1576 in 32,64,128,256 and 512-bit */
1577 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
1578 2, 2, /* SSE->integer and integer->SSE moves */
1579 20, 8, /* Gather load static, per_elt. */
1580 22, 10, /* Gather store static, per_elt. */
1581 64, /* size of l1 cache. */
1582 512, /* size of l2 cache. */
1583 64, /* size of prefetch block */
1584 6, /* number of parallel prefetches */
1585 3, /* Branch cost */
1586 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
1587 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1588 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1589 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1590 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1591 COSTS_N_INSNS (20), /* cost of FSQRT instruction. */
1592
1593 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1594 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1595 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1596 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1597 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
1598 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
1599 COSTS_N_INSNS (11), /* cost of DIVSS instruction. */
1600 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
1601 COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */
1602 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
1603 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
1604 skylake_memcpy,
1605 skylake_memset,
1606 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1607 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1608};
64766e8d
JH
1609 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1610 very small blocks it is better to use loop. For large blocks, libcall can
1611 do nontemporary accesses and beat inline considerably. */
1612static stringop_algs btver1_memcpy[2] = {
1613 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1614 {-1, rep_prefix_4_byte, false}}},
1615 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1616 {-1, libcall, false}}}};
1617static stringop_algs btver1_memset[2] = {
1618 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1619 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1620 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1621 {-1, libcall, false}}}};
1622const struct processor_costs btver1_cost = {
1623 COSTS_N_INSNS (1), /* cost of an add instruction */
1624 COSTS_N_INSNS (2), /* cost of a lea instruction */
1625 COSTS_N_INSNS (1), /* variable shift costs */
1626 COSTS_N_INSNS (1), /* constant shift costs */
1627 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1628 COSTS_N_INSNS (4), /* HI */
1629 COSTS_N_INSNS (3), /* SI */
1630 COSTS_N_INSNS (4), /* DI */
1631 COSTS_N_INSNS (5)}, /* other */
1632 0, /* cost of multiply per each bit set */
1633 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1634 COSTS_N_INSNS (35), /* HI */
1635 COSTS_N_INSNS (51), /* SI */
1636 COSTS_N_INSNS (83), /* DI */
1637 COSTS_N_INSNS (83)}, /* other */
1638 COSTS_N_INSNS (1), /* cost of movsx */
1639 COSTS_N_INSNS (1), /* cost of movzx */
1640 8, /* "large" insn */
1641 9, /* MOVE_RATIO */
df41dbaf
JH
1642
1643 /* All move costs are relative to integer->integer move times 2 and thus
1644 they are latency*2. */
1645 8, /* cost for loading QImode using movzbl */
1646 {6, 8, 6}, /* cost of loading integer registers
64766e8d
JH
1647 in QImode, HImode and SImode.
1648 Relative to reg-reg move (2). */
df41dbaf 1649 {6, 8, 6}, /* cost of storing integer registers */
64766e8d 1650 4, /* cost of reg,reg fld/fst */
df41dbaf 1651 {12, 12, 28}, /* cost of loading fp registers
64766e8d 1652 in SFmode, DFmode and XFmode */
df41dbaf 1653 {12, 12, 38}, /* cost of storing fp registers
64766e8d 1654 in SFmode, DFmode and XFmode */
df41dbaf
JH
1655 4, /* cost of moving MMX register */
1656 {10, 10}, /* cost of loading MMX registers
64766e8d 1657 in SImode and DImode */
df41dbaf 1658 {12, 12}, /* cost of storing MMX registers
64766e8d 1659 in SImode and DImode */
df41dbaf
JH
1660 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1661 {10, 10, 12, 24, 48}, /* cost of loading SSE registers
1662 in 32,64,128,256 and 512-bit */
1663 {10, 10, 12, 24, 48}, /* cost of unaligned loads. */
1664 {10, 10, 12, 24, 48}, /* cost of storing SSE registers
1665 in 32,64,128,256 and 512-bit */
1666 {10, 10, 12, 24, 48}, /* cost of unaligned stores. */
1667 14, 14, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
1668 10, 10, /* Gather load static, per_elt. */
1669 10, 10, /* Gather store static, per_elt. */
64766e8d
JH
1670 32, /* size of l1 cache. */
1671 512, /* size of l2 cache. */
1672 64, /* size of prefetch block */
1673 100, /* number of parallel prefetches */
1674 2, /* Branch cost */
1675 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1676 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1677 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1678 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1679 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1680 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
6065f444 1681
c53c148c 1682 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
1683 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1684 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
1685 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
c53c148c
JH
1686 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1687 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
6065f444
JH
1688 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
1689 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
1690 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
1691 COSTS_N_INSNS (48), /* cost of SQRTSD instruction. */
64766e8d
JH
1692 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1693 btver1_memcpy,
1694 btver1_memset,
f6fd8f2b
JH
1695 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1696 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
64766e8d
JH
1697};
1698
1699static stringop_algs btver2_memcpy[2] = {
1700 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1701 {-1, rep_prefix_4_byte, false}}},
1702 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1703 {-1, libcall, false}}}};
1704static stringop_algs btver2_memset[2] = {
1705 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1706 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1707 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1708 {-1, libcall, false}}}};
1709const struct processor_costs btver2_cost = {
1710 COSTS_N_INSNS (1), /* cost of an add instruction */
1711 COSTS_N_INSNS (2), /* cost of a lea instruction */
1712 COSTS_N_INSNS (1), /* variable shift costs */
1713 COSTS_N_INSNS (1), /* constant shift costs */
1714 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1715 COSTS_N_INSNS (4), /* HI */
1716 COSTS_N_INSNS (3), /* SI */
1717 COSTS_N_INSNS (4), /* DI */
1718 COSTS_N_INSNS (5)}, /* other */
1719 0, /* cost of multiply per each bit set */
1720 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1721 COSTS_N_INSNS (35), /* HI */
1722 COSTS_N_INSNS (51), /* SI */
1723 COSTS_N_INSNS (83), /* DI */
1724 COSTS_N_INSNS (83)}, /* other */
1725 COSTS_N_INSNS (1), /* cost of movsx */
1726 COSTS_N_INSNS (1), /* cost of movzx */
1727 8, /* "large" insn */
1728 9, /* MOVE_RATIO */
df41dbaf
JH
1729
1730 /* All move costs are relative to integer->integer move times 2 and thus
1731 they are latency*2. */
1732 8, /* cost for loading QImode using movzbl */
1733 {8, 8, 6}, /* cost of loading integer registers
64766e8d
JH
1734 in QImode, HImode and SImode.
1735 Relative to reg-reg move (2). */
df41dbaf 1736 {8, 8, 6}, /* cost of storing integer registers */
64766e8d 1737 4, /* cost of reg,reg fld/fst */
df41dbaf 1738 {12, 12, 28}, /* cost of loading fp registers
64766e8d 1739 in SFmode, DFmode and XFmode */
df41dbaf 1740 {12, 12, 38}, /* cost of storing fp registers
64766e8d 1741 in SFmode, DFmode and XFmode */
df41dbaf
JH
1742 4, /* cost of moving MMX register */
1743 {10, 10}, /* cost of loading MMX registers
64766e8d 1744 in SImode and DImode */
df41dbaf 1745 {12, 12}, /* cost of storing MMX registers
64766e8d 1746 in SImode and DImode */
df41dbaf
JH
1747 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1748 {10, 10, 12, 24, 48}, /* cost of loading SSE registers
1749 in 32,64,128,256 and 512-bit */
1750 {10, 10, 12, 24, 48}, /* cost of unaligned loads. */
1751 {10, 10, 12, 24, 48}, /* cost of storing SSE registers
1752 in 32,64,128,256 and 512-bit */
1753 {10, 10, 12, 24, 48}, /* cost of unaligned stores. */
1754 14, 14, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
1755 10, 10, /* Gather load static, per_elt. */
1756 10, 10, /* Gather store static, per_elt. */
64766e8d
JH
1757 32, /* size of l1 cache. */
1758 2048, /* size of l2 cache. */
1759 64, /* size of prefetch block */
1760 100, /* number of parallel prefetches */
1761 2, /* Branch cost */
1762 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1763 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1764 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1765 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1766 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1767 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
6065f444 1768
c53c148c 1769 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
1770 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1771 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
1772 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
c53c148c
JH
1773 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1774 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
6065f444
JH
1775 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
1776 COSTS_N_INSNS (19), /* cost of DIVSD instruction. */
1777 COSTS_N_INSNS (16), /* cost of SQRTSS instruction. */
1778 COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */
64766e8d
JH
1779 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1780 btver2_memcpy,
1781 btver2_memset,
f6fd8f2b
JH
1782 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1783 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
64766e8d
JH
1784};
1785
1786static stringop_algs pentium4_memcpy[2] = {
1787 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1788 DUMMY_STRINGOP_ALGS};
1789static stringop_algs pentium4_memset[2] = {
1790 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1791 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1792 DUMMY_STRINGOP_ALGS};
1793
1794static const
1795struct processor_costs pentium4_cost = {
1796 COSTS_N_INSNS (1), /* cost of an add instruction */
1797 COSTS_N_INSNS (3), /* cost of a lea instruction */
1798 COSTS_N_INSNS (4), /* variable shift costs */
1799 COSTS_N_INSNS (4), /* constant shift costs */
1800 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1801 COSTS_N_INSNS (15), /* HI */
1802 COSTS_N_INSNS (15), /* SI */
1803 COSTS_N_INSNS (15), /* DI */
1804 COSTS_N_INSNS (15)}, /* other */
1805 0, /* cost of multiply per each bit set */
1806 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1807 COSTS_N_INSNS (56), /* HI */
1808 COSTS_N_INSNS (56), /* SI */
1809 COSTS_N_INSNS (56), /* DI */
1810 COSTS_N_INSNS (56)}, /* other */
1811 COSTS_N_INSNS (1), /* cost of movsx */
1812 COSTS_N_INSNS (1), /* cost of movzx */
1813 16, /* "large" insn */
1814 6, /* MOVE_RATIO */
df41dbaf
JH
1815
1816 /* All move costs are relative to integer->integer move times 2 and thus
1817 they are latency*2. */
1818 5, /* cost for loading QImode using movzbl */
64766e8d
JH
1819 {4, 5, 4}, /* cost of loading integer registers
1820 in QImode, HImode and SImode.
1821 Relative to reg-reg move (2). */
1822 {2, 3, 2}, /* cost of storing integer registers */
df41dbaf
JH
1823 12, /* cost of reg,reg fld/fst */
1824 {14, 14, 14}, /* cost of loading fp registers
64766e8d 1825 in SFmode, DFmode and XFmode */
df41dbaf 1826 {14, 14, 14}, /* cost of storing fp registers
64766e8d 1827 in SFmode, DFmode and XFmode */
df41dbaf
JH
1828 12, /* cost of moving MMX register */
1829 {16, 16}, /* cost of loading MMX registers
64766e8d 1830 in SImode and DImode */
df41dbaf 1831 {16, 16}, /* cost of storing MMX registers
64766e8d 1832 in SImode and DImode */
df41dbaf
JH
1833 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
1834 {16, 16, 16, 32, 64}, /* cost of loading SSE registers
1835 in 32,64,128,256 and 512-bit */
1836 {32, 32, 32, 64, 128}, /* cost of unaligned loads. */
1837 {16, 16, 16, 32, 64}, /* cost of storing SSE registers
1838 in 32,64,128,256 and 512-bit */
1839 {32, 32, 32, 64, 128}, /* cost of unaligned stores. */
1840 20, 12, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
1841 16, 16, /* Gather load static, per_elt. */
1842 16, 16, /* Gather store static, per_elt. */
64766e8d
JH
1843 8, /* size of l1 cache. */
1844 256, /* size of l2 cache. */
1845 64, /* size of prefetch block */
1846 6, /* number of parallel prefetches */
1847 2, /* Branch cost */
1848 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1849 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1850 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1851 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1852 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1853 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
6065f444 1854
c53c148c 1855 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
6065f444
JH
1856 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1857 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1858 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
c53c148c
JH
1859 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1860 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
6065f444
JH
1861 COSTS_N_INSNS (23), /* cost of DIVSS instruction. */
1862 COSTS_N_INSNS (38), /* cost of DIVSD instruction. */
1863 COSTS_N_INSNS (23), /* cost of SQRTSS instruction. */
1864 COSTS_N_INSNS (38), /* cost of SQRTSD instruction. */
64766e8d
JH
1865 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1866 pentium4_memcpy,
1867 pentium4_memset,
f6fd8f2b
JH
1868 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1869 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
64766e8d
JH
1870};
1871
1872static stringop_algs nocona_memcpy[2] = {
1873 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1874 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1875 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1876
1877static stringop_algs nocona_memset[2] = {
1878 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1879 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1880 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1881 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1882
1883static const
1884struct processor_costs nocona_cost = {
1885 COSTS_N_INSNS (1), /* cost of an add instruction */
1886 COSTS_N_INSNS (1), /* cost of a lea instruction */
1887 COSTS_N_INSNS (1), /* variable shift costs */
1888 COSTS_N_INSNS (1), /* constant shift costs */
1889 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1890 COSTS_N_INSNS (10), /* HI */
1891 COSTS_N_INSNS (10), /* SI */
1892 COSTS_N_INSNS (10), /* DI */
1893 COSTS_N_INSNS (10)}, /* other */
1894 0, /* cost of multiply per each bit set */
1895 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1896 COSTS_N_INSNS (66), /* HI */
1897 COSTS_N_INSNS (66), /* SI */
1898 COSTS_N_INSNS (66), /* DI */
1899 COSTS_N_INSNS (66)}, /* other */
1900 COSTS_N_INSNS (1), /* cost of movsx */
1901 COSTS_N_INSNS (1), /* cost of movzx */
1902 16, /* "large" insn */
1903 17, /* MOVE_RATIO */
df41dbaf
JH
1904
1905 /* All move costs are relative to integer->integer move times 2 and thus
1906 they are latency*2. */
64766e8d
JH
1907 4, /* cost for loading QImode using movzbl */
1908 {4, 4, 4}, /* cost of loading integer registers
1909 in QImode, HImode and SImode.
1910 Relative to reg-reg move (2). */
1911 {4, 4, 4}, /* cost of storing integer registers */
df41dbaf
JH
1912 12, /* cost of reg,reg fld/fst */
1913 {14, 14, 14}, /* cost of loading fp registers
64766e8d 1914 in SFmode, DFmode and XFmode */
df41dbaf 1915 {14, 14, 14}, /* cost of storing fp registers
64766e8d 1916 in SFmode, DFmode and XFmode */
df41dbaf 1917 14, /* cost of moving MMX register */
64766e8d
JH
1918 {12, 12}, /* cost of loading MMX registers
1919 in SImode and DImode */
1920 {12, 12}, /* cost of storing MMX registers
1921 in SImode and DImode */
df41dbaf
JH
1922 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
1923 {12, 12, 12, 24, 48}, /* cost of loading SSE registers
1924 in 32,64,128,256 and 512-bit */
1925 {24, 24, 24, 48, 96}, /* cost of unaligned loads. */
1926 {12, 12, 12, 24, 48}, /* cost of storing SSE registers
1927 in 32,64,128,256 and 512-bit */
1928 {24, 24, 24, 48, 96}, /* cost of unaligned stores. */
1929 20, 12, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
1930 12, 12, /* Gather load static, per_elt. */
1931 12, 12, /* Gather store static, per_elt. */
64766e8d
JH
1932 8, /* size of l1 cache. */
1933 1024, /* size of l2 cache. */
1934 64, /* size of prefetch block */
1935 8, /* number of parallel prefetches */
1936 1, /* Branch cost */
1937 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1938 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1939 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1940 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1941 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1942 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
6065f444 1943
c53c148c 1944 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
6065f444
JH
1945 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
1946 COSTS_N_INSNS (7), /* cost of MULSS instruction. */
1947 COSTS_N_INSNS (7), /* cost of MULSD instruction. */
c53c148c
JH
1948 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
1949 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
6065f444
JH
1950 COSTS_N_INSNS (32), /* cost of DIVSS instruction. */
1951 COSTS_N_INSNS (40), /* cost of DIVSD instruction. */
1952 COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */
1953 COSTS_N_INSNS (41), /* cost of SQRTSD instruction. */
64766e8d
JH
1954 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1955 nocona_memcpy,
1956 nocona_memset,
f6fd8f2b
JH
1957 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1958 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
64766e8d
JH
1959};
1960
1961static stringop_algs atom_memcpy[2] = {
1962 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1963 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1964 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1965static stringop_algs atom_memset[2] = {
1966 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1967 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1968 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1969 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1970static const
1971struct processor_costs atom_cost = {
1972 COSTS_N_INSNS (1), /* cost of an add instruction */
1973 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1974 COSTS_N_INSNS (1), /* variable shift costs */
1975 COSTS_N_INSNS (1), /* constant shift costs */
1976 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1977 COSTS_N_INSNS (4), /* HI */
1978 COSTS_N_INSNS (3), /* SI */
1979 COSTS_N_INSNS (4), /* DI */
1980 COSTS_N_INSNS (2)}, /* other */
1981 0, /* cost of multiply per each bit set */
1982 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1983 COSTS_N_INSNS (26), /* HI */
1984 COSTS_N_INSNS (42), /* SI */
1985 COSTS_N_INSNS (74), /* DI */
1986 COSTS_N_INSNS (74)}, /* other */
1987 COSTS_N_INSNS (1), /* cost of movsx */
1988 COSTS_N_INSNS (1), /* cost of movzx */
1989 8, /* "large" insn */
1990 17, /* MOVE_RATIO */
df41dbaf
JH
1991
1992 /* All move costs are relative to integer->integer move times 2 and thus
1993 they are latency*2. */
1994 6, /* cost for loading QImode using movzbl */
1995 {6, 6, 6}, /* cost of loading integer registers
64766e8d
JH
1996 in QImode, HImode and SImode.
1997 Relative to reg-reg move (2). */
df41dbaf 1998 {6, 6, 6}, /* cost of storing integer registers */
64766e8d 1999 4, /* cost of reg,reg fld/fst */
df41dbaf 2000 {6, 6, 18}, /* cost of loading fp registers
64766e8d 2001 in SFmode, DFmode and XFmode */
df41dbaf 2002 {14, 14, 24}, /* cost of storing fp registers
64766e8d
JH
2003 in SFmode, DFmode and XFmode */
2004 2, /* cost of moving MMX register */
2005 {8, 8}, /* cost of loading MMX registers
2006 in SImode and DImode */
df41dbaf 2007 {10, 10}, /* cost of storing MMX registers
64766e8d 2008 in SImode and DImode */
df41dbaf
JH
2009 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2010 {8, 8, 8, 16, 32}, /* cost of loading SSE registers
2011 in 32,64,128,256 and 512-bit */
2012 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */
2013 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
2014 in 32,64,128,256 and 512-bit */
2015 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
2016 8, 6, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
2017 8, 8, /* Gather load static, per_elt. */
2018 8, 8, /* Gather store static, per_elt. */
64766e8d
JH
2019 32, /* size of l1 cache. */
2020 256, /* size of l2 cache. */
2021 64, /* size of prefetch block */
2022 6, /* number of parallel prefetches */
2023 3, /* Branch cost */
2024 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2025 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2026 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2027 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2028 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2029 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
6065f444 2030
c53c148c 2031 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
2032 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
2033 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2034 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
c53c148c
JH
2035 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2036 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
6065f444
JH
2037 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
2038 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
2039 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
2040 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
64766e8d
JH
2041 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
2042 atom_memcpy,
2043 atom_memset,
f6fd8f2b
JH
2044 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2045 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
64766e8d
JH
2046};
2047
2048static stringop_algs slm_memcpy[2] = {
2049 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2050 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2051 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2052static stringop_algs slm_memset[2] = {
2053 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2054 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2055 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2056 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2057static const
2058struct processor_costs slm_cost = {
2059 COSTS_N_INSNS (1), /* cost of an add instruction */
2060 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2061 COSTS_N_INSNS (1), /* variable shift costs */
2062 COSTS_N_INSNS (1), /* constant shift costs */
2063 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2064 COSTS_N_INSNS (3), /* HI */
2065 COSTS_N_INSNS (3), /* SI */
2066 COSTS_N_INSNS (4), /* DI */
2067 COSTS_N_INSNS (2)}, /* other */
2068 0, /* cost of multiply per each bit set */
2069 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2070 COSTS_N_INSNS (26), /* HI */
2071 COSTS_N_INSNS (42), /* SI */
2072 COSTS_N_INSNS (74), /* DI */
2073 COSTS_N_INSNS (74)}, /* other */
2074 COSTS_N_INSNS (1), /* cost of movsx */
2075 COSTS_N_INSNS (1), /* cost of movzx */
2076 8, /* "large" insn */
2077 17, /* MOVE_RATIO */
df41dbaf
JH
2078
2079 /* All move costs are relative to integer->integer move times 2 and thus
2080 they are latency*2. */
2081 8, /* cost for loading QImode using movzbl */
2082 {8, 8, 8}, /* cost of loading integer registers
64766e8d
JH
2083 in QImode, HImode and SImode.
2084 Relative to reg-reg move (2). */
df41dbaf
JH
2085 {6, 6, 6}, /* cost of storing integer registers */
2086 2, /* cost of reg,reg fld/fst */
2087 {8, 8, 18}, /* cost of loading fp registers
64766e8d 2088 in SFmode, DFmode and XFmode */
df41dbaf 2089 {6, 6, 18}, /* cost of storing fp registers
64766e8d
JH
2090 in SFmode, DFmode and XFmode */
2091 2, /* cost of moving MMX register */
2092 {8, 8}, /* cost of loading MMX registers
2093 in SImode and DImode */
df41dbaf 2094 {6, 6}, /* cost of storing MMX registers
64766e8d 2095 in SImode and DImode */
df41dbaf
JH
2096 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2097 {8, 8, 8, 16, 32}, /* cost of loading SSE registers
2098 in 32,64,128,256 and 512-bit */
2099 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */
2100 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
2101 in 32,64,128,256 and 512-bit */
2102 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
2103 8, 6, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
2104 8, 8, /* Gather load static, per_elt. */
2105 8, 8, /* Gather store static, per_elt. */
64766e8d
JH
2106 32, /* size of l1 cache. */
2107 256, /* size of l2 cache. */
2108 64, /* size of prefetch block */
2109 6, /* number of parallel prefetches */
2110 3, /* Branch cost */
2111 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2112 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2113 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2114 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2115 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2116 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
6065f444 2117
c53c148c 2118 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
2119 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2120 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2121 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
c53c148c
JH
2122 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2123 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
6065f444
JH
2124 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
2125 COSTS_N_INSNS (69), /* cost of DIVSD instruction. */
2126 COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */
2127 COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */
64766e8d
JH
2128 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2129 slm_memcpy,
2130 slm_memset,
f6fd8f2b
JH
2131 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2132 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
64766e8d
JH
2133};
2134
2135static stringop_algs intel_memcpy[2] = {
2136 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2137 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2138 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2139static stringop_algs intel_memset[2] = {
2140 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2141 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2142 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2143 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2144static const
2145struct processor_costs intel_cost = {
2146 COSTS_N_INSNS (1), /* cost of an add instruction */
2147 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2148 COSTS_N_INSNS (1), /* variable shift costs */
2149 COSTS_N_INSNS (1), /* constant shift costs */
2150 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2151 COSTS_N_INSNS (3), /* HI */
2152 COSTS_N_INSNS (3), /* SI */
2153 COSTS_N_INSNS (4), /* DI */
2154 COSTS_N_INSNS (2)}, /* other */
2155 0, /* cost of multiply per each bit set */
2156 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2157 COSTS_N_INSNS (26), /* HI */
2158 COSTS_N_INSNS (42), /* SI */
2159 COSTS_N_INSNS (74), /* DI */
2160 COSTS_N_INSNS (74)}, /* other */
2161 COSTS_N_INSNS (1), /* cost of movsx */
2162 COSTS_N_INSNS (1), /* cost of movzx */
2163 8, /* "large" insn */
2164 17, /* MOVE_RATIO */
df41dbaf
JH
2165
2166 /* All move costs are relative to integer->integer move times 2 and thus
2167 they are latency*2. */
af863030 2168 6, /* cost for loading QImode using movzbl */
64766e8d
JH
2169 {4, 4, 4}, /* cost of loading integer registers
2170 in QImode, HImode and SImode.
2171 Relative to reg-reg move (2). */
af863030
JH
2172 {6, 6, 6}, /* cost of storing integer registers */
2173 2, /* cost of reg,reg fld/fst */
2174 {6, 6, 8}, /* cost of loading fp registers
64766e8d 2175 in SFmode, DFmode and XFmode */
af863030 2176 {6, 6, 10}, /* cost of storing fp registers
64766e8d
JH
2177 in SFmode, DFmode and XFmode */
2178 2, /* cost of moving MMX register */
af863030 2179 {6, 6}, /* cost of loading MMX registers
64766e8d 2180 in SImode and DImode */
af863030 2181 {6, 6}, /* cost of storing MMX registers
64766e8d 2182 in SImode and DImode */
df41dbaf
JH
2183 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */
2184 {6, 6, 6, 6, 6}, /* cost of loading SSE registers
2185 in 32,64,128,256 and 512-bit */
2186 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
2187 {6, 6, 6, 6, 6}, /* cost of storing SSE registers
2188 in 32,64,128,256 and 512-bit */
2189 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
2190 4, 4, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
2191 6, 6, /* Gather load static, per_elt. */
2192 6, 6, /* Gather store static, per_elt. */
64766e8d
JH
2193 32, /* size of l1 cache. */
2194 256, /* size of l2 cache. */
2195 64, /* size of prefetch block */
2196 6, /* number of parallel prefetches */
2197 3, /* Branch cost */
2198 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2199 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2200 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2201 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2202 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2203 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
6065f444 2204
c53c148c 2205 COSTS_N_INSNS (8), /* cost of cheap SSE instruction. */
6065f444
JH
2206 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
2207 COSTS_N_INSNS (8), /* cost of MULSS instruction. */
2208 COSTS_N_INSNS (8), /* cost of MULSD instruction. */
c53c148c
JH
2209 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2210 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
6065f444
JH
2211 COSTS_N_INSNS (20), /* cost of DIVSS instruction. */
2212 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
2213 COSTS_N_INSNS (40), /* cost of SQRTSS instruction. */
2214 COSTS_N_INSNS (40), /* cost of SQRTSD instruction. */
64766e8d
JH
2215 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2216 intel_memcpy,
2217 intel_memset,
f6fd8f2b
JH
2218 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2219 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
64766e8d
JH
2220};
2221
2222/* Generic should produce code tuned for Core-i7 (and newer chips)
2223 and btver1 (and newer chips). */
2224
2225static stringop_algs generic_memcpy[2] = {
2226 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2227 {-1, libcall, false}}},
2228 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2229 {-1, libcall, false}}}};
2230static stringop_algs generic_memset[2] = {
2231 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2232 {-1, libcall, false}}},
2233 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2234 {-1, libcall, false}}}};
2235static const
2236struct processor_costs generic_cost = {
2237 COSTS_N_INSNS (1), /* cost of an add instruction */
ef9eec0b 2238 /* Setting cost to 2 makes our current implementation of synth_mult result in
64766e8d
JH
2239 use of unnecessary temporary registers causing regression on several
2240 SPECfp benchmarks. */
2241 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2242 COSTS_N_INSNS (1), /* variable shift costs */
2243 COSTS_N_INSNS (1), /* constant shift costs */
2244 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2245 COSTS_N_INSNS (4), /* HI */
2246 COSTS_N_INSNS (3), /* SI */
2247 COSTS_N_INSNS (4), /* DI */
7c080ade 2248 COSTS_N_INSNS (4)}, /* other */
64766e8d 2249 0, /* cost of multiply per each bit set */
7c080ade
JH
2250 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */
2251 COSTS_N_INSNS (22), /* HI */
2252 COSTS_N_INSNS (30), /* SI */
64766e8d
JH
2253 COSTS_N_INSNS (74), /* DI */
2254 COSTS_N_INSNS (74)}, /* other */
2255 COSTS_N_INSNS (1), /* cost of movsx */
2256 COSTS_N_INSNS (1), /* cost of movzx */
2257 8, /* "large" insn */
2258 17, /* MOVE_RATIO */
df41dbaf
JH
2259
2260 /* All move costs are relative to integer->integer move times 2 and thus
2261 they are latency*2. */
d555138e
JH
2262 6, /* cost for loading QImode using movzbl */
2263 {6, 6, 6}, /* cost of loading integer registers
64766e8d
JH
2264 in QImode, HImode and SImode.
2265 Relative to reg-reg move (2). */
af863030 2266 {6, 6, 6}, /* cost of storing integer registers */
64766e8d 2267 4, /* cost of reg,reg fld/fst */
af863030 2268 {6, 6, 12}, /* cost of loading fp registers
64766e8d 2269 in SFmode, DFmode and XFmode */
af863030 2270 {6, 6, 12}, /* cost of storing fp registers
64766e8d
JH
2271 in SFmode, DFmode and XFmode */
2272 2, /* cost of moving MMX register */
af863030 2273 {6, 6}, /* cost of loading MMX registers
64766e8d 2274 in SImode and DImode */
af863030 2275 {6, 6}, /* cost of storing MMX registers
64766e8d 2276 in SImode and DImode */
df41dbaf
JH
2277 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
2278 {6, 6, 6, 10, 15}, /* cost of loading SSE registers
2279 in 32,64,128,256 and 512-bit */
7c080ade 2280 {6, 6, 6, 10, 15}, /* cost of unaligned loads. */
df41dbaf
JH
2281 {6, 6, 6, 10, 15}, /* cost of storing SSE registers
2282 in 32,64,128,256 and 512-bit */
7c080ade
JH
2283 {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
2284 6, 6, /* SSE->integer and integer->SSE moves */
2285 18, 6, /* Gather load static, per_elt. */
2286 18, 6, /* Gather store static, per_elt. */
64766e8d
JH
2287 32, /* size of l1 cache. */
2288 512, /* size of l2 cache. */
2289 64, /* size of prefetch block */
2290 6, /* number of parallel prefetches */
2291 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2292 value is increased to perhaps more appropriate value of 5. */
2293 3, /* Branch cost */
ef9eec0b 2294 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
7c080ade 2295 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
e8e3054e 2296 COSTS_N_INSNS (17), /* cost of FDIV instruction. */
ef9eec0b
JH
2297 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2298 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
e8e3054e 2299 COSTS_N_INSNS (14), /* cost of FSQRT instruction. */
6065f444 2300
ef9eec0b
JH
2301 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2302 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2303 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2304 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2305 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2306 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
e8e3054e
JH
2307 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
2308 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
2309 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
2310 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
7c080ade 2311 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
64766e8d
JH
2312 generic_memcpy,
2313 generic_memset,
e8e3054e
JH
2314 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
2315 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
64766e8d
JH
2316};
2317
2318/* core_cost should produce code tuned for Core familly of CPUs. */
2319static stringop_algs core_memcpy[2] = {
2320 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2321 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2322 {-1, libcall, false}}}};
2323static stringop_algs core_memset[2] = {
2324 {libcall, {{6, loop_1_byte, true},
2325 {24, loop, true},
2326 {8192, rep_prefix_4_byte, true},
2327 {-1, libcall, false}}},
2328 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2329 {-1, libcall, false}}}};
2330
2331static const
2332struct processor_costs core_cost = {
2333 COSTS_N_INSNS (1), /* cost of an add instruction */
2334 /* On all chips taken into consideration lea is 2 cycles and more. With
2335 this cost however our current implementation of synth_mult results in
2336 use of unnecessary temporary registers causing regression on several
2337 SPECfp benchmarks. */
2338 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2339 COSTS_N_INSNS (1), /* variable shift costs */
2340 COSTS_N_INSNS (1), /* constant shift costs */
2341 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2342 COSTS_N_INSNS (4), /* HI */
2343 COSTS_N_INSNS (3), /* SI */
a2ef9558
MT
2344 /* Here we tune for Sandybridge or newer. */
2345 COSTS_N_INSNS (3), /* DI */
2346 COSTS_N_INSNS (3)}, /* other */
64766e8d 2347 0, /* cost of multiply per each bit set */
02308bd3
MT
2348 /* Expanding div/mod currently doesn't consider parallelism. So the cost
2349 model is not realistic. We compensate by increasing the latencies a bit. */
2350 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
2351 COSTS_N_INSNS (11), /* HI */
2352 COSTS_N_INSNS (14), /* SI */
ffa3ce53
JH
2353 COSTS_N_INSNS (81), /* DI */
2354 COSTS_N_INSNS (81)}, /* other */
64766e8d
JH
2355 COSTS_N_INSNS (1), /* cost of movsx */
2356 COSTS_N_INSNS (1), /* cost of movzx */
2357 8, /* "large" insn */
2358 17, /* MOVE_RATIO */
df41dbaf
JH
2359
2360 /* All move costs are relative to integer->integer move times 2 and thus
2361 they are latency*2. */
ffa3ce53 2362 6, /* cost for loading QImode using movzbl */
64766e8d
JH
2363 {4, 4, 4}, /* cost of loading integer registers
2364 in QImode, HImode and SImode.
2365 Relative to reg-reg move (2). */
ffa3ce53
JH
2366 {6, 6, 6}, /* cost of storing integer registers */
2367 2, /* cost of reg,reg fld/fst */
2368 {6, 6, 8}, /* cost of loading fp registers
64766e8d 2369 in SFmode, DFmode and XFmode */
af863030 2370 {6, 6, 10}, /* cost of storing fp registers
64766e8d
JH
2371 in SFmode, DFmode and XFmode */
2372 2, /* cost of moving MMX register */
ffa3ce53 2373 {6, 6}, /* cost of loading MMX registers
64766e8d 2374 in SImode and DImode */
ffa3ce53 2375 {6, 6}, /* cost of storing MMX registers
64766e8d 2376 in SImode and DImode */
df41dbaf
JH
2377 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
2378 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
2379 in 32,64,128,256 and 512-bit */
2380 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */
2381 {6, 6, 6, 6, 12}, /* cost of storing SSE registers
2382 in 32,64,128,256 and 512-bit */
2383 {6, 6, 6, 6, 12}, /* cost of unaligned stores. */
2384 2, 2, /* SSE->integer and integer->SSE moves */
a4fe6139
JH
2385 /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
2386 rec. throughput 6.
2387 So 5 uops statically and one uops per load. */
2388 10, 6, /* Gather load static, per_elt. */
2389 10, 6, /* Gather store static, per_elt. */
64766e8d
JH
2390 64, /* size of l1 cache. */
2391 512, /* size of l2 cache. */
2392 64, /* size of prefetch block */
2393 6, /* number of parallel prefetches */
2394 /* FIXME perhaps more appropriate value is 5. */
2395 3, /* Branch cost */
ef9eec0b
JH
2396 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2397 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
ffa3ce53 2398 /* 10-24 */
ef9eec0b
JH
2399 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
2400 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2401 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
ffa3ce53 2402 COSTS_N_INSNS (23), /* cost of FSQRT instruction. */
6065f444 2403
c53c148c 2404 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
2405 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2406 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2407 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
c53c148c
JH
2408 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2409 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
6065f444
JH
2410 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
2411 COSTS_N_INSNS (32), /* cost of DIVSD instruction. */
2412 COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */
2413 COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */
64766e8d
JH
2414 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
2415 core_memcpy,
2416 core_memset,
f6fd8f2b
JH
2417 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2418 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
64766e8d
JH
2419};
2420