]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/i386/x86-tune-costs.h
asan.c (create_cond_insert_point): Do not update edge count.
[thirdparty/gcc.git] / gcc / config / i386 / x86-tune-costs.h
CommitLineData
64766e8d
JH
1
2/* Processor costs (relative to an add) */
3/* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
4#define COSTS_N_BYTES(N) ((N) * 2)
5
6#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
7
8static stringop_algs ix86_size_memcpy[2] = {
9 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
10 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
11static stringop_algs ix86_size_memset[2] = {
12 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
13 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
14
15const
16struct processor_costs ix86_size_cost = {/* costs for tuning for size */
17 COSTS_N_BYTES (2), /* cost of an add instruction */
18 COSTS_N_BYTES (3), /* cost of a lea instruction */
19 COSTS_N_BYTES (2), /* variable shift costs */
20 COSTS_N_BYTES (3), /* constant shift costs */
21 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
22 COSTS_N_BYTES (3), /* HI */
23 COSTS_N_BYTES (3), /* SI */
24 COSTS_N_BYTES (3), /* DI */
25 COSTS_N_BYTES (5)}, /* other */
26 0, /* cost of multiply per each bit set */
27 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
28 COSTS_N_BYTES (3), /* HI */
29 COSTS_N_BYTES (3), /* SI */
30 COSTS_N_BYTES (3), /* DI */
31 COSTS_N_BYTES (5)}, /* other */
32 COSTS_N_BYTES (3), /* cost of movsx */
33 COSTS_N_BYTES (3), /* cost of movzx */
34 0, /* "large" insn */
35 2, /* MOVE_RATIO */
36 2, /* cost for loading QImode using movzbl */
37 {2, 2, 2}, /* cost of loading integer registers
38 in QImode, HImode and SImode.
39 Relative to reg-reg move (2). */
40 {2, 2, 2}, /* cost of storing integer registers */
41 2, /* cost of reg,reg fld/fst */
42 {2, 2, 2}, /* cost of loading fp registers
43 in SFmode, DFmode and XFmode */
44 {2, 2, 2}, /* cost of storing fp registers
45 in SFmode, DFmode and XFmode */
46 3, /* cost of moving MMX register */
47 {3, 3}, /* cost of loading MMX registers
48 in SImode and DImode */
49 {3, 3}, /* cost of storing MMX registers
50 in SImode and DImode */
51 3, /* cost of moving SSE register */
52 {3, 3, 3}, /* cost of loading SSE registers
53 in SImode, DImode and TImode */
54 {3, 3, 3}, /* cost of storing SSE registers
55 in SImode, DImode and TImode */
56 3, /* MMX or SSE register to integer */
57 0, /* size of l1 cache */
58 0, /* size of l2 cache */
59 0, /* size of prefetch block */
60 0, /* number of parallel prefetches */
61 2, /* Branch cost */
62 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
63 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
64 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
65 COSTS_N_BYTES (2), /* cost of FABS instruction. */
66 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
67 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
6065f444 68
c53c148c 69 COSTS_N_BYTES (2), /* cost of cheap SSE instruction. */
6065f444
JH
70 COSTS_N_BYTES (2), /* cost of ADDSS/SD SUBSS/SD insns. */
71 COSTS_N_BYTES (2), /* cost of MULSS instruction. */
72 COSTS_N_BYTES (2), /* cost of MULSD instruction. */
c53c148c
JH
73 COSTS_N_BYTES (2), /* cost of FMA SS instruction. */
74 COSTS_N_BYTES (2), /* cost of FMA SD instruction. */
6065f444
JH
75 COSTS_N_BYTES (2), /* cost of DIVSS instruction. */
76 COSTS_N_BYTES (2), /* cost of DIVSD instruction. */
77 COSTS_N_BYTES (2), /* cost of SQRTSS instruction. */
78 COSTS_N_BYTES (2), /* cost of SQRTSD instruction. */
64766e8d
JH
79 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
80 ix86_size_memcpy,
81 ix86_size_memset,
82 1, /* scalar_stmt_cost. */
83 1, /* scalar load_cost. */
84 1, /* scalar_store_cost. */
85 1, /* vec_stmt_cost. */
86 1, /* vec_to_scalar_cost. */
87 1, /* scalar_to_vec_cost. */
88 1, /* vec_align_load_cost. */
89 1, /* vec_unalign_load_cost. */
90 1, /* vec_store_cost. */
91 1, /* cond_taken_branch_cost. */
92 1, /* cond_not_taken_branch_cost. */
93};
94
95/* Processor costs (relative to an add) */
96static stringop_algs i386_memcpy[2] = {
97 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
98 DUMMY_STRINGOP_ALGS};
99static stringop_algs i386_memset[2] = {
100 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
101 DUMMY_STRINGOP_ALGS};
102
103static const
104struct processor_costs i386_cost = { /* 386 specific costs */
105 COSTS_N_INSNS (1), /* cost of an add instruction */
106 COSTS_N_INSNS (1), /* cost of a lea instruction */
107 COSTS_N_INSNS (3), /* variable shift costs */
108 COSTS_N_INSNS (2), /* constant shift costs */
109 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
110 COSTS_N_INSNS (6), /* HI */
111 COSTS_N_INSNS (6), /* SI */
112 COSTS_N_INSNS (6), /* DI */
113 COSTS_N_INSNS (6)}, /* other */
114 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
115 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
116 COSTS_N_INSNS (23), /* HI */
117 COSTS_N_INSNS (23), /* SI */
118 COSTS_N_INSNS (23), /* DI */
119 COSTS_N_INSNS (23)}, /* other */
120 COSTS_N_INSNS (3), /* cost of movsx */
121 COSTS_N_INSNS (2), /* cost of movzx */
122 15, /* "large" insn */
123 3, /* MOVE_RATIO */
124 4, /* cost for loading QImode using movzbl */
125 {2, 4, 2}, /* cost of loading integer registers
126 in QImode, HImode and SImode.
127 Relative to reg-reg move (2). */
128 {2, 4, 2}, /* cost of storing integer registers */
129 2, /* cost of reg,reg fld/fst */
130 {8, 8, 8}, /* cost of loading fp registers
131 in SFmode, DFmode and XFmode */
132 {8, 8, 8}, /* cost of storing fp registers
133 in SFmode, DFmode and XFmode */
134 2, /* cost of moving MMX register */
135 {4, 8}, /* cost of loading MMX registers
136 in SImode and DImode */
137 {4, 8}, /* cost of storing MMX registers
138 in SImode and DImode */
139 2, /* cost of moving SSE register */
140 {4, 8, 16}, /* cost of loading SSE registers
141 in SImode, DImode and TImode */
142 {4, 8, 16}, /* cost of storing SSE registers
143 in SImode, DImode and TImode */
144 3, /* MMX or SSE register to integer */
145 0, /* size of l1 cache */
146 0, /* size of l2 cache */
147 0, /* size of prefetch block */
148 0, /* number of parallel prefetches */
149 1, /* Branch cost */
150 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
151 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
152 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
153 COSTS_N_INSNS (22), /* cost of FABS instruction. */
154 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
155 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
6065f444 156
c53c148c 157 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
158 COSTS_N_INSNS (23), /* cost of ADDSS/SD SUBSS/SD insns. */
159 COSTS_N_INSNS (27), /* cost of MULSS instruction. */
160 COSTS_N_INSNS (27), /* cost of MULSD instruction. */
c53c148c
JH
161 COSTS_N_INSNS (27), /* cost of FMA SS instruction. */
162 COSTS_N_INSNS (27), /* cost of FMA SD instruction. */
6065f444
JH
163 COSTS_N_INSNS (88), /* cost of DIVSS instruction. */
164 COSTS_N_INSNS (88), /* cost of DIVSD instruction. */
165 COSTS_N_INSNS (122), /* cost of SQRTSS instruction. */
166 COSTS_N_INSNS (122), /* cost of SQRTSD instruction. */
64766e8d
JH
167 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
168 i386_memcpy,
169 i386_memset,
170 1, /* scalar_stmt_cost. */
171 1, /* scalar load_cost. */
172 1, /* scalar_store_cost. */
173 1, /* vec_stmt_cost. */
174 1, /* vec_to_scalar_cost. */
175 1, /* scalar_to_vec_cost. */
176 1, /* vec_align_load_cost. */
177 2, /* vec_unalign_load_cost. */
178 1, /* vec_store_cost. */
179 3, /* cond_taken_branch_cost. */
180 1, /* cond_not_taken_branch_cost. */
181};
182
183static stringop_algs i486_memcpy[2] = {
184 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
185 DUMMY_STRINGOP_ALGS};
186static stringop_algs i486_memset[2] = {
187 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
188 DUMMY_STRINGOP_ALGS};
189
190static const
191struct processor_costs i486_cost = { /* 486 specific costs */
192 COSTS_N_INSNS (1), /* cost of an add instruction */
193 COSTS_N_INSNS (1), /* cost of a lea instruction */
194 COSTS_N_INSNS (3), /* variable shift costs */
195 COSTS_N_INSNS (2), /* constant shift costs */
196 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
197 COSTS_N_INSNS (12), /* HI */
198 COSTS_N_INSNS (12), /* SI */
199 COSTS_N_INSNS (12), /* DI */
200 COSTS_N_INSNS (12)}, /* other */
201 1, /* cost of multiply per each bit set */
202 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
203 COSTS_N_INSNS (40), /* HI */
204 COSTS_N_INSNS (40), /* SI */
205 COSTS_N_INSNS (40), /* DI */
206 COSTS_N_INSNS (40)}, /* other */
207 COSTS_N_INSNS (3), /* cost of movsx */
208 COSTS_N_INSNS (2), /* cost of movzx */
209 15, /* "large" insn */
210 3, /* MOVE_RATIO */
211 4, /* cost for loading QImode using movzbl */
212 {2, 4, 2}, /* cost of loading integer registers
213 in QImode, HImode and SImode.
214 Relative to reg-reg move (2). */
215 {2, 4, 2}, /* cost of storing integer registers */
216 2, /* cost of reg,reg fld/fst */
217 {8, 8, 8}, /* cost of loading fp registers
218 in SFmode, DFmode and XFmode */
219 {8, 8, 8}, /* cost of storing fp registers
220 in SFmode, DFmode and XFmode */
221 2, /* cost of moving MMX register */
222 {4, 8}, /* cost of loading MMX registers
223 in SImode and DImode */
224 {4, 8}, /* cost of storing MMX registers
225 in SImode and DImode */
226 2, /* cost of moving SSE register */
227 {4, 8, 16}, /* cost of loading SSE registers
228 in SImode, DImode and TImode */
229 {4, 8, 16}, /* cost of storing SSE registers
230 in SImode, DImode and TImode */
231 3, /* MMX or SSE register to integer */
232 4, /* size of l1 cache. 486 has 8kB cache
233 shared for code and data, so 4kB is
234 not really precise. */
235 4, /* size of l2 cache */
236 0, /* size of prefetch block */
237 0, /* number of parallel prefetches */
238 1, /* Branch cost */
239 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
240 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
241 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
242 COSTS_N_INSNS (3), /* cost of FABS instruction. */
243 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
244 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
6065f444 245
c53c148c 246 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
247 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
248 COSTS_N_INSNS (16), /* cost of MULSS instruction. */
249 COSTS_N_INSNS (16), /* cost of MULSD instruction. */
c53c148c
JH
250 COSTS_N_INSNS (16), /* cost of FMA SS instruction. */
251 COSTS_N_INSNS (16), /* cost of FMA SD instruction. */
6065f444
JH
252 COSTS_N_INSNS (73), /* cost of DIVSS instruction. */
253 COSTS_N_INSNS (74), /* cost of DIVSD instruction. */
254 COSTS_N_INSNS (83), /* cost of SQRTSS instruction. */
255 COSTS_N_INSNS (83), /* cost of SQRTSD instruction. */
64766e8d
JH
256 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
257 i486_memcpy,
258 i486_memset,
259 1, /* scalar_stmt_cost. */
260 1, /* scalar load_cost. */
261 1, /* scalar_store_cost. */
262 1, /* vec_stmt_cost. */
263 1, /* vec_to_scalar_cost. */
264 1, /* scalar_to_vec_cost. */
265 1, /* vec_align_load_cost. */
266 2, /* vec_unalign_load_cost. */
267 1, /* vec_store_cost. */
268 3, /* cond_taken_branch_cost. */
269 1, /* cond_not_taken_branch_cost. */
270};
271
272static stringop_algs pentium_memcpy[2] = {
273 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
274 DUMMY_STRINGOP_ALGS};
275static stringop_algs pentium_memset[2] = {
276 {libcall, {{-1, rep_prefix_4_byte, false}}},
277 DUMMY_STRINGOP_ALGS};
278
279static const
280struct processor_costs pentium_cost = {
281 COSTS_N_INSNS (1), /* cost of an add instruction */
282 COSTS_N_INSNS (1), /* cost of a lea instruction */
283 COSTS_N_INSNS (4), /* variable shift costs */
284 COSTS_N_INSNS (1), /* constant shift costs */
285 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
286 COSTS_N_INSNS (11), /* HI */
287 COSTS_N_INSNS (11), /* SI */
288 COSTS_N_INSNS (11), /* DI */
289 COSTS_N_INSNS (11)}, /* other */
290 0, /* cost of multiply per each bit set */
291 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
292 COSTS_N_INSNS (25), /* HI */
293 COSTS_N_INSNS (25), /* SI */
294 COSTS_N_INSNS (25), /* DI */
295 COSTS_N_INSNS (25)}, /* other */
296 COSTS_N_INSNS (3), /* cost of movsx */
297 COSTS_N_INSNS (2), /* cost of movzx */
298 8, /* "large" insn */
299 6, /* MOVE_RATIO */
300 6, /* cost for loading QImode using movzbl */
301 {2, 4, 2}, /* cost of loading integer registers
302 in QImode, HImode and SImode.
303 Relative to reg-reg move (2). */
304 {2, 4, 2}, /* cost of storing integer registers */
305 2, /* cost of reg,reg fld/fst */
306 {2, 2, 6}, /* cost of loading fp registers
307 in SFmode, DFmode and XFmode */
308 {4, 4, 6}, /* cost of storing fp registers
309 in SFmode, DFmode and XFmode */
310 8, /* cost of moving MMX register */
311 {8, 8}, /* cost of loading MMX registers
312 in SImode and DImode */
313 {8, 8}, /* cost of storing MMX registers
314 in SImode and DImode */
315 2, /* cost of moving SSE register */
316 {4, 8, 16}, /* cost of loading SSE registers
317 in SImode, DImode and TImode */
318 {4, 8, 16}, /* cost of storing SSE registers
319 in SImode, DImode and TImode */
320 3, /* MMX or SSE register to integer */
321 8, /* size of l1 cache. */
322 8, /* size of l2 cache */
323 0, /* size of prefetch block */
324 0, /* number of parallel prefetches */
325 2, /* Branch cost */
326 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
327 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
328 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
329 COSTS_N_INSNS (1), /* cost of FABS instruction. */
330 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
331 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
6065f444 332
c53c148c 333 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
334 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
335 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
336 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
c53c148c
JH
337 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
338 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
6065f444
JH
339 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
340 COSTS_N_INSNS (39), /* cost of DIVSD instruction. */
341 COSTS_N_INSNS (70), /* cost of SQRTSS instruction. */
342 COSTS_N_INSNS (70), /* cost of SQRTSD instruction. */
64766e8d
JH
343 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
344 pentium_memcpy,
345 pentium_memset,
346 1, /* scalar_stmt_cost. */
347 1, /* scalar load_cost. */
348 1, /* scalar_store_cost. */
349 1, /* vec_stmt_cost. */
350 1, /* vec_to_scalar_cost. */
351 1, /* scalar_to_vec_cost. */
352 1, /* vec_align_load_cost. */
353 2, /* vec_unalign_load_cost. */
354 1, /* vec_store_cost. */
355 3, /* cond_taken_branch_cost. */
356 1, /* cond_not_taken_branch_cost. */
357};
358
359static const
360struct processor_costs lakemont_cost = {
361 COSTS_N_INSNS (1), /* cost of an add instruction */
362 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
363 COSTS_N_INSNS (1), /* variable shift costs */
364 COSTS_N_INSNS (1), /* constant shift costs */
365 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
366 COSTS_N_INSNS (11), /* HI */
367 COSTS_N_INSNS (11), /* SI */
368 COSTS_N_INSNS (11), /* DI */
369 COSTS_N_INSNS (11)}, /* other */
370 0, /* cost of multiply per each bit set */
371 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
372 COSTS_N_INSNS (25), /* HI */
373 COSTS_N_INSNS (25), /* SI */
374 COSTS_N_INSNS (25), /* DI */
375 COSTS_N_INSNS (25)}, /* other */
376 COSTS_N_INSNS (3), /* cost of movsx */
377 COSTS_N_INSNS (2), /* cost of movzx */
378 8, /* "large" insn */
379 17, /* MOVE_RATIO */
380 6, /* cost for loading QImode using movzbl */
381 {2, 4, 2}, /* cost of loading integer registers
382 in QImode, HImode and SImode.
383 Relative to reg-reg move (2). */
384 {2, 4, 2}, /* cost of storing integer registers */
385 2, /* cost of reg,reg fld/fst */
386 {2, 2, 6}, /* cost of loading fp registers
387 in SFmode, DFmode and XFmode */
388 {4, 4, 6}, /* cost of storing fp registers
389 in SFmode, DFmode and XFmode */
390 8, /* cost of moving MMX register */
391 {8, 8}, /* cost of loading MMX registers
392 in SImode and DImode */
393 {8, 8}, /* cost of storing MMX registers
394 in SImode and DImode */
395 2, /* cost of moving SSE register */
396 {4, 8, 16}, /* cost of loading SSE registers
397 in SImode, DImode and TImode */
398 {4, 8, 16}, /* cost of storing SSE registers
399 in SImode, DImode and TImode */
400 3, /* MMX or SSE register to integer */
401 8, /* size of l1 cache. */
402 8, /* size of l2 cache */
403 0, /* size of prefetch block */
404 0, /* number of parallel prefetches */
405 2, /* Branch cost */
406 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
407 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
408 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
409 COSTS_N_INSNS (1), /* cost of FABS instruction. */
410 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
411 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
6065f444 412
c53c148c 413 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
414 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
415 COSTS_N_INSNS (5), /* cost of MULSS instruction. */
416 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
c53c148c
JH
417 COSTS_N_INSNS (10), /* cost of FMA SS instruction. */
418 COSTS_N_INSNS (10), /* cost of FMA SD instruction. */
6065f444
JH
419 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
420 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
421 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
422 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
64766e8d
JH
423 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
424 pentium_memcpy,
425 pentium_memset,
426 1, /* scalar_stmt_cost. */
427 1, /* scalar load_cost. */
428 1, /* scalar_store_cost. */
429 1, /* vec_stmt_cost. */
430 1, /* vec_to_scalar_cost. */
431 1, /* scalar_to_vec_cost. */
432 1, /* vec_align_load_cost. */
433 2, /* vec_unalign_load_cost. */
434 1, /* vec_store_cost. */
435 3, /* cond_taken_branch_cost. */
436 1, /* cond_not_taken_branch_cost. */
437};
438
439/* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
440 (we ensure the alignment). For small blocks inline loop is still a
441 noticeable win, for bigger blocks either rep movsl or rep movsb is
442 way to go. Rep movsb has apparently more expensive startup time in CPU,
443 but after 4K the difference is down in the noise. */
444static stringop_algs pentiumpro_memcpy[2] = {
445 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
446 {8192, rep_prefix_4_byte, false},
447 {-1, rep_prefix_1_byte, false}}},
448 DUMMY_STRINGOP_ALGS};
449static stringop_algs pentiumpro_memset[2] = {
450 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
451 {8192, rep_prefix_4_byte, false},
452 {-1, libcall, false}}},
453 DUMMY_STRINGOP_ALGS};
454static const
455struct processor_costs pentiumpro_cost = {
456 COSTS_N_INSNS (1), /* cost of an add instruction */
457 COSTS_N_INSNS (1), /* cost of a lea instruction */
458 COSTS_N_INSNS (1), /* variable shift costs */
459 COSTS_N_INSNS (1), /* constant shift costs */
460 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
461 COSTS_N_INSNS (4), /* HI */
462 COSTS_N_INSNS (4), /* SI */
463 COSTS_N_INSNS (4), /* DI */
464 COSTS_N_INSNS (4)}, /* other */
465 0, /* cost of multiply per each bit set */
466 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
467 COSTS_N_INSNS (17), /* HI */
468 COSTS_N_INSNS (17), /* SI */
469 COSTS_N_INSNS (17), /* DI */
470 COSTS_N_INSNS (17)}, /* other */
471 COSTS_N_INSNS (1), /* cost of movsx */
472 COSTS_N_INSNS (1), /* cost of movzx */
473 8, /* "large" insn */
474 6, /* MOVE_RATIO */
475 2, /* cost for loading QImode using movzbl */
476 {4, 4, 4}, /* cost of loading integer registers
477 in QImode, HImode and SImode.
478 Relative to reg-reg move (2). */
479 {2, 2, 2}, /* cost of storing integer registers */
480 2, /* cost of reg,reg fld/fst */
481 {2, 2, 6}, /* cost of loading fp registers
482 in SFmode, DFmode and XFmode */
483 {4, 4, 6}, /* cost of storing fp registers
484 in SFmode, DFmode and XFmode */
485 2, /* cost of moving MMX register */
486 {2, 2}, /* cost of loading MMX registers
487 in SImode and DImode */
488 {2, 2}, /* cost of storing MMX registers
489 in SImode and DImode */
490 2, /* cost of moving SSE register */
491 {2, 2, 8}, /* cost of loading SSE registers
492 in SImode, DImode and TImode */
493 {2, 2, 8}, /* cost of storing SSE registers
494 in SImode, DImode and TImode */
495 3, /* MMX or SSE register to integer */
496 8, /* size of l1 cache. */
497 256, /* size of l2 cache */
498 32, /* size of prefetch block */
499 6, /* number of parallel prefetches */
500 2, /* Branch cost */
501 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
502 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
503 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
504 COSTS_N_INSNS (2), /* cost of FABS instruction. */
505 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
506 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
6065f444 507
c53c148c 508 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
509 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
510 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
511 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
c53c148c
JH
512 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
513 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
6065f444
JH
514 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
515 COSTS_N_INSNS (18), /* cost of DIVSD instruction. */
516 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
517 COSTS_N_INSNS (31), /* cost of SQRTSD instruction. */
64766e8d
JH
518 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
519 pentiumpro_memcpy,
520 pentiumpro_memset,
521 1, /* scalar_stmt_cost. */
522 1, /* scalar load_cost. */
523 1, /* scalar_store_cost. */
524 1, /* vec_stmt_cost. */
525 1, /* vec_to_scalar_cost. */
526 1, /* scalar_to_vec_cost. */
527 1, /* vec_align_load_cost. */
528 2, /* vec_unalign_load_cost. */
529 1, /* vec_store_cost. */
530 3, /* cond_taken_branch_cost. */
531 1, /* cond_not_taken_branch_cost. */
532};
533
534static stringop_algs geode_memcpy[2] = {
535 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
536 DUMMY_STRINGOP_ALGS};
537static stringop_algs geode_memset[2] = {
538 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
539 DUMMY_STRINGOP_ALGS};
540static const
541struct processor_costs geode_cost = {
542 COSTS_N_INSNS (1), /* cost of an add instruction */
543 COSTS_N_INSNS (1), /* cost of a lea instruction */
544 COSTS_N_INSNS (2), /* variable shift costs */
545 COSTS_N_INSNS (1), /* constant shift costs */
546 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
547 COSTS_N_INSNS (4), /* HI */
548 COSTS_N_INSNS (7), /* SI */
549 COSTS_N_INSNS (7), /* DI */
550 COSTS_N_INSNS (7)}, /* other */
551 0, /* cost of multiply per each bit set */
552 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
553 COSTS_N_INSNS (23), /* HI */
554 COSTS_N_INSNS (39), /* SI */
555 COSTS_N_INSNS (39), /* DI */
556 COSTS_N_INSNS (39)}, /* other */
557 COSTS_N_INSNS (1), /* cost of movsx */
558 COSTS_N_INSNS (1), /* cost of movzx */
559 8, /* "large" insn */
560 4, /* MOVE_RATIO */
561 1, /* cost for loading QImode using movzbl */
562 {1, 1, 1}, /* cost of loading integer registers
563 in QImode, HImode and SImode.
564 Relative to reg-reg move (2). */
565 {1, 1, 1}, /* cost of storing integer registers */
566 1, /* cost of reg,reg fld/fst */
567 {1, 1, 1}, /* cost of loading fp registers
568 in SFmode, DFmode and XFmode */
569 {4, 6, 6}, /* cost of storing fp registers
570 in SFmode, DFmode and XFmode */
571
572 2, /* cost of moving MMX register */
573 {2, 2}, /* cost of loading MMX registers
574 in SImode and DImode */
575 {2, 2}, /* cost of storing MMX registers
576 in SImode and DImode */
577 2, /* cost of moving SSE register */
578 {2, 2, 8}, /* cost of loading SSE registers
579 in SImode, DImode and TImode */
580 {2, 2, 8}, /* cost of storing SSE registers
581 in SImode, DImode and TImode */
582 3, /* MMX or SSE register to integer */
583 64, /* size of l1 cache. */
584 128, /* size of l2 cache. */
585 32, /* size of prefetch block */
586 1, /* number of parallel prefetches */
587 1, /* Branch cost */
588 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
589 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
590 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
591 COSTS_N_INSNS (1), /* cost of FABS instruction. */
592 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
593 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
6065f444 594
c53c148c 595 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
596 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
597 COSTS_N_INSNS (11), /* cost of MULSS instruction. */
598 COSTS_N_INSNS (11), /* cost of MULSD instruction. */
c53c148c
JH
599 COSTS_N_INSNS (17), /* cost of FMA SS instruction. */
600 COSTS_N_INSNS (17), /* cost of FMA SD instruction. */
6065f444
JH
601 COSTS_N_INSNS (47), /* cost of DIVSS instruction. */
602 COSTS_N_INSNS (47), /* cost of DIVSD instruction. */
603 COSTS_N_INSNS (54), /* cost of SQRTSS instruction. */
604 COSTS_N_INSNS (54), /* cost of SQRTSD instruction. */
64766e8d
JH
605 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
606 geode_memcpy,
607 geode_memset,
608 1, /* scalar_stmt_cost. */
609 1, /* scalar load_cost. */
610 1, /* scalar_store_cost. */
611 1, /* vec_stmt_cost. */
612 1, /* vec_to_scalar_cost. */
613 1, /* scalar_to_vec_cost. */
614 1, /* vec_align_load_cost. */
615 2, /* vec_unalign_load_cost. */
616 1, /* vec_store_cost. */
617 3, /* cond_taken_branch_cost. */
618 1, /* cond_not_taken_branch_cost. */
619};
620
621static stringop_algs k6_memcpy[2] = {
622 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
623 DUMMY_STRINGOP_ALGS};
624static stringop_algs k6_memset[2] = {
625 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
626 DUMMY_STRINGOP_ALGS};
627static const
628struct processor_costs k6_cost = {
629 COSTS_N_INSNS (1), /* cost of an add instruction */
630 COSTS_N_INSNS (2), /* cost of a lea instruction */
631 COSTS_N_INSNS (1), /* variable shift costs */
632 COSTS_N_INSNS (1), /* constant shift costs */
633 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
634 COSTS_N_INSNS (3), /* HI */
635 COSTS_N_INSNS (3), /* SI */
636 COSTS_N_INSNS (3), /* DI */
637 COSTS_N_INSNS (3)}, /* other */
638 0, /* cost of multiply per each bit set */
639 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
640 COSTS_N_INSNS (18), /* HI */
641 COSTS_N_INSNS (18), /* SI */
642 COSTS_N_INSNS (18), /* DI */
643 COSTS_N_INSNS (18)}, /* other */
644 COSTS_N_INSNS (2), /* cost of movsx */
645 COSTS_N_INSNS (2), /* cost of movzx */
646 8, /* "large" insn */
647 4, /* MOVE_RATIO */
648 3, /* cost for loading QImode using movzbl */
649 {4, 5, 4}, /* cost of loading integer registers
650 in QImode, HImode and SImode.
651 Relative to reg-reg move (2). */
652 {2, 3, 2}, /* cost of storing integer registers */
653 4, /* cost of reg,reg fld/fst */
654 {6, 6, 6}, /* cost of loading fp registers
655 in SFmode, DFmode and XFmode */
656 {4, 4, 4}, /* cost of storing fp registers
657 in SFmode, DFmode and XFmode */
658 2, /* cost of moving MMX register */
659 {2, 2}, /* cost of loading MMX registers
660 in SImode and DImode */
661 {2, 2}, /* cost of storing MMX registers
662 in SImode and DImode */
663 2, /* cost of moving SSE register */
664 {2, 2, 8}, /* cost of loading SSE registers
665 in SImode, DImode and TImode */
666 {2, 2, 8}, /* cost of storing SSE registers
667 in SImode, DImode and TImode */
668 6, /* MMX or SSE register to integer */
669 32, /* size of l1 cache. */
670 32, /* size of l2 cache. Some models
671 have integrated l2 cache, but
672 optimizing for k6 is not important
673 enough to worry about that. */
674 32, /* size of prefetch block */
675 1, /* number of parallel prefetches */
676 1, /* Branch cost */
677 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
678 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
679 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
680 COSTS_N_INSNS (2), /* cost of FABS instruction. */
681 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
682 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
6065f444 683
c53c148c 684 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
685 COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */
686 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
687 COSTS_N_INSNS (2), /* cost of MULSD instruction. */
c53c148c
JH
688 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
689 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
6065f444
JH
690 COSTS_N_INSNS (56), /* cost of DIVSS instruction. */
691 COSTS_N_INSNS (56), /* cost of DIVSD instruction. */
692 COSTS_N_INSNS (56), /* cost of SQRTSS instruction. */
693 COSTS_N_INSNS (56), /* cost of SQRTSD instruction. */
64766e8d
JH
694 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
695 k6_memcpy,
696 k6_memset,
697 1, /* scalar_stmt_cost. */
698 1, /* scalar load_cost. */
699 1, /* scalar_store_cost. */
700 1, /* vec_stmt_cost. */
701 1, /* vec_to_scalar_cost. */
702 1, /* scalar_to_vec_cost. */
703 1, /* vec_align_load_cost. */
704 2, /* vec_unalign_load_cost. */
705 1, /* vec_store_cost. */
706 3, /* cond_taken_branch_cost. */
707 1, /* cond_not_taken_branch_cost. */
708};
709
710/* For some reason, Athlon deals better with REP prefix (relative to loops)
711 compared to K8. Alignment becomes important after 8 bytes for memcpy and
712 128 bytes for memset. */
713static stringop_algs athlon_memcpy[2] = {
714 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
715 DUMMY_STRINGOP_ALGS};
716static stringop_algs athlon_memset[2] = {
717 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
718 DUMMY_STRINGOP_ALGS};
719static const
720struct processor_costs athlon_cost = {
721 COSTS_N_INSNS (1), /* cost of an add instruction */
722 COSTS_N_INSNS (2), /* cost of a lea instruction */
723 COSTS_N_INSNS (1), /* variable shift costs */
724 COSTS_N_INSNS (1), /* constant shift costs */
725 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
726 COSTS_N_INSNS (5), /* HI */
727 COSTS_N_INSNS (5), /* SI */
728 COSTS_N_INSNS (5), /* DI */
729 COSTS_N_INSNS (5)}, /* other */
730 0, /* cost of multiply per each bit set */
731 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
732 COSTS_N_INSNS (26), /* HI */
733 COSTS_N_INSNS (42), /* SI */
734 COSTS_N_INSNS (74), /* DI */
735 COSTS_N_INSNS (74)}, /* other */
736 COSTS_N_INSNS (1), /* cost of movsx */
737 COSTS_N_INSNS (1), /* cost of movzx */
738 8, /* "large" insn */
739 9, /* MOVE_RATIO */
740 4, /* cost for loading QImode using movzbl */
741 {3, 4, 3}, /* cost of loading integer registers
742 in QImode, HImode and SImode.
743 Relative to reg-reg move (2). */
744 {3, 4, 3}, /* cost of storing integer registers */
745 4, /* cost of reg,reg fld/fst */
746 {4, 4, 12}, /* cost of loading fp registers
747 in SFmode, DFmode and XFmode */
748 {6, 6, 8}, /* cost of storing fp registers
749 in SFmode, DFmode and XFmode */
750 2, /* cost of moving MMX register */
751 {4, 4}, /* cost of loading MMX registers
752 in SImode and DImode */
753 {4, 4}, /* cost of storing MMX registers
754 in SImode and DImode */
755 2, /* cost of moving SSE register */
756 {4, 4, 6}, /* cost of loading SSE registers
757 in SImode, DImode and TImode */
758 {4, 4, 5}, /* cost of storing SSE registers
759 in SImode, DImode and TImode */
760 5, /* MMX or SSE register to integer */
761 64, /* size of l1 cache. */
762 256, /* size of l2 cache. */
763 64, /* size of prefetch block */
764 6, /* number of parallel prefetches */
765 5, /* Branch cost */
766 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
767 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
768 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
769 COSTS_N_INSNS (2), /* cost of FABS instruction. */
770 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
771 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
6065f444 772
c53c148c 773 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
6065f444
JH
774 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
775 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
776 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
c53c148c
JH
777 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
778 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
6065f444
JH
779 /* 11-16 */
780 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
781 COSTS_N_INSNS (24), /* cost of DIVSD instruction. */
782 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
783 COSTS_N_INSNS (19), /* cost of SQRTSD instruction. */
64766e8d
JH
784 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
785 athlon_memcpy,
786 athlon_memset,
787 1, /* scalar_stmt_cost. */
788 1, /* scalar load_cost. */
789 1, /* scalar_store_cost. */
790 1, /* vec_stmt_cost. */
791 1, /* vec_to_scalar_cost. */
792 1, /* scalar_to_vec_cost. */
793 1, /* vec_align_load_cost. */
794 2, /* vec_unalign_load_cost. */
795 1, /* vec_store_cost. */
796 3, /* cond_taken_branch_cost. */
797 1, /* cond_not_taken_branch_cost. */
798};
799
800/* K8 has optimized REP instruction for medium sized blocks, but for very
801 small blocks it is better to use loop. For large blocks, libcall can
802 do nontemporary accesses and beat inline considerably. */
803static stringop_algs k8_memcpy[2] = {
804 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
805 {-1, rep_prefix_4_byte, false}}},
806 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
807 {-1, libcall, false}}}};
808static stringop_algs k8_memset[2] = {
809 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
810 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
811 {libcall, {{48, unrolled_loop, false},
812 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
813static const
814struct processor_costs k8_cost = {
815 COSTS_N_INSNS (1), /* cost of an add instruction */
816 COSTS_N_INSNS (2), /* cost of a lea instruction */
817 COSTS_N_INSNS (1), /* variable shift costs */
818 COSTS_N_INSNS (1), /* constant shift costs */
819 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
820 COSTS_N_INSNS (4), /* HI */
821 COSTS_N_INSNS (3), /* SI */
822 COSTS_N_INSNS (4), /* DI */
823 COSTS_N_INSNS (5)}, /* other */
824 0, /* cost of multiply per each bit set */
825 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
826 COSTS_N_INSNS (26), /* HI */
827 COSTS_N_INSNS (42), /* SI */
828 COSTS_N_INSNS (74), /* DI */
829 COSTS_N_INSNS (74)}, /* other */
830 COSTS_N_INSNS (1), /* cost of movsx */
831 COSTS_N_INSNS (1), /* cost of movzx */
832 8, /* "large" insn */
833 9, /* MOVE_RATIO */
834 4, /* cost for loading QImode using movzbl */
835 {3, 4, 3}, /* cost of loading integer registers
836 in QImode, HImode and SImode.
837 Relative to reg-reg move (2). */
838 {3, 4, 3}, /* cost of storing integer registers */
839 4, /* cost of reg,reg fld/fst */
840 {4, 4, 12}, /* cost of loading fp registers
841 in SFmode, DFmode and XFmode */
842 {6, 6, 8}, /* cost of storing fp registers
843 in SFmode, DFmode and XFmode */
844 2, /* cost of moving MMX register */
845 {3, 3}, /* cost of loading MMX registers
846 in SImode and DImode */
847 {4, 4}, /* cost of storing MMX registers
848 in SImode and DImode */
849 2, /* cost of moving SSE register */
850 {4, 3, 6}, /* cost of loading SSE registers
851 in SImode, DImode and TImode */
852 {4, 4, 5}, /* cost of storing SSE registers
853 in SImode, DImode and TImode */
854 5, /* MMX or SSE register to integer */
855 64, /* size of l1 cache. */
856 512, /* size of l2 cache. */
857 64, /* size of prefetch block */
858 /* New AMD processors never drop prefetches; if they cannot be performed
859 immediately, they are queued. We set number of simultaneous prefetches
860 to a large constant to reflect this (it probably is not a good idea not
861 to limit number of prefetches at all, as their execution also takes some
862 time). */
863 100, /* number of parallel prefetches */
864 3, /* Branch cost */
865 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
866 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
867 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
868 COSTS_N_INSNS (2), /* cost of FABS instruction. */
869 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
870 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
6065f444 871
c53c148c 872 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
6065f444
JH
873 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
874 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
875 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
c53c148c
JH
876 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
877 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
6065f444
JH
878 /* 11-16 */
879 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
880 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
881 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
882 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
64766e8d
JH
883 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
884 k8_memcpy,
885 k8_memset,
886 4, /* scalar_stmt_cost. */
887 2, /* scalar load_cost. */
888 2, /* scalar_store_cost. */
889 5, /* vec_stmt_cost. */
890 0, /* vec_to_scalar_cost. */
891 2, /* scalar_to_vec_cost. */
892 2, /* vec_align_load_cost. */
893 3, /* vec_unalign_load_cost. */
894 3, /* vec_store_cost. */
895 3, /* cond_taken_branch_cost. */
896 2, /* cond_not_taken_branch_cost. */
897};
898
899/* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
900 very small blocks it is better to use loop. For large blocks, libcall can
901 do nontemporary accesses and beat inline considerably. */
902static stringop_algs amdfam10_memcpy[2] = {
903 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
904 {-1, rep_prefix_4_byte, false}}},
905 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
906 {-1, libcall, false}}}};
907static stringop_algs amdfam10_memset[2] = {
908 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
909 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
910 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
911 {-1, libcall, false}}}};
912struct processor_costs amdfam10_cost = {
913 COSTS_N_INSNS (1), /* cost of an add instruction */
914 COSTS_N_INSNS (2), /* cost of a lea instruction */
915 COSTS_N_INSNS (1), /* variable shift costs */
916 COSTS_N_INSNS (1), /* constant shift costs */
917 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
918 COSTS_N_INSNS (4), /* HI */
919 COSTS_N_INSNS (3), /* SI */
920 COSTS_N_INSNS (4), /* DI */
921 COSTS_N_INSNS (5)}, /* other */
922 0, /* cost of multiply per each bit set */
923 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
924 COSTS_N_INSNS (35), /* HI */
925 COSTS_N_INSNS (51), /* SI */
926 COSTS_N_INSNS (83), /* DI */
927 COSTS_N_INSNS (83)}, /* other */
928 COSTS_N_INSNS (1), /* cost of movsx */
929 COSTS_N_INSNS (1), /* cost of movzx */
930 8, /* "large" insn */
931 9, /* MOVE_RATIO */
932 4, /* cost for loading QImode using movzbl */
933 {3, 4, 3}, /* cost of loading integer registers
934 in QImode, HImode and SImode.
935 Relative to reg-reg move (2). */
936 {3, 4, 3}, /* cost of storing integer registers */
937 4, /* cost of reg,reg fld/fst */
938 {4, 4, 12}, /* cost of loading fp registers
939 in SFmode, DFmode and XFmode */
940 {6, 6, 8}, /* cost of storing fp registers
941 in SFmode, DFmode and XFmode */
942 2, /* cost of moving MMX register */
943 {3, 3}, /* cost of loading MMX registers
944 in SImode and DImode */
945 {4, 4}, /* cost of storing MMX registers
946 in SImode and DImode */
947 2, /* cost of moving SSE register */
948 {4, 4, 3}, /* cost of loading SSE registers
949 in SImode, DImode and TImode */
950 {4, 4, 5}, /* cost of storing SSE registers
951 in SImode, DImode and TImode */
952 3, /* MMX or SSE register to integer */
953 /* On K8:
954 MOVD reg64, xmmreg Double FSTORE 4
955 MOVD reg32, xmmreg Double FSTORE 4
956 On AMDFAM10:
957 MOVD reg64, xmmreg Double FADD 3
958 1/1 1/1
959 MOVD reg32, xmmreg Double FADD 3
960 1/1 1/1 */
961 64, /* size of l1 cache. */
962 512, /* size of l2 cache. */
963 64, /* size of prefetch block */
964 /* New AMD processors never drop prefetches; if they cannot be performed
965 immediately, they are queued. We set number of simultaneous prefetches
966 to a large constant to reflect this (it probably is not a good idea not
967 to limit number of prefetches at all, as their execution also takes some
968 time). */
969 100, /* number of parallel prefetches */
970 2, /* Branch cost */
971 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
972 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
973 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
974 COSTS_N_INSNS (2), /* cost of FABS instruction. */
975 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
976 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
6065f444 977
c53c148c 978 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
6065f444
JH
979 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
980 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
981 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
c53c148c
JH
982 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
983 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
6065f444
JH
984 /* 11-16 */
985 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
986 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
987 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
988 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
64766e8d
JH
989 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
990 amdfam10_memcpy,
991 amdfam10_memset,
992 4, /* scalar_stmt_cost. */
993 2, /* scalar load_cost. */
994 2, /* scalar_store_cost. */
995 6, /* vec_stmt_cost. */
996 0, /* vec_to_scalar_cost. */
997 2, /* scalar_to_vec_cost. */
998 2, /* vec_align_load_cost. */
999 2, /* vec_unalign_load_cost. */
1000 2, /* vec_store_cost. */
1001 2, /* cond_taken_branch_cost. */
1002 1, /* cond_not_taken_branch_cost. */
1003};
1004
1005/* BDVER1 has optimized REP instruction for medium sized blocks, but for
1006 very small blocks it is better to use loop. For large blocks, libcall
1007 can do nontemporary accesses and beat inline considerably. */
1008static stringop_algs bdver1_memcpy[2] = {
1009 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1010 {-1, rep_prefix_4_byte, false}}},
1011 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1012 {-1, libcall, false}}}};
1013static stringop_algs bdver1_memset[2] = {
1014 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1015 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1016 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1017 {-1, libcall, false}}}};
1018
1019const struct processor_costs bdver1_cost = {
1020 COSTS_N_INSNS (1), /* cost of an add instruction */
1021 COSTS_N_INSNS (1), /* cost of a lea instruction */
1022 COSTS_N_INSNS (1), /* variable shift costs */
1023 COSTS_N_INSNS (1), /* constant shift costs */
1024 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1025 COSTS_N_INSNS (4), /* HI */
1026 COSTS_N_INSNS (4), /* SI */
1027 COSTS_N_INSNS (6), /* DI */
1028 COSTS_N_INSNS (6)}, /* other */
1029 0, /* cost of multiply per each bit set */
1030 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1031 COSTS_N_INSNS (35), /* HI */
1032 COSTS_N_INSNS (51), /* SI */
1033 COSTS_N_INSNS (83), /* DI */
1034 COSTS_N_INSNS (83)}, /* other */
1035 COSTS_N_INSNS (1), /* cost of movsx */
1036 COSTS_N_INSNS (1), /* cost of movzx */
1037 8, /* "large" insn */
1038 9, /* MOVE_RATIO */
1039 4, /* cost for loading QImode using movzbl */
1040 {5, 5, 4}, /* cost of loading integer registers
1041 in QImode, HImode and SImode.
1042 Relative to reg-reg move (2). */
1043 {4, 4, 4}, /* cost of storing integer registers */
1044 2, /* cost of reg,reg fld/fst */
1045 {5, 5, 12}, /* cost of loading fp registers
1046 in SFmode, DFmode and XFmode */
1047 {4, 4, 8}, /* cost of storing fp registers
1048 in SFmode, DFmode and XFmode */
1049 2, /* cost of moving MMX register */
1050 {4, 4}, /* cost of loading MMX registers
1051 in SImode and DImode */
1052 {4, 4}, /* cost of storing MMX registers
1053 in SImode and DImode */
1054 2, /* cost of moving SSE register */
1055 {4, 4, 4}, /* cost of loading SSE registers
1056 in SImode, DImode and TImode */
1057 {4, 4, 4}, /* cost of storing SSE registers
1058 in SImode, DImode and TImode */
1059 2, /* MMX or SSE register to integer */
1060 /* On K8:
1061 MOVD reg64, xmmreg Double FSTORE 4
1062 MOVD reg32, xmmreg Double FSTORE 4
1063 On AMDFAM10:
1064 MOVD reg64, xmmreg Double FADD 3
1065 1/1 1/1
1066 MOVD reg32, xmmreg Double FADD 3
1067 1/1 1/1 */
1068 16, /* size of l1 cache. */
1069 2048, /* size of l2 cache. */
1070 64, /* size of prefetch block */
1071 /* New AMD processors never drop prefetches; if they cannot be performed
1072 immediately, they are queued. We set number of simultaneous prefetches
1073 to a large constant to reflect this (it probably is not a good idea not
1074 to limit number of prefetches at all, as their execution also takes some
1075 time). */
1076 100, /* number of parallel prefetches */
1077 2, /* Branch cost */
1078 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1079 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1080 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1081 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1082 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1083 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
6065f444 1084
c53c148c 1085 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
6065f444
JH
1086 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1087 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1088 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
c53c148c
JH
1089 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1090 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
6065f444
JH
1091 /* 9-24 */
1092 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1093 /* 9-27 */
1094 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1095 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1096 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
64766e8d
JH
1097 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1098 bdver1_memcpy,
1099 bdver1_memset,
1100 6, /* scalar_stmt_cost. */
1101 4, /* scalar load_cost. */
1102 4, /* scalar_store_cost. */
1103 6, /* vec_stmt_cost. */
1104 0, /* vec_to_scalar_cost. */
1105 2, /* scalar_to_vec_cost. */
1106 4, /* vec_align_load_cost. */
1107 4, /* vec_unalign_load_cost. */
1108 4, /* vec_store_cost. */
1109 4, /* cond_taken_branch_cost. */
1110 2, /* cond_not_taken_branch_cost. */
1111};
1112
1113/* BDVER2 has optimized REP instruction for medium sized blocks, but for
1114 very small blocks it is better to use loop. For large blocks, libcall
1115 can do nontemporary accesses and beat inline considerably. */
1116
1117static stringop_algs bdver2_memcpy[2] = {
1118 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1119 {-1, rep_prefix_4_byte, false}}},
1120 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1121 {-1, libcall, false}}}};
1122static stringop_algs bdver2_memset[2] = {
1123 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1124 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1125 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1126 {-1, libcall, false}}}};
1127
1128const struct processor_costs bdver2_cost = {
1129 COSTS_N_INSNS (1), /* cost of an add instruction */
1130 COSTS_N_INSNS (1), /* cost of a lea instruction */
1131 COSTS_N_INSNS (1), /* variable shift costs */
1132 COSTS_N_INSNS (1), /* constant shift costs */
1133 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1134 COSTS_N_INSNS (4), /* HI */
1135 COSTS_N_INSNS (4), /* SI */
1136 COSTS_N_INSNS (6), /* DI */
1137 COSTS_N_INSNS (6)}, /* other */
1138 0, /* cost of multiply per each bit set */
1139 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1140 COSTS_N_INSNS (35), /* HI */
1141 COSTS_N_INSNS (51), /* SI */
1142 COSTS_N_INSNS (83), /* DI */
1143 COSTS_N_INSNS (83)}, /* other */
1144 COSTS_N_INSNS (1), /* cost of movsx */
1145 COSTS_N_INSNS (1), /* cost of movzx */
1146 8, /* "large" insn */
1147 9, /* MOVE_RATIO */
1148 4, /* cost for loading QImode using movzbl */
1149 {5, 5, 4}, /* cost of loading integer registers
1150 in QImode, HImode and SImode.
1151 Relative to reg-reg move (2). */
1152 {4, 4, 4}, /* cost of storing integer registers */
1153 2, /* cost of reg,reg fld/fst */
1154 {5, 5, 12}, /* cost of loading fp registers
1155 in SFmode, DFmode and XFmode */
1156 {4, 4, 8}, /* cost of storing fp registers
1157 in SFmode, DFmode and XFmode */
1158 2, /* cost of moving MMX register */
1159 {4, 4}, /* cost of loading MMX registers
1160 in SImode and DImode */
1161 {4, 4}, /* cost of storing MMX registers
1162 in SImode and DImode */
1163 2, /* cost of moving SSE register */
1164 {4, 4, 4}, /* cost of loading SSE registers
1165 in SImode, DImode and TImode */
1166 {4, 4, 4}, /* cost of storing SSE registers
1167 in SImode, DImode and TImode */
1168 2, /* MMX or SSE register to integer */
1169 /* On K8:
1170 MOVD reg64, xmmreg Double FSTORE 4
1171 MOVD reg32, xmmreg Double FSTORE 4
1172 On AMDFAM10:
1173 MOVD reg64, xmmreg Double FADD 3
1174 1/1 1/1
1175 MOVD reg32, xmmreg Double FADD 3
1176 1/1 1/1 */
1177 16, /* size of l1 cache. */
1178 2048, /* size of l2 cache. */
1179 64, /* size of prefetch block */
1180 /* New AMD processors never drop prefetches; if they cannot be performed
1181 immediately, they are queued. We set number of simultaneous prefetches
1182 to a large constant to reflect this (it probably is not a good idea not
1183 to limit number of prefetches at all, as their execution also takes some
1184 time). */
1185 100, /* number of parallel prefetches */
1186 2, /* Branch cost */
1187 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1188 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1189 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1190 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1191 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1192 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
6065f444 1193
c53c148c 1194 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
6065f444
JH
1195 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1196 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1197 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
c53c148c
JH
1198 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1199 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
6065f444
JH
1200 /* 9-24 */
1201 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1202 /* 9-27 */
1203 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1204 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1205 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
64766e8d
JH
1206 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1207 bdver2_memcpy,
1208 bdver2_memset,
1209 6, /* scalar_stmt_cost. */
1210 4, /* scalar load_cost. */
1211 4, /* scalar_store_cost. */
1212 6, /* vec_stmt_cost. */
1213 0, /* vec_to_scalar_cost. */
1214 2, /* scalar_to_vec_cost. */
1215 4, /* vec_align_load_cost. */
1216 4, /* vec_unalign_load_cost. */
1217 4, /* vec_store_cost. */
1218 4, /* cond_taken_branch_cost. */
1219 2, /* cond_not_taken_branch_cost. */
1220};
1221
1222
1223 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1224 very small blocks it is better to use loop. For large blocks, libcall
1225 can do nontemporary accesses and beat inline considerably. */
1226static stringop_algs bdver3_memcpy[2] = {
1227 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1228 {-1, rep_prefix_4_byte, false}}},
1229 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1230 {-1, libcall, false}}}};
1231static stringop_algs bdver3_memset[2] = {
1232 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1233 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1234 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1235 {-1, libcall, false}}}};
1236struct processor_costs bdver3_cost = {
1237 COSTS_N_INSNS (1), /* cost of an add instruction */
1238 COSTS_N_INSNS (1), /* cost of a lea instruction */
1239 COSTS_N_INSNS (1), /* variable shift costs */
1240 COSTS_N_INSNS (1), /* constant shift costs */
1241 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1242 COSTS_N_INSNS (4), /* HI */
1243 COSTS_N_INSNS (4), /* SI */
1244 COSTS_N_INSNS (6), /* DI */
1245 COSTS_N_INSNS (6)}, /* other */
1246 0, /* cost of multiply per each bit set */
1247 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1248 COSTS_N_INSNS (35), /* HI */
1249 COSTS_N_INSNS (51), /* SI */
1250 COSTS_N_INSNS (83), /* DI */
1251 COSTS_N_INSNS (83)}, /* other */
1252 COSTS_N_INSNS (1), /* cost of movsx */
1253 COSTS_N_INSNS (1), /* cost of movzx */
1254 8, /* "large" insn */
1255 9, /* MOVE_RATIO */
1256 4, /* cost for loading QImode using movzbl */
1257 {5, 5, 4}, /* cost of loading integer registers
1258 in QImode, HImode and SImode.
1259 Relative to reg-reg move (2). */
1260 {4, 4, 4}, /* cost of storing integer registers */
1261 2, /* cost of reg,reg fld/fst */
1262 {5, 5, 12}, /* cost of loading fp registers
1263 in SFmode, DFmode and XFmode */
1264 {4, 4, 8}, /* cost of storing fp registers
1265 in SFmode, DFmode and XFmode */
1266 2, /* cost of moving MMX register */
1267 {4, 4}, /* cost of loading MMX registers
1268 in SImode and DImode */
1269 {4, 4}, /* cost of storing MMX registers
1270 in SImode and DImode */
1271 2, /* cost of moving SSE register */
1272 {4, 4, 4}, /* cost of loading SSE registers
1273 in SImode, DImode and TImode */
1274 {4, 4, 4}, /* cost of storing SSE registers
1275 in SImode, DImode and TImode */
1276 2, /* MMX or SSE register to integer */
1277 16, /* size of l1 cache. */
1278 2048, /* size of l2 cache. */
1279 64, /* size of prefetch block */
1280 /* New AMD processors never drop prefetches; if they cannot be performed
1281 immediately, they are queued. We set number of simultaneous prefetches
1282 to a large constant to reflect this (it probably is not a good idea not
1283 to limit number of prefetches at all, as their execution also takes some
1284 time). */
1285 100, /* number of parallel prefetches */
1286 2, /* Branch cost */
1287 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1288 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1289 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1290 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1291 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1292 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
6065f444 1293
c53c148c 1294 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
6065f444
JH
1295 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1296 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1297 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
c53c148c
JH
1298 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1299 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
6065f444
JH
1300 /* 9-24 */
1301 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1302 /* 9-27 */
1303 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1304 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1305 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
64766e8d
JH
1306 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1307 bdver3_memcpy,
1308 bdver3_memset,
1309 6, /* scalar_stmt_cost. */
1310 4, /* scalar load_cost. */
1311 4, /* scalar_store_cost. */
1312 6, /* vec_stmt_cost. */
1313 0, /* vec_to_scalar_cost. */
1314 2, /* scalar_to_vec_cost. */
1315 4, /* vec_align_load_cost. */
1316 4, /* vec_unalign_load_cost. */
1317 4, /* vec_store_cost. */
1318 4, /* cond_taken_branch_cost. */
1319 2, /* cond_not_taken_branch_cost. */
1320};
1321
1322/* BDVER4 has optimized REP instruction for medium sized blocks, but for
1323 very small blocks it is better to use loop. For large blocks, libcall
1324 can do nontemporary accesses and beat inline considerably. */
1325static stringop_algs bdver4_memcpy[2] = {
1326 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1327 {-1, rep_prefix_4_byte, false}}},
1328 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1329 {-1, libcall, false}}}};
1330static stringop_algs bdver4_memset[2] = {
1331 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1332 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1333 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1334 {-1, libcall, false}}}};
1335struct processor_costs bdver4_cost = {
1336 COSTS_N_INSNS (1), /* cost of an add instruction */
1337 COSTS_N_INSNS (1), /* cost of a lea instruction */
1338 COSTS_N_INSNS (1), /* variable shift costs */
1339 COSTS_N_INSNS (1), /* constant shift costs */
1340 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1341 COSTS_N_INSNS (4), /* HI */
1342 COSTS_N_INSNS (4), /* SI */
1343 COSTS_N_INSNS (6), /* DI */
1344 COSTS_N_INSNS (6)}, /* other */
1345 0, /* cost of multiply per each bit set */
1346 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1347 COSTS_N_INSNS (35), /* HI */
1348 COSTS_N_INSNS (51), /* SI */
1349 COSTS_N_INSNS (83), /* DI */
1350 COSTS_N_INSNS (83)}, /* other */
1351 COSTS_N_INSNS (1), /* cost of movsx */
1352 COSTS_N_INSNS (1), /* cost of movzx */
1353 8, /* "large" insn */
1354 9, /* MOVE_RATIO */
1355 4, /* cost for loading QImode using movzbl */
1356 {5, 5, 4}, /* cost of loading integer registers
1357 in QImode, HImode and SImode.
1358 Relative to reg-reg move (2). */
1359 {4, 4, 4}, /* cost of storing integer registers */
1360 2, /* cost of reg,reg fld/fst */
1361 {5, 5, 12}, /* cost of loading fp registers
1362 in SFmode, DFmode and XFmode */
1363 {4, 4, 8}, /* cost of storing fp registers
1364 in SFmode, DFmode and XFmode */
1365 2, /* cost of moving MMX register */
1366 {4, 4}, /* cost of loading MMX registers
1367 in SImode and DImode */
1368 {4, 4}, /* cost of storing MMX registers
1369 in SImode and DImode */
1370 2, /* cost of moving SSE register */
1371 {4, 4, 4}, /* cost of loading SSE registers
1372 in SImode, DImode and TImode */
1373 {4, 4, 4}, /* cost of storing SSE registers
1374 in SImode, DImode and TImode */
1375 2, /* MMX or SSE register to integer */
1376 16, /* size of l1 cache. */
1377 2048, /* size of l2 cache. */
1378 64, /* size of prefetch block */
1379 /* New AMD processors never drop prefetches; if they cannot be performed
1380 immediately, they are queued. We set number of simultaneous prefetches
1381 to a large constant to reflect this (it probably is not a good idea not
1382 to limit number of prefetches at all, as their execution also takes some
1383 time). */
1384 100, /* number of parallel prefetches */
1385 2, /* Branch cost */
1386 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1387 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1388 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1389 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1390 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1391 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
6065f444 1392
c53c148c 1393 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
6065f444
JH
1394 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1395 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1396 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
c53c148c
JH
1397 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1398 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
6065f444
JH
1399 /* 9-24 */
1400 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1401 /* 9-27 */
1402 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1403 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1404 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
64766e8d
JH
1405 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1406 bdver4_memcpy,
1407 bdver4_memset,
1408 6, /* scalar_stmt_cost. */
1409 4, /* scalar load_cost. */
1410 4, /* scalar_store_cost. */
1411 6, /* vec_stmt_cost. */
1412 0, /* vec_to_scalar_cost. */
1413 2, /* scalar_to_vec_cost. */
1414 4, /* vec_align_load_cost. */
1415 4, /* vec_unalign_load_cost. */
1416 4, /* vec_store_cost. */
1417 4, /* cond_taken_branch_cost. */
1418 2, /* cond_not_taken_branch_cost. */
1419};
1420
1421
1422/* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1423 very small blocks it is better to use loop. For large blocks, libcall
1424 can do nontemporary accesses and beat inline considerably. */
1425static stringop_algs znver1_memcpy[2] = {
1426 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1427 {-1, rep_prefix_4_byte, false}}},
1428 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1429 {-1, libcall, false}}}};
1430static stringop_algs znver1_memset[2] = {
1431 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1432 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1433 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1434 {-1, libcall, false}}}};
1435struct processor_costs znver1_cost = {
1436 COSTS_N_INSNS (1), /* cost of an add instruction. */
1437 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1438 COSTS_N_INSNS (1), /* variable shift costs. */
1439 COSTS_N_INSNS (1), /* constant shift costs. */
1440 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1441 COSTS_N_INSNS (3), /* HI. */
1442 COSTS_N_INSNS (3), /* SI. */
6065f444
JH
1443 COSTS_N_INSNS (3), /* DI. */
1444 COSTS_N_INSNS (3)}, /* other. */
64766e8d
JH
1445 0, /* cost of multiply per each bit
1446 set. */
6065f444
JH
1447 /* Depending on parameters, idiv can get faster on ryzen. This is upper
1448 bound. */
1449 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
1450 COSTS_N_INSNS (22), /* HI. */
1451 COSTS_N_INSNS (30), /* SI. */
1452 COSTS_N_INSNS (45), /* DI. */
1453 COSTS_N_INSNS (45)}, /* other. */
64766e8d
JH
1454 COSTS_N_INSNS (1), /* cost of movsx. */
1455 COSTS_N_INSNS (1), /* cost of movzx. */
1456 8, /* "large" insn. */
1457 9, /* MOVE_RATIO. */
01118373
JH
1458
1459 /* reg-reg moves are done by renaming and thus they are even cheaper than
1460 1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1461 to doubles of latencies, we do not model this correctly. It does not
1462 seem to make practical difference to bump prices up even more. */
1463 6, /* cost for loading QImode using
64766e8d 1464 movzbl. */
01118373 1465 {6, 6, 6}, /* cost of loading integer registers
64766e8d
JH
1466 in QImode, HImode and SImode.
1467 Relative to reg-reg move (2). */
01118373 1468 {8, 8, 8}, /* cost of storing integer
64766e8d
JH
1469 registers. */
1470 2, /* cost of reg,reg fld/fst. */
01118373 1471 {6, 6, 16}, /* cost of loading fp registers
64766e8d 1472 in SFmode, DFmode and XFmode. */
01118373 1473 {8, 8, 16}, /* cost of storing fp registers
64766e8d
JH
1474 in SFmode, DFmode and XFmode. */
1475 2, /* cost of moving MMX register. */
01118373 1476 {6, 6}, /* cost of loading MMX registers
64766e8d 1477 in SImode and DImode. */
01118373 1478 {8, 8}, /* cost of storing MMX registers
64766e8d
JH
1479 in SImode and DImode. */
1480 2, /* cost of moving SSE register. */
01118373 1481 {6, 6, 6}, /* cost of loading SSE registers
64766e8d 1482 in SImode, DImode and TImode. */
01118373 1483 {8, 8, 8}, /* cost of storing SSE registers
64766e8d 1484 in SImode, DImode and TImode. */
01118373 1485 6, /* MMX or SSE register to integer. */
64766e8d
JH
1486 32, /* size of l1 cache. */
1487 512, /* size of l2 cache. */
1488 64, /* size of prefetch block. */
1489 /* New AMD processors never drop prefetches; if they cannot be performed
1490 immediately, they are queued. We set number of simultaneous prefetches
1491 to a large constant to reflect this (it probably is not a good idea not
1492 to limit number of prefetches at all, as their execution also takes some
1493 time). */
1494 100, /* number of parallel prefetches. */
1495 3, /* Branch cost. */
6065f444
JH
1496 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1497 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1498 /* Latency of fdiv is 8-15. */
1499 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1500 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1501 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1502 /* Latency of fsqrt is 4-10. */
1503 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1504
c53c148c 1505 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
1506 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1507 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1508 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
c53c148c
JH
1509 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1510 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
6065f444
JH
1511 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1512 /* 9-13 */
1513 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1514 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1515 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
64766e8d
JH
1516 /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1517 and it can execute 2 integer additions and 2 multiplications thus
1518 reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests
1519 that 4 works better than 6 probably due to register pressure.
1520
1521 Integer vector operations are taken by FP unit and execute 3 vector
1522 plus/minus operations per cycle but only one multiply. This is adjusted
1523 in ix86_reassociation_width. */
1524 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1525 znver1_memcpy,
1526 znver1_memset,
1527 6, /* scalar_stmt_cost. */
1528 4, /* scalar load_cost. */
1529 4, /* scalar_store_cost. */
1530 6, /* vec_stmt_cost. */
1531 0, /* vec_to_scalar_cost. */
1532 2, /* scalar_to_vec_cost. */
1533 4, /* vec_align_load_cost. */
1534 4, /* vec_unalign_load_cost. */
1535 4, /* vec_store_cost. */
1536 4, /* cond_taken_branch_cost. */
1537 2, /* cond_not_taken_branch_cost. */
1538};
1539
1540 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1541 very small blocks it is better to use loop. For large blocks, libcall can
1542 do nontemporary accesses and beat inline considerably. */
1543static stringop_algs btver1_memcpy[2] = {
1544 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1545 {-1, rep_prefix_4_byte, false}}},
1546 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1547 {-1, libcall, false}}}};
1548static stringop_algs btver1_memset[2] = {
1549 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1550 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1551 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1552 {-1, libcall, false}}}};
1553const struct processor_costs btver1_cost = {
1554 COSTS_N_INSNS (1), /* cost of an add instruction */
1555 COSTS_N_INSNS (2), /* cost of a lea instruction */
1556 COSTS_N_INSNS (1), /* variable shift costs */
1557 COSTS_N_INSNS (1), /* constant shift costs */
1558 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1559 COSTS_N_INSNS (4), /* HI */
1560 COSTS_N_INSNS (3), /* SI */
1561 COSTS_N_INSNS (4), /* DI */
1562 COSTS_N_INSNS (5)}, /* other */
1563 0, /* cost of multiply per each bit set */
1564 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1565 COSTS_N_INSNS (35), /* HI */
1566 COSTS_N_INSNS (51), /* SI */
1567 COSTS_N_INSNS (83), /* DI */
1568 COSTS_N_INSNS (83)}, /* other */
1569 COSTS_N_INSNS (1), /* cost of movsx */
1570 COSTS_N_INSNS (1), /* cost of movzx */
1571 8, /* "large" insn */
1572 9, /* MOVE_RATIO */
1573 4, /* cost for loading QImode using movzbl */
1574 {3, 4, 3}, /* cost of loading integer registers
1575 in QImode, HImode and SImode.
1576 Relative to reg-reg move (2). */
1577 {3, 4, 3}, /* cost of storing integer registers */
1578 4, /* cost of reg,reg fld/fst */
1579 {4, 4, 12}, /* cost of loading fp registers
1580 in SFmode, DFmode and XFmode */
1581 {6, 6, 8}, /* cost of storing fp registers
1582 in SFmode, DFmode and XFmode */
1583 2, /* cost of moving MMX register */
1584 {3, 3}, /* cost of loading MMX registers
1585 in SImode and DImode */
1586 {4, 4}, /* cost of storing MMX registers
1587 in SImode and DImode */
1588 2, /* cost of moving SSE register */
1589 {4, 4, 3}, /* cost of loading SSE registers
1590 in SImode, DImode and TImode */
1591 {4, 4, 5}, /* cost of storing SSE registers
1592 in SImode, DImode and TImode */
1593 3, /* MMX or SSE register to integer */
1594 /* On K8:
1595 MOVD reg64, xmmreg Double FSTORE 4
1596 MOVD reg32, xmmreg Double FSTORE 4
1597 On AMDFAM10:
1598 MOVD reg64, xmmreg Double FADD 3
1599 1/1 1/1
1600 MOVD reg32, xmmreg Double FADD 3
1601 1/1 1/1 */
1602 32, /* size of l1 cache. */
1603 512, /* size of l2 cache. */
1604 64, /* size of prefetch block */
1605 100, /* number of parallel prefetches */
1606 2, /* Branch cost */
1607 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1608 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1609 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1610 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1611 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1612 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
6065f444 1613
c53c148c 1614 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
1615 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1616 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
1617 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
c53c148c
JH
1618 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1619 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
6065f444
JH
1620 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
1621 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
1622 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
1623 COSTS_N_INSNS (48), /* cost of SQRTSD instruction. */
64766e8d
JH
1624 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1625 btver1_memcpy,
1626 btver1_memset,
1627 4, /* scalar_stmt_cost. */
1628 2, /* scalar load_cost. */
1629 2, /* scalar_store_cost. */
1630 6, /* vec_stmt_cost. */
1631 0, /* vec_to_scalar_cost. */
1632 2, /* scalar_to_vec_cost. */
1633 2, /* vec_align_load_cost. */
1634 2, /* vec_unalign_load_cost. */
1635 2, /* vec_store_cost. */
1636 2, /* cond_taken_branch_cost. */
1637 1, /* cond_not_taken_branch_cost. */
1638};
1639
1640static stringop_algs btver2_memcpy[2] = {
1641 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1642 {-1, rep_prefix_4_byte, false}}},
1643 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1644 {-1, libcall, false}}}};
1645static stringop_algs btver2_memset[2] = {
1646 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1647 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1648 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1649 {-1, libcall, false}}}};
1650const struct processor_costs btver2_cost = {
1651 COSTS_N_INSNS (1), /* cost of an add instruction */
1652 COSTS_N_INSNS (2), /* cost of a lea instruction */
1653 COSTS_N_INSNS (1), /* variable shift costs */
1654 COSTS_N_INSNS (1), /* constant shift costs */
1655 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1656 COSTS_N_INSNS (4), /* HI */
1657 COSTS_N_INSNS (3), /* SI */
1658 COSTS_N_INSNS (4), /* DI */
1659 COSTS_N_INSNS (5)}, /* other */
1660 0, /* cost of multiply per each bit set */
1661 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1662 COSTS_N_INSNS (35), /* HI */
1663 COSTS_N_INSNS (51), /* SI */
1664 COSTS_N_INSNS (83), /* DI */
1665 COSTS_N_INSNS (83)}, /* other */
1666 COSTS_N_INSNS (1), /* cost of movsx */
1667 COSTS_N_INSNS (1), /* cost of movzx */
1668 8, /* "large" insn */
1669 9, /* MOVE_RATIO */
1670 4, /* cost for loading QImode using movzbl */
1671 {3, 4, 3}, /* cost of loading integer registers
1672 in QImode, HImode and SImode.
1673 Relative to reg-reg move (2). */
1674 {3, 4, 3}, /* cost of storing integer registers */
1675 4, /* cost of reg,reg fld/fst */
1676 {4, 4, 12}, /* cost of loading fp registers
1677 in SFmode, DFmode and XFmode */
1678 {6, 6, 8}, /* cost of storing fp registers
1679 in SFmode, DFmode and XFmode */
1680 2, /* cost of moving MMX register */
1681 {3, 3}, /* cost of loading MMX registers
1682 in SImode and DImode */
1683 {4, 4}, /* cost of storing MMX registers
1684 in SImode and DImode */
1685 2, /* cost of moving SSE register */
1686 {4, 4, 3}, /* cost of loading SSE registers
1687 in SImode, DImode and TImode */
1688 {4, 4, 5}, /* cost of storing SSE registers
1689 in SImode, DImode and TImode */
1690 3, /* MMX or SSE register to integer */
1691 /* On K8:
1692 MOVD reg64, xmmreg Double FSTORE 4
1693 MOVD reg32, xmmreg Double FSTORE 4
1694 On AMDFAM10:
1695 MOVD reg64, xmmreg Double FADD 3
1696 1/1 1/1
1697 MOVD reg32, xmmreg Double FADD 3
1698 1/1 1/1 */
1699 32, /* size of l1 cache. */
1700 2048, /* size of l2 cache. */
1701 64, /* size of prefetch block */
1702 100, /* number of parallel prefetches */
1703 2, /* Branch cost */
1704 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1705 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1706 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1707 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1708 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1709 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
6065f444 1710
c53c148c 1711 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
1712 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1713 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
1714 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
c53c148c
JH
1715 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1716 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
6065f444
JH
1717 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
1718 COSTS_N_INSNS (19), /* cost of DIVSD instruction. */
1719 COSTS_N_INSNS (16), /* cost of SQRTSS instruction. */
1720 COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */
64766e8d
JH
1721 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1722 btver2_memcpy,
1723 btver2_memset,
1724 4, /* scalar_stmt_cost. */
1725 2, /* scalar load_cost. */
1726 2, /* scalar_store_cost. */
1727 6, /* vec_stmt_cost. */
1728 0, /* vec_to_scalar_cost. */
1729 2, /* scalar_to_vec_cost. */
1730 2, /* vec_align_load_cost. */
1731 2, /* vec_unalign_load_cost. */
1732 2, /* vec_store_cost. */
1733 2, /* cond_taken_branch_cost. */
1734 1, /* cond_not_taken_branch_cost. */
1735};
1736
1737static stringop_algs pentium4_memcpy[2] = {
1738 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1739 DUMMY_STRINGOP_ALGS};
1740static stringop_algs pentium4_memset[2] = {
1741 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1742 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1743 DUMMY_STRINGOP_ALGS};
1744
1745static const
1746struct processor_costs pentium4_cost = {
1747 COSTS_N_INSNS (1), /* cost of an add instruction */
1748 COSTS_N_INSNS (3), /* cost of a lea instruction */
1749 COSTS_N_INSNS (4), /* variable shift costs */
1750 COSTS_N_INSNS (4), /* constant shift costs */
1751 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1752 COSTS_N_INSNS (15), /* HI */
1753 COSTS_N_INSNS (15), /* SI */
1754 COSTS_N_INSNS (15), /* DI */
1755 COSTS_N_INSNS (15)}, /* other */
1756 0, /* cost of multiply per each bit set */
1757 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1758 COSTS_N_INSNS (56), /* HI */
1759 COSTS_N_INSNS (56), /* SI */
1760 COSTS_N_INSNS (56), /* DI */
1761 COSTS_N_INSNS (56)}, /* other */
1762 COSTS_N_INSNS (1), /* cost of movsx */
1763 COSTS_N_INSNS (1), /* cost of movzx */
1764 16, /* "large" insn */
1765 6, /* MOVE_RATIO */
1766 2, /* cost for loading QImode using movzbl */
1767 {4, 5, 4}, /* cost of loading integer registers
1768 in QImode, HImode and SImode.
1769 Relative to reg-reg move (2). */
1770 {2, 3, 2}, /* cost of storing integer registers */
1771 2, /* cost of reg,reg fld/fst */
1772 {2, 2, 6}, /* cost of loading fp registers
1773 in SFmode, DFmode and XFmode */
1774 {4, 4, 6}, /* cost of storing fp registers
1775 in SFmode, DFmode and XFmode */
1776 2, /* cost of moving MMX register */
1777 {2, 2}, /* cost of loading MMX registers
1778 in SImode and DImode */
1779 {2, 2}, /* cost of storing MMX registers
1780 in SImode and DImode */
1781 12, /* cost of moving SSE register */
1782 {12, 12, 12}, /* cost of loading SSE registers
1783 in SImode, DImode and TImode */
1784 {2, 2, 8}, /* cost of storing SSE registers
1785 in SImode, DImode and TImode */
1786 10, /* MMX or SSE register to integer */
1787 8, /* size of l1 cache. */
1788 256, /* size of l2 cache. */
1789 64, /* size of prefetch block */
1790 6, /* number of parallel prefetches */
1791 2, /* Branch cost */
1792 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1793 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1794 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1795 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1796 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1797 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
6065f444 1798
c53c148c 1799 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
6065f444
JH
1800 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1801 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1802 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
c53c148c
JH
1803 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1804 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
6065f444
JH
1805 COSTS_N_INSNS (23), /* cost of DIVSS instruction. */
1806 COSTS_N_INSNS (38), /* cost of DIVSD instruction. */
1807 COSTS_N_INSNS (23), /* cost of SQRTSS instruction. */
1808 COSTS_N_INSNS (38), /* cost of SQRTSD instruction. */
64766e8d
JH
1809 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1810 pentium4_memcpy,
1811 pentium4_memset,
1812 1, /* scalar_stmt_cost. */
1813 1, /* scalar load_cost. */
1814 1, /* scalar_store_cost. */
1815 1, /* vec_stmt_cost. */
1816 1, /* vec_to_scalar_cost. */
1817 1, /* scalar_to_vec_cost. */
1818 1, /* vec_align_load_cost. */
1819 2, /* vec_unalign_load_cost. */
1820 1, /* vec_store_cost. */
1821 3, /* cond_taken_branch_cost. */
1822 1, /* cond_not_taken_branch_cost. */
1823};
1824
1825static stringop_algs nocona_memcpy[2] = {
1826 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1827 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1828 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1829
1830static stringop_algs nocona_memset[2] = {
1831 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1832 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1833 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1834 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1835
1836static const
1837struct processor_costs nocona_cost = {
1838 COSTS_N_INSNS (1), /* cost of an add instruction */
1839 COSTS_N_INSNS (1), /* cost of a lea instruction */
1840 COSTS_N_INSNS (1), /* variable shift costs */
1841 COSTS_N_INSNS (1), /* constant shift costs */
1842 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1843 COSTS_N_INSNS (10), /* HI */
1844 COSTS_N_INSNS (10), /* SI */
1845 COSTS_N_INSNS (10), /* DI */
1846 COSTS_N_INSNS (10)}, /* other */
1847 0, /* cost of multiply per each bit set */
1848 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1849 COSTS_N_INSNS (66), /* HI */
1850 COSTS_N_INSNS (66), /* SI */
1851 COSTS_N_INSNS (66), /* DI */
1852 COSTS_N_INSNS (66)}, /* other */
1853 COSTS_N_INSNS (1), /* cost of movsx */
1854 COSTS_N_INSNS (1), /* cost of movzx */
1855 16, /* "large" insn */
1856 17, /* MOVE_RATIO */
1857 4, /* cost for loading QImode using movzbl */
1858 {4, 4, 4}, /* cost of loading integer registers
1859 in QImode, HImode and SImode.
1860 Relative to reg-reg move (2). */
1861 {4, 4, 4}, /* cost of storing integer registers */
1862 3, /* cost of reg,reg fld/fst */
1863 {12, 12, 12}, /* cost of loading fp registers
1864 in SFmode, DFmode and XFmode */
1865 {4, 4, 4}, /* cost of storing fp registers
1866 in SFmode, DFmode and XFmode */
1867 6, /* cost of moving MMX register */
1868 {12, 12}, /* cost of loading MMX registers
1869 in SImode and DImode */
1870 {12, 12}, /* cost of storing MMX registers
1871 in SImode and DImode */
1872 6, /* cost of moving SSE register */
1873 {12, 12, 12}, /* cost of loading SSE registers
1874 in SImode, DImode and TImode */
1875 {12, 12, 12}, /* cost of storing SSE registers
1876 in SImode, DImode and TImode */
1877 8, /* MMX or SSE register to integer */
1878 8, /* size of l1 cache. */
1879 1024, /* size of l2 cache. */
1880 64, /* size of prefetch block */
1881 8, /* number of parallel prefetches */
1882 1, /* Branch cost */
1883 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1884 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1885 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1886 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1887 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1888 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
6065f444 1889
c53c148c 1890 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
6065f444
JH
1891 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
1892 COSTS_N_INSNS (7), /* cost of MULSS instruction. */
1893 COSTS_N_INSNS (7), /* cost of MULSD instruction. */
c53c148c
JH
1894 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
1895 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
6065f444
JH
1896 COSTS_N_INSNS (32), /* cost of DIVSS instruction. */
1897 COSTS_N_INSNS (40), /* cost of DIVSD instruction. */
1898 COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */
1899 COSTS_N_INSNS (41), /* cost of SQRTSD instruction. */
64766e8d
JH
1900 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1901 nocona_memcpy,
1902 nocona_memset,
1903 1, /* scalar_stmt_cost. */
1904 1, /* scalar load_cost. */
1905 1, /* scalar_store_cost. */
1906 1, /* vec_stmt_cost. */
1907 1, /* vec_to_scalar_cost. */
1908 1, /* scalar_to_vec_cost. */
1909 1, /* vec_align_load_cost. */
1910 2, /* vec_unalign_load_cost. */
1911 1, /* vec_store_cost. */
1912 3, /* cond_taken_branch_cost. */
1913 1, /* cond_not_taken_branch_cost. */
1914};
1915
1916static stringop_algs atom_memcpy[2] = {
1917 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1918 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1919 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1920static stringop_algs atom_memset[2] = {
1921 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1922 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1923 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1924 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1925static const
1926struct processor_costs atom_cost = {
1927 COSTS_N_INSNS (1), /* cost of an add instruction */
1928 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1929 COSTS_N_INSNS (1), /* variable shift costs */
1930 COSTS_N_INSNS (1), /* constant shift costs */
1931 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1932 COSTS_N_INSNS (4), /* HI */
1933 COSTS_N_INSNS (3), /* SI */
1934 COSTS_N_INSNS (4), /* DI */
1935 COSTS_N_INSNS (2)}, /* other */
1936 0, /* cost of multiply per each bit set */
1937 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1938 COSTS_N_INSNS (26), /* HI */
1939 COSTS_N_INSNS (42), /* SI */
1940 COSTS_N_INSNS (74), /* DI */
1941 COSTS_N_INSNS (74)}, /* other */
1942 COSTS_N_INSNS (1), /* cost of movsx */
1943 COSTS_N_INSNS (1), /* cost of movzx */
1944 8, /* "large" insn */
1945 17, /* MOVE_RATIO */
1946 4, /* cost for loading QImode using movzbl */
1947 {4, 4, 4}, /* cost of loading integer registers
1948 in QImode, HImode and SImode.
1949 Relative to reg-reg move (2). */
1950 {4, 4, 4}, /* cost of storing integer registers */
1951 4, /* cost of reg,reg fld/fst */
1952 {12, 12, 12}, /* cost of loading fp registers
1953 in SFmode, DFmode and XFmode */
1954 {6, 6, 8}, /* cost of storing fp registers
1955 in SFmode, DFmode and XFmode */
1956 2, /* cost of moving MMX register */
1957 {8, 8}, /* cost of loading MMX registers
1958 in SImode and DImode */
1959 {8, 8}, /* cost of storing MMX registers
1960 in SImode and DImode */
1961 2, /* cost of moving SSE register */
1962 {8, 8, 8}, /* cost of loading SSE registers
1963 in SImode, DImode and TImode */
1964 {8, 8, 8}, /* cost of storing SSE registers
1965 in SImode, DImode and TImode */
1966 5, /* MMX or SSE register to integer */
1967 32, /* size of l1 cache. */
1968 256, /* size of l2 cache. */
1969 64, /* size of prefetch block */
1970 6, /* number of parallel prefetches */
1971 3, /* Branch cost */
1972 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1973 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1974 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1975 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1976 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1977 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
6065f444 1978
c53c148c 1979 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
1980 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
1981 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1982 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
c53c148c
JH
1983 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1984 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
6065f444
JH
1985 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
1986 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
1987 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
1988 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
64766e8d
JH
1989 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
1990 atom_memcpy,
1991 atom_memset,
1992 1, /* scalar_stmt_cost. */
1993 1, /* scalar load_cost. */
1994 1, /* scalar_store_cost. */
1995 1, /* vec_stmt_cost. */
1996 1, /* vec_to_scalar_cost. */
1997 1, /* scalar_to_vec_cost. */
1998 1, /* vec_align_load_cost. */
1999 2, /* vec_unalign_load_cost. */
2000 1, /* vec_store_cost. */
2001 3, /* cond_taken_branch_cost. */
2002 1, /* cond_not_taken_branch_cost. */
2003};
2004
2005static stringop_algs slm_memcpy[2] = {
2006 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2007 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2008 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2009static stringop_algs slm_memset[2] = {
2010 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2011 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2012 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2013 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2014static const
2015struct processor_costs slm_cost = {
2016 COSTS_N_INSNS (1), /* cost of an add instruction */
2017 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2018 COSTS_N_INSNS (1), /* variable shift costs */
2019 COSTS_N_INSNS (1), /* constant shift costs */
2020 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2021 COSTS_N_INSNS (3), /* HI */
2022 COSTS_N_INSNS (3), /* SI */
2023 COSTS_N_INSNS (4), /* DI */
2024 COSTS_N_INSNS (2)}, /* other */
2025 0, /* cost of multiply per each bit set */
2026 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2027 COSTS_N_INSNS (26), /* HI */
2028 COSTS_N_INSNS (42), /* SI */
2029 COSTS_N_INSNS (74), /* DI */
2030 COSTS_N_INSNS (74)}, /* other */
2031 COSTS_N_INSNS (1), /* cost of movsx */
2032 COSTS_N_INSNS (1), /* cost of movzx */
2033 8, /* "large" insn */
2034 17, /* MOVE_RATIO */
2035 4, /* cost for loading QImode using movzbl */
2036 {4, 4, 4}, /* cost of loading integer registers
2037 in QImode, HImode and SImode.
2038 Relative to reg-reg move (2). */
2039 {4, 4, 4}, /* cost of storing integer registers */
2040 4, /* cost of reg,reg fld/fst */
2041 {12, 12, 12}, /* cost of loading fp registers
2042 in SFmode, DFmode and XFmode */
2043 {6, 6, 8}, /* cost of storing fp registers
2044 in SFmode, DFmode and XFmode */
2045 2, /* cost of moving MMX register */
2046 {8, 8}, /* cost of loading MMX registers
2047 in SImode and DImode */
2048 {8, 8}, /* cost of storing MMX registers
2049 in SImode and DImode */
2050 2, /* cost of moving SSE register */
2051 {8, 8, 8}, /* cost of loading SSE registers
2052 in SImode, DImode and TImode */
2053 {8, 8, 8}, /* cost of storing SSE registers
2054 in SImode, DImode and TImode */
2055 5, /* MMX or SSE register to integer */
2056 32, /* size of l1 cache. */
2057 256, /* size of l2 cache. */
2058 64, /* size of prefetch block */
2059 6, /* number of parallel prefetches */
2060 3, /* Branch cost */
2061 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2062 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2063 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2064 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2065 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2066 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
6065f444 2067
c53c148c 2068 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
2069 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2070 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2071 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
c53c148c
JH
2072 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2073 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
6065f444
JH
2074 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
2075 COSTS_N_INSNS (69), /* cost of DIVSD instruction. */
2076 COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */
2077 COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */
64766e8d
JH
2078 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2079 slm_memcpy,
2080 slm_memset,
2081 1, /* scalar_stmt_cost. */
2082 1, /* scalar load_cost. */
2083 1, /* scalar_store_cost. */
2084 1, /* vec_stmt_cost. */
2085 4, /* vec_to_scalar_cost. */
2086 1, /* scalar_to_vec_cost. */
2087 1, /* vec_align_load_cost. */
2088 2, /* vec_unalign_load_cost. */
2089 1, /* vec_store_cost. */
2090 3, /* cond_taken_branch_cost. */
2091 1, /* cond_not_taken_branch_cost. */
2092};
2093
2094static stringop_algs intel_memcpy[2] = {
2095 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2096 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2097 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2098static stringop_algs intel_memset[2] = {
2099 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2100 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2101 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2102 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2103static const
2104struct processor_costs intel_cost = {
2105 COSTS_N_INSNS (1), /* cost of an add instruction */
2106 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2107 COSTS_N_INSNS (1), /* variable shift costs */
2108 COSTS_N_INSNS (1), /* constant shift costs */
2109 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2110 COSTS_N_INSNS (3), /* HI */
2111 COSTS_N_INSNS (3), /* SI */
2112 COSTS_N_INSNS (4), /* DI */
2113 COSTS_N_INSNS (2)}, /* other */
2114 0, /* cost of multiply per each bit set */
2115 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2116 COSTS_N_INSNS (26), /* HI */
2117 COSTS_N_INSNS (42), /* SI */
2118 COSTS_N_INSNS (74), /* DI */
2119 COSTS_N_INSNS (74)}, /* other */
2120 COSTS_N_INSNS (1), /* cost of movsx */
2121 COSTS_N_INSNS (1), /* cost of movzx */
2122 8, /* "large" insn */
2123 17, /* MOVE_RATIO */
2124 4, /* cost for loading QImode using movzbl */
2125 {4, 4, 4}, /* cost of loading integer registers
2126 in QImode, HImode and SImode.
2127 Relative to reg-reg move (2). */
2128 {4, 4, 4}, /* cost of storing integer registers */
2129 4, /* cost of reg,reg fld/fst */
2130 {12, 12, 12}, /* cost of loading fp registers
2131 in SFmode, DFmode and XFmode */
2132 {6, 6, 8}, /* cost of storing fp registers
2133 in SFmode, DFmode and XFmode */
2134 2, /* cost of moving MMX register */
2135 {8, 8}, /* cost of loading MMX registers
2136 in SImode and DImode */
2137 {8, 8}, /* cost of storing MMX registers
2138 in SImode and DImode */
2139 2, /* cost of moving SSE register */
2140 {8, 8, 8}, /* cost of loading SSE registers
2141 in SImode, DImode and TImode */
2142 {8, 8, 8}, /* cost of storing SSE registers
2143 in SImode, DImode and TImode */
2144 5, /* MMX or SSE register to integer */
2145 32, /* size of l1 cache. */
2146 256, /* size of l2 cache. */
2147 64, /* size of prefetch block */
2148 6, /* number of parallel prefetches */
2149 3, /* Branch cost */
2150 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2151 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2152 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2153 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2154 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2155 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
6065f444 2156
c53c148c 2157 COSTS_N_INSNS (8), /* cost of cheap SSE instruction. */
6065f444
JH
2158 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
2159 COSTS_N_INSNS (8), /* cost of MULSS instruction. */
2160 COSTS_N_INSNS (8), /* cost of MULSD instruction. */
c53c148c
JH
2161 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2162 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
6065f444
JH
2163 COSTS_N_INSNS (20), /* cost of DIVSS instruction. */
2164 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
2165 COSTS_N_INSNS (40), /* cost of SQRTSS instruction. */
2166 COSTS_N_INSNS (40), /* cost of SQRTSD instruction. */
64766e8d
JH
2167 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2168 intel_memcpy,
2169 intel_memset,
2170 1, /* scalar_stmt_cost. */
2171 1, /* scalar load_cost. */
2172 1, /* scalar_store_cost. */
2173 1, /* vec_stmt_cost. */
2174 4, /* vec_to_scalar_cost. */
2175 1, /* scalar_to_vec_cost. */
2176 1, /* vec_align_load_cost. */
2177 2, /* vec_unalign_load_cost. */
2178 1, /* vec_store_cost. */
2179 3, /* cond_taken_branch_cost. */
2180 1, /* cond_not_taken_branch_cost. */
2181};
2182
2183/* Generic should produce code tuned for Core-i7 (and newer chips)
2184 and btver1 (and newer chips). */
2185
2186static stringop_algs generic_memcpy[2] = {
2187 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2188 {-1, libcall, false}}},
2189 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2190 {-1, libcall, false}}}};
2191static stringop_algs generic_memset[2] = {
2192 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2193 {-1, libcall, false}}},
2194 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2195 {-1, libcall, false}}}};
2196static const
2197struct processor_costs generic_cost = {
2198 COSTS_N_INSNS (1), /* cost of an add instruction */
2199 /* On all chips taken into consideration lea is 2 cycles and more. With
2200 this cost however our current implementation of synth_mult results in
2201 use of unnecessary temporary registers causing regression on several
2202 SPECfp benchmarks. */
2203 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2204 COSTS_N_INSNS (1), /* variable shift costs */
2205 COSTS_N_INSNS (1), /* constant shift costs */
2206 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2207 COSTS_N_INSNS (4), /* HI */
2208 COSTS_N_INSNS (3), /* SI */
2209 COSTS_N_INSNS (4), /* DI */
2210 COSTS_N_INSNS (2)}, /* other */
2211 0, /* cost of multiply per each bit set */
2212 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2213 COSTS_N_INSNS (26), /* HI */
2214 COSTS_N_INSNS (42), /* SI */
2215 COSTS_N_INSNS (74), /* DI */
2216 COSTS_N_INSNS (74)}, /* other */
2217 COSTS_N_INSNS (1), /* cost of movsx */
2218 COSTS_N_INSNS (1), /* cost of movzx */
2219 8, /* "large" insn */
2220 17, /* MOVE_RATIO */
2221 4, /* cost for loading QImode using movzbl */
2222 {4, 4, 4}, /* cost of loading integer registers
2223 in QImode, HImode and SImode.
2224 Relative to reg-reg move (2). */
2225 {4, 4, 4}, /* cost of storing integer registers */
2226 4, /* cost of reg,reg fld/fst */
2227 {12, 12, 12}, /* cost of loading fp registers
2228 in SFmode, DFmode and XFmode */
2229 {6, 6, 8}, /* cost of storing fp registers
2230 in SFmode, DFmode and XFmode */
2231 2, /* cost of moving MMX register */
2232 {8, 8}, /* cost of loading MMX registers
2233 in SImode and DImode */
2234 {8, 8}, /* cost of storing MMX registers
2235 in SImode and DImode */
2236 2, /* cost of moving SSE register */
2237 {8, 8, 8}, /* cost of loading SSE registers
2238 in SImode, DImode and TImode */
2239 {8, 8, 8}, /* cost of storing SSE registers
2240 in SImode, DImode and TImode */
2241 5, /* MMX or SSE register to integer */
2242 32, /* size of l1 cache. */
2243 512, /* size of l2 cache. */
2244 64, /* size of prefetch block */
2245 6, /* number of parallel prefetches */
2246 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2247 value is increased to perhaps more appropriate value of 5. */
2248 3, /* Branch cost */
2249 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2250 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2251 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2252 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2253 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2254 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
6065f444 2255
c53c148c 2256 COSTS_N_INSNS (8), /* cost of cheap SSE instruction. */
6065f444
JH
2257 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
2258 COSTS_N_INSNS (8), /* cost of MULSS instruction. */
2259 COSTS_N_INSNS (8), /* cost of MULSD instruction. */
c53c148c
JH
2260 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
2261 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
6065f444
JH
2262 COSTS_N_INSNS (20), /* cost of DIVSS instruction. */
2263 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
2264 COSTS_N_INSNS (40), /* cost of SQRTSS instruction. */
2265 COSTS_N_INSNS (40), /* cost of SQRTSD instruction. */
64766e8d
JH
2266 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2267 generic_memcpy,
2268 generic_memset,
2269 1, /* scalar_stmt_cost. */
2270 1, /* scalar load_cost. */
2271 1, /* scalar_store_cost. */
2272 1, /* vec_stmt_cost. */
2273 1, /* vec_to_scalar_cost. */
2274 1, /* scalar_to_vec_cost. */
2275 1, /* vec_align_load_cost. */
2276 2, /* vec_unalign_load_cost. */
2277 1, /* vec_store_cost. */
2278 3, /* cond_taken_branch_cost. */
2279 1, /* cond_not_taken_branch_cost. */
2280};
2281
2282/* core_cost should produce code tuned for Core familly of CPUs. */
2283static stringop_algs core_memcpy[2] = {
2284 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2285 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2286 {-1, libcall, false}}}};
2287static stringop_algs core_memset[2] = {
2288 {libcall, {{6, loop_1_byte, true},
2289 {24, loop, true},
2290 {8192, rep_prefix_4_byte, true},
2291 {-1, libcall, false}}},
2292 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2293 {-1, libcall, false}}}};
2294
2295static const
2296struct processor_costs core_cost = {
2297 COSTS_N_INSNS (1), /* cost of an add instruction */
2298 /* On all chips taken into consideration lea is 2 cycles and more. With
2299 this cost however our current implementation of synth_mult results in
2300 use of unnecessary temporary registers causing regression on several
2301 SPECfp benchmarks. */
2302 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2303 COSTS_N_INSNS (1), /* variable shift costs */
2304 COSTS_N_INSNS (1), /* constant shift costs */
2305 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2306 COSTS_N_INSNS (4), /* HI */
2307 COSTS_N_INSNS (3), /* SI */
2308 COSTS_N_INSNS (4), /* DI */
2309 COSTS_N_INSNS (2)}, /* other */
2310 0, /* cost of multiply per each bit set */
2311 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2312 COSTS_N_INSNS (26), /* HI */
2313 COSTS_N_INSNS (42), /* SI */
2314 COSTS_N_INSNS (74), /* DI */
2315 COSTS_N_INSNS (74)}, /* other */
2316 COSTS_N_INSNS (1), /* cost of movsx */
2317 COSTS_N_INSNS (1), /* cost of movzx */
2318 8, /* "large" insn */
2319 17, /* MOVE_RATIO */
2320 4, /* cost for loading QImode using movzbl */
2321 {4, 4, 4}, /* cost of loading integer registers
2322 in QImode, HImode and SImode.
2323 Relative to reg-reg move (2). */
2324 {4, 4, 4}, /* cost of storing integer registers */
2325 4, /* cost of reg,reg fld/fst */
2326 {12, 12, 12}, /* cost of loading fp registers
2327 in SFmode, DFmode and XFmode */
2328 {6, 6, 8}, /* cost of storing fp registers
2329 in SFmode, DFmode and XFmode */
2330 2, /* cost of moving MMX register */
2331 {8, 8}, /* cost of loading MMX registers
2332 in SImode and DImode */
2333 {8, 8}, /* cost of storing MMX registers
2334 in SImode and DImode */
2335 2, /* cost of moving SSE register */
2336 {8, 8, 8}, /* cost of loading SSE registers
2337 in SImode, DImode and TImode */
2338 {8, 8, 8}, /* cost of storing SSE registers
2339 in SImode, DImode and TImode */
2340 5, /* MMX or SSE register to integer */
2341 64, /* size of l1 cache. */
2342 512, /* size of l2 cache. */
2343 64, /* size of prefetch block */
2344 6, /* number of parallel prefetches */
2345 /* FIXME perhaps more appropriate value is 5. */
2346 3, /* Branch cost */
2347 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2348 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2349 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2350 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2351 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2352 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
6065f444 2353
c53c148c 2354 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
6065f444
JH
2355 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2356 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2357 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
c53c148c
JH
2358 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2359 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
6065f444
JH
2360 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
2361 COSTS_N_INSNS (32), /* cost of DIVSD instruction. */
2362 COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */
2363 COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */
64766e8d
JH
2364 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
2365 core_memcpy,
2366 core_memset,
2367 1, /* scalar_stmt_cost. */
2368 1, /* scalar load_cost. */
2369 1, /* scalar_store_cost. */
2370 1, /* vec_stmt_cost. */
2371 1, /* vec_to_scalar_cost. */
2372 1, /* scalar_to_vec_cost. */
2373 1, /* vec_align_load_cost. */
2374 2, /* vec_unalign_load_cost. */
2375 1, /* vec_store_cost. */
2376 3, /* cond_taken_branch_cost. */
2377 1, /* cond_not_taken_branch_cost. */
2378};
2379