]>
Commit | Line | Data |
---|---|---|
df41dbaf | 1 | /* Costs of operations of individual x86 CPUs. |
a5544970 | 2 | Copyright (C) 1988-2019 Free Software Foundation, Inc. |
64766e8d | 3 | |
df41dbaf JH |
4 | This file is part of GCC. |
5 | ||
6 | GCC is free software; you can redistribute it and/or modify | |
7 | it under the terms of the GNU General Public License as published by | |
8 | the Free Software Foundation; either version 3, or (at your option) | |
9 | any later version. | |
10 | ||
11 | GCC is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | GNU General Public License for more details. | |
15 | ||
16 | Under Section 7 of GPL version 3, you are granted additional | |
17 | permissions described in the GCC Runtime Library Exception, version | |
18 | 3.1, as published by the Free Software Foundation. | |
19 | ||
20 | You should have received a copy of the GNU General Public License and | |
21 | a copy of the GCC Runtime Library Exception along with this program; | |
22 | see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
23 | <http://www.gnu.org/licenses/>. */ | |
64766e8d JH |
24 | /* Processor costs (relative to an add) */ |
25 | /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */ | |
26 | #define COSTS_N_BYTES(N) ((N) * 2) | |
27 | ||
28 | #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}} | |
29 | ||
30 | static stringop_algs ix86_size_memcpy[2] = { | |
31 | {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, | |
32 | {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}}; | |
33 | static stringop_algs ix86_size_memset[2] = { | |
34 | {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, | |
35 | {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}}; | |
36 | ||
37 | const | |
38 | struct processor_costs ix86_size_cost = {/* costs for tuning for size */ | |
39 | COSTS_N_BYTES (2), /* cost of an add instruction */ | |
40 | COSTS_N_BYTES (3), /* cost of a lea instruction */ | |
41 | COSTS_N_BYTES (2), /* variable shift costs */ | |
42 | COSTS_N_BYTES (3), /* constant shift costs */ | |
43 | {COSTS_N_BYTES (3), /* cost of starting multiply for QI */ | |
44 | COSTS_N_BYTES (3), /* HI */ | |
45 | COSTS_N_BYTES (3), /* SI */ | |
46 | COSTS_N_BYTES (3), /* DI */ | |
47 | COSTS_N_BYTES (5)}, /* other */ | |
48 | 0, /* cost of multiply per each bit set */ | |
49 | {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */ | |
50 | COSTS_N_BYTES (3), /* HI */ | |
51 | COSTS_N_BYTES (3), /* SI */ | |
52 | COSTS_N_BYTES (3), /* DI */ | |
53 | COSTS_N_BYTES (5)}, /* other */ | |
54 | COSTS_N_BYTES (3), /* cost of movsx */ | |
55 | COSTS_N_BYTES (3), /* cost of movzx */ | |
56 | 0, /* "large" insn */ | |
57 | 2, /* MOVE_RATIO */ | |
df41dbaf JH |
58 | |
59 | /* All move costs are relative to integer->integer move times 2. */ | |
64766e8d JH |
60 | 2, /* cost for loading QImode using movzbl */ |
61 | {2, 2, 2}, /* cost of loading integer registers | |
62 | in QImode, HImode and SImode. | |
63 | Relative to reg-reg move (2). */ | |
64 | {2, 2, 2}, /* cost of storing integer registers */ | |
65 | 2, /* cost of reg,reg fld/fst */ | |
66 | {2, 2, 2}, /* cost of loading fp registers | |
67 | in SFmode, DFmode and XFmode */ | |
68 | {2, 2, 2}, /* cost of storing fp registers | |
69 | in SFmode, DFmode and XFmode */ | |
70 | 3, /* cost of moving MMX register */ | |
71 | {3, 3}, /* cost of loading MMX registers | |
72 | in SImode and DImode */ | |
73 | {3, 3}, /* cost of storing MMX registers | |
74 | in SImode and DImode */ | |
df41dbaf JH |
75 | 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */ |
76 | {3, 3, 3, 3, 3}, /* cost of loading SSE registers | |
77 | in 32,64,128,256 and 512-bit */ | |
78 | {3, 3, 3, 3, 3}, /* cost of unaligned SSE load | |
79 | in 128bit, 256bit and 512bit */ | |
80 | {3, 3, 3, 3, 3}, /* cost of storing SSE registers | |
81 | in 32,64,128,256 and 512-bit */ | |
82 | {3, 3, 3, 3, 3}, /* cost of unaligned SSE store | |
83 | in 128bit, 256bit and 512bit */ | |
84 | 3, 3, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
85 | 5, 0, /* Gather load static, per_elt. */ |
86 | 5, 0, /* Gather store static, per_elt. */ | |
64766e8d JH |
87 | 0, /* size of l1 cache */ |
88 | 0, /* size of l2 cache */ | |
89 | 0, /* size of prefetch block */ | |
90 | 0, /* number of parallel prefetches */ | |
91 | 2, /* Branch cost */ | |
92 | COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */ | |
93 | COSTS_N_BYTES (2), /* cost of FMUL instruction. */ | |
94 | COSTS_N_BYTES (2), /* cost of FDIV instruction. */ | |
95 | COSTS_N_BYTES (2), /* cost of FABS instruction. */ | |
96 | COSTS_N_BYTES (2), /* cost of FCHS instruction. */ | |
97 | COSTS_N_BYTES (2), /* cost of FSQRT instruction. */ | |
6065f444 | 98 | |
c53c148c | 99 | COSTS_N_BYTES (2), /* cost of cheap SSE instruction. */ |
6065f444 JH |
100 | COSTS_N_BYTES (2), /* cost of ADDSS/SD SUBSS/SD insns. */ |
101 | COSTS_N_BYTES (2), /* cost of MULSS instruction. */ | |
102 | COSTS_N_BYTES (2), /* cost of MULSD instruction. */ | |
c53c148c JH |
103 | COSTS_N_BYTES (2), /* cost of FMA SS instruction. */ |
104 | COSTS_N_BYTES (2), /* cost of FMA SD instruction. */ | |
6065f444 JH |
105 | COSTS_N_BYTES (2), /* cost of DIVSS instruction. */ |
106 | COSTS_N_BYTES (2), /* cost of DIVSD instruction. */ | |
107 | COSTS_N_BYTES (2), /* cost of SQRTSS instruction. */ | |
108 | COSTS_N_BYTES (2), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
109 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
110 | ix86_size_memcpy, | |
111 | ix86_size_memset, | |
f6fd8f2b JH |
112 | COSTS_N_BYTES (1), /* cond_taken_branch_cost. */ |
113 | COSTS_N_BYTES (1), /* cond_not_taken_branch_cost. */ | |
7dc58b50 ML |
114 | NULL, /* Loop alignment. */ |
115 | NULL, /* Jump alignment. */ | |
116 | NULL, /* Label alignment. */ | |
117 | NULL, /* Func alignment. */ | |
64766e8d JH |
118 | }; |
119 | ||
120 | /* Processor costs (relative to an add) */ | |
121 | static stringop_algs i386_memcpy[2] = { | |
122 | {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, | |
123 | DUMMY_STRINGOP_ALGS}; | |
124 | static stringop_algs i386_memset[2] = { | |
125 | {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, | |
126 | DUMMY_STRINGOP_ALGS}; | |
127 | ||
128 | static const | |
129 | struct processor_costs i386_cost = { /* 386 specific costs */ | |
130 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
131 | COSTS_N_INSNS (1), /* cost of a lea instruction */ | |
132 | COSTS_N_INSNS (3), /* variable shift costs */ | |
133 | COSTS_N_INSNS (2), /* constant shift costs */ | |
134 | {COSTS_N_INSNS (6), /* cost of starting multiply for QI */ | |
135 | COSTS_N_INSNS (6), /* HI */ | |
136 | COSTS_N_INSNS (6), /* SI */ | |
137 | COSTS_N_INSNS (6), /* DI */ | |
138 | COSTS_N_INSNS (6)}, /* other */ | |
139 | COSTS_N_INSNS (1), /* cost of multiply per each bit set */ | |
140 | {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */ | |
141 | COSTS_N_INSNS (23), /* HI */ | |
142 | COSTS_N_INSNS (23), /* SI */ | |
143 | COSTS_N_INSNS (23), /* DI */ | |
144 | COSTS_N_INSNS (23)}, /* other */ | |
145 | COSTS_N_INSNS (3), /* cost of movsx */ | |
146 | COSTS_N_INSNS (2), /* cost of movzx */ | |
147 | 15, /* "large" insn */ | |
148 | 3, /* MOVE_RATIO */ | |
df41dbaf JH |
149 | |
150 | /* All move costs are relative to integer->integer move times 2 and thus | |
151 | they are latency*2. */ | |
64766e8d JH |
152 | 4, /* cost for loading QImode using movzbl */ |
153 | {2, 4, 2}, /* cost of loading integer registers | |
154 | in QImode, HImode and SImode. | |
155 | Relative to reg-reg move (2). */ | |
156 | {2, 4, 2}, /* cost of storing integer registers */ | |
157 | 2, /* cost of reg,reg fld/fst */ | |
158 | {8, 8, 8}, /* cost of loading fp registers | |
159 | in SFmode, DFmode and XFmode */ | |
160 | {8, 8, 8}, /* cost of storing fp registers | |
161 | in SFmode, DFmode and XFmode */ | |
162 | 2, /* cost of moving MMX register */ | |
163 | {4, 8}, /* cost of loading MMX registers | |
164 | in SImode and DImode */ | |
165 | {4, 8}, /* cost of storing MMX registers | |
166 | in SImode and DImode */ | |
df41dbaf JH |
167 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
168 | {4, 8, 16, 32, 64}, /* cost of loading SSE registers | |
169 | in 32,64,128,256 and 512-bit */ | |
170 | {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ | |
171 | {4, 8, 16, 32, 64}, /* cost of storing SSE registers | |
172 | in 32,64,128,256 and 512-bit */ | |
173 | {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ | |
174 | 3, 3, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
175 | 4, 4, /* Gather load static, per_elt. */ |
176 | 4, 4, /* Gather store static, per_elt. */ | |
64766e8d JH |
177 | 0, /* size of l1 cache */ |
178 | 0, /* size of l2 cache */ | |
179 | 0, /* size of prefetch block */ | |
180 | 0, /* number of parallel prefetches */ | |
181 | 1, /* Branch cost */ | |
182 | COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */ | |
183 | COSTS_N_INSNS (27), /* cost of FMUL instruction. */ | |
184 | COSTS_N_INSNS (88), /* cost of FDIV instruction. */ | |
185 | COSTS_N_INSNS (22), /* cost of FABS instruction. */ | |
186 | COSTS_N_INSNS (24), /* cost of FCHS instruction. */ | |
187 | COSTS_N_INSNS (122), /* cost of FSQRT instruction. */ | |
6065f444 | 188 | |
c53c148c | 189 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
6065f444 JH |
190 | COSTS_N_INSNS (23), /* cost of ADDSS/SD SUBSS/SD insns. */ |
191 | COSTS_N_INSNS (27), /* cost of MULSS instruction. */ | |
192 | COSTS_N_INSNS (27), /* cost of MULSD instruction. */ | |
c53c148c JH |
193 | COSTS_N_INSNS (27), /* cost of FMA SS instruction. */ |
194 | COSTS_N_INSNS (27), /* cost of FMA SD instruction. */ | |
6065f444 JH |
195 | COSTS_N_INSNS (88), /* cost of DIVSS instruction. */ |
196 | COSTS_N_INSNS (88), /* cost of DIVSD instruction. */ | |
197 | COSTS_N_INSNS (122), /* cost of SQRTSS instruction. */ | |
198 | COSTS_N_INSNS (122), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
199 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
200 | i386_memcpy, | |
201 | i386_memset, | |
f6fd8f2b JH |
202 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
203 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
7dc58b50 ML |
204 | "4", /* Loop alignment. */ |
205 | "4", /* Jump alignment. */ | |
206 | NULL, /* Label alignment. */ | |
207 | "4", /* Func alignment. */ | |
64766e8d JH |
208 | }; |
209 | ||
210 | static stringop_algs i486_memcpy[2] = { | |
211 | {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}}, | |
212 | DUMMY_STRINGOP_ALGS}; | |
213 | static stringop_algs i486_memset[2] = { | |
214 | {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}}, | |
215 | DUMMY_STRINGOP_ALGS}; | |
216 | ||
217 | static const | |
218 | struct processor_costs i486_cost = { /* 486 specific costs */ | |
219 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
220 | COSTS_N_INSNS (1), /* cost of a lea instruction */ | |
221 | COSTS_N_INSNS (3), /* variable shift costs */ | |
222 | COSTS_N_INSNS (2), /* constant shift costs */ | |
223 | {COSTS_N_INSNS (12), /* cost of starting multiply for QI */ | |
224 | COSTS_N_INSNS (12), /* HI */ | |
225 | COSTS_N_INSNS (12), /* SI */ | |
226 | COSTS_N_INSNS (12), /* DI */ | |
227 | COSTS_N_INSNS (12)}, /* other */ | |
228 | 1, /* cost of multiply per each bit set */ | |
229 | {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */ | |
230 | COSTS_N_INSNS (40), /* HI */ | |
231 | COSTS_N_INSNS (40), /* SI */ | |
232 | COSTS_N_INSNS (40), /* DI */ | |
233 | COSTS_N_INSNS (40)}, /* other */ | |
234 | COSTS_N_INSNS (3), /* cost of movsx */ | |
235 | COSTS_N_INSNS (2), /* cost of movzx */ | |
236 | 15, /* "large" insn */ | |
237 | 3, /* MOVE_RATIO */ | |
df41dbaf JH |
238 | |
239 | /* All move costs are relative to integer->integer move times 2 and thus | |
240 | they are latency*2. */ | |
64766e8d JH |
241 | 4, /* cost for loading QImode using movzbl */ |
242 | {2, 4, 2}, /* cost of loading integer registers | |
243 | in QImode, HImode and SImode. | |
244 | Relative to reg-reg move (2). */ | |
245 | {2, 4, 2}, /* cost of storing integer registers */ | |
246 | 2, /* cost of reg,reg fld/fst */ | |
247 | {8, 8, 8}, /* cost of loading fp registers | |
248 | in SFmode, DFmode and XFmode */ | |
249 | {8, 8, 8}, /* cost of storing fp registers | |
250 | in SFmode, DFmode and XFmode */ | |
251 | 2, /* cost of moving MMX register */ | |
252 | {4, 8}, /* cost of loading MMX registers | |
253 | in SImode and DImode */ | |
254 | {4, 8}, /* cost of storing MMX registers | |
255 | in SImode and DImode */ | |
df41dbaf JH |
256 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
257 | {4, 8, 16, 32, 64}, /* cost of loading SSE registers | |
258 | in 32,64,128,256 and 512-bit */ | |
259 | {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ | |
260 | {4, 8, 16, 32, 64}, /* cost of storing SSE registers | |
261 | in 32,64,128,256 and 512-bit */ | |
262 | {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ | |
263 | 3, 3, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
264 | 4, 4, /* Gather load static, per_elt. */ |
265 | 4, 4, /* Gather store static, per_elt. */ | |
64766e8d JH |
266 | 4, /* size of l1 cache. 486 has 8kB cache |
267 | shared for code and data, so 4kB is | |
268 | not really precise. */ | |
269 | 4, /* size of l2 cache */ | |
270 | 0, /* size of prefetch block */ | |
271 | 0, /* number of parallel prefetches */ | |
272 | 1, /* Branch cost */ | |
273 | COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ | |
274 | COSTS_N_INSNS (16), /* cost of FMUL instruction. */ | |
275 | COSTS_N_INSNS (73), /* cost of FDIV instruction. */ | |
276 | COSTS_N_INSNS (3), /* cost of FABS instruction. */ | |
277 | COSTS_N_INSNS (3), /* cost of FCHS instruction. */ | |
278 | COSTS_N_INSNS (83), /* cost of FSQRT instruction. */ | |
6065f444 | 279 | |
c53c148c | 280 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
6065f444 JH |
281 | COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */ |
282 | COSTS_N_INSNS (16), /* cost of MULSS instruction. */ | |
283 | COSTS_N_INSNS (16), /* cost of MULSD instruction. */ | |
c53c148c JH |
284 | COSTS_N_INSNS (16), /* cost of FMA SS instruction. */ |
285 | COSTS_N_INSNS (16), /* cost of FMA SD instruction. */ | |
6065f444 JH |
286 | COSTS_N_INSNS (73), /* cost of DIVSS instruction. */ |
287 | COSTS_N_INSNS (74), /* cost of DIVSD instruction. */ | |
288 | COSTS_N_INSNS (83), /* cost of SQRTSS instruction. */ | |
289 | COSTS_N_INSNS (83), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
290 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
291 | i486_memcpy, | |
292 | i486_memset, | |
f6fd8f2b JH |
293 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
294 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
7dc58b50 ML |
295 | "16", /* Loop alignment. */ |
296 | "16", /* Jump alignment. */ | |
297 | "0:0:8", /* Label alignment. */ | |
298 | "16", /* Func alignment. */ | |
64766e8d JH |
299 | }; |
300 | ||
301 | static stringop_algs pentium_memcpy[2] = { | |
302 | {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
303 | DUMMY_STRINGOP_ALGS}; | |
304 | static stringop_algs pentium_memset[2] = { | |
305 | {libcall, {{-1, rep_prefix_4_byte, false}}}, | |
306 | DUMMY_STRINGOP_ALGS}; | |
307 | ||
308 | static const | |
309 | struct processor_costs pentium_cost = { | |
310 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
311 | COSTS_N_INSNS (1), /* cost of a lea instruction */ | |
312 | COSTS_N_INSNS (4), /* variable shift costs */ | |
313 | COSTS_N_INSNS (1), /* constant shift costs */ | |
314 | {COSTS_N_INSNS (11), /* cost of starting multiply for QI */ | |
315 | COSTS_N_INSNS (11), /* HI */ | |
316 | COSTS_N_INSNS (11), /* SI */ | |
317 | COSTS_N_INSNS (11), /* DI */ | |
318 | COSTS_N_INSNS (11)}, /* other */ | |
319 | 0, /* cost of multiply per each bit set */ | |
320 | {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */ | |
321 | COSTS_N_INSNS (25), /* HI */ | |
322 | COSTS_N_INSNS (25), /* SI */ | |
323 | COSTS_N_INSNS (25), /* DI */ | |
324 | COSTS_N_INSNS (25)}, /* other */ | |
325 | COSTS_N_INSNS (3), /* cost of movsx */ | |
326 | COSTS_N_INSNS (2), /* cost of movzx */ | |
327 | 8, /* "large" insn */ | |
328 | 6, /* MOVE_RATIO */ | |
df41dbaf JH |
329 | |
330 | /* All move costs are relative to integer->integer move times 2 and thus | |
331 | they are latency*2. */ | |
64766e8d JH |
332 | 6, /* cost for loading QImode using movzbl */ |
333 | {2, 4, 2}, /* cost of loading integer registers | |
334 | in QImode, HImode and SImode. | |
335 | Relative to reg-reg move (2). */ | |
336 | {2, 4, 2}, /* cost of storing integer registers */ | |
337 | 2, /* cost of reg,reg fld/fst */ | |
338 | {2, 2, 6}, /* cost of loading fp registers | |
339 | in SFmode, DFmode and XFmode */ | |
340 | {4, 4, 6}, /* cost of storing fp registers | |
341 | in SFmode, DFmode and XFmode */ | |
342 | 8, /* cost of moving MMX register */ | |
343 | {8, 8}, /* cost of loading MMX registers | |
344 | in SImode and DImode */ | |
345 | {8, 8}, /* cost of storing MMX registers | |
346 | in SImode and DImode */ | |
df41dbaf JH |
347 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
348 | {4, 8, 16, 32, 64}, /* cost of loading SSE registers | |
349 | in 32,64,128,256 and 512-bit */ | |
350 | {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ | |
351 | {4, 8, 16, 32, 64}, /* cost of storing SSE registers | |
352 | in 32,64,128,256 and 512-bit */ | |
353 | {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ | |
354 | 3, 3, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
355 | 4, 4, /* Gather load static, per_elt. */ |
356 | 4, 4, /* Gather store static, per_elt. */ | |
64766e8d JH |
357 | 8, /* size of l1 cache. */ |
358 | 8, /* size of l2 cache */ | |
359 | 0, /* size of prefetch block */ | |
360 | 0, /* number of parallel prefetches */ | |
361 | 2, /* Branch cost */ | |
362 | COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ | |
363 | COSTS_N_INSNS (3), /* cost of FMUL instruction. */ | |
364 | COSTS_N_INSNS (39), /* cost of FDIV instruction. */ | |
365 | COSTS_N_INSNS (1), /* cost of FABS instruction. */ | |
366 | COSTS_N_INSNS (1), /* cost of FCHS instruction. */ | |
367 | COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ | |
6065f444 | 368 | |
c53c148c | 369 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
6065f444 JH |
370 | COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ |
371 | COSTS_N_INSNS (3), /* cost of MULSS instruction. */ | |
372 | COSTS_N_INSNS (3), /* cost of MULSD instruction. */ | |
c53c148c JH |
373 | COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ |
374 | COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ | |
6065f444 JH |
375 | COSTS_N_INSNS (39), /* cost of DIVSS instruction. */ |
376 | COSTS_N_INSNS (39), /* cost of DIVSD instruction. */ | |
377 | COSTS_N_INSNS (70), /* cost of SQRTSS instruction. */ | |
378 | COSTS_N_INSNS (70), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
379 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
380 | pentium_memcpy, | |
381 | pentium_memset, | |
f6fd8f2b JH |
382 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
383 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
7dc58b50 ML |
384 | "16:8:8", /* Loop alignment. */ |
385 | "16:8:8", /* Jump alignment. */ | |
386 | "0:0:8", /* Label alignment. */ | |
387 | "16", /* Func alignment. */ | |
64766e8d JH |
388 | }; |
389 | ||
390 | static const | |
391 | struct processor_costs lakemont_cost = { | |
392 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
393 | COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ | |
394 | COSTS_N_INSNS (1), /* variable shift costs */ | |
395 | COSTS_N_INSNS (1), /* constant shift costs */ | |
396 | {COSTS_N_INSNS (11), /* cost of starting multiply for QI */ | |
397 | COSTS_N_INSNS (11), /* HI */ | |
398 | COSTS_N_INSNS (11), /* SI */ | |
399 | COSTS_N_INSNS (11), /* DI */ | |
400 | COSTS_N_INSNS (11)}, /* other */ | |
401 | 0, /* cost of multiply per each bit set */ | |
402 | {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */ | |
403 | COSTS_N_INSNS (25), /* HI */ | |
404 | COSTS_N_INSNS (25), /* SI */ | |
405 | COSTS_N_INSNS (25), /* DI */ | |
406 | COSTS_N_INSNS (25)}, /* other */ | |
407 | COSTS_N_INSNS (3), /* cost of movsx */ | |
408 | COSTS_N_INSNS (2), /* cost of movzx */ | |
409 | 8, /* "large" insn */ | |
410 | 17, /* MOVE_RATIO */ | |
df41dbaf JH |
411 | |
412 | /* All move costs are relative to integer->integer move times 2 and thus | |
413 | they are latency*2. */ | |
64766e8d JH |
414 | 6, /* cost for loading QImode using movzbl */ |
415 | {2, 4, 2}, /* cost of loading integer registers | |
416 | in QImode, HImode and SImode. | |
417 | Relative to reg-reg move (2). */ | |
418 | {2, 4, 2}, /* cost of storing integer registers */ | |
419 | 2, /* cost of reg,reg fld/fst */ | |
420 | {2, 2, 6}, /* cost of loading fp registers | |
421 | in SFmode, DFmode and XFmode */ | |
422 | {4, 4, 6}, /* cost of storing fp registers | |
423 | in SFmode, DFmode and XFmode */ | |
424 | 8, /* cost of moving MMX register */ | |
425 | {8, 8}, /* cost of loading MMX registers | |
426 | in SImode and DImode */ | |
427 | {8, 8}, /* cost of storing MMX registers | |
428 | in SImode and DImode */ | |
df41dbaf JH |
429 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
430 | {4, 8, 16, 32, 64}, /* cost of loading SSE registers | |
431 | in 32,64,128,256 and 512-bit */ | |
432 | {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ | |
433 | {4, 8, 16, 32, 64}, /* cost of storing SSE registers | |
434 | in 32,64,128,256 and 512-bit */ | |
435 | {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ | |
436 | 3, 3, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
437 | 4, 4, /* Gather load static, per_elt. */ |
438 | 4, 4, /* Gather store static, per_elt. */ | |
64766e8d JH |
439 | 8, /* size of l1 cache. */ |
440 | 8, /* size of l2 cache */ | |
441 | 0, /* size of prefetch block */ | |
442 | 0, /* number of parallel prefetches */ | |
443 | 2, /* Branch cost */ | |
444 | COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ | |
445 | COSTS_N_INSNS (3), /* cost of FMUL instruction. */ | |
446 | COSTS_N_INSNS (39), /* cost of FDIV instruction. */ | |
447 | COSTS_N_INSNS (1), /* cost of FABS instruction. */ | |
448 | COSTS_N_INSNS (1), /* cost of FCHS instruction. */ | |
449 | COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ | |
6065f444 | 450 | |
c53c148c | 451 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
6065f444 JH |
452 | COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */ |
453 | COSTS_N_INSNS (5), /* cost of MULSS instruction. */ | |
454 | COSTS_N_INSNS (5), /* cost of MULSD instruction. */ | |
c53c148c JH |
455 | COSTS_N_INSNS (10), /* cost of FMA SS instruction. */ |
456 | COSTS_N_INSNS (10), /* cost of FMA SD instruction. */ | |
6065f444 JH |
457 | COSTS_N_INSNS (31), /* cost of DIVSS instruction. */ |
458 | COSTS_N_INSNS (60), /* cost of DIVSD instruction. */ | |
459 | COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */ | |
460 | COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
461 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
462 | pentium_memcpy, | |
463 | pentium_memset, | |
f6fd8f2b JH |
464 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
465 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
7dc58b50 ML |
466 | "16:8:8", /* Loop alignment. */ |
467 | "16:8:8", /* Jump alignment. */ | |
468 | "0:0:8", /* Label alignment. */ | |
469 | "16", /* Func alignment. */ | |
64766e8d JH |
470 | }; |
471 | ||
472 | /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes | |
473 | (we ensure the alignment). For small blocks inline loop is still a | |
474 | noticeable win, for bigger blocks either rep movsl or rep movsb is | |
475 | way to go. Rep movsb has apparently more expensive startup time in CPU, | |
476 | but after 4K the difference is down in the noise. */ | |
477 | static stringop_algs pentiumpro_memcpy[2] = { | |
478 | {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false}, | |
479 | {8192, rep_prefix_4_byte, false}, | |
480 | {-1, rep_prefix_1_byte, false}}}, | |
481 | DUMMY_STRINGOP_ALGS}; | |
482 | static stringop_algs pentiumpro_memset[2] = { | |
483 | {rep_prefix_4_byte, {{1024, unrolled_loop, false}, | |
484 | {8192, rep_prefix_4_byte, false}, | |
485 | {-1, libcall, false}}}, | |
486 | DUMMY_STRINGOP_ALGS}; | |
487 | static const | |
488 | struct processor_costs pentiumpro_cost = { | |
489 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
490 | COSTS_N_INSNS (1), /* cost of a lea instruction */ | |
491 | COSTS_N_INSNS (1), /* variable shift costs */ | |
492 | COSTS_N_INSNS (1), /* constant shift costs */ | |
493 | {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ | |
494 | COSTS_N_INSNS (4), /* HI */ | |
495 | COSTS_N_INSNS (4), /* SI */ | |
496 | COSTS_N_INSNS (4), /* DI */ | |
497 | COSTS_N_INSNS (4)}, /* other */ | |
498 | 0, /* cost of multiply per each bit set */ | |
499 | {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */ | |
500 | COSTS_N_INSNS (17), /* HI */ | |
501 | COSTS_N_INSNS (17), /* SI */ | |
502 | COSTS_N_INSNS (17), /* DI */ | |
503 | COSTS_N_INSNS (17)}, /* other */ | |
504 | COSTS_N_INSNS (1), /* cost of movsx */ | |
505 | COSTS_N_INSNS (1), /* cost of movzx */ | |
506 | 8, /* "large" insn */ | |
507 | 6, /* MOVE_RATIO */ | |
df41dbaf JH |
508 | |
509 | /* All move costs are relative to integer->integer move times 2 and thus | |
510 | they are latency*2. */ | |
64766e8d JH |
511 | 2, /* cost for loading QImode using movzbl */ |
512 | {4, 4, 4}, /* cost of loading integer registers | |
513 | in QImode, HImode and SImode. | |
514 | Relative to reg-reg move (2). */ | |
515 | {2, 2, 2}, /* cost of storing integer registers */ | |
516 | 2, /* cost of reg,reg fld/fst */ | |
517 | {2, 2, 6}, /* cost of loading fp registers | |
518 | in SFmode, DFmode and XFmode */ | |
519 | {4, 4, 6}, /* cost of storing fp registers | |
520 | in SFmode, DFmode and XFmode */ | |
521 | 2, /* cost of moving MMX register */ | |
522 | {2, 2}, /* cost of loading MMX registers | |
523 | in SImode and DImode */ | |
524 | {2, 2}, /* cost of storing MMX registers | |
525 | in SImode and DImode */ | |
df41dbaf JH |
526 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
527 | {4, 8, 16, 32, 64}, /* cost of loading SSE registers | |
528 | in 32,64,128,256 and 512-bit */ | |
529 | {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ | |
530 | {4, 8, 16, 32, 64}, /* cost of storing SSE registers | |
531 | in 32,64,128,256 and 512-bit */ | |
532 | {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ | |
533 | 3, 3, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
534 | 4, 4, /* Gather load static, per_elt. */ |
535 | 4, 4, /* Gather store static, per_elt. */ | |
64766e8d JH |
536 | 8, /* size of l1 cache. */ |
537 | 256, /* size of l2 cache */ | |
538 | 32, /* size of prefetch block */ | |
539 | 6, /* number of parallel prefetches */ | |
540 | 2, /* Branch cost */ | |
541 | COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ | |
542 | COSTS_N_INSNS (5), /* cost of FMUL instruction. */ | |
543 | COSTS_N_INSNS (56), /* cost of FDIV instruction. */ | |
544 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ | |
545 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ | |
546 | COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ | |
6065f444 | 547 | |
c53c148c | 548 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
6065f444 JH |
549 | COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ |
550 | COSTS_N_INSNS (4), /* cost of MULSS instruction. */ | |
551 | COSTS_N_INSNS (4), /* cost of MULSD instruction. */ | |
c53c148c JH |
552 | COSTS_N_INSNS (7), /* cost of FMA SS instruction. */ |
553 | COSTS_N_INSNS (7), /* cost of FMA SD instruction. */ | |
6065f444 JH |
554 | COSTS_N_INSNS (18), /* cost of DIVSS instruction. */ |
555 | COSTS_N_INSNS (18), /* cost of DIVSD instruction. */ | |
556 | COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */ | |
557 | COSTS_N_INSNS (31), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
558 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
559 | pentiumpro_memcpy, | |
560 | pentiumpro_memset, | |
f6fd8f2b JH |
561 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
562 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
7dc58b50 ML |
563 | "16", /* Loop alignment. */ |
564 | "16:11:8", /* Jump alignment. */ | |
565 | "0:0:8", /* Label alignment. */ | |
566 | "16", /* Func alignment. */ | |
64766e8d JH |
567 | }; |
568 | ||
569 | static stringop_algs geode_memcpy[2] = { | |
570 | {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
571 | DUMMY_STRINGOP_ALGS}; | |
572 | static stringop_algs geode_memset[2] = { | |
573 | {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
574 | DUMMY_STRINGOP_ALGS}; | |
575 | static const | |
576 | struct processor_costs geode_cost = { | |
577 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
578 | COSTS_N_INSNS (1), /* cost of a lea instruction */ | |
579 | COSTS_N_INSNS (2), /* variable shift costs */ | |
580 | COSTS_N_INSNS (1), /* constant shift costs */ | |
581 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ | |
582 | COSTS_N_INSNS (4), /* HI */ | |
583 | COSTS_N_INSNS (7), /* SI */ | |
584 | COSTS_N_INSNS (7), /* DI */ | |
585 | COSTS_N_INSNS (7)}, /* other */ | |
586 | 0, /* cost of multiply per each bit set */ | |
587 | {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */ | |
588 | COSTS_N_INSNS (23), /* HI */ | |
589 | COSTS_N_INSNS (39), /* SI */ | |
590 | COSTS_N_INSNS (39), /* DI */ | |
591 | COSTS_N_INSNS (39)}, /* other */ | |
592 | COSTS_N_INSNS (1), /* cost of movsx */ | |
593 | COSTS_N_INSNS (1), /* cost of movzx */ | |
594 | 8, /* "large" insn */ | |
595 | 4, /* MOVE_RATIO */ | |
df41dbaf JH |
596 | |
597 | /* All move costs are relative to integer->integer move times 2 and thus | |
598 | they are latency*2. */ | |
599 | 2, /* cost for loading QImode using movzbl */ | |
600 | {2, 2, 2}, /* cost of loading integer registers | |
64766e8d JH |
601 | in QImode, HImode and SImode. |
602 | Relative to reg-reg move (2). */ | |
df41dbaf JH |
603 | {2, 2, 2}, /* cost of storing integer registers */ |
604 | 2, /* cost of reg,reg fld/fst */ | |
605 | {2, 2, 2}, /* cost of loading fp registers | |
64766e8d JH |
606 | in SFmode, DFmode and XFmode */ |
607 | {4, 6, 6}, /* cost of storing fp registers | |
608 | in SFmode, DFmode and XFmode */ | |
609 | ||
610 | 2, /* cost of moving MMX register */ | |
611 | {2, 2}, /* cost of loading MMX registers | |
612 | in SImode and DImode */ | |
613 | {2, 2}, /* cost of storing MMX registers | |
614 | in SImode and DImode */ | |
df41dbaf JH |
615 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
616 | {2, 2, 8, 16, 32}, /* cost of loading SSE registers | |
617 | in 32,64,128,256 and 512-bit */ | |
618 | {2, 2, 8, 16, 32}, /* cost of unaligned loads. */ | |
619 | {2, 2, 8, 16, 32}, /* cost of storing SSE registers | |
620 | in 32,64,128,256 and 512-bit */ | |
621 | {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ | |
622 | 6, 6, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
623 | 2, 2, /* Gather load static, per_elt. */ |
624 | 2, 2, /* Gather store static, per_elt. */ | |
64766e8d JH |
625 | 64, /* size of l1 cache. */ |
626 | 128, /* size of l2 cache. */ | |
627 | 32, /* size of prefetch block */ | |
628 | 1, /* number of parallel prefetches */ | |
629 | 1, /* Branch cost */ | |
630 | COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ | |
631 | COSTS_N_INSNS (11), /* cost of FMUL instruction. */ | |
632 | COSTS_N_INSNS (47), /* cost of FDIV instruction. */ | |
633 | COSTS_N_INSNS (1), /* cost of FABS instruction. */ | |
634 | COSTS_N_INSNS (1), /* cost of FCHS instruction. */ | |
635 | COSTS_N_INSNS (54), /* cost of FSQRT instruction. */ | |
6065f444 | 636 | |
c53c148c | 637 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
6065f444 JH |
638 | COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */ |
639 | COSTS_N_INSNS (11), /* cost of MULSS instruction. */ | |
640 | COSTS_N_INSNS (11), /* cost of MULSD instruction. */ | |
c53c148c JH |
641 | COSTS_N_INSNS (17), /* cost of FMA SS instruction. */ |
642 | COSTS_N_INSNS (17), /* cost of FMA SD instruction. */ | |
6065f444 JH |
643 | COSTS_N_INSNS (47), /* cost of DIVSS instruction. */ |
644 | COSTS_N_INSNS (47), /* cost of DIVSD instruction. */ | |
645 | COSTS_N_INSNS (54), /* cost of SQRTSS instruction. */ | |
646 | COSTS_N_INSNS (54), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
647 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
648 | geode_memcpy, | |
649 | geode_memset, | |
f6fd8f2b JH |
650 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
651 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
7dc58b50 ML |
652 | NULL, /* Loop alignment. */ |
653 | NULL, /* Jump alignment. */ | |
654 | NULL, /* Label alignment. */ | |
655 | NULL, /* Func alignment. */ | |
64766e8d JH |
656 | }; |
657 | ||
658 | static stringop_algs k6_memcpy[2] = { | |
659 | {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
660 | DUMMY_STRINGOP_ALGS}; | |
661 | static stringop_algs k6_memset[2] = { | |
662 | {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
663 | DUMMY_STRINGOP_ALGS}; | |
664 | static const | |
665 | struct processor_costs k6_cost = { | |
666 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
667 | COSTS_N_INSNS (2), /* cost of a lea instruction */ | |
668 | COSTS_N_INSNS (1), /* variable shift costs */ | |
669 | COSTS_N_INSNS (1), /* constant shift costs */ | |
670 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ | |
671 | COSTS_N_INSNS (3), /* HI */ | |
672 | COSTS_N_INSNS (3), /* SI */ | |
673 | COSTS_N_INSNS (3), /* DI */ | |
674 | COSTS_N_INSNS (3)}, /* other */ | |
675 | 0, /* cost of multiply per each bit set */ | |
676 | {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ | |
677 | COSTS_N_INSNS (18), /* HI */ | |
678 | COSTS_N_INSNS (18), /* SI */ | |
679 | COSTS_N_INSNS (18), /* DI */ | |
680 | COSTS_N_INSNS (18)}, /* other */ | |
681 | COSTS_N_INSNS (2), /* cost of movsx */ | |
682 | COSTS_N_INSNS (2), /* cost of movzx */ | |
683 | 8, /* "large" insn */ | |
684 | 4, /* MOVE_RATIO */ | |
df41dbaf JH |
685 | |
686 | /* All move costs are relative to integer->integer move times 2 and thus | |
687 | they are latency*2. */ | |
64766e8d JH |
688 | 3, /* cost for loading QImode using movzbl */ |
689 | {4, 5, 4}, /* cost of loading integer registers | |
690 | in QImode, HImode and SImode. | |
691 | Relative to reg-reg move (2). */ | |
692 | {2, 3, 2}, /* cost of storing integer registers */ | |
693 | 4, /* cost of reg,reg fld/fst */ | |
694 | {6, 6, 6}, /* cost of loading fp registers | |
695 | in SFmode, DFmode and XFmode */ | |
696 | {4, 4, 4}, /* cost of storing fp registers | |
697 | in SFmode, DFmode and XFmode */ | |
698 | 2, /* cost of moving MMX register */ | |
699 | {2, 2}, /* cost of loading MMX registers | |
700 | in SImode and DImode */ | |
701 | {2, 2}, /* cost of storing MMX registers | |
702 | in SImode and DImode */ | |
df41dbaf JH |
703 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
704 | {2, 2, 8, 16, 32}, /* cost of loading SSE registers | |
705 | in 32,64,128,256 and 512-bit */ | |
706 | {2, 2, 8, 16, 32}, /* cost of unaligned loads. */ | |
707 | {2, 2, 8, 16, 32}, /* cost of storing SSE registers | |
708 | in 32,64,128,256 and 512-bit */ | |
709 | {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ | |
710 | 6, 6, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
711 | 2, 2, /* Gather load static, per_elt. */ |
712 | 2, 2, /* Gather store static, per_elt. */ | |
64766e8d JH |
713 | 32, /* size of l1 cache. */ |
714 | 32, /* size of l2 cache. Some models | |
715 | have integrated l2 cache, but | |
716 | optimizing for k6 is not important | |
717 | enough to worry about that. */ | |
718 | 32, /* size of prefetch block */ | |
719 | 1, /* number of parallel prefetches */ | |
720 | 1, /* Branch cost */ | |
721 | COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */ | |
722 | COSTS_N_INSNS (2), /* cost of FMUL instruction. */ | |
723 | COSTS_N_INSNS (56), /* cost of FDIV instruction. */ | |
724 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ | |
725 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ | |
726 | COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ | |
6065f444 | 727 | |
c53c148c | 728 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
6065f444 JH |
729 | COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */ |
730 | COSTS_N_INSNS (2), /* cost of MULSS instruction. */ | |
731 | COSTS_N_INSNS (2), /* cost of MULSD instruction. */ | |
c53c148c JH |
732 | COSTS_N_INSNS (4), /* cost of FMA SS instruction. */ |
733 | COSTS_N_INSNS (4), /* cost of FMA SD instruction. */ | |
6065f444 JH |
734 | COSTS_N_INSNS (56), /* cost of DIVSS instruction. */ |
735 | COSTS_N_INSNS (56), /* cost of DIVSD instruction. */ | |
736 | COSTS_N_INSNS (56), /* cost of SQRTSS instruction. */ | |
737 | COSTS_N_INSNS (56), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
738 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
739 | k6_memcpy, | |
740 | k6_memset, | |
f6fd8f2b JH |
741 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
742 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
7dc58b50 ML |
743 | "32:8:8", /* Loop alignment. */ |
744 | "32:8:8", /* Jump alignment. */ | |
745 | "0:0:8", /* Label alignment. */ | |
746 | "32", /* Func alignment. */ | |
64766e8d JH |
747 | }; |
748 | ||
749 | /* For some reason, Athlon deals better with REP prefix (relative to loops) | |
750 | compared to K8. Alignment becomes important after 8 bytes for memcpy and | |
751 | 128 bytes for memset. */ | |
752 | static stringop_algs athlon_memcpy[2] = { | |
753 | {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
754 | DUMMY_STRINGOP_ALGS}; | |
755 | static stringop_algs athlon_memset[2] = { | |
756 | {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
757 | DUMMY_STRINGOP_ALGS}; | |
758 | static const | |
759 | struct processor_costs athlon_cost = { | |
760 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
761 | COSTS_N_INSNS (2), /* cost of a lea instruction */ | |
762 | COSTS_N_INSNS (1), /* variable shift costs */ | |
763 | COSTS_N_INSNS (1), /* constant shift costs */ | |
764 | {COSTS_N_INSNS (5), /* cost of starting multiply for QI */ | |
765 | COSTS_N_INSNS (5), /* HI */ | |
766 | COSTS_N_INSNS (5), /* SI */ | |
767 | COSTS_N_INSNS (5), /* DI */ | |
768 | COSTS_N_INSNS (5)}, /* other */ | |
769 | 0, /* cost of multiply per each bit set */ | |
770 | {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ | |
771 | COSTS_N_INSNS (26), /* HI */ | |
772 | COSTS_N_INSNS (42), /* SI */ | |
773 | COSTS_N_INSNS (74), /* DI */ | |
774 | COSTS_N_INSNS (74)}, /* other */ | |
775 | COSTS_N_INSNS (1), /* cost of movsx */ | |
776 | COSTS_N_INSNS (1), /* cost of movzx */ | |
777 | 8, /* "large" insn */ | |
778 | 9, /* MOVE_RATIO */ | |
df41dbaf JH |
779 | |
780 | /* All move costs are relative to integer->integer move times 2 and thus | |
781 | they are latency*2. */ | |
64766e8d JH |
782 | 4, /* cost for loading QImode using movzbl */ |
783 | {3, 4, 3}, /* cost of loading integer registers | |
784 | in QImode, HImode and SImode. | |
785 | Relative to reg-reg move (2). */ | |
786 | {3, 4, 3}, /* cost of storing integer registers */ | |
787 | 4, /* cost of reg,reg fld/fst */ | |
788 | {4, 4, 12}, /* cost of loading fp registers | |
789 | in SFmode, DFmode and XFmode */ | |
790 | {6, 6, 8}, /* cost of storing fp registers | |
791 | in SFmode, DFmode and XFmode */ | |
792 | 2, /* cost of moving MMX register */ | |
793 | {4, 4}, /* cost of loading MMX registers | |
794 | in SImode and DImode */ | |
795 | {4, 4}, /* cost of storing MMX registers | |
796 | in SImode and DImode */ | |
df41dbaf | 797 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
b7167993 | 798 | {4, 4, 12, 12, 24}, /* cost of loading SSE registers |
df41dbaf | 799 | in 32,64,128,256 and 512-bit */ |
b7167993 RB |
800 | {4, 4, 12, 12, 24}, /* cost of unaligned loads. */ |
801 | {4, 4, 10, 10, 20}, /* cost of storing SSE registers | |
df41dbaf | 802 | in 32,64,128,256 and 512-bit */ |
b7167993 | 803 | {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ |
df41dbaf | 804 | 5, 5, /* SSE->integer and integer->SSE moves */ |
a4fe6139 JH |
805 | 4, 4, /* Gather load static, per_elt. */ |
806 | 4, 4, /* Gather store static, per_elt. */ | |
64766e8d JH |
807 | 64, /* size of l1 cache. */ |
808 | 256, /* size of l2 cache. */ | |
809 | 64, /* size of prefetch block */ | |
810 | 6, /* number of parallel prefetches */ | |
811 | 5, /* Branch cost */ | |
812 | COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ | |
813 | COSTS_N_INSNS (4), /* cost of FMUL instruction. */ | |
814 | COSTS_N_INSNS (24), /* cost of FDIV instruction. */ | |
815 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ | |
816 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ | |
817 | COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ | |
6065f444 | 818 | |
c53c148c | 819 | COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ |
6065f444 JH |
820 | COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ |
821 | COSTS_N_INSNS (4), /* cost of MULSS instruction. */ | |
822 | COSTS_N_INSNS (4), /* cost of MULSD instruction. */ | |
c53c148c JH |
823 | COSTS_N_INSNS (8), /* cost of FMA SS instruction. */ |
824 | COSTS_N_INSNS (8), /* cost of FMA SD instruction. */ | |
6065f444 JH |
825 | /* 11-16 */ |
826 | COSTS_N_INSNS (16), /* cost of DIVSS instruction. */ | |
827 | COSTS_N_INSNS (24), /* cost of DIVSD instruction. */ | |
828 | COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */ | |
829 | COSTS_N_INSNS (19), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
830 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
831 | athlon_memcpy, | |
832 | athlon_memset, | |
f6fd8f2b JH |
833 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
834 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
7dc58b50 ML |
835 | "16:8:8", /* Loop alignment. */ |
836 | "16:8:8", /* Jump alignment. */ | |
837 | "0:0:8", /* Label alignment. */ | |
838 | "16", /* Func alignment. */ | |
64766e8d JH |
839 | }; |
840 | ||
841 | /* K8 has optimized REP instruction for medium sized blocks, but for very | |
842 | small blocks it is better to use loop. For large blocks, libcall can | |
843 | do nontemporary accesses and beat inline considerably. */ | |
844 | static stringop_algs k8_memcpy[2] = { | |
845 | {libcall, {{6, loop, false}, {14, unrolled_loop, false}, | |
846 | {-1, rep_prefix_4_byte, false}}}, | |
847 | {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, | |
848 | {-1, libcall, false}}}}; | |
849 | static stringop_algs k8_memset[2] = { | |
850 | {libcall, {{8, loop, false}, {24, unrolled_loop, false}, | |
851 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
852 | {libcall, {{48, unrolled_loop, false}, | |
853 | {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; | |
854 | static const | |
855 | struct processor_costs k8_cost = { | |
856 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
857 | COSTS_N_INSNS (2), /* cost of a lea instruction */ | |
858 | COSTS_N_INSNS (1), /* variable shift costs */ | |
859 | COSTS_N_INSNS (1), /* constant shift costs */ | |
860 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ | |
861 | COSTS_N_INSNS (4), /* HI */ | |
862 | COSTS_N_INSNS (3), /* SI */ | |
863 | COSTS_N_INSNS (4), /* DI */ | |
864 | COSTS_N_INSNS (5)}, /* other */ | |
865 | 0, /* cost of multiply per each bit set */ | |
866 | {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ | |
867 | COSTS_N_INSNS (26), /* HI */ | |
868 | COSTS_N_INSNS (42), /* SI */ | |
869 | COSTS_N_INSNS (74), /* DI */ | |
870 | COSTS_N_INSNS (74)}, /* other */ | |
871 | COSTS_N_INSNS (1), /* cost of movsx */ | |
872 | COSTS_N_INSNS (1), /* cost of movzx */ | |
873 | 8, /* "large" insn */ | |
874 | 9, /* MOVE_RATIO */ | |
df41dbaf JH |
875 | |
876 | /* All move costs are relative to integer->integer move times 2 and thus | |
877 | they are latency*2. */ | |
64766e8d JH |
878 | 4, /* cost for loading QImode using movzbl */ |
879 | {3, 4, 3}, /* cost of loading integer registers | |
880 | in QImode, HImode and SImode. | |
881 | Relative to reg-reg move (2). */ | |
882 | {3, 4, 3}, /* cost of storing integer registers */ | |
883 | 4, /* cost of reg,reg fld/fst */ | |
884 | {4, 4, 12}, /* cost of loading fp registers | |
885 | in SFmode, DFmode and XFmode */ | |
886 | {6, 6, 8}, /* cost of storing fp registers | |
887 | in SFmode, DFmode and XFmode */ | |
888 | 2, /* cost of moving MMX register */ | |
889 | {3, 3}, /* cost of loading MMX registers | |
890 | in SImode and DImode */ | |
891 | {4, 4}, /* cost of storing MMX registers | |
892 | in SImode and DImode */ | |
df41dbaf | 893 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
b7167993 | 894 | {4, 3, 12, 12, 24}, /* cost of loading SSE registers |
df41dbaf | 895 | in 32,64,128,256 and 512-bit */ |
b7167993 RB |
896 | {4, 3, 12, 12, 24}, /* cost of unaligned loads. */ |
897 | {4, 4, 10, 10, 20}, /* cost of storing SSE registers | |
df41dbaf | 898 | in 32,64,128,256 and 512-bit */ |
b7167993 | 899 | {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ |
df41dbaf | 900 | 5, 5, /* SSE->integer and integer->SSE moves */ |
a4fe6139 JH |
901 | 4, 4, /* Gather load static, per_elt. */ |
902 | 4, 4, /* Gather store static, per_elt. */ | |
64766e8d JH |
903 | 64, /* size of l1 cache. */ |
904 | 512, /* size of l2 cache. */ | |
905 | 64, /* size of prefetch block */ | |
906 | /* New AMD processors never drop prefetches; if they cannot be performed | |
907 | immediately, they are queued. We set number of simultaneous prefetches | |
908 | to a large constant to reflect this (it probably is not a good idea not | |
909 | to limit number of prefetches at all, as their execution also takes some | |
910 | time). */ | |
911 | 100, /* number of parallel prefetches */ | |
912 | 3, /* Branch cost */ | |
913 | COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ | |
914 | COSTS_N_INSNS (4), /* cost of FMUL instruction. */ | |
915 | COSTS_N_INSNS (19), /* cost of FDIV instruction. */ | |
916 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ | |
917 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ | |
918 | COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ | |
6065f444 | 919 | |
c53c148c | 920 | COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ |
6065f444 JH |
921 | COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ |
922 | COSTS_N_INSNS (4), /* cost of MULSS instruction. */ | |
923 | COSTS_N_INSNS (4), /* cost of MULSD instruction. */ | |
c53c148c JH |
924 | COSTS_N_INSNS (8), /* cost of FMA SS instruction. */ |
925 | COSTS_N_INSNS (8), /* cost of FMA SD instruction. */ | |
6065f444 JH |
926 | /* 11-16 */ |
927 | COSTS_N_INSNS (16), /* cost of DIVSS instruction. */ | |
928 | COSTS_N_INSNS (20), /* cost of DIVSD instruction. */ | |
929 | COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */ | |
930 | COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
931 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
932 | k8_memcpy, | |
933 | k8_memset, | |
f6fd8f2b JH |
934 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
935 | COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ | |
7dc58b50 ML |
936 | "16:8:8", /* Loop alignment. */ |
937 | "16:8:8", /* Jump alignment. */ | |
938 | "0:0:8", /* Label alignment. */ | |
939 | "16", /* Func alignment. */ | |
64766e8d JH |
940 | }; |
941 | ||
942 | /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for | |
943 | very small blocks it is better to use loop. For large blocks, libcall can | |
944 | do nontemporary accesses and beat inline considerably. */ | |
945 | static stringop_algs amdfam10_memcpy[2] = { | |
946 | {libcall, {{6, loop, false}, {14, unrolled_loop, false}, | |
947 | {-1, rep_prefix_4_byte, false}}}, | |
948 | {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, | |
949 | {-1, libcall, false}}}}; | |
950 | static stringop_algs amdfam10_memset[2] = { | |
951 | {libcall, {{8, loop, false}, {24, unrolled_loop, false}, | |
952 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
953 | {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, | |
954 | {-1, libcall, false}}}}; | |
955 | struct processor_costs amdfam10_cost = { | |
956 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
957 | COSTS_N_INSNS (2), /* cost of a lea instruction */ | |
958 | COSTS_N_INSNS (1), /* variable shift costs */ | |
959 | COSTS_N_INSNS (1), /* constant shift costs */ | |
960 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ | |
961 | COSTS_N_INSNS (4), /* HI */ | |
962 | COSTS_N_INSNS (3), /* SI */ | |
963 | COSTS_N_INSNS (4), /* DI */ | |
964 | COSTS_N_INSNS (5)}, /* other */ | |
965 | 0, /* cost of multiply per each bit set */ | |
966 | {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ | |
967 | COSTS_N_INSNS (35), /* HI */ | |
968 | COSTS_N_INSNS (51), /* SI */ | |
969 | COSTS_N_INSNS (83), /* DI */ | |
970 | COSTS_N_INSNS (83)}, /* other */ | |
971 | COSTS_N_INSNS (1), /* cost of movsx */ | |
972 | COSTS_N_INSNS (1), /* cost of movzx */ | |
973 | 8, /* "large" insn */ | |
974 | 9, /* MOVE_RATIO */ | |
df41dbaf JH |
975 | |
976 | /* All move costs are relative to integer->integer move times 2 and thus | |
977 | they are latency*2. */ | |
64766e8d JH |
978 | 4, /* cost for loading QImode using movzbl */ |
979 | {3, 4, 3}, /* cost of loading integer registers | |
980 | in QImode, HImode and SImode. | |
981 | Relative to reg-reg move (2). */ | |
982 | {3, 4, 3}, /* cost of storing integer registers */ | |
983 | 4, /* cost of reg,reg fld/fst */ | |
984 | {4, 4, 12}, /* cost of loading fp registers | |
985 | in SFmode, DFmode and XFmode */ | |
986 | {6, 6, 8}, /* cost of storing fp registers | |
987 | in SFmode, DFmode and XFmode */ | |
988 | 2, /* cost of moving MMX register */ | |
989 | {3, 3}, /* cost of loading MMX registers | |
990 | in SImode and DImode */ | |
991 | {4, 4}, /* cost of storing MMX registers | |
992 | in SImode and DImode */ | |
df41dbaf JH |
993 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
994 | {4, 4, 3, 6, 12}, /* cost of loading SSE registers | |
995 | in 32,64,128,256 and 512-bit */ | |
996 | {4, 4, 3, 7, 12}, /* cost of unaligned loads. */ | |
997 | {4, 4, 5, 10, 20}, /* cost of storing SSE registers | |
998 | in 32,64,128,256 and 512-bit */ | |
999 | {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ | |
1000 | 3, 3, /* SSE->integer and integer->SSE moves */ | |
64766e8d JH |
1001 | /* On K8: |
1002 | MOVD reg64, xmmreg Double FSTORE 4 | |
1003 | MOVD reg32, xmmreg Double FSTORE 4 | |
1004 | On AMDFAM10: | |
1005 | MOVD reg64, xmmreg Double FADD 3 | |
1006 | 1/1 1/1 | |
1007 | MOVD reg32, xmmreg Double FADD 3 | |
1008 | 1/1 1/1 */ | |
a4fe6139 JH |
1009 | 4, 4, /* Gather load static, per_elt. */ |
1010 | 4, 4, /* Gather store static, per_elt. */ | |
64766e8d JH |
1011 | 64, /* size of l1 cache. */ |
1012 | 512, /* size of l2 cache. */ | |
1013 | 64, /* size of prefetch block */ | |
1014 | /* New AMD processors never drop prefetches; if they cannot be performed | |
1015 | immediately, they are queued. We set number of simultaneous prefetches | |
1016 | to a large constant to reflect this (it probably is not a good idea not | |
1017 | to limit number of prefetches at all, as their execution also takes some | |
1018 | time). */ | |
1019 | 100, /* number of parallel prefetches */ | |
1020 | 2, /* Branch cost */ | |
1021 | COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ | |
1022 | COSTS_N_INSNS (4), /* cost of FMUL instruction. */ | |
1023 | COSTS_N_INSNS (19), /* cost of FDIV instruction. */ | |
1024 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ | |
1025 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ | |
1026 | COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ | |
6065f444 | 1027 | |
c53c148c | 1028 | COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ |
6065f444 JH |
1029 | COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ |
1030 | COSTS_N_INSNS (4), /* cost of MULSS instruction. */ | |
1031 | COSTS_N_INSNS (4), /* cost of MULSD instruction. */ | |
c53c148c JH |
1032 | COSTS_N_INSNS (8), /* cost of FMA SS instruction. */ |
1033 | COSTS_N_INSNS (8), /* cost of FMA SD instruction. */ | |
6065f444 JH |
1034 | /* 11-16 */ |
1035 | COSTS_N_INSNS (16), /* cost of DIVSS instruction. */ | |
1036 | COSTS_N_INSNS (20), /* cost of DIVSD instruction. */ | |
1037 | COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */ | |
1038 | COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
1039 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
1040 | amdfam10_memcpy, | |
1041 | amdfam10_memset, | |
f6fd8f2b JH |
1042 | COSTS_N_INSNS (2), /* cond_taken_branch_cost. */ |
1043 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
7dc58b50 ML |
1044 | "32:25:8", /* Loop alignment. */ |
1045 | "32:8:8", /* Jump alignment. */ | |
1046 | "0:0:8", /* Label alignment. */ | |
1047 | "32", /* Func alignment. */ | |
64766e8d JH |
1048 | }; |
1049 | ||
c727b835 | 1050 | /* BDVER has optimized REP instruction for medium sized blocks, but for |
64766e8d JH |
1051 | very small blocks it is better to use loop. For large blocks, libcall |
1052 | can do nontemporary accesses and beat inline considerably. */ | |
c727b835 | 1053 | static stringop_algs bdver_memcpy[2] = { |
64766e8d JH |
1054 | {libcall, {{6, loop, false}, {14, unrolled_loop, false}, |
1055 | {-1, rep_prefix_4_byte, false}}}, | |
1056 | {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, | |
1057 | {-1, libcall, false}}}}; | |
c727b835 | 1058 | static stringop_algs bdver_memset[2] = { |
64766e8d JH |
1059 | {libcall, {{8, loop, false}, {24, unrolled_loop, false}, |
1060 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
1061 | {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, | |
1062 | {-1, libcall, false}}}}; | |
1063 | ||
c727b835 | 1064 | const struct processor_costs bdver_cost = { |
64766e8d JH |
1065 | COSTS_N_INSNS (1), /* cost of an add instruction */ |
1066 | COSTS_N_INSNS (1), /* cost of a lea instruction */ | |
1067 | COSTS_N_INSNS (1), /* variable shift costs */ | |
1068 | COSTS_N_INSNS (1), /* constant shift costs */ | |
1069 | {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ | |
1070 | COSTS_N_INSNS (4), /* HI */ | |
1071 | COSTS_N_INSNS (4), /* SI */ | |
1072 | COSTS_N_INSNS (6), /* DI */ | |
1073 | COSTS_N_INSNS (6)}, /* other */ | |
1074 | 0, /* cost of multiply per each bit set */ | |
1075 | {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ | |
1076 | COSTS_N_INSNS (35), /* HI */ | |
1077 | COSTS_N_INSNS (51), /* SI */ | |
1078 | COSTS_N_INSNS (83), /* DI */ | |
1079 | COSTS_N_INSNS (83)}, /* other */ | |
1080 | COSTS_N_INSNS (1), /* cost of movsx */ | |
1081 | COSTS_N_INSNS (1), /* cost of movzx */ | |
1082 | 8, /* "large" insn */ | |
1083 | 9, /* MOVE_RATIO */ | |
df41dbaf JH |
1084 | |
1085 | /* All move costs are relative to integer->integer move times 2 and thus | |
1086 | they are latency*2. */ | |
1087 | 8, /* cost for loading QImode using movzbl */ | |
1088 | {8, 8, 8}, /* cost of loading integer registers | |
64766e8d JH |
1089 | in QImode, HImode and SImode. |
1090 | Relative to reg-reg move (2). */ | |
df41dbaf JH |
1091 | {8, 8, 8}, /* cost of storing integer registers */ |
1092 | 4, /* cost of reg,reg fld/fst */ | |
1093 | {12, 12, 28}, /* cost of loading fp registers | |
64766e8d | 1094 | in SFmode, DFmode and XFmode */ |
df41dbaf | 1095 | {10, 10, 18}, /* cost of storing fp registers |
64766e8d | 1096 | in SFmode, DFmode and XFmode */ |
df41dbaf JH |
1097 | 4, /* cost of moving MMX register */ |
1098 | {12, 12}, /* cost of loading MMX registers | |
64766e8d | 1099 | in SImode and DImode */ |
df41dbaf | 1100 | {10, 10}, /* cost of storing MMX registers |
64766e8d | 1101 | in SImode and DImode */ |
df41dbaf | 1102 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
b7167993 | 1103 | {12, 12, 10, 40, 60}, /* cost of loading SSE registers |
df41dbaf | 1104 | in 32,64,128,256 and 512-bit */ |
b7167993 RB |
1105 | {12, 12, 10, 40, 60}, /* cost of unaligned loads. */ |
1106 | {10, 10, 10, 40, 60}, /* cost of storing SSE registers | |
df41dbaf | 1107 | in 32,64,128,256 and 512-bit */ |
b7167993 | 1108 | {10, 10, 10, 40, 60}, /* cost of unaligned stores. */ |
df41dbaf | 1109 | 16, 20, /* SSE->integer and integer->SSE moves */ |
a4fe6139 JH |
1110 | 12, 12, /* Gather load static, per_elt. */ |
1111 | 10, 10, /* Gather store static, per_elt. */ | |
64766e8d JH |
1112 | 16, /* size of l1 cache. */ |
1113 | 2048, /* size of l2 cache. */ | |
1114 | 64, /* size of prefetch block */ | |
1115 | /* New AMD processors never drop prefetches; if they cannot be performed | |
1116 | immediately, they are queued. We set number of simultaneous prefetches | |
1117 | to a large constant to reflect this (it probably is not a good idea not | |
1118 | to limit number of prefetches at all, as their execution also takes some | |
1119 | time). */ | |
1120 | 100, /* number of parallel prefetches */ | |
1121 | 2, /* Branch cost */ | |
1122 | COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ | |
1123 | COSTS_N_INSNS (6), /* cost of FMUL instruction. */ | |
1124 | COSTS_N_INSNS (42), /* cost of FDIV instruction. */ | |
1125 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ | |
1126 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ | |
1127 | COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ | |
6065f444 | 1128 | |
c53c148c | 1129 | COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ |
6065f444 JH |
1130 | COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */ |
1131 | COSTS_N_INSNS (6), /* cost of MULSS instruction. */ | |
1132 | COSTS_N_INSNS (6), /* cost of MULSD instruction. */ | |
c53c148c JH |
1133 | COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ |
1134 | COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ | |
6065f444 JH |
1135 | /* 9-24 */ |
1136 | COSTS_N_INSNS (24), /* cost of DIVSS instruction. */ | |
1137 | /* 9-27 */ | |
1138 | COSTS_N_INSNS (27), /* cost of DIVSD instruction. */ | |
1139 | COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */ | |
1140 | COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */ | |
64766e8d | 1141 | 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
c727b835 RB |
1142 | bdver_memcpy, |
1143 | bdver_memset, | |
f6fd8f2b JH |
1144 | COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ |
1145 | COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ | |
7dc58b50 ML |
1146 | "16:11:8", /* Loop alignment. */ |
1147 | "16:8:8", /* Jump alignment. */ | |
1148 | "0:0:8", /* Label alignment. */ | |
1149 | "11", /* Func alignment. */ | |
64766e8d JH |
1150 | }; |
1151 | ||
1152 | ||
1153 | /* ZNVER1 has optimized REP instruction for medium sized blocks, but for | |
1154 | very small blocks it is better to use loop. For large blocks, libcall | |
1155 | can do nontemporary accesses and beat inline considerably. */ | |
1156 | static stringop_algs znver1_memcpy[2] = { | |
1157 | {libcall, {{6, loop, false}, {14, unrolled_loop, false}, | |
1158 | {-1, rep_prefix_4_byte, false}}}, | |
1159 | {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, | |
1160 | {-1, libcall, false}}}}; | |
1161 | static stringop_algs znver1_memset[2] = { | |
1162 | {libcall, {{8, loop, false}, {24, unrolled_loop, false}, | |
1163 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
1164 | {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, | |
1165 | {-1, libcall, false}}}}; | |
1166 | struct processor_costs znver1_cost = { | |
1167 | COSTS_N_INSNS (1), /* cost of an add instruction. */ | |
1168 | COSTS_N_INSNS (1), /* cost of a lea instruction. */ | |
1169 | COSTS_N_INSNS (1), /* variable shift costs. */ | |
1170 | COSTS_N_INSNS (1), /* constant shift costs. */ | |
1171 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */ | |
1172 | COSTS_N_INSNS (3), /* HI. */ | |
1173 | COSTS_N_INSNS (3), /* SI. */ | |
6065f444 JH |
1174 | COSTS_N_INSNS (3), /* DI. */ |
1175 | COSTS_N_INSNS (3)}, /* other. */ | |
64766e8d JH |
1176 | 0, /* cost of multiply per each bit |
1177 | set. */ | |
6065f444 JH |
1178 | /* Depending on parameters, idiv can get faster on ryzen. This is upper |
1179 | bound. */ | |
1180 | {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */ | |
1181 | COSTS_N_INSNS (22), /* HI. */ | |
1182 | COSTS_N_INSNS (30), /* SI. */ | |
1183 | COSTS_N_INSNS (45), /* DI. */ | |
1184 | COSTS_N_INSNS (45)}, /* other. */ | |
64766e8d JH |
1185 | COSTS_N_INSNS (1), /* cost of movsx. */ |
1186 | COSTS_N_INSNS (1), /* cost of movzx. */ | |
1187 | 8, /* "large" insn. */ | |
1188 | 9, /* MOVE_RATIO. */ | |
01118373 | 1189 | |
df41dbaf JH |
1190 | /* All move costs are relative to integer->integer move times 2 and thus |
1191 | they are latency*2. */ | |
1192 | ||
01118373 JH |
1193 | /* reg-reg moves are done by renaming and thus they are even cheaper than |
1194 | 1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond | |
1195 | to doubles of latencies, we do not model this correctly. It does not | |
1196 | seem to make practical difference to bump prices up even more. */ | |
1197 | 6, /* cost for loading QImode using | |
64766e8d | 1198 | movzbl. */ |
01118373 | 1199 | {6, 6, 6}, /* cost of loading integer registers |
64766e8d JH |
1200 | in QImode, HImode and SImode. |
1201 | Relative to reg-reg move (2). */ | |
01118373 | 1202 | {8, 8, 8}, /* cost of storing integer |
64766e8d JH |
1203 | registers. */ |
1204 | 2, /* cost of reg,reg fld/fst. */ | |
01118373 | 1205 | {6, 6, 16}, /* cost of loading fp registers |
64766e8d | 1206 | in SFmode, DFmode and XFmode. */ |
01118373 | 1207 | {8, 8, 16}, /* cost of storing fp registers |
64766e8d JH |
1208 | in SFmode, DFmode and XFmode. */ |
1209 | 2, /* cost of moving MMX register. */ | |
01118373 | 1210 | {6, 6}, /* cost of loading MMX registers |
64766e8d | 1211 | in SImode and DImode. */ |
01118373 | 1212 | {8, 8}, /* cost of storing MMX registers |
64766e8d | 1213 | in SImode and DImode. */ |
df41dbaf | 1214 | 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */ |
b7167993 | 1215 | {6, 6, 6, 12, 24}, /* cost of loading SSE registers |
df41dbaf | 1216 | in 32,64,128,256 and 512-bit. */ |
b7167993 RB |
1217 | {6, 6, 6, 12, 24}, /* cost of unaligned loads. */ |
1218 | {8, 8, 8, 16, 32}, /* cost of storing SSE registers | |
df41dbaf | 1219 | in 32,64,128,256 and 512-bit. */ |
b7167993 | 1220 | {8, 8, 8, 16, 32}, /* cost of unaligned stores. */ |
df41dbaf | 1221 | 6, 6, /* SSE->integer and integer->SSE moves. */ |
a4fe6139 JH |
1222 | /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, |
1223 | throughput 12. Approx 9 uops do not depend on vector size and every load | |
1224 | is 7 uops. */ | |
1225 | 18, 8, /* Gather load static, per_elt. */ | |
1226 | 18, 10, /* Gather store static, per_elt. */ | |
64766e8d JH |
1227 | 32, /* size of l1 cache. */ |
1228 | 512, /* size of l2 cache. */ | |
1229 | 64, /* size of prefetch block. */ | |
1230 | /* New AMD processors never drop prefetches; if they cannot be performed | |
1231 | immediately, they are queued. We set number of simultaneous prefetches | |
1232 | to a large constant to reflect this (it probably is not a good idea not | |
1233 | to limit number of prefetches at all, as their execution also takes some | |
1234 | time). */ | |
1235 | 100, /* number of parallel prefetches. */ | |
1236 | 3, /* Branch cost. */ | |
6065f444 JH |
1237 | COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */ |
1238 | COSTS_N_INSNS (5), /* cost of FMUL instruction. */ | |
1239 | /* Latency of fdiv is 8-15. */ | |
1240 | COSTS_N_INSNS (15), /* cost of FDIV instruction. */ | |
1241 | COSTS_N_INSNS (1), /* cost of FABS instruction. */ | |
1242 | COSTS_N_INSNS (1), /* cost of FCHS instruction. */ | |
1243 | /* Latency of fsqrt is 4-10. */ | |
1244 | COSTS_N_INSNS (10), /* cost of FSQRT instruction. */ | |
1245 | ||
c53c148c | 1246 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
6065f444 JH |
1247 | COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ |
1248 | COSTS_N_INSNS (3), /* cost of MULSS instruction. */ | |
1249 | COSTS_N_INSNS (4), /* cost of MULSD instruction. */ | |
c53c148c JH |
1250 | COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ |
1251 | COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ | |
6065f444 JH |
1252 | COSTS_N_INSNS (10), /* cost of DIVSS instruction. */ |
1253 | /* 9-13 */ | |
1254 | COSTS_N_INSNS (13), /* cost of DIVSD instruction. */ | |
1255 | COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */ | |
1256 | COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
1257 | /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles |
1258 | and it can execute 2 integer additions and 2 multiplications thus | |
1259 | reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests | |
1260 | that 4 works better than 6 probably due to register pressure. | |
1261 | ||
1262 | Integer vector operations are taken by FP unit and execute 3 vector | |
1263 | plus/minus operations per cycle but only one multiply. This is adjusted | |
1264 | in ix86_reassociation_width. */ | |
1265 | 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */ | |
1266 | znver1_memcpy, | |
1267 | znver1_memset, | |
f6fd8f2b JH |
1268 | COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ |
1269 | COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ | |
7dc58b50 ML |
1270 | "16", /* Loop alignment. */ |
1271 | "16", /* Jump alignment. */ | |
1272 | "0:0:8", /* Label alignment. */ | |
1273 | "16", /* Func alignment. */ | |
64766e8d JH |
1274 | }; |
1275 | ||
2901f42f VK |
1276 | /* ZNVER2 has optimized REP instruction for medium sized blocks, but for |
1277 | very small blocks it is better to use loop. For large blocks, libcall | |
1278 | can do nontemporary accesses and beat inline considerably. */ | |
1279 | static stringop_algs znver2_memcpy[2] = { | |
1280 | {libcall, {{6, loop, false}, {14, unrolled_loop, false}, | |
1281 | {-1, rep_prefix_4_byte, false}}}, | |
1282 | {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, | |
1283 | {-1, libcall, false}}}}; | |
1284 | static stringop_algs znver2_memset[2] = { | |
1285 | {libcall, {{8, loop, false}, {24, unrolled_loop, false}, | |
1286 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
1287 | {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, | |
1288 | {-1, libcall, false}}}}; | |
1289 | ||
1290 | struct processor_costs znver2_cost = { | |
1291 | COSTS_N_INSNS (1), /* cost of an add instruction. */ | |
1292 | COSTS_N_INSNS (1), /* cost of a lea instruction. */ | |
1293 | COSTS_N_INSNS (1), /* variable shift costs. */ | |
1294 | COSTS_N_INSNS (1), /* constant shift costs. */ | |
1295 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */ | |
1296 | COSTS_N_INSNS (3), /* HI. */ | |
1297 | COSTS_N_INSNS (3), /* SI. */ | |
1298 | COSTS_N_INSNS (3), /* DI. */ | |
1299 | COSTS_N_INSNS (3)}, /* other. */ | |
1300 | 0, /* cost of multiply per each bit | |
1301 | set. */ | |
1302 | /* Depending on parameters, idiv can get faster on ryzen. This is upper | |
1303 | bound. */ | |
1304 | {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */ | |
1305 | COSTS_N_INSNS (22), /* HI. */ | |
1306 | COSTS_N_INSNS (30), /* SI. */ | |
1307 | COSTS_N_INSNS (45), /* DI. */ | |
1308 | COSTS_N_INSNS (45)}, /* other. */ | |
1309 | COSTS_N_INSNS (1), /* cost of movsx. */ | |
1310 | COSTS_N_INSNS (1), /* cost of movzx. */ | |
1311 | 8, /* "large" insn. */ | |
1312 | 9, /* MOVE_RATIO. */ | |
1313 | ||
1314 | /* All move costs are relative to integer->integer move times 2 and thus | |
1315 | they are latency*2. */ | |
1316 | ||
1317 | /* reg-reg moves are done by renaming and thus they are even cheaper than | |
1318 | 1 cycle. Because reg-reg move cost is 2 and following tables correspond | |
1319 | to doubles of latencies, we do not model this correctly. It does not | |
1320 | seem to make practical difference to bump prices up even more. */ | |
1321 | 6, /* cost for loading QImode using | |
1322 | movzbl. */ | |
1323 | {6, 6, 6}, /* cost of loading integer registers | |
1324 | in QImode, HImode and SImode. | |
1325 | Relative to reg-reg move (2). */ | |
1326 | {8, 8, 8}, /* cost of storing integer | |
1327 | registers. */ | |
1328 | 2, /* cost of reg,reg fld/fst. */ | |
1329 | {6, 6, 16}, /* cost of loading fp registers | |
1330 | in SFmode, DFmode and XFmode. */ | |
1331 | {8, 8, 16}, /* cost of storing fp registers | |
1332 | in SFmode, DFmode and XFmode. */ | |
1333 | 2, /* cost of moving MMX register. */ | |
1334 | {6, 6}, /* cost of loading MMX registers | |
1335 | in SImode and DImode. */ | |
1336 | {8, 8}, /* cost of storing MMX registers | |
1337 | in SImode and DImode. */ | |
1338 | 2, 3, 6, /* cost of moving XMM,YMM,ZMM | |
1339 | register. */ | |
1340 | {6, 6, 6, 10, 20}, /* cost of loading SSE registers | |
1341 | in 32,64,128,256 and 512-bit. */ | |
1342 | {6, 6, 6, 10, 20}, /* cost of unaligned loads. */ | |
1343 | {8, 8, 8, 8, 16}, /* cost of storing SSE registers | |
1344 | in 32,64,128,256 and 512-bit. */ | |
1345 | {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ | |
1346 | 6, 6, /* SSE->integer and integer->SSE | |
1347 | moves. */ | |
1348 | /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, | |
1349 | throughput 12. Approx 9 uops do not depend on vector size and every load | |
1350 | is 7 uops. */ | |
1351 | 18, 8, /* Gather load static, per_elt. */ | |
1352 | 18, 10, /* Gather store static, per_elt. */ | |
1353 | 32, /* size of l1 cache. */ | |
1354 | 512, /* size of l2 cache. */ | |
1355 | 64, /* size of prefetch block. */ | |
1356 | /* New AMD processors never drop prefetches; if they cannot be performed | |
1357 | immediately, they are queued. We set number of simultaneous prefetches | |
1358 | to a large constant to reflect this (it probably is not a good idea not | |
1359 | to limit number of prefetches at all, as their execution also takes some | |
1360 | time). */ | |
1361 | 100, /* number of parallel prefetches. */ | |
1362 | 3, /* Branch cost. */ | |
1363 | COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */ | |
1364 | COSTS_N_INSNS (5), /* cost of FMUL instruction. */ | |
1365 | /* Latency of fdiv is 8-15. */ | |
1366 | COSTS_N_INSNS (15), /* cost of FDIV instruction. */ | |
1367 | COSTS_N_INSNS (1), /* cost of FABS instruction. */ | |
1368 | COSTS_N_INSNS (1), /* cost of FCHS instruction. */ | |
1369 | /* Latency of fsqrt is 4-10. */ | |
1370 | COSTS_N_INSNS (10), /* cost of FSQRT instruction. */ | |
1371 | ||
1372 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ | |
1373 | COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ | |
1374 | COSTS_N_INSNS (3), /* cost of MULSS instruction. */ | |
1375 | COSTS_N_INSNS (4), /* cost of MULSD instruction. */ | |
1376 | COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ | |
1377 | COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ | |
1378 | COSTS_N_INSNS (10), /* cost of DIVSS instruction. */ | |
1379 | /* 9-13. */ | |
1380 | COSTS_N_INSNS (13), /* cost of DIVSD instruction. */ | |
1381 | COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */ | |
1382 | COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */ | |
1383 | /* Zen can execute 4 integer operations per cycle. FP operations | |
1384 | take 3 cycles and it can execute 2 integer additions and 2 | |
1385 | multiplications thus reassociation may make sense up to with of 6. | |
1386 | SPEC2k6 bencharks suggests | |
1387 | that 4 works better than 6 probably due to register pressure. | |
1388 | ||
1389 | Integer vector operations are taken by FP unit and execute 3 vector | |
1390 | plus/minus operations per cycle but only one multiply. This is adjusted | |
1391 | in ix86_reassociation_width. */ | |
1392 | 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */ | |
1393 | znver2_memcpy, | |
1394 | znver2_memset, | |
1395 | COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ | |
1396 | COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ | |
1397 | "16", /* Loop alignment. */ | |
1398 | "16", /* Jump alignment. */ | |
1399 | "0:0:8", /* Label alignment. */ | |
1400 | "16", /* Func alignment. */ | |
1401 | }; | |
1402 | ||
c234d831 UB |
1403 | /* skylake_cost should produce code tuned for Skylake familly of CPUs. */ |
1404 | static stringop_algs skylake_memcpy[2] = { | |
1405 | {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, | |
6e559c70 | 1406 | {libcall, {{16, loop, false}, {512, unrolled_loop, false}, |
c234d831 UB |
1407 | {-1, libcall, false}}}}; |
1408 | ||
1409 | static stringop_algs skylake_memset[2] = { | |
1410 | {libcall, {{6, loop_1_byte, true}, | |
1411 | {24, loop, true}, | |
1412 | {8192, rep_prefix_4_byte, true}, | |
1413 | {-1, libcall, false}}}, | |
6e559c70 | 1414 | {libcall, {{24, loop, true}, {512, unrolled_loop, false}, |
c234d831 UB |
1415 | {-1, libcall, false}}}}; |
1416 | ||
1417 | static const | |
1418 | struct processor_costs skylake_cost = { | |
1419 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
1420 | COSTS_N_INSNS (1)+1, /* cost of a lea instruction */ | |
1421 | COSTS_N_INSNS (1), /* variable shift costs */ | |
1422 | COSTS_N_INSNS (1), /* constant shift costs */ | |
1423 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ | |
1424 | COSTS_N_INSNS (4), /* HI */ | |
1425 | COSTS_N_INSNS (3), /* SI */ | |
a2ef9558 MT |
1426 | COSTS_N_INSNS (3), /* DI */ |
1427 | COSTS_N_INSNS (3)}, /* other */ | |
c234d831 | 1428 | 0, /* cost of multiply per each bit set */ |
02308bd3 MT |
1429 | /* Expanding div/mod currently doesn't consider parallelism. So the cost |
1430 | model is not realistic. We compensate by increasing the latencies a bit. */ | |
1431 | {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */ | |
1432 | COSTS_N_INSNS (11), /* HI */ | |
1433 | COSTS_N_INSNS (14), /* SI */ | |
c234d831 UB |
1434 | COSTS_N_INSNS (76), /* DI */ |
1435 | COSTS_N_INSNS (76)}, /* other */ | |
1436 | COSTS_N_INSNS (1), /* cost of movsx */ | |
1437 | COSTS_N_INSNS (0), /* cost of movzx */ | |
1438 | 8, /* "large" insn */ | |
1439 | 17, /* MOVE_RATIO */ | |
1440 | ||
1441 | 6, /* cost for loading QImode using movzbl */ | |
1442 | {4, 4, 4}, /* cost of loading integer registers | |
1443 | in QImode, HImode and SImode. | |
1444 | Relative to reg-reg move (2). */ | |
001e7337 | 1445 | {6, 6, 3}, /* cost of storing integer registers */ |
c234d831 UB |
1446 | 2, /* cost of reg,reg fld/fst */ |
1447 | {6, 6, 8}, /* cost of loading fp registers | |
1448 | in SFmode, DFmode and XFmode */ | |
1449 | {6, 6, 10}, /* cost of storing fp registers | |
1450 | in SFmode, DFmode and XFmode */ | |
1451 | 2, /* cost of moving MMX register */ | |
1452 | {6, 6}, /* cost of loading MMX registers | |
1453 | in SImode and DImode */ | |
1454 | {6, 6}, /* cost of storing MMX registers | |
1455 | in SImode and DImode */ | |
1456 | 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ | |
1457 | {6, 6, 6, 10, 20}, /* cost of loading SSE registers | |
1458 | in 32,64,128,256 and 512-bit */ | |
1459 | {6, 6, 6, 10, 20}, /* cost of unaligned loads. */ | |
001e7337 | 1460 | {8, 8, 8, 12, 24}, /* cost of storing SSE registers |
c234d831 UB |
1461 | in 32,64,128,256 and 512-bit */ |
1462 | {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ | |
1463 | 2, 2, /* SSE->integer and integer->SSE moves */ | |
1464 | 20, 8, /* Gather load static, per_elt. */ | |
1465 | 22, 10, /* Gather store static, per_elt. */ | |
1466 | 64, /* size of l1 cache. */ | |
1467 | 512, /* size of l2 cache. */ | |
1468 | 64, /* size of prefetch block */ | |
1469 | 6, /* number of parallel prefetches */ | |
1470 | 3, /* Branch cost */ | |
1471 | COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ | |
1472 | COSTS_N_INSNS (4), /* cost of FMUL instruction. */ | |
1473 | COSTS_N_INSNS (20), /* cost of FDIV instruction. */ | |
1474 | COSTS_N_INSNS (1), /* cost of FABS instruction. */ | |
1475 | COSTS_N_INSNS (1), /* cost of FCHS instruction. */ | |
1476 | COSTS_N_INSNS (20), /* cost of FSQRT instruction. */ | |
1477 | ||
1478 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ | |
1479 | COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ | |
1480 | COSTS_N_INSNS (4), /* cost of MULSS instruction. */ | |
1481 | COSTS_N_INSNS (4), /* cost of MULSD instruction. */ | |
1482 | COSTS_N_INSNS (4), /* cost of FMA SS instruction. */ | |
1483 | COSTS_N_INSNS (4), /* cost of FMA SD instruction. */ | |
1484 | COSTS_N_INSNS (11), /* cost of DIVSS instruction. */ | |
1485 | COSTS_N_INSNS (14), /* cost of DIVSD instruction. */ | |
1486 | COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */ | |
1487 | COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */ | |
1488 | 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ | |
1489 | skylake_memcpy, | |
1490 | skylake_memset, | |
1491 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ | |
1492 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
7dc58b50 ML |
1493 | "16:11:8", /* Loop alignment. */ |
1494 | "16:11:8", /* Jump alignment. */ | |
1495 | "0:0:8", /* Label alignment. */ | |
1496 | "16", /* Func alignment. */ | |
c234d831 | 1497 | }; |
64766e8d JH |
1498 | /* BTVER1 has optimized REP instruction for medium sized blocks, but for |
1499 | very small blocks it is better to use loop. For large blocks, libcall can | |
1500 | do nontemporary accesses and beat inline considerably. */ | |
1501 | static stringop_algs btver1_memcpy[2] = { | |
1502 | {libcall, {{6, loop, false}, {14, unrolled_loop, false}, | |
1503 | {-1, rep_prefix_4_byte, false}}}, | |
1504 | {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, | |
1505 | {-1, libcall, false}}}}; | |
1506 | static stringop_algs btver1_memset[2] = { | |
1507 | {libcall, {{8, loop, false}, {24, unrolled_loop, false}, | |
1508 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
1509 | {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, | |
1510 | {-1, libcall, false}}}}; | |
1511 | const struct processor_costs btver1_cost = { | |
1512 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
1513 | COSTS_N_INSNS (2), /* cost of a lea instruction */ | |
1514 | COSTS_N_INSNS (1), /* variable shift costs */ | |
1515 | COSTS_N_INSNS (1), /* constant shift costs */ | |
1516 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ | |
1517 | COSTS_N_INSNS (4), /* HI */ | |
1518 | COSTS_N_INSNS (3), /* SI */ | |
1519 | COSTS_N_INSNS (4), /* DI */ | |
1520 | COSTS_N_INSNS (5)}, /* other */ | |
1521 | 0, /* cost of multiply per each bit set */ | |
1522 | {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ | |
1523 | COSTS_N_INSNS (35), /* HI */ | |
1524 | COSTS_N_INSNS (51), /* SI */ | |
1525 | COSTS_N_INSNS (83), /* DI */ | |
1526 | COSTS_N_INSNS (83)}, /* other */ | |
1527 | COSTS_N_INSNS (1), /* cost of movsx */ | |
1528 | COSTS_N_INSNS (1), /* cost of movzx */ | |
1529 | 8, /* "large" insn */ | |
1530 | 9, /* MOVE_RATIO */ | |
df41dbaf JH |
1531 | |
1532 | /* All move costs are relative to integer->integer move times 2 and thus | |
1533 | they are latency*2. */ | |
1534 | 8, /* cost for loading QImode using movzbl */ | |
1535 | {6, 8, 6}, /* cost of loading integer registers | |
64766e8d JH |
1536 | in QImode, HImode and SImode. |
1537 | Relative to reg-reg move (2). */ | |
df41dbaf | 1538 | {6, 8, 6}, /* cost of storing integer registers */ |
64766e8d | 1539 | 4, /* cost of reg,reg fld/fst */ |
df41dbaf | 1540 | {12, 12, 28}, /* cost of loading fp registers |
64766e8d | 1541 | in SFmode, DFmode and XFmode */ |
df41dbaf | 1542 | {12, 12, 38}, /* cost of storing fp registers |
64766e8d | 1543 | in SFmode, DFmode and XFmode */ |
df41dbaf JH |
1544 | 4, /* cost of moving MMX register */ |
1545 | {10, 10}, /* cost of loading MMX registers | |
64766e8d | 1546 | in SImode and DImode */ |
df41dbaf | 1547 | {12, 12}, /* cost of storing MMX registers |
64766e8d | 1548 | in SImode and DImode */ |
df41dbaf | 1549 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
b7167993 | 1550 | {10, 10, 12, 48, 96}, /* cost of loading SSE registers |
df41dbaf | 1551 | in 32,64,128,256 and 512-bit */ |
b7167993 RB |
1552 | {10, 10, 12, 48, 96}, /* cost of unaligned loads. */ |
1553 | {10, 10, 12, 48, 96}, /* cost of storing SSE registers | |
df41dbaf | 1554 | in 32,64,128,256 and 512-bit */ |
b7167993 | 1555 | {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ |
df41dbaf | 1556 | 14, 14, /* SSE->integer and integer->SSE moves */ |
a4fe6139 JH |
1557 | 10, 10, /* Gather load static, per_elt. */ |
1558 | 10, 10, /* Gather store static, per_elt. */ | |
64766e8d JH |
1559 | 32, /* size of l1 cache. */ |
1560 | 512, /* size of l2 cache. */ | |
1561 | 64, /* size of prefetch block */ | |
1562 | 100, /* number of parallel prefetches */ | |
1563 | 2, /* Branch cost */ | |
1564 | COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ | |
1565 | COSTS_N_INSNS (4), /* cost of FMUL instruction. */ | |
1566 | COSTS_N_INSNS (19), /* cost of FDIV instruction. */ | |
1567 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ | |
1568 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ | |
1569 | COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ | |
6065f444 | 1570 | |
c53c148c | 1571 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
6065f444 JH |
1572 | COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ |
1573 | COSTS_N_INSNS (2), /* cost of MULSS instruction. */ | |
1574 | COSTS_N_INSNS (4), /* cost of MULSD instruction. */ | |
c53c148c JH |
1575 | COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ |
1576 | COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ | |
6065f444 JH |
1577 | COSTS_N_INSNS (13), /* cost of DIVSS instruction. */ |
1578 | COSTS_N_INSNS (17), /* cost of DIVSD instruction. */ | |
1579 | COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */ | |
1580 | COSTS_N_INSNS (48), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
1581 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
1582 | btver1_memcpy, | |
1583 | btver1_memset, | |
f6fd8f2b JH |
1584 | COSTS_N_INSNS (2), /* cond_taken_branch_cost. */ |
1585 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
7dc58b50 ML |
1586 | "16:11:8", /* Loop alignment. */ |
1587 | "16:8:8", /* Jump alignment. */ | |
1588 | "0:0:8", /* Label alignment. */ | |
1589 | "11", /* Func alignment. */ | |
64766e8d JH |
1590 | }; |
1591 | ||
1592 | static stringop_algs btver2_memcpy[2] = { | |
1593 | {libcall, {{6, loop, false}, {14, unrolled_loop, false}, | |
1594 | {-1, rep_prefix_4_byte, false}}}, | |
1595 | {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, | |
1596 | {-1, libcall, false}}}}; | |
1597 | static stringop_algs btver2_memset[2] = { | |
1598 | {libcall, {{8, loop, false}, {24, unrolled_loop, false}, | |
1599 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
1600 | {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, | |
1601 | {-1, libcall, false}}}}; | |
1602 | const struct processor_costs btver2_cost = { | |
1603 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
1604 | COSTS_N_INSNS (2), /* cost of a lea instruction */ | |
1605 | COSTS_N_INSNS (1), /* variable shift costs */ | |
1606 | COSTS_N_INSNS (1), /* constant shift costs */ | |
1607 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ | |
1608 | COSTS_N_INSNS (4), /* HI */ | |
1609 | COSTS_N_INSNS (3), /* SI */ | |
1610 | COSTS_N_INSNS (4), /* DI */ | |
1611 | COSTS_N_INSNS (5)}, /* other */ | |
1612 | 0, /* cost of multiply per each bit set */ | |
1613 | {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ | |
1614 | COSTS_N_INSNS (35), /* HI */ | |
1615 | COSTS_N_INSNS (51), /* SI */ | |
1616 | COSTS_N_INSNS (83), /* DI */ | |
1617 | COSTS_N_INSNS (83)}, /* other */ | |
1618 | COSTS_N_INSNS (1), /* cost of movsx */ | |
1619 | COSTS_N_INSNS (1), /* cost of movzx */ | |
1620 | 8, /* "large" insn */ | |
1621 | 9, /* MOVE_RATIO */ | |
df41dbaf JH |
1622 | |
1623 | /* All move costs are relative to integer->integer move times 2 and thus | |
1624 | they are latency*2. */ | |
1625 | 8, /* cost for loading QImode using movzbl */ | |
1626 | {8, 8, 6}, /* cost of loading integer registers | |
64766e8d JH |
1627 | in QImode, HImode and SImode. |
1628 | Relative to reg-reg move (2). */ | |
df41dbaf | 1629 | {8, 8, 6}, /* cost of storing integer registers */ |
64766e8d | 1630 | 4, /* cost of reg,reg fld/fst */ |
df41dbaf | 1631 | {12, 12, 28}, /* cost of loading fp registers |
64766e8d | 1632 | in SFmode, DFmode and XFmode */ |
df41dbaf | 1633 | {12, 12, 38}, /* cost of storing fp registers |
64766e8d | 1634 | in SFmode, DFmode and XFmode */ |
df41dbaf JH |
1635 | 4, /* cost of moving MMX register */ |
1636 | {10, 10}, /* cost of loading MMX registers | |
64766e8d | 1637 | in SImode and DImode */ |
df41dbaf | 1638 | {12, 12}, /* cost of storing MMX registers |
64766e8d | 1639 | in SImode and DImode */ |
df41dbaf | 1640 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
b7167993 | 1641 | {10, 10, 12, 48, 96}, /* cost of loading SSE registers |
df41dbaf | 1642 | in 32,64,128,256 and 512-bit */ |
b7167993 RB |
1643 | {10, 10, 12, 48, 96}, /* cost of unaligned loads. */ |
1644 | {10, 10, 12, 48, 96}, /* cost of storing SSE registers | |
df41dbaf | 1645 | in 32,64,128,256 and 512-bit */ |
b7167993 | 1646 | {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ |
df41dbaf | 1647 | 14, 14, /* SSE->integer and integer->SSE moves */ |
a4fe6139 JH |
1648 | 10, 10, /* Gather load static, per_elt. */ |
1649 | 10, 10, /* Gather store static, per_elt. */ | |
64766e8d JH |
1650 | 32, /* size of l1 cache. */ |
1651 | 2048, /* size of l2 cache. */ | |
1652 | 64, /* size of prefetch block */ | |
1653 | 100, /* number of parallel prefetches */ | |
1654 | 2, /* Branch cost */ | |
1655 | COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ | |
1656 | COSTS_N_INSNS (4), /* cost of FMUL instruction. */ | |
1657 | COSTS_N_INSNS (19), /* cost of FDIV instruction. */ | |
1658 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ | |
1659 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ | |
1660 | COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ | |
6065f444 | 1661 | |
c53c148c | 1662 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
6065f444 JH |
1663 | COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ |
1664 | COSTS_N_INSNS (2), /* cost of MULSS instruction. */ | |
1665 | COSTS_N_INSNS (4), /* cost of MULSD instruction. */ | |
c53c148c JH |
1666 | COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ |
1667 | COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ | |
6065f444 JH |
1668 | COSTS_N_INSNS (13), /* cost of DIVSS instruction. */ |
1669 | COSTS_N_INSNS (19), /* cost of DIVSD instruction. */ | |
1670 | COSTS_N_INSNS (16), /* cost of SQRTSS instruction. */ | |
1671 | COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
1672 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
1673 | btver2_memcpy, | |
1674 | btver2_memset, | |
f6fd8f2b JH |
1675 | COSTS_N_INSNS (2), /* cond_taken_branch_cost. */ |
1676 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
7dc58b50 ML |
1677 | "16:11:8", /* Loop alignment. */ |
1678 | "16:8:8", /* Jump alignment. */ | |
1679 | "0:0:8", /* Label alignment. */ | |
1680 | "11", /* Func alignment. */ | |
64766e8d JH |
1681 | }; |
1682 | ||
1683 | static stringop_algs pentium4_memcpy[2] = { | |
1684 | {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}}, | |
1685 | DUMMY_STRINGOP_ALGS}; | |
1686 | static stringop_algs pentium4_memset[2] = { | |
1687 | {libcall, {{6, loop_1_byte, false}, {48, loop, false}, | |
1688 | {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
1689 | DUMMY_STRINGOP_ALGS}; | |
1690 | ||
1691 | static const | |
1692 | struct processor_costs pentium4_cost = { | |
1693 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
1694 | COSTS_N_INSNS (3), /* cost of a lea instruction */ | |
1695 | COSTS_N_INSNS (4), /* variable shift costs */ | |
1696 | COSTS_N_INSNS (4), /* constant shift costs */ | |
1697 | {COSTS_N_INSNS (15), /* cost of starting multiply for QI */ | |
1698 | COSTS_N_INSNS (15), /* HI */ | |
1699 | COSTS_N_INSNS (15), /* SI */ | |
1700 | COSTS_N_INSNS (15), /* DI */ | |
1701 | COSTS_N_INSNS (15)}, /* other */ | |
1702 | 0, /* cost of multiply per each bit set */ | |
1703 | {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */ | |
1704 | COSTS_N_INSNS (56), /* HI */ | |
1705 | COSTS_N_INSNS (56), /* SI */ | |
1706 | COSTS_N_INSNS (56), /* DI */ | |
1707 | COSTS_N_INSNS (56)}, /* other */ | |
1708 | COSTS_N_INSNS (1), /* cost of movsx */ | |
1709 | COSTS_N_INSNS (1), /* cost of movzx */ | |
1710 | 16, /* "large" insn */ | |
1711 | 6, /* MOVE_RATIO */ | |
df41dbaf JH |
1712 | |
1713 | /* All move costs are relative to integer->integer move times 2 and thus | |
1714 | they are latency*2. */ | |
1715 | 5, /* cost for loading QImode using movzbl */ | |
64766e8d JH |
1716 | {4, 5, 4}, /* cost of loading integer registers |
1717 | in QImode, HImode and SImode. | |
1718 | Relative to reg-reg move (2). */ | |
1719 | {2, 3, 2}, /* cost of storing integer registers */ | |
df41dbaf JH |
1720 | 12, /* cost of reg,reg fld/fst */ |
1721 | {14, 14, 14}, /* cost of loading fp registers | |
64766e8d | 1722 | in SFmode, DFmode and XFmode */ |
df41dbaf | 1723 | {14, 14, 14}, /* cost of storing fp registers |
64766e8d | 1724 | in SFmode, DFmode and XFmode */ |
df41dbaf JH |
1725 | 12, /* cost of moving MMX register */ |
1726 | {16, 16}, /* cost of loading MMX registers | |
64766e8d | 1727 | in SImode and DImode */ |
df41dbaf | 1728 | {16, 16}, /* cost of storing MMX registers |
64766e8d | 1729 | in SImode and DImode */ |
df41dbaf JH |
1730 | 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */ |
1731 | {16, 16, 16, 32, 64}, /* cost of loading SSE registers | |
1732 | in 32,64,128,256 and 512-bit */ | |
1733 | {32, 32, 32, 64, 128}, /* cost of unaligned loads. */ | |
1734 | {16, 16, 16, 32, 64}, /* cost of storing SSE registers | |
1735 | in 32,64,128,256 and 512-bit */ | |
1736 | {32, 32, 32, 64, 128}, /* cost of unaligned stores. */ | |
1737 | 20, 12, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
1738 | 16, 16, /* Gather load static, per_elt. */ |
1739 | 16, 16, /* Gather store static, per_elt. */ | |
64766e8d JH |
1740 | 8, /* size of l1 cache. */ |
1741 | 256, /* size of l2 cache. */ | |
1742 | 64, /* size of prefetch block */ | |
1743 | 6, /* number of parallel prefetches */ | |
1744 | 2, /* Branch cost */ | |
1745 | COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */ | |
1746 | COSTS_N_INSNS (7), /* cost of FMUL instruction. */ | |
1747 | COSTS_N_INSNS (43), /* cost of FDIV instruction. */ | |
1748 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ | |
1749 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ | |
1750 | COSTS_N_INSNS (43), /* cost of FSQRT instruction. */ | |
6065f444 | 1751 | |
c53c148c | 1752 | COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ |
6065f444 JH |
1753 | COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ |
1754 | COSTS_N_INSNS (6), /* cost of MULSS instruction. */ | |
1755 | COSTS_N_INSNS (6), /* cost of MULSD instruction. */ | |
c53c148c JH |
1756 | COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ |
1757 | COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ | |
6065f444 JH |
1758 | COSTS_N_INSNS (23), /* cost of DIVSS instruction. */ |
1759 | COSTS_N_INSNS (38), /* cost of DIVSD instruction. */ | |
1760 | COSTS_N_INSNS (23), /* cost of SQRTSS instruction. */ | |
1761 | COSTS_N_INSNS (38), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
1762 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
1763 | pentium4_memcpy, | |
1764 | pentium4_memset, | |
f6fd8f2b JH |
1765 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
1766 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
7dc58b50 ML |
1767 | NULL, /* Loop alignment. */ |
1768 | NULL, /* Jump alignment. */ | |
1769 | NULL, /* Label alignment. */ | |
1770 | NULL, /* Func alignment. */ | |
64766e8d JH |
1771 | }; |
1772 | ||
1773 | static stringop_algs nocona_memcpy[2] = { | |
1774 | {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}}, | |
1775 | {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false}, | |
1776 | {100000, unrolled_loop, false}, {-1, libcall, false}}}}; | |
1777 | ||
1778 | static stringop_algs nocona_memset[2] = { | |
1779 | {libcall, {{6, loop_1_byte, false}, {48, loop, false}, | |
1780 | {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
1781 | {libcall, {{24, loop, false}, {64, unrolled_loop, false}, | |
1782 | {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; | |
1783 | ||
1784 | static const | |
1785 | struct processor_costs nocona_cost = { | |
1786 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
1787 | COSTS_N_INSNS (1), /* cost of a lea instruction */ | |
1788 | COSTS_N_INSNS (1), /* variable shift costs */ | |
1789 | COSTS_N_INSNS (1), /* constant shift costs */ | |
1790 | {COSTS_N_INSNS (10), /* cost of starting multiply for QI */ | |
1791 | COSTS_N_INSNS (10), /* HI */ | |
1792 | COSTS_N_INSNS (10), /* SI */ | |
1793 | COSTS_N_INSNS (10), /* DI */ | |
1794 | COSTS_N_INSNS (10)}, /* other */ | |
1795 | 0, /* cost of multiply per each bit set */ | |
1796 | {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */ | |
1797 | COSTS_N_INSNS (66), /* HI */ | |
1798 | COSTS_N_INSNS (66), /* SI */ | |
1799 | COSTS_N_INSNS (66), /* DI */ | |
1800 | COSTS_N_INSNS (66)}, /* other */ | |
1801 | COSTS_N_INSNS (1), /* cost of movsx */ | |
1802 | COSTS_N_INSNS (1), /* cost of movzx */ | |
1803 | 16, /* "large" insn */ | |
1804 | 17, /* MOVE_RATIO */ | |
df41dbaf JH |
1805 | |
1806 | /* All move costs are relative to integer->integer move times 2 and thus | |
1807 | they are latency*2. */ | |
64766e8d JH |
1808 | 4, /* cost for loading QImode using movzbl */ |
1809 | {4, 4, 4}, /* cost of loading integer registers | |
1810 | in QImode, HImode and SImode. | |
1811 | Relative to reg-reg move (2). */ | |
1812 | {4, 4, 4}, /* cost of storing integer registers */ | |
df41dbaf JH |
1813 | 12, /* cost of reg,reg fld/fst */ |
1814 | {14, 14, 14}, /* cost of loading fp registers | |
64766e8d | 1815 | in SFmode, DFmode and XFmode */ |
df41dbaf | 1816 | {14, 14, 14}, /* cost of storing fp registers |
64766e8d | 1817 | in SFmode, DFmode and XFmode */ |
df41dbaf | 1818 | 14, /* cost of moving MMX register */ |
64766e8d JH |
1819 | {12, 12}, /* cost of loading MMX registers |
1820 | in SImode and DImode */ | |
1821 | {12, 12}, /* cost of storing MMX registers | |
1822 | in SImode and DImode */ | |
df41dbaf JH |
1823 | 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */ |
1824 | {12, 12, 12, 24, 48}, /* cost of loading SSE registers | |
1825 | in 32,64,128,256 and 512-bit */ | |
1826 | {24, 24, 24, 48, 96}, /* cost of unaligned loads. */ | |
1827 | {12, 12, 12, 24, 48}, /* cost of storing SSE registers | |
1828 | in 32,64,128,256 and 512-bit */ | |
1829 | {24, 24, 24, 48, 96}, /* cost of unaligned stores. */ | |
1830 | 20, 12, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
1831 | 12, 12, /* Gather load static, per_elt. */ |
1832 | 12, 12, /* Gather store static, per_elt. */ | |
64766e8d JH |
1833 | 8, /* size of l1 cache. */ |
1834 | 1024, /* size of l2 cache. */ | |
1835 | 64, /* size of prefetch block */ | |
1836 | 8, /* number of parallel prefetches */ | |
1837 | 1, /* Branch cost */ | |
1838 | COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ | |
1839 | COSTS_N_INSNS (8), /* cost of FMUL instruction. */ | |
1840 | COSTS_N_INSNS (40), /* cost of FDIV instruction. */ | |
1841 | COSTS_N_INSNS (3), /* cost of FABS instruction. */ | |
1842 | COSTS_N_INSNS (3), /* cost of FCHS instruction. */ | |
1843 | COSTS_N_INSNS (44), /* cost of FSQRT instruction. */ | |
6065f444 | 1844 | |
c53c148c | 1845 | COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ |
6065f444 JH |
1846 | COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */ |
1847 | COSTS_N_INSNS (7), /* cost of MULSS instruction. */ | |
1848 | COSTS_N_INSNS (7), /* cost of MULSD instruction. */ | |
c53c148c JH |
1849 | COSTS_N_INSNS (7), /* cost of FMA SS instruction. */ |
1850 | COSTS_N_INSNS (7), /* cost of FMA SD instruction. */ | |
6065f444 JH |
1851 | COSTS_N_INSNS (32), /* cost of DIVSS instruction. */ |
1852 | COSTS_N_INSNS (40), /* cost of DIVSD instruction. */ | |
1853 | COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */ | |
1854 | COSTS_N_INSNS (41), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
1855 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
1856 | nocona_memcpy, | |
1857 | nocona_memset, | |
f6fd8f2b JH |
1858 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
1859 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
7dc58b50 ML |
1860 | NULL, /* Loop alignment. */ |
1861 | NULL, /* Jump alignment. */ | |
1862 | NULL, /* Label alignment. */ | |
1863 | NULL, /* Func alignment. */ | |
64766e8d JH |
1864 | }; |
1865 | ||
1866 | static stringop_algs atom_memcpy[2] = { | |
1867 | {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, | |
1868 | {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, | |
1869 | {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; | |
1870 | static stringop_algs atom_memset[2] = { | |
1871 | {libcall, {{8, loop, false}, {15, unrolled_loop, false}, | |
1872 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
1873 | {libcall, {{24, loop, false}, {32, unrolled_loop, false}, | |
1874 | {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; | |
1875 | static const | |
1876 | struct processor_costs atom_cost = { | |
1877 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
1878 | COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ | |
1879 | COSTS_N_INSNS (1), /* variable shift costs */ | |
1880 | COSTS_N_INSNS (1), /* constant shift costs */ | |
1881 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ | |
1882 | COSTS_N_INSNS (4), /* HI */ | |
1883 | COSTS_N_INSNS (3), /* SI */ | |
1884 | COSTS_N_INSNS (4), /* DI */ | |
1885 | COSTS_N_INSNS (2)}, /* other */ | |
1886 | 0, /* cost of multiply per each bit set */ | |
1887 | {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ | |
1888 | COSTS_N_INSNS (26), /* HI */ | |
1889 | COSTS_N_INSNS (42), /* SI */ | |
1890 | COSTS_N_INSNS (74), /* DI */ | |
1891 | COSTS_N_INSNS (74)}, /* other */ | |
1892 | COSTS_N_INSNS (1), /* cost of movsx */ | |
1893 | COSTS_N_INSNS (1), /* cost of movzx */ | |
1894 | 8, /* "large" insn */ | |
1895 | 17, /* MOVE_RATIO */ | |
df41dbaf JH |
1896 | |
1897 | /* All move costs are relative to integer->integer move times 2 and thus | |
1898 | they are latency*2. */ | |
1899 | 6, /* cost for loading QImode using movzbl */ | |
1900 | {6, 6, 6}, /* cost of loading integer registers | |
64766e8d JH |
1901 | in QImode, HImode and SImode. |
1902 | Relative to reg-reg move (2). */ | |
df41dbaf | 1903 | {6, 6, 6}, /* cost of storing integer registers */ |
64766e8d | 1904 | 4, /* cost of reg,reg fld/fst */ |
df41dbaf | 1905 | {6, 6, 18}, /* cost of loading fp registers |
64766e8d | 1906 | in SFmode, DFmode and XFmode */ |
df41dbaf | 1907 | {14, 14, 24}, /* cost of storing fp registers |
64766e8d JH |
1908 | in SFmode, DFmode and XFmode */ |
1909 | 2, /* cost of moving MMX register */ | |
1910 | {8, 8}, /* cost of loading MMX registers | |
1911 | in SImode and DImode */ | |
df41dbaf | 1912 | {10, 10}, /* cost of storing MMX registers |
64766e8d | 1913 | in SImode and DImode */ |
df41dbaf JH |
1914 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
1915 | {8, 8, 8, 16, 32}, /* cost of loading SSE registers | |
1916 | in 32,64,128,256 and 512-bit */ | |
1917 | {16, 16, 16, 32, 64}, /* cost of unaligned loads. */ | |
1918 | {8, 8, 8, 16, 32}, /* cost of storing SSE registers | |
1919 | in 32,64,128,256 and 512-bit */ | |
1920 | {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ | |
1921 | 8, 6, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
1922 | 8, 8, /* Gather load static, per_elt. */ |
1923 | 8, 8, /* Gather store static, per_elt. */ | |
64766e8d JH |
1924 | 32, /* size of l1 cache. */ |
1925 | 256, /* size of l2 cache. */ | |
1926 | 64, /* size of prefetch block */ | |
1927 | 6, /* number of parallel prefetches */ | |
1928 | 3, /* Branch cost */ | |
1929 | COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ | |
1930 | COSTS_N_INSNS (8), /* cost of FMUL instruction. */ | |
1931 | COSTS_N_INSNS (20), /* cost of FDIV instruction. */ | |
1932 | COSTS_N_INSNS (8), /* cost of FABS instruction. */ | |
1933 | COSTS_N_INSNS (8), /* cost of FCHS instruction. */ | |
1934 | COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ | |
6065f444 | 1935 | |
c53c148c | 1936 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
6065f444 JH |
1937 | COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */ |
1938 | COSTS_N_INSNS (4), /* cost of MULSS instruction. */ | |
1939 | COSTS_N_INSNS (5), /* cost of MULSD instruction. */ | |
c53c148c JH |
1940 | COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ |
1941 | COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ | |
6065f444 JH |
1942 | COSTS_N_INSNS (31), /* cost of DIVSS instruction. */ |
1943 | COSTS_N_INSNS (60), /* cost of DIVSD instruction. */ | |
1944 | COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */ | |
1945 | COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
1946 | 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ |
1947 | atom_memcpy, | |
1948 | atom_memset, | |
f6fd8f2b JH |
1949 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
1950 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
7dc58b50 ML |
1951 | "16", /* Loop alignment. */ |
1952 | "16:8:8", /* Jump alignment. */ | |
1953 | "0:0:8", /* Label alignment. */ | |
1954 | "16", /* Func alignment. */ | |
64766e8d JH |
1955 | }; |
1956 | ||
1957 | static stringop_algs slm_memcpy[2] = { | |
1958 | {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, | |
1959 | {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, | |
1960 | {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; | |
1961 | static stringop_algs slm_memset[2] = { | |
1962 | {libcall, {{8, loop, false}, {15, unrolled_loop, false}, | |
1963 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
1964 | {libcall, {{24, loop, false}, {32, unrolled_loop, false}, | |
1965 | {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; | |
1966 | static const | |
1967 | struct processor_costs slm_cost = { | |
1968 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
1969 | COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ | |
1970 | COSTS_N_INSNS (1), /* variable shift costs */ | |
1971 | COSTS_N_INSNS (1), /* constant shift costs */ | |
1972 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ | |
1973 | COSTS_N_INSNS (3), /* HI */ | |
1974 | COSTS_N_INSNS (3), /* SI */ | |
1975 | COSTS_N_INSNS (4), /* DI */ | |
1976 | COSTS_N_INSNS (2)}, /* other */ | |
1977 | 0, /* cost of multiply per each bit set */ | |
1978 | {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ | |
1979 | COSTS_N_INSNS (26), /* HI */ | |
1980 | COSTS_N_INSNS (42), /* SI */ | |
1981 | COSTS_N_INSNS (74), /* DI */ | |
1982 | COSTS_N_INSNS (74)}, /* other */ | |
1983 | COSTS_N_INSNS (1), /* cost of movsx */ | |
1984 | COSTS_N_INSNS (1), /* cost of movzx */ | |
1985 | 8, /* "large" insn */ | |
1986 | 17, /* MOVE_RATIO */ | |
df41dbaf JH |
1987 | |
1988 | /* All move costs are relative to integer->integer move times 2 and thus | |
1989 | they are latency*2. */ | |
1990 | 8, /* cost for loading QImode using movzbl */ | |
1991 | {8, 8, 8}, /* cost of loading integer registers | |
64766e8d JH |
1992 | in QImode, HImode and SImode. |
1993 | Relative to reg-reg move (2). */ | |
df41dbaf JH |
1994 | {6, 6, 6}, /* cost of storing integer registers */ |
1995 | 2, /* cost of reg,reg fld/fst */ | |
1996 | {8, 8, 18}, /* cost of loading fp registers | |
64766e8d | 1997 | in SFmode, DFmode and XFmode */ |
df41dbaf | 1998 | {6, 6, 18}, /* cost of storing fp registers |
64766e8d JH |
1999 | in SFmode, DFmode and XFmode */ |
2000 | 2, /* cost of moving MMX register */ | |
2001 | {8, 8}, /* cost of loading MMX registers | |
2002 | in SImode and DImode */ | |
df41dbaf | 2003 | {6, 6}, /* cost of storing MMX registers |
64766e8d | 2004 | in SImode and DImode */ |
df41dbaf JH |
2005 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
2006 | {8, 8, 8, 16, 32}, /* cost of loading SSE registers | |
2007 | in 32,64,128,256 and 512-bit */ | |
2008 | {16, 16, 16, 32, 64}, /* cost of unaligned loads. */ | |
2009 | {8, 8, 8, 16, 32}, /* cost of storing SSE registers | |
2010 | in 32,64,128,256 and 512-bit */ | |
2011 | {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ | |
2012 | 8, 6, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
2013 | 8, 8, /* Gather load static, per_elt. */ |
2014 | 8, 8, /* Gather store static, per_elt. */ | |
64766e8d JH |
2015 | 32, /* size of l1 cache. */ |
2016 | 256, /* size of l2 cache. */ | |
2017 | 64, /* size of prefetch block */ | |
2018 | 6, /* number of parallel prefetches */ | |
2019 | 3, /* Branch cost */ | |
2020 | COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ | |
2021 | COSTS_N_INSNS (8), /* cost of FMUL instruction. */ | |
2022 | COSTS_N_INSNS (20), /* cost of FDIV instruction. */ | |
2023 | COSTS_N_INSNS (8), /* cost of FABS instruction. */ | |
2024 | COSTS_N_INSNS (8), /* cost of FCHS instruction. */ | |
2025 | COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ | |
6065f444 | 2026 | |
c53c148c | 2027 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
6065f444 JH |
2028 | COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ |
2029 | COSTS_N_INSNS (4), /* cost of MULSS instruction. */ | |
2030 | COSTS_N_INSNS (5), /* cost of MULSD instruction. */ | |
c53c148c JH |
2031 | COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ |
2032 | COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ | |
6065f444 JH |
2033 | COSTS_N_INSNS (39), /* cost of DIVSS instruction. */ |
2034 | COSTS_N_INSNS (69), /* cost of DIVSD instruction. */ | |
2035 | COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */ | |
2036 | COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
2037 | 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
2038 | slm_memcpy, | |
2039 | slm_memset, | |
f6fd8f2b JH |
2040 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
2041 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
7dc58b50 ML |
2042 | "16", /* Loop alignment. */ |
2043 | "16:8:8", /* Jump alignment. */ | |
2044 | "0:0:8", /* Label alignment. */ | |
2045 | "16", /* Func alignment. */ | |
64766e8d JH |
2046 | }; |
2047 | ||
2048 | static stringop_algs intel_memcpy[2] = { | |
2049 | {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, | |
2050 | {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, | |
2051 | {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; | |
2052 | static stringop_algs intel_memset[2] = { | |
2053 | {libcall, {{8, loop, false}, {15, unrolled_loop, false}, | |
2054 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
2055 | {libcall, {{24, loop, false}, {32, unrolled_loop, false}, | |
2056 | {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; | |
2057 | static const | |
2058 | struct processor_costs intel_cost = { | |
2059 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
2060 | COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ | |
2061 | COSTS_N_INSNS (1), /* variable shift costs */ | |
2062 | COSTS_N_INSNS (1), /* constant shift costs */ | |
2063 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ | |
2064 | COSTS_N_INSNS (3), /* HI */ | |
2065 | COSTS_N_INSNS (3), /* SI */ | |
2066 | COSTS_N_INSNS (4), /* DI */ | |
2067 | COSTS_N_INSNS (2)}, /* other */ | |
2068 | 0, /* cost of multiply per each bit set */ | |
2069 | {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ | |
2070 | COSTS_N_INSNS (26), /* HI */ | |
2071 | COSTS_N_INSNS (42), /* SI */ | |
2072 | COSTS_N_INSNS (74), /* DI */ | |
2073 | COSTS_N_INSNS (74)}, /* other */ | |
2074 | COSTS_N_INSNS (1), /* cost of movsx */ | |
2075 | COSTS_N_INSNS (1), /* cost of movzx */ | |
2076 | 8, /* "large" insn */ | |
2077 | 17, /* MOVE_RATIO */ | |
df41dbaf JH |
2078 | |
2079 | /* All move costs are relative to integer->integer move times 2 and thus | |
2080 | they are latency*2. */ | |
af863030 | 2081 | 6, /* cost for loading QImode using movzbl */ |
64766e8d JH |
2082 | {4, 4, 4}, /* cost of loading integer registers |
2083 | in QImode, HImode and SImode. | |
2084 | Relative to reg-reg move (2). */ | |
af863030 JH |
2085 | {6, 6, 6}, /* cost of storing integer registers */ |
2086 | 2, /* cost of reg,reg fld/fst */ | |
2087 | {6, 6, 8}, /* cost of loading fp registers | |
64766e8d | 2088 | in SFmode, DFmode and XFmode */ |
af863030 | 2089 | {6, 6, 10}, /* cost of storing fp registers |
64766e8d JH |
2090 | in SFmode, DFmode and XFmode */ |
2091 | 2, /* cost of moving MMX register */ | |
af863030 | 2092 | {6, 6}, /* cost of loading MMX registers |
64766e8d | 2093 | in SImode and DImode */ |
af863030 | 2094 | {6, 6}, /* cost of storing MMX registers |
64766e8d | 2095 | in SImode and DImode */ |
df41dbaf JH |
2096 | 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */ |
2097 | {6, 6, 6, 6, 6}, /* cost of loading SSE registers | |
2098 | in 32,64,128,256 and 512-bit */ | |
2099 | {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ | |
2100 | {6, 6, 6, 6, 6}, /* cost of storing SSE registers | |
2101 | in 32,64,128,256 and 512-bit */ | |
2102 | {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ | |
2103 | 4, 4, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
2104 | 6, 6, /* Gather load static, per_elt. */ |
2105 | 6, 6, /* Gather store static, per_elt. */ | |
64766e8d JH |
2106 | 32, /* size of l1 cache. */ |
2107 | 256, /* size of l2 cache. */ | |
2108 | 64, /* size of prefetch block */ | |
2109 | 6, /* number of parallel prefetches */ | |
2110 | 3, /* Branch cost */ | |
2111 | COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ | |
2112 | COSTS_N_INSNS (8), /* cost of FMUL instruction. */ | |
2113 | COSTS_N_INSNS (20), /* cost of FDIV instruction. */ | |
2114 | COSTS_N_INSNS (8), /* cost of FABS instruction. */ | |
2115 | COSTS_N_INSNS (8), /* cost of FCHS instruction. */ | |
2116 | COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ | |
6065f444 | 2117 | |
3ff59baa | 2118 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
6065f444 JH |
2119 | COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */ |
2120 | COSTS_N_INSNS (8), /* cost of MULSS instruction. */ | |
2121 | COSTS_N_INSNS (8), /* cost of MULSD instruction. */ | |
c53c148c JH |
2122 | COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ |
2123 | COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ | |
6065f444 JH |
2124 | COSTS_N_INSNS (20), /* cost of DIVSS instruction. */ |
2125 | COSTS_N_INSNS (20), /* cost of DIVSD instruction. */ | |
2126 | COSTS_N_INSNS (40), /* cost of SQRTSS instruction. */ | |
2127 | COSTS_N_INSNS (40), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
2128 | 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
2129 | intel_memcpy, | |
2130 | intel_memset, | |
f6fd8f2b JH |
2131 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
2132 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
7dc58b50 ML |
2133 | "16", /* Loop alignment. */ |
2134 | "16:8:8", /* Jump alignment. */ | |
2135 | "0:0:8", /* Label alignment. */ | |
2136 | "16", /* Func alignment. */ | |
64766e8d JH |
2137 | }; |
2138 | ||
2139 | /* Generic should produce code tuned for Core-i7 (and newer chips) | |
2140 | and btver1 (and newer chips). */ | |
2141 | ||
2142 | static stringop_algs generic_memcpy[2] = { | |
2143 | {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false}, | |
2144 | {-1, libcall, false}}}, | |
2145 | {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false}, | |
2146 | {-1, libcall, false}}}}; | |
2147 | static stringop_algs generic_memset[2] = { | |
2148 | {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false}, | |
2149 | {-1, libcall, false}}}, | |
2150 | {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false}, | |
2151 | {-1, libcall, false}}}}; | |
2152 | static const | |
2153 | struct processor_costs generic_cost = { | |
2154 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
ef9eec0b | 2155 | /* Setting cost to 2 makes our current implementation of synth_mult result in |
64766e8d JH |
2156 | use of unnecessary temporary registers causing regression on several |
2157 | SPECfp benchmarks. */ | |
2158 | COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ | |
2159 | COSTS_N_INSNS (1), /* variable shift costs */ | |
2160 | COSTS_N_INSNS (1), /* constant shift costs */ | |
2161 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ | |
2162 | COSTS_N_INSNS (4), /* HI */ | |
2163 | COSTS_N_INSNS (3), /* SI */ | |
2164 | COSTS_N_INSNS (4), /* DI */ | |
7c080ade | 2165 | COSTS_N_INSNS (4)}, /* other */ |
64766e8d | 2166 | 0, /* cost of multiply per each bit set */ |
7c080ade JH |
2167 | {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */ |
2168 | COSTS_N_INSNS (22), /* HI */ | |
2169 | COSTS_N_INSNS (30), /* SI */ | |
64766e8d JH |
2170 | COSTS_N_INSNS (74), /* DI */ |
2171 | COSTS_N_INSNS (74)}, /* other */ | |
2172 | COSTS_N_INSNS (1), /* cost of movsx */ | |
2173 | COSTS_N_INSNS (1), /* cost of movzx */ | |
2174 | 8, /* "large" insn */ | |
2175 | 17, /* MOVE_RATIO */ | |
df41dbaf JH |
2176 | |
2177 | /* All move costs are relative to integer->integer move times 2 and thus | |
2178 | they are latency*2. */ | |
d555138e JH |
2179 | 6, /* cost for loading QImode using movzbl */ |
2180 | {6, 6, 6}, /* cost of loading integer registers | |
64766e8d JH |
2181 | in QImode, HImode and SImode. |
2182 | Relative to reg-reg move (2). */ | |
af863030 | 2183 | {6, 6, 6}, /* cost of storing integer registers */ |
64766e8d | 2184 | 4, /* cost of reg,reg fld/fst */ |
af863030 | 2185 | {6, 6, 12}, /* cost of loading fp registers |
64766e8d | 2186 | in SFmode, DFmode and XFmode */ |
af863030 | 2187 | {6, 6, 12}, /* cost of storing fp registers |
64766e8d JH |
2188 | in SFmode, DFmode and XFmode */ |
2189 | 2, /* cost of moving MMX register */ | |
af863030 | 2190 | {6, 6}, /* cost of loading MMX registers |
64766e8d | 2191 | in SImode and DImode */ |
af863030 | 2192 | {6, 6}, /* cost of storing MMX registers |
64766e8d | 2193 | in SImode and DImode */ |
df41dbaf JH |
2194 | 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ |
2195 | {6, 6, 6, 10, 15}, /* cost of loading SSE registers | |
2196 | in 32,64,128,256 and 512-bit */ | |
7c080ade | 2197 | {6, 6, 6, 10, 15}, /* cost of unaligned loads. */ |
df41dbaf JH |
2198 | {6, 6, 6, 10, 15}, /* cost of storing SSE registers |
2199 | in 32,64,128,256 and 512-bit */ | |
7c080ade JH |
2200 | {6, 6, 6, 10, 15}, /* cost of unaligned storess. */ |
2201 | 6, 6, /* SSE->integer and integer->SSE moves */ | |
2202 | 18, 6, /* Gather load static, per_elt. */ | |
2203 | 18, 6, /* Gather store static, per_elt. */ | |
64766e8d JH |
2204 | 32, /* size of l1 cache. */ |
2205 | 512, /* size of l2 cache. */ | |
2206 | 64, /* size of prefetch block */ | |
2207 | 6, /* number of parallel prefetches */ | |
2208 | /* Benchmarks shows large regressions on K8 sixtrack benchmark when this | |
2209 | value is increased to perhaps more appropriate value of 5. */ | |
2210 | 3, /* Branch cost */ | |
ef9eec0b | 2211 | COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ |
7c080ade | 2212 | COSTS_N_INSNS (5), /* cost of FMUL instruction. */ |
e8e3054e | 2213 | COSTS_N_INSNS (17), /* cost of FDIV instruction. */ |
ef9eec0b JH |
2214 | COSTS_N_INSNS (1), /* cost of FABS instruction. */ |
2215 | COSTS_N_INSNS (1), /* cost of FCHS instruction. */ | |
e8e3054e | 2216 | COSTS_N_INSNS (14), /* cost of FSQRT instruction. */ |
6065f444 | 2217 | |
ef9eec0b JH |
2218 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
2219 | COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ | |
2220 | COSTS_N_INSNS (4), /* cost of MULSS instruction. */ | |
2221 | COSTS_N_INSNS (5), /* cost of MULSD instruction. */ | |
2222 | COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ | |
2223 | COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ | |
e8e3054e JH |
2224 | COSTS_N_INSNS (13), /* cost of DIVSS instruction. */ |
2225 | COSTS_N_INSNS (17), /* cost of DIVSD instruction. */ | |
2226 | COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */ | |
2227 | COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */ | |
7c080ade | 2228 | 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */ |
64766e8d JH |
2229 | generic_memcpy, |
2230 | generic_memset, | |
e8e3054e JH |
2231 | COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ |
2232 | COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ | |
7dc58b50 ML |
2233 | "16:11:8", /* Loop alignment. */ |
2234 | "16:11:8", /* Jump alignment. */ | |
2235 | "0:0:8", /* Label alignment. */ | |
2236 | "16", /* Func alignment. */ | |
64766e8d JH |
2237 | }; |
2238 | ||
2239 | /* core_cost should produce code tuned for Core familly of CPUs. */ | |
2240 | static stringop_algs core_memcpy[2] = { | |
2241 | {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, | |
2242 | {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true}, | |
2243 | {-1, libcall, false}}}}; | |
2244 | static stringop_algs core_memset[2] = { | |
2245 | {libcall, {{6, loop_1_byte, true}, | |
2246 | {24, loop, true}, | |
2247 | {8192, rep_prefix_4_byte, true}, | |
2248 | {-1, libcall, false}}}, | |
2249 | {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true}, | |
2250 | {-1, libcall, false}}}}; | |
2251 | ||
2252 | static const | |
2253 | struct processor_costs core_cost = { | |
2254 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
2255 | /* On all chips taken into consideration lea is 2 cycles and more. With | |
2256 | this cost however our current implementation of synth_mult results in | |
2257 | use of unnecessary temporary registers causing regression on several | |
2258 | SPECfp benchmarks. */ | |
2259 | COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ | |
2260 | COSTS_N_INSNS (1), /* variable shift costs */ | |
2261 | COSTS_N_INSNS (1), /* constant shift costs */ | |
2262 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ | |
2263 | COSTS_N_INSNS (4), /* HI */ | |
2264 | COSTS_N_INSNS (3), /* SI */ | |
a2ef9558 MT |
2265 | /* Here we tune for Sandybridge or newer. */ |
2266 | COSTS_N_INSNS (3), /* DI */ | |
2267 | COSTS_N_INSNS (3)}, /* other */ | |
64766e8d | 2268 | 0, /* cost of multiply per each bit set */ |
02308bd3 MT |
2269 | /* Expanding div/mod currently doesn't consider parallelism. So the cost |
2270 | model is not realistic. We compensate by increasing the latencies a bit. */ | |
2271 | {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */ | |
2272 | COSTS_N_INSNS (11), /* HI */ | |
2273 | COSTS_N_INSNS (14), /* SI */ | |
ffa3ce53 JH |
2274 | COSTS_N_INSNS (81), /* DI */ |
2275 | COSTS_N_INSNS (81)}, /* other */ | |
64766e8d JH |
2276 | COSTS_N_INSNS (1), /* cost of movsx */ |
2277 | COSTS_N_INSNS (1), /* cost of movzx */ | |
2278 | 8, /* "large" insn */ | |
2279 | 17, /* MOVE_RATIO */ | |
df41dbaf JH |
2280 | |
2281 | /* All move costs are relative to integer->integer move times 2 and thus | |
2282 | they are latency*2. */ | |
ffa3ce53 | 2283 | 6, /* cost for loading QImode using movzbl */ |
64766e8d JH |
2284 | {4, 4, 4}, /* cost of loading integer registers |
2285 | in QImode, HImode and SImode. | |
2286 | Relative to reg-reg move (2). */ | |
ffa3ce53 JH |
2287 | {6, 6, 6}, /* cost of storing integer registers */ |
2288 | 2, /* cost of reg,reg fld/fst */ | |
2289 | {6, 6, 8}, /* cost of loading fp registers | |
64766e8d | 2290 | in SFmode, DFmode and XFmode */ |
af863030 | 2291 | {6, 6, 10}, /* cost of storing fp registers |
64766e8d JH |
2292 | in SFmode, DFmode and XFmode */ |
2293 | 2, /* cost of moving MMX register */ | |
ffa3ce53 | 2294 | {6, 6}, /* cost of loading MMX registers |
64766e8d | 2295 | in SImode and DImode */ |
ffa3ce53 | 2296 | {6, 6}, /* cost of storing MMX registers |
64766e8d | 2297 | in SImode and DImode */ |
df41dbaf JH |
2298 | 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ |
2299 | {6, 6, 6, 6, 12}, /* cost of loading SSE registers | |
2300 | in 32,64,128,256 and 512-bit */ | |
2301 | {6, 6, 6, 6, 12}, /* cost of unaligned loads. */ | |
2302 | {6, 6, 6, 6, 12}, /* cost of storing SSE registers | |
2303 | in 32,64,128,256 and 512-bit */ | |
2304 | {6, 6, 6, 6, 12}, /* cost of unaligned stores. */ | |
2305 | 2, 2, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
2306 | /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops, |
2307 | rec. throughput 6. | |
2308 | So 5 uops statically and one uops per load. */ | |
2309 | 10, 6, /* Gather load static, per_elt. */ | |
2310 | 10, 6, /* Gather store static, per_elt. */ | |
64766e8d JH |
2311 | 64, /* size of l1 cache. */ |
2312 | 512, /* size of l2 cache. */ | |
2313 | 64, /* size of prefetch block */ | |
2314 | 6, /* number of parallel prefetches */ | |
2315 | /* FIXME perhaps more appropriate value is 5. */ | |
2316 | 3, /* Branch cost */ | |
ef9eec0b JH |
2317 | COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ |
2318 | COSTS_N_INSNS (5), /* cost of FMUL instruction. */ | |
ffa3ce53 | 2319 | /* 10-24 */ |
ef9eec0b JH |
2320 | COSTS_N_INSNS (24), /* cost of FDIV instruction. */ |
2321 | COSTS_N_INSNS (1), /* cost of FABS instruction. */ | |
2322 | COSTS_N_INSNS (1), /* cost of FCHS instruction. */ | |
ffa3ce53 | 2323 | COSTS_N_INSNS (23), /* cost of FSQRT instruction. */ |
6065f444 | 2324 | |
c53c148c | 2325 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
6065f444 JH |
2326 | COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ |
2327 | COSTS_N_INSNS (4), /* cost of MULSS instruction. */ | |
2328 | COSTS_N_INSNS (5), /* cost of MULSD instruction. */ | |
c53c148c JH |
2329 | COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ |
2330 | COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ | |
6065f444 JH |
2331 | COSTS_N_INSNS (18), /* cost of DIVSS instruction. */ |
2332 | COSTS_N_INSNS (32), /* cost of DIVSD instruction. */ | |
2333 | COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */ | |
2334 | COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
2335 | 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ |
2336 | core_memcpy, | |
2337 | core_memset, | |
f6fd8f2b JH |
2338 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
2339 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
7dc58b50 ML |
2340 | "16:11:8", /* Loop alignment. */ |
2341 | "16:11:8", /* Jump alignment. */ | |
2342 | "0:0:8", /* Label alignment. */ | |
2343 | "16", /* Func alignment. */ | |
64766e8d JH |
2344 | }; |
2345 |