]>
Commit | Line | Data |
---|---|---|
df41dbaf | 1 | /* Costs of operations of individual x86 CPUs. |
85ec4feb | 2 | Copyright (C) 1988-2018 Free Software Foundation, Inc. |
64766e8d | 3 | |
df41dbaf JH |
4 | This file is part of GCC. |
5 | ||
6 | GCC is free software; you can redistribute it and/or modify | |
7 | it under the terms of the GNU General Public License as published by | |
8 | the Free Software Foundation; either version 3, or (at your option) | |
9 | any later version. | |
10 | ||
11 | GCC is distributed in the hope that it will be useful, | |
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | GNU General Public License for more details. | |
15 | ||
16 | Under Section 7 of GPL version 3, you are granted additional | |
17 | permissions described in the GCC Runtime Library Exception, version | |
18 | 3.1, as published by the Free Software Foundation. | |
19 | ||
20 | You should have received a copy of the GNU General Public License and | |
21 | a copy of the GCC Runtime Library Exception along with this program; | |
22 | see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
23 | <http://www.gnu.org/licenses/>. */ | |
64766e8d JH |
24 | /* Processor costs (relative to an add) */ |
25 | /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */ | |
26 | #define COSTS_N_BYTES(N) ((N) * 2) | |
27 | ||
28 | #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}} | |
29 | ||
30 | static stringop_algs ix86_size_memcpy[2] = { | |
31 | {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, | |
32 | {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}}; | |
33 | static stringop_algs ix86_size_memset[2] = { | |
34 | {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, | |
35 | {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}}; | |
36 | ||
37 | const | |
38 | struct processor_costs ix86_size_cost = {/* costs for tuning for size */ | |
39 | COSTS_N_BYTES (2), /* cost of an add instruction */ | |
40 | COSTS_N_BYTES (3), /* cost of a lea instruction */ | |
41 | COSTS_N_BYTES (2), /* variable shift costs */ | |
42 | COSTS_N_BYTES (3), /* constant shift costs */ | |
43 | {COSTS_N_BYTES (3), /* cost of starting multiply for QI */ | |
44 | COSTS_N_BYTES (3), /* HI */ | |
45 | COSTS_N_BYTES (3), /* SI */ | |
46 | COSTS_N_BYTES (3), /* DI */ | |
47 | COSTS_N_BYTES (5)}, /* other */ | |
48 | 0, /* cost of multiply per each bit set */ | |
49 | {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */ | |
50 | COSTS_N_BYTES (3), /* HI */ | |
51 | COSTS_N_BYTES (3), /* SI */ | |
52 | COSTS_N_BYTES (3), /* DI */ | |
53 | COSTS_N_BYTES (5)}, /* other */ | |
54 | COSTS_N_BYTES (3), /* cost of movsx */ | |
55 | COSTS_N_BYTES (3), /* cost of movzx */ | |
56 | 0, /* "large" insn */ | |
57 | 2, /* MOVE_RATIO */ | |
df41dbaf JH |
58 | |
59 | /* All move costs are relative to integer->integer move times 2. */ | |
64766e8d JH |
60 | 2, /* cost for loading QImode using movzbl */ |
61 | {2, 2, 2}, /* cost of loading integer registers | |
62 | in QImode, HImode and SImode. | |
63 | Relative to reg-reg move (2). */ | |
64 | {2, 2, 2}, /* cost of storing integer registers */ | |
65 | 2, /* cost of reg,reg fld/fst */ | |
66 | {2, 2, 2}, /* cost of loading fp registers | |
67 | in SFmode, DFmode and XFmode */ | |
68 | {2, 2, 2}, /* cost of storing fp registers | |
69 | in SFmode, DFmode and XFmode */ | |
70 | 3, /* cost of moving MMX register */ | |
71 | {3, 3}, /* cost of loading MMX registers | |
72 | in SImode and DImode */ | |
73 | {3, 3}, /* cost of storing MMX registers | |
74 | in SImode and DImode */ | |
df41dbaf JH |
75 | 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */ |
76 | {3, 3, 3, 3, 3}, /* cost of loading SSE registers | |
77 | in 32,64,128,256 and 512-bit */ | |
78 | {3, 3, 3, 3, 3}, /* cost of unaligned SSE load | |
79 | in 128bit, 256bit and 512bit */ | |
80 | {3, 3, 3, 3, 3}, /* cost of storing SSE registers | |
81 | in 32,64,128,256 and 512-bit */ | |
82 | {3, 3, 3, 3, 3}, /* cost of unaligned SSE store | |
83 | in 128bit, 256bit and 512bit */ | |
84 | 3, 3, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
85 | 5, 0, /* Gather load static, per_elt. */ |
86 | 5, 0, /* Gather store static, per_elt. */ | |
64766e8d JH |
87 | 0, /* size of l1 cache */ |
88 | 0, /* size of l2 cache */ | |
89 | 0, /* size of prefetch block */ | |
90 | 0, /* number of parallel prefetches */ | |
91 | 2, /* Branch cost */ | |
92 | COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */ | |
93 | COSTS_N_BYTES (2), /* cost of FMUL instruction. */ | |
94 | COSTS_N_BYTES (2), /* cost of FDIV instruction. */ | |
95 | COSTS_N_BYTES (2), /* cost of FABS instruction. */ | |
96 | COSTS_N_BYTES (2), /* cost of FCHS instruction. */ | |
97 | COSTS_N_BYTES (2), /* cost of FSQRT instruction. */ | |
6065f444 | 98 | |
c53c148c | 99 | COSTS_N_BYTES (2), /* cost of cheap SSE instruction. */ |
6065f444 JH |
100 | COSTS_N_BYTES (2), /* cost of ADDSS/SD SUBSS/SD insns. */ |
101 | COSTS_N_BYTES (2), /* cost of MULSS instruction. */ | |
102 | COSTS_N_BYTES (2), /* cost of MULSD instruction. */ | |
c53c148c JH |
103 | COSTS_N_BYTES (2), /* cost of FMA SS instruction. */ |
104 | COSTS_N_BYTES (2), /* cost of FMA SD instruction. */ | |
6065f444 JH |
105 | COSTS_N_BYTES (2), /* cost of DIVSS instruction. */ |
106 | COSTS_N_BYTES (2), /* cost of DIVSD instruction. */ | |
107 | COSTS_N_BYTES (2), /* cost of SQRTSS instruction. */ | |
108 | COSTS_N_BYTES (2), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
109 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
110 | ix86_size_memcpy, | |
111 | ix86_size_memset, | |
f6fd8f2b JH |
112 | COSTS_N_BYTES (1), /* cond_taken_branch_cost. */ |
113 | COSTS_N_BYTES (1), /* cond_not_taken_branch_cost. */ | |
64766e8d JH |
114 | }; |
115 | ||
116 | /* Processor costs (relative to an add) */ | |
117 | static stringop_algs i386_memcpy[2] = { | |
118 | {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, | |
119 | DUMMY_STRINGOP_ALGS}; | |
120 | static stringop_algs i386_memset[2] = { | |
121 | {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}, | |
122 | DUMMY_STRINGOP_ALGS}; | |
123 | ||
124 | static const | |
125 | struct processor_costs i386_cost = { /* 386 specific costs */ | |
126 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
127 | COSTS_N_INSNS (1), /* cost of a lea instruction */ | |
128 | COSTS_N_INSNS (3), /* variable shift costs */ | |
129 | COSTS_N_INSNS (2), /* constant shift costs */ | |
130 | {COSTS_N_INSNS (6), /* cost of starting multiply for QI */ | |
131 | COSTS_N_INSNS (6), /* HI */ | |
132 | COSTS_N_INSNS (6), /* SI */ | |
133 | COSTS_N_INSNS (6), /* DI */ | |
134 | COSTS_N_INSNS (6)}, /* other */ | |
135 | COSTS_N_INSNS (1), /* cost of multiply per each bit set */ | |
136 | {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */ | |
137 | COSTS_N_INSNS (23), /* HI */ | |
138 | COSTS_N_INSNS (23), /* SI */ | |
139 | COSTS_N_INSNS (23), /* DI */ | |
140 | COSTS_N_INSNS (23)}, /* other */ | |
141 | COSTS_N_INSNS (3), /* cost of movsx */ | |
142 | COSTS_N_INSNS (2), /* cost of movzx */ | |
143 | 15, /* "large" insn */ | |
144 | 3, /* MOVE_RATIO */ | |
df41dbaf JH |
145 | |
146 | /* All move costs are relative to integer->integer move times 2 and thus | |
147 | they are latency*2. */ | |
64766e8d JH |
148 | 4, /* cost for loading QImode using movzbl */ |
149 | {2, 4, 2}, /* cost of loading integer registers | |
150 | in QImode, HImode and SImode. | |
151 | Relative to reg-reg move (2). */ | |
152 | {2, 4, 2}, /* cost of storing integer registers */ | |
153 | 2, /* cost of reg,reg fld/fst */ | |
154 | {8, 8, 8}, /* cost of loading fp registers | |
155 | in SFmode, DFmode and XFmode */ | |
156 | {8, 8, 8}, /* cost of storing fp registers | |
157 | in SFmode, DFmode and XFmode */ | |
158 | 2, /* cost of moving MMX register */ | |
159 | {4, 8}, /* cost of loading MMX registers | |
160 | in SImode and DImode */ | |
161 | {4, 8}, /* cost of storing MMX registers | |
162 | in SImode and DImode */ | |
df41dbaf JH |
163 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
164 | {4, 8, 16, 32, 64}, /* cost of loading SSE registers | |
165 | in 32,64,128,256 and 512-bit */ | |
166 | {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ | |
167 | {4, 8, 16, 32, 64}, /* cost of storing SSE registers | |
168 | in 32,64,128,256 and 512-bit */ | |
169 | {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ | |
170 | 3, 3, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
171 | 4, 4, /* Gather load static, per_elt. */ |
172 | 4, 4, /* Gather store static, per_elt. */ | |
64766e8d JH |
173 | 0, /* size of l1 cache */ |
174 | 0, /* size of l2 cache */ | |
175 | 0, /* size of prefetch block */ | |
176 | 0, /* number of parallel prefetches */ | |
177 | 1, /* Branch cost */ | |
178 | COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */ | |
179 | COSTS_N_INSNS (27), /* cost of FMUL instruction. */ | |
180 | COSTS_N_INSNS (88), /* cost of FDIV instruction. */ | |
181 | COSTS_N_INSNS (22), /* cost of FABS instruction. */ | |
182 | COSTS_N_INSNS (24), /* cost of FCHS instruction. */ | |
183 | COSTS_N_INSNS (122), /* cost of FSQRT instruction. */ | |
6065f444 | 184 | |
c53c148c | 185 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
6065f444 JH |
186 | COSTS_N_INSNS (23), /* cost of ADDSS/SD SUBSS/SD insns. */ |
187 | COSTS_N_INSNS (27), /* cost of MULSS instruction. */ | |
188 | COSTS_N_INSNS (27), /* cost of MULSD instruction. */ | |
c53c148c JH |
189 | COSTS_N_INSNS (27), /* cost of FMA SS instruction. */ |
190 | COSTS_N_INSNS (27), /* cost of FMA SD instruction. */ | |
6065f444 JH |
191 | COSTS_N_INSNS (88), /* cost of DIVSS instruction. */ |
192 | COSTS_N_INSNS (88), /* cost of DIVSD instruction. */ | |
193 | COSTS_N_INSNS (122), /* cost of SQRTSS instruction. */ | |
194 | COSTS_N_INSNS (122), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
195 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
196 | i386_memcpy, | |
197 | i386_memset, | |
f6fd8f2b JH |
198 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
199 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
64766e8d JH |
200 | }; |
201 | ||
202 | static stringop_algs i486_memcpy[2] = { | |
203 | {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}}, | |
204 | DUMMY_STRINGOP_ALGS}; | |
205 | static stringop_algs i486_memset[2] = { | |
206 | {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}}, | |
207 | DUMMY_STRINGOP_ALGS}; | |
208 | ||
209 | static const | |
210 | struct processor_costs i486_cost = { /* 486 specific costs */ | |
211 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
212 | COSTS_N_INSNS (1), /* cost of a lea instruction */ | |
213 | COSTS_N_INSNS (3), /* variable shift costs */ | |
214 | COSTS_N_INSNS (2), /* constant shift costs */ | |
215 | {COSTS_N_INSNS (12), /* cost of starting multiply for QI */ | |
216 | COSTS_N_INSNS (12), /* HI */ | |
217 | COSTS_N_INSNS (12), /* SI */ | |
218 | COSTS_N_INSNS (12), /* DI */ | |
219 | COSTS_N_INSNS (12)}, /* other */ | |
220 | 1, /* cost of multiply per each bit set */ | |
221 | {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */ | |
222 | COSTS_N_INSNS (40), /* HI */ | |
223 | COSTS_N_INSNS (40), /* SI */ | |
224 | COSTS_N_INSNS (40), /* DI */ | |
225 | COSTS_N_INSNS (40)}, /* other */ | |
226 | COSTS_N_INSNS (3), /* cost of movsx */ | |
227 | COSTS_N_INSNS (2), /* cost of movzx */ | |
228 | 15, /* "large" insn */ | |
229 | 3, /* MOVE_RATIO */ | |
df41dbaf JH |
230 | |
231 | /* All move costs are relative to integer->integer move times 2 and thus | |
232 | they are latency*2. */ | |
64766e8d JH |
233 | 4, /* cost for loading QImode using movzbl */ |
234 | {2, 4, 2}, /* cost of loading integer registers | |
235 | in QImode, HImode and SImode. | |
236 | Relative to reg-reg move (2). */ | |
237 | {2, 4, 2}, /* cost of storing integer registers */ | |
238 | 2, /* cost of reg,reg fld/fst */ | |
239 | {8, 8, 8}, /* cost of loading fp registers | |
240 | in SFmode, DFmode and XFmode */ | |
241 | {8, 8, 8}, /* cost of storing fp registers | |
242 | in SFmode, DFmode and XFmode */ | |
243 | 2, /* cost of moving MMX register */ | |
244 | {4, 8}, /* cost of loading MMX registers | |
245 | in SImode and DImode */ | |
246 | {4, 8}, /* cost of storing MMX registers | |
247 | in SImode and DImode */ | |
df41dbaf JH |
248 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
249 | {4, 8, 16, 32, 64}, /* cost of loading SSE registers | |
250 | in 32,64,128,256 and 512-bit */ | |
251 | {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ | |
252 | {4, 8, 16, 32, 64}, /* cost of storing SSE registers | |
253 | in 32,64,128,256 and 512-bit */ | |
254 | {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ | |
255 | 3, 3, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
256 | 4, 4, /* Gather load static, per_elt. */ |
257 | 4, 4, /* Gather store static, per_elt. */ | |
64766e8d JH |
258 | 4, /* size of l1 cache. 486 has 8kB cache |
259 | shared for code and data, so 4kB is | |
260 | not really precise. */ | |
261 | 4, /* size of l2 cache */ | |
262 | 0, /* size of prefetch block */ | |
263 | 0, /* number of parallel prefetches */ | |
264 | 1, /* Branch cost */ | |
265 | COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ | |
266 | COSTS_N_INSNS (16), /* cost of FMUL instruction. */ | |
267 | COSTS_N_INSNS (73), /* cost of FDIV instruction. */ | |
268 | COSTS_N_INSNS (3), /* cost of FABS instruction. */ | |
269 | COSTS_N_INSNS (3), /* cost of FCHS instruction. */ | |
270 | COSTS_N_INSNS (83), /* cost of FSQRT instruction. */ | |
6065f444 | 271 | |
c53c148c | 272 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
6065f444 JH |
273 | COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */ |
274 | COSTS_N_INSNS (16), /* cost of MULSS instruction. */ | |
275 | COSTS_N_INSNS (16), /* cost of MULSD instruction. */ | |
c53c148c JH |
276 | COSTS_N_INSNS (16), /* cost of FMA SS instruction. */ |
277 | COSTS_N_INSNS (16), /* cost of FMA SD instruction. */ | |
6065f444 JH |
278 | COSTS_N_INSNS (73), /* cost of DIVSS instruction. */ |
279 | COSTS_N_INSNS (74), /* cost of DIVSD instruction. */ | |
280 | COSTS_N_INSNS (83), /* cost of SQRTSS instruction. */ | |
281 | COSTS_N_INSNS (83), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
282 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
283 | i486_memcpy, | |
284 | i486_memset, | |
f6fd8f2b JH |
285 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
286 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
64766e8d JH |
287 | }; |
288 | ||
289 | static stringop_algs pentium_memcpy[2] = { | |
290 | {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
291 | DUMMY_STRINGOP_ALGS}; | |
292 | static stringop_algs pentium_memset[2] = { | |
293 | {libcall, {{-1, rep_prefix_4_byte, false}}}, | |
294 | DUMMY_STRINGOP_ALGS}; | |
295 | ||
296 | static const | |
297 | struct processor_costs pentium_cost = { | |
298 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
299 | COSTS_N_INSNS (1), /* cost of a lea instruction */ | |
300 | COSTS_N_INSNS (4), /* variable shift costs */ | |
301 | COSTS_N_INSNS (1), /* constant shift costs */ | |
302 | {COSTS_N_INSNS (11), /* cost of starting multiply for QI */ | |
303 | COSTS_N_INSNS (11), /* HI */ | |
304 | COSTS_N_INSNS (11), /* SI */ | |
305 | COSTS_N_INSNS (11), /* DI */ | |
306 | COSTS_N_INSNS (11)}, /* other */ | |
307 | 0, /* cost of multiply per each bit set */ | |
308 | {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */ | |
309 | COSTS_N_INSNS (25), /* HI */ | |
310 | COSTS_N_INSNS (25), /* SI */ | |
311 | COSTS_N_INSNS (25), /* DI */ | |
312 | COSTS_N_INSNS (25)}, /* other */ | |
313 | COSTS_N_INSNS (3), /* cost of movsx */ | |
314 | COSTS_N_INSNS (2), /* cost of movzx */ | |
315 | 8, /* "large" insn */ | |
316 | 6, /* MOVE_RATIO */ | |
df41dbaf JH |
317 | |
318 | /* All move costs are relative to integer->integer move times 2 and thus | |
319 | they are latency*2. */ | |
64766e8d JH |
320 | 6, /* cost for loading QImode using movzbl */ |
321 | {2, 4, 2}, /* cost of loading integer registers | |
322 | in QImode, HImode and SImode. | |
323 | Relative to reg-reg move (2). */ | |
324 | {2, 4, 2}, /* cost of storing integer registers */ | |
325 | 2, /* cost of reg,reg fld/fst */ | |
326 | {2, 2, 6}, /* cost of loading fp registers | |
327 | in SFmode, DFmode and XFmode */ | |
328 | {4, 4, 6}, /* cost of storing fp registers | |
329 | in SFmode, DFmode and XFmode */ | |
330 | 8, /* cost of moving MMX register */ | |
331 | {8, 8}, /* cost of loading MMX registers | |
332 | in SImode and DImode */ | |
333 | {8, 8}, /* cost of storing MMX registers | |
334 | in SImode and DImode */ | |
df41dbaf JH |
335 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
336 | {4, 8, 16, 32, 64}, /* cost of loading SSE registers | |
337 | in 32,64,128,256 and 512-bit */ | |
338 | {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ | |
339 | {4, 8, 16, 32, 64}, /* cost of storing SSE registers | |
340 | in 32,64,128,256 and 512-bit */ | |
341 | {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ | |
342 | 3, 3, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
343 | 4, 4, /* Gather load static, per_elt. */ |
344 | 4, 4, /* Gather store static, per_elt. */ | |
64766e8d JH |
345 | 8, /* size of l1 cache. */ |
346 | 8, /* size of l2 cache */ | |
347 | 0, /* size of prefetch block */ | |
348 | 0, /* number of parallel prefetches */ | |
349 | 2, /* Branch cost */ | |
350 | COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ | |
351 | COSTS_N_INSNS (3), /* cost of FMUL instruction. */ | |
352 | COSTS_N_INSNS (39), /* cost of FDIV instruction. */ | |
353 | COSTS_N_INSNS (1), /* cost of FABS instruction. */ | |
354 | COSTS_N_INSNS (1), /* cost of FCHS instruction. */ | |
355 | COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ | |
6065f444 | 356 | |
c53c148c | 357 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
6065f444 JH |
358 | COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ |
359 | COSTS_N_INSNS (3), /* cost of MULSS instruction. */ | |
360 | COSTS_N_INSNS (3), /* cost of MULSD instruction. */ | |
c53c148c JH |
361 | COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ |
362 | COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ | |
6065f444 JH |
363 | COSTS_N_INSNS (39), /* cost of DIVSS instruction. */ |
364 | COSTS_N_INSNS (39), /* cost of DIVSD instruction. */ | |
365 | COSTS_N_INSNS (70), /* cost of SQRTSS instruction. */ | |
366 | COSTS_N_INSNS (70), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
367 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
368 | pentium_memcpy, | |
369 | pentium_memset, | |
f6fd8f2b JH |
370 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
371 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
64766e8d JH |
372 | }; |
373 | ||
374 | static const | |
375 | struct processor_costs lakemont_cost = { | |
376 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
377 | COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ | |
378 | COSTS_N_INSNS (1), /* variable shift costs */ | |
379 | COSTS_N_INSNS (1), /* constant shift costs */ | |
380 | {COSTS_N_INSNS (11), /* cost of starting multiply for QI */ | |
381 | COSTS_N_INSNS (11), /* HI */ | |
382 | COSTS_N_INSNS (11), /* SI */ | |
383 | COSTS_N_INSNS (11), /* DI */ | |
384 | COSTS_N_INSNS (11)}, /* other */ | |
385 | 0, /* cost of multiply per each bit set */ | |
386 | {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */ | |
387 | COSTS_N_INSNS (25), /* HI */ | |
388 | COSTS_N_INSNS (25), /* SI */ | |
389 | COSTS_N_INSNS (25), /* DI */ | |
390 | COSTS_N_INSNS (25)}, /* other */ | |
391 | COSTS_N_INSNS (3), /* cost of movsx */ | |
392 | COSTS_N_INSNS (2), /* cost of movzx */ | |
393 | 8, /* "large" insn */ | |
394 | 17, /* MOVE_RATIO */ | |
df41dbaf JH |
395 | |
396 | /* All move costs are relative to integer->integer move times 2 and thus | |
397 | they are latency*2. */ | |
64766e8d JH |
398 | 6, /* cost for loading QImode using movzbl */ |
399 | {2, 4, 2}, /* cost of loading integer registers | |
400 | in QImode, HImode and SImode. | |
401 | Relative to reg-reg move (2). */ | |
402 | {2, 4, 2}, /* cost of storing integer registers */ | |
403 | 2, /* cost of reg,reg fld/fst */ | |
404 | {2, 2, 6}, /* cost of loading fp registers | |
405 | in SFmode, DFmode and XFmode */ | |
406 | {4, 4, 6}, /* cost of storing fp registers | |
407 | in SFmode, DFmode and XFmode */ | |
408 | 8, /* cost of moving MMX register */ | |
409 | {8, 8}, /* cost of loading MMX registers | |
410 | in SImode and DImode */ | |
411 | {8, 8}, /* cost of storing MMX registers | |
412 | in SImode and DImode */ | |
df41dbaf JH |
413 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
414 | {4, 8, 16, 32, 64}, /* cost of loading SSE registers | |
415 | in 32,64,128,256 and 512-bit */ | |
416 | {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ | |
417 | {4, 8, 16, 32, 64}, /* cost of storing SSE registers | |
418 | in 32,64,128,256 and 512-bit */ | |
419 | {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ | |
420 | 3, 3, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
421 | 4, 4, /* Gather load static, per_elt. */ |
422 | 4, 4, /* Gather store static, per_elt. */ | |
64766e8d JH |
423 | 8, /* size of l1 cache. */ |
424 | 8, /* size of l2 cache */ | |
425 | 0, /* size of prefetch block */ | |
426 | 0, /* number of parallel prefetches */ | |
427 | 2, /* Branch cost */ | |
428 | COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ | |
429 | COSTS_N_INSNS (3), /* cost of FMUL instruction. */ | |
430 | COSTS_N_INSNS (39), /* cost of FDIV instruction. */ | |
431 | COSTS_N_INSNS (1), /* cost of FABS instruction. */ | |
432 | COSTS_N_INSNS (1), /* cost of FCHS instruction. */ | |
433 | COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ | |
6065f444 | 434 | |
c53c148c | 435 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
6065f444 JH |
436 | COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */ |
437 | COSTS_N_INSNS (5), /* cost of MULSS instruction. */ | |
438 | COSTS_N_INSNS (5), /* cost of MULSD instruction. */ | |
c53c148c JH |
439 | COSTS_N_INSNS (10), /* cost of FMA SS instruction. */ |
440 | COSTS_N_INSNS (10), /* cost of FMA SD instruction. */ | |
6065f444 JH |
441 | COSTS_N_INSNS (31), /* cost of DIVSS instruction. */ |
442 | COSTS_N_INSNS (60), /* cost of DIVSD instruction. */ | |
443 | COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */ | |
444 | COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
445 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
446 | pentium_memcpy, | |
447 | pentium_memset, | |
f6fd8f2b JH |
448 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
449 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
64766e8d JH |
450 | }; |
451 | ||
452 | /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes | |
453 | (we ensure the alignment). For small blocks inline loop is still a | |
454 | noticeable win, for bigger blocks either rep movsl or rep movsb is | |
455 | way to go. Rep movsb has apparently more expensive startup time in CPU, | |
456 | but after 4K the difference is down in the noise. */ | |
457 | static stringop_algs pentiumpro_memcpy[2] = { | |
458 | {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false}, | |
459 | {8192, rep_prefix_4_byte, false}, | |
460 | {-1, rep_prefix_1_byte, false}}}, | |
461 | DUMMY_STRINGOP_ALGS}; | |
462 | static stringop_algs pentiumpro_memset[2] = { | |
463 | {rep_prefix_4_byte, {{1024, unrolled_loop, false}, | |
464 | {8192, rep_prefix_4_byte, false}, | |
465 | {-1, libcall, false}}}, | |
466 | DUMMY_STRINGOP_ALGS}; | |
467 | static const | |
468 | struct processor_costs pentiumpro_cost = { | |
469 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
470 | COSTS_N_INSNS (1), /* cost of a lea instruction */ | |
471 | COSTS_N_INSNS (1), /* variable shift costs */ | |
472 | COSTS_N_INSNS (1), /* constant shift costs */ | |
473 | {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ | |
474 | COSTS_N_INSNS (4), /* HI */ | |
475 | COSTS_N_INSNS (4), /* SI */ | |
476 | COSTS_N_INSNS (4), /* DI */ | |
477 | COSTS_N_INSNS (4)}, /* other */ | |
478 | 0, /* cost of multiply per each bit set */ | |
479 | {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */ | |
480 | COSTS_N_INSNS (17), /* HI */ | |
481 | COSTS_N_INSNS (17), /* SI */ | |
482 | COSTS_N_INSNS (17), /* DI */ | |
483 | COSTS_N_INSNS (17)}, /* other */ | |
484 | COSTS_N_INSNS (1), /* cost of movsx */ | |
485 | COSTS_N_INSNS (1), /* cost of movzx */ | |
486 | 8, /* "large" insn */ | |
487 | 6, /* MOVE_RATIO */ | |
df41dbaf JH |
488 | |
489 | /* All move costs are relative to integer->integer move times 2 and thus | |
490 | they are latency*2. */ | |
64766e8d JH |
491 | 2, /* cost for loading QImode using movzbl */ |
492 | {4, 4, 4}, /* cost of loading integer registers | |
493 | in QImode, HImode and SImode. | |
494 | Relative to reg-reg move (2). */ | |
495 | {2, 2, 2}, /* cost of storing integer registers */ | |
496 | 2, /* cost of reg,reg fld/fst */ | |
497 | {2, 2, 6}, /* cost of loading fp registers | |
498 | in SFmode, DFmode and XFmode */ | |
499 | {4, 4, 6}, /* cost of storing fp registers | |
500 | in SFmode, DFmode and XFmode */ | |
501 | 2, /* cost of moving MMX register */ | |
502 | {2, 2}, /* cost of loading MMX registers | |
503 | in SImode and DImode */ | |
504 | {2, 2}, /* cost of storing MMX registers | |
505 | in SImode and DImode */ | |
df41dbaf JH |
506 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
507 | {4, 8, 16, 32, 64}, /* cost of loading SSE registers | |
508 | in 32,64,128,256 and 512-bit */ | |
509 | {4, 8, 16, 32, 64}, /* cost of unaligned loads. */ | |
510 | {4, 8, 16, 32, 64}, /* cost of storing SSE registers | |
511 | in 32,64,128,256 and 512-bit */ | |
512 | {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ | |
513 | 3, 3, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
514 | 4, 4, /* Gather load static, per_elt. */ |
515 | 4, 4, /* Gather store static, per_elt. */ | |
64766e8d JH |
516 | 8, /* size of l1 cache. */ |
517 | 256, /* size of l2 cache */ | |
518 | 32, /* size of prefetch block */ | |
519 | 6, /* number of parallel prefetches */ | |
520 | 2, /* Branch cost */ | |
521 | COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ | |
522 | COSTS_N_INSNS (5), /* cost of FMUL instruction. */ | |
523 | COSTS_N_INSNS (56), /* cost of FDIV instruction. */ | |
524 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ | |
525 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ | |
526 | COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ | |
6065f444 | 527 | |
c53c148c | 528 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
6065f444 JH |
529 | COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ |
530 | COSTS_N_INSNS (4), /* cost of MULSS instruction. */ | |
531 | COSTS_N_INSNS (4), /* cost of MULSD instruction. */ | |
c53c148c JH |
532 | COSTS_N_INSNS (7), /* cost of FMA SS instruction. */ |
533 | COSTS_N_INSNS (7), /* cost of FMA SD instruction. */ | |
6065f444 JH |
534 | COSTS_N_INSNS (18), /* cost of DIVSS instruction. */ |
535 | COSTS_N_INSNS (18), /* cost of DIVSD instruction. */ | |
536 | COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */ | |
537 | COSTS_N_INSNS (31), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
538 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
539 | pentiumpro_memcpy, | |
540 | pentiumpro_memset, | |
f6fd8f2b JH |
541 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
542 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
64766e8d JH |
543 | }; |
544 | ||
545 | static stringop_algs geode_memcpy[2] = { | |
546 | {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
547 | DUMMY_STRINGOP_ALGS}; | |
548 | static stringop_algs geode_memset[2] = { | |
549 | {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
550 | DUMMY_STRINGOP_ALGS}; | |
551 | static const | |
552 | struct processor_costs geode_cost = { | |
553 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
554 | COSTS_N_INSNS (1), /* cost of a lea instruction */ | |
555 | COSTS_N_INSNS (2), /* variable shift costs */ | |
556 | COSTS_N_INSNS (1), /* constant shift costs */ | |
557 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ | |
558 | COSTS_N_INSNS (4), /* HI */ | |
559 | COSTS_N_INSNS (7), /* SI */ | |
560 | COSTS_N_INSNS (7), /* DI */ | |
561 | COSTS_N_INSNS (7)}, /* other */ | |
562 | 0, /* cost of multiply per each bit set */ | |
563 | {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */ | |
564 | COSTS_N_INSNS (23), /* HI */ | |
565 | COSTS_N_INSNS (39), /* SI */ | |
566 | COSTS_N_INSNS (39), /* DI */ | |
567 | COSTS_N_INSNS (39)}, /* other */ | |
568 | COSTS_N_INSNS (1), /* cost of movsx */ | |
569 | COSTS_N_INSNS (1), /* cost of movzx */ | |
570 | 8, /* "large" insn */ | |
571 | 4, /* MOVE_RATIO */ | |
df41dbaf JH |
572 | |
573 | /* All move costs are relative to integer->integer move times 2 and thus | |
574 | they are latency*2. */ | |
575 | 2, /* cost for loading QImode using movzbl */ | |
576 | {2, 2, 2}, /* cost of loading integer registers | |
64766e8d JH |
577 | in QImode, HImode and SImode. |
578 | Relative to reg-reg move (2). */ | |
df41dbaf JH |
579 | {2, 2, 2}, /* cost of storing integer registers */ |
580 | 2, /* cost of reg,reg fld/fst */ | |
581 | {2, 2, 2}, /* cost of loading fp registers | |
64766e8d JH |
582 | in SFmode, DFmode and XFmode */ |
583 | {4, 6, 6}, /* cost of storing fp registers | |
584 | in SFmode, DFmode and XFmode */ | |
585 | ||
586 | 2, /* cost of moving MMX register */ | |
587 | {2, 2}, /* cost of loading MMX registers | |
588 | in SImode and DImode */ | |
589 | {2, 2}, /* cost of storing MMX registers | |
590 | in SImode and DImode */ | |
df41dbaf JH |
591 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
592 | {2, 2, 8, 16, 32}, /* cost of loading SSE registers | |
593 | in 32,64,128,256 and 512-bit */ | |
594 | {2, 2, 8, 16, 32}, /* cost of unaligned loads. */ | |
595 | {2, 2, 8, 16, 32}, /* cost of storing SSE registers | |
596 | in 32,64,128,256 and 512-bit */ | |
597 | {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ | |
598 | 6, 6, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
599 | 2, 2, /* Gather load static, per_elt. */ |
600 | 2, 2, /* Gather store static, per_elt. */ | |
64766e8d JH |
601 | 64, /* size of l1 cache. */ |
602 | 128, /* size of l2 cache. */ | |
603 | 32, /* size of prefetch block */ | |
604 | 1, /* number of parallel prefetches */ | |
605 | 1, /* Branch cost */ | |
606 | COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ | |
607 | COSTS_N_INSNS (11), /* cost of FMUL instruction. */ | |
608 | COSTS_N_INSNS (47), /* cost of FDIV instruction. */ | |
609 | COSTS_N_INSNS (1), /* cost of FABS instruction. */ | |
610 | COSTS_N_INSNS (1), /* cost of FCHS instruction. */ | |
611 | COSTS_N_INSNS (54), /* cost of FSQRT instruction. */ | |
6065f444 | 612 | |
c53c148c | 613 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
6065f444 JH |
614 | COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */ |
615 | COSTS_N_INSNS (11), /* cost of MULSS instruction. */ | |
616 | COSTS_N_INSNS (11), /* cost of MULSD instruction. */ | |
c53c148c JH |
617 | COSTS_N_INSNS (17), /* cost of FMA SS instruction. */ |
618 | COSTS_N_INSNS (17), /* cost of FMA SD instruction. */ | |
6065f444 JH |
619 | COSTS_N_INSNS (47), /* cost of DIVSS instruction. */ |
620 | COSTS_N_INSNS (47), /* cost of DIVSD instruction. */ | |
621 | COSTS_N_INSNS (54), /* cost of SQRTSS instruction. */ | |
622 | COSTS_N_INSNS (54), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
623 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
624 | geode_memcpy, | |
625 | geode_memset, | |
f6fd8f2b JH |
626 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
627 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
64766e8d JH |
628 | }; |
629 | ||
630 | static stringop_algs k6_memcpy[2] = { | |
631 | {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
632 | DUMMY_STRINGOP_ALGS}; | |
633 | static stringop_algs k6_memset[2] = { | |
634 | {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
635 | DUMMY_STRINGOP_ALGS}; | |
636 | static const | |
637 | struct processor_costs k6_cost = { | |
638 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
639 | COSTS_N_INSNS (2), /* cost of a lea instruction */ | |
640 | COSTS_N_INSNS (1), /* variable shift costs */ | |
641 | COSTS_N_INSNS (1), /* constant shift costs */ | |
642 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ | |
643 | COSTS_N_INSNS (3), /* HI */ | |
644 | COSTS_N_INSNS (3), /* SI */ | |
645 | COSTS_N_INSNS (3), /* DI */ | |
646 | COSTS_N_INSNS (3)}, /* other */ | |
647 | 0, /* cost of multiply per each bit set */ | |
648 | {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ | |
649 | COSTS_N_INSNS (18), /* HI */ | |
650 | COSTS_N_INSNS (18), /* SI */ | |
651 | COSTS_N_INSNS (18), /* DI */ | |
652 | COSTS_N_INSNS (18)}, /* other */ | |
653 | COSTS_N_INSNS (2), /* cost of movsx */ | |
654 | COSTS_N_INSNS (2), /* cost of movzx */ | |
655 | 8, /* "large" insn */ | |
656 | 4, /* MOVE_RATIO */ | |
df41dbaf JH |
657 | |
658 | /* All move costs are relative to integer->integer move times 2 and thus | |
659 | they are latency*2. */ | |
64766e8d JH |
660 | 3, /* cost for loading QImode using movzbl */ |
661 | {4, 5, 4}, /* cost of loading integer registers | |
662 | in QImode, HImode and SImode. | |
663 | Relative to reg-reg move (2). */ | |
664 | {2, 3, 2}, /* cost of storing integer registers */ | |
665 | 4, /* cost of reg,reg fld/fst */ | |
666 | {6, 6, 6}, /* cost of loading fp registers | |
667 | in SFmode, DFmode and XFmode */ | |
668 | {4, 4, 4}, /* cost of storing fp registers | |
669 | in SFmode, DFmode and XFmode */ | |
670 | 2, /* cost of moving MMX register */ | |
671 | {2, 2}, /* cost of loading MMX registers | |
672 | in SImode and DImode */ | |
673 | {2, 2}, /* cost of storing MMX registers | |
674 | in SImode and DImode */ | |
df41dbaf JH |
675 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
676 | {2, 2, 8, 16, 32}, /* cost of loading SSE registers | |
677 | in 32,64,128,256 and 512-bit */ | |
678 | {2, 2, 8, 16, 32}, /* cost of unaligned loads. */ | |
679 | {2, 2, 8, 16, 32}, /* cost of storing SSE registers | |
680 | in 32,64,128,256 and 512-bit */ | |
681 | {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ | |
682 | 6, 6, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
683 | 2, 2, /* Gather load static, per_elt. */ |
684 | 2, 2, /* Gather store static, per_elt. */ | |
64766e8d JH |
685 | 32, /* size of l1 cache. */ |
686 | 32, /* size of l2 cache. Some models | |
687 | have integrated l2 cache, but | |
688 | optimizing for k6 is not important | |
689 | enough to worry about that. */ | |
690 | 32, /* size of prefetch block */ | |
691 | 1, /* number of parallel prefetches */ | |
692 | 1, /* Branch cost */ | |
693 | COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */ | |
694 | COSTS_N_INSNS (2), /* cost of FMUL instruction. */ | |
695 | COSTS_N_INSNS (56), /* cost of FDIV instruction. */ | |
696 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ | |
697 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ | |
698 | COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ | |
6065f444 | 699 | |
c53c148c | 700 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
6065f444 JH |
701 | COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */ |
702 | COSTS_N_INSNS (2), /* cost of MULSS instruction. */ | |
703 | COSTS_N_INSNS (2), /* cost of MULSD instruction. */ | |
c53c148c JH |
704 | COSTS_N_INSNS (4), /* cost of FMA SS instruction. */ |
705 | COSTS_N_INSNS (4), /* cost of FMA SD instruction. */ | |
6065f444 JH |
706 | COSTS_N_INSNS (56), /* cost of DIVSS instruction. */ |
707 | COSTS_N_INSNS (56), /* cost of DIVSD instruction. */ | |
708 | COSTS_N_INSNS (56), /* cost of SQRTSS instruction. */ | |
709 | COSTS_N_INSNS (56), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
710 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
711 | k6_memcpy, | |
712 | k6_memset, | |
f6fd8f2b JH |
713 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
714 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
64766e8d JH |
715 | }; |
716 | ||
717 | /* For some reason, Athlon deals better with REP prefix (relative to loops) | |
718 | compared to K8. Alignment becomes important after 8 bytes for memcpy and | |
719 | 128 bytes for memset. */ | |
720 | static stringop_algs athlon_memcpy[2] = { | |
721 | {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
722 | DUMMY_STRINGOP_ALGS}; | |
723 | static stringop_algs athlon_memset[2] = { | |
724 | {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
725 | DUMMY_STRINGOP_ALGS}; | |
726 | static const | |
727 | struct processor_costs athlon_cost = { | |
728 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
729 | COSTS_N_INSNS (2), /* cost of a lea instruction */ | |
730 | COSTS_N_INSNS (1), /* variable shift costs */ | |
731 | COSTS_N_INSNS (1), /* constant shift costs */ | |
732 | {COSTS_N_INSNS (5), /* cost of starting multiply for QI */ | |
733 | COSTS_N_INSNS (5), /* HI */ | |
734 | COSTS_N_INSNS (5), /* SI */ | |
735 | COSTS_N_INSNS (5), /* DI */ | |
736 | COSTS_N_INSNS (5)}, /* other */ | |
737 | 0, /* cost of multiply per each bit set */ | |
738 | {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ | |
739 | COSTS_N_INSNS (26), /* HI */ | |
740 | COSTS_N_INSNS (42), /* SI */ | |
741 | COSTS_N_INSNS (74), /* DI */ | |
742 | COSTS_N_INSNS (74)}, /* other */ | |
743 | COSTS_N_INSNS (1), /* cost of movsx */ | |
744 | COSTS_N_INSNS (1), /* cost of movzx */ | |
745 | 8, /* "large" insn */ | |
746 | 9, /* MOVE_RATIO */ | |
df41dbaf JH |
747 | |
748 | /* All move costs are relative to integer->integer move times 2 and thus | |
749 | they are latency*2. */ | |
64766e8d JH |
750 | 4, /* cost for loading QImode using movzbl */ |
751 | {3, 4, 3}, /* cost of loading integer registers | |
752 | in QImode, HImode and SImode. | |
753 | Relative to reg-reg move (2). */ | |
754 | {3, 4, 3}, /* cost of storing integer registers */ | |
755 | 4, /* cost of reg,reg fld/fst */ | |
756 | {4, 4, 12}, /* cost of loading fp registers | |
757 | in SFmode, DFmode and XFmode */ | |
758 | {6, 6, 8}, /* cost of storing fp registers | |
759 | in SFmode, DFmode and XFmode */ | |
760 | 2, /* cost of moving MMX register */ | |
761 | {4, 4}, /* cost of loading MMX registers | |
762 | in SImode and DImode */ | |
763 | {4, 4}, /* cost of storing MMX registers | |
764 | in SImode and DImode */ | |
df41dbaf JH |
765 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
766 | {4, 4, 6, 12, 24}, /* cost of loading SSE registers | |
767 | in 32,64,128,256 and 512-bit */ | |
768 | {4, 4, 6, 12, 24}, /* cost of unaligned loads. */ | |
769 | {4, 4, 5, 10, 20}, /* cost of storing SSE registers | |
770 | in 32,64,128,256 and 512-bit */ | |
771 | {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ | |
772 | 5, 5, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
773 | 4, 4, /* Gather load static, per_elt. */ |
774 | 4, 4, /* Gather store static, per_elt. */ | |
64766e8d JH |
775 | 64, /* size of l1 cache. */ |
776 | 256, /* size of l2 cache. */ | |
777 | 64, /* size of prefetch block */ | |
778 | 6, /* number of parallel prefetches */ | |
779 | 5, /* Branch cost */ | |
780 | COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ | |
781 | COSTS_N_INSNS (4), /* cost of FMUL instruction. */ | |
782 | COSTS_N_INSNS (24), /* cost of FDIV instruction. */ | |
783 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ | |
784 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ | |
785 | COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ | |
6065f444 | 786 | |
c53c148c | 787 | COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ |
6065f444 JH |
788 | COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ |
789 | COSTS_N_INSNS (4), /* cost of MULSS instruction. */ | |
790 | COSTS_N_INSNS (4), /* cost of MULSD instruction. */ | |
c53c148c JH |
791 | COSTS_N_INSNS (8), /* cost of FMA SS instruction. */ |
792 | COSTS_N_INSNS (8), /* cost of FMA SD instruction. */ | |
6065f444 JH |
793 | /* 11-16 */ |
794 | COSTS_N_INSNS (16), /* cost of DIVSS instruction. */ | |
795 | COSTS_N_INSNS (24), /* cost of DIVSD instruction. */ | |
796 | COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */ | |
797 | COSTS_N_INSNS (19), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
798 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
799 | athlon_memcpy, | |
800 | athlon_memset, | |
f6fd8f2b JH |
801 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
802 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
64766e8d JH |
803 | }; |
804 | ||
805 | /* K8 has optimized REP instruction for medium sized blocks, but for very | |
806 | small blocks it is better to use loop. For large blocks, libcall can | |
807 | do nontemporary accesses and beat inline considerably. */ | |
808 | static stringop_algs k8_memcpy[2] = { | |
809 | {libcall, {{6, loop, false}, {14, unrolled_loop, false}, | |
810 | {-1, rep_prefix_4_byte, false}}}, | |
811 | {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, | |
812 | {-1, libcall, false}}}}; | |
813 | static stringop_algs k8_memset[2] = { | |
814 | {libcall, {{8, loop, false}, {24, unrolled_loop, false}, | |
815 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
816 | {libcall, {{48, unrolled_loop, false}, | |
817 | {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; | |
818 | static const | |
819 | struct processor_costs k8_cost = { | |
820 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
821 | COSTS_N_INSNS (2), /* cost of a lea instruction */ | |
822 | COSTS_N_INSNS (1), /* variable shift costs */ | |
823 | COSTS_N_INSNS (1), /* constant shift costs */ | |
824 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ | |
825 | COSTS_N_INSNS (4), /* HI */ | |
826 | COSTS_N_INSNS (3), /* SI */ | |
827 | COSTS_N_INSNS (4), /* DI */ | |
828 | COSTS_N_INSNS (5)}, /* other */ | |
829 | 0, /* cost of multiply per each bit set */ | |
830 | {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ | |
831 | COSTS_N_INSNS (26), /* HI */ | |
832 | COSTS_N_INSNS (42), /* SI */ | |
833 | COSTS_N_INSNS (74), /* DI */ | |
834 | COSTS_N_INSNS (74)}, /* other */ | |
835 | COSTS_N_INSNS (1), /* cost of movsx */ | |
836 | COSTS_N_INSNS (1), /* cost of movzx */ | |
837 | 8, /* "large" insn */ | |
838 | 9, /* MOVE_RATIO */ | |
df41dbaf JH |
839 | |
840 | /* All move costs are relative to integer->integer move times 2 and thus | |
841 | they are latency*2. */ | |
64766e8d JH |
842 | 4, /* cost for loading QImode using movzbl */ |
843 | {3, 4, 3}, /* cost of loading integer registers | |
844 | in QImode, HImode and SImode. | |
845 | Relative to reg-reg move (2). */ | |
846 | {3, 4, 3}, /* cost of storing integer registers */ | |
847 | 4, /* cost of reg,reg fld/fst */ | |
848 | {4, 4, 12}, /* cost of loading fp registers | |
849 | in SFmode, DFmode and XFmode */ | |
850 | {6, 6, 8}, /* cost of storing fp registers | |
851 | in SFmode, DFmode and XFmode */ | |
852 | 2, /* cost of moving MMX register */ | |
853 | {3, 3}, /* cost of loading MMX registers | |
854 | in SImode and DImode */ | |
855 | {4, 4}, /* cost of storing MMX registers | |
856 | in SImode and DImode */ | |
df41dbaf JH |
857 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
858 | {4, 3, 6, 12, 24}, /* cost of loading SSE registers | |
859 | in 32,64,128,256 and 512-bit */ | |
860 | {4, 3, 6, 12, 24}, /* cost of unaligned loads. */ | |
861 | {4, 4, 5, 10, 20}, /* cost of storing SSE registers | |
862 | in 32,64,128,256 and 512-bit */ | |
863 | {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ | |
864 | 5, 5, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
865 | 4, 4, /* Gather load static, per_elt. */ |
866 | 4, 4, /* Gather store static, per_elt. */ | |
64766e8d JH |
867 | 64, /* size of l1 cache. */ |
868 | 512, /* size of l2 cache. */ | |
869 | 64, /* size of prefetch block */ | |
870 | /* New AMD processors never drop prefetches; if they cannot be performed | |
871 | immediately, they are queued. We set number of simultaneous prefetches | |
872 | to a large constant to reflect this (it probably is not a good idea not | |
873 | to limit number of prefetches at all, as their execution also takes some | |
874 | time). */ | |
875 | 100, /* number of parallel prefetches */ | |
876 | 3, /* Branch cost */ | |
877 | COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ | |
878 | COSTS_N_INSNS (4), /* cost of FMUL instruction. */ | |
879 | COSTS_N_INSNS (19), /* cost of FDIV instruction. */ | |
880 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ | |
881 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ | |
882 | COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ | |
6065f444 | 883 | |
c53c148c | 884 | COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ |
6065f444 JH |
885 | COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ |
886 | COSTS_N_INSNS (4), /* cost of MULSS instruction. */ | |
887 | COSTS_N_INSNS (4), /* cost of MULSD instruction. */ | |
c53c148c JH |
888 | COSTS_N_INSNS (8), /* cost of FMA SS instruction. */ |
889 | COSTS_N_INSNS (8), /* cost of FMA SD instruction. */ | |
6065f444 JH |
890 | /* 11-16 */ |
891 | COSTS_N_INSNS (16), /* cost of DIVSS instruction. */ | |
892 | COSTS_N_INSNS (20), /* cost of DIVSD instruction. */ | |
893 | COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */ | |
894 | COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
895 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
896 | k8_memcpy, | |
897 | k8_memset, | |
f6fd8f2b JH |
898 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
899 | COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ | |
64766e8d JH |
900 | }; |
901 | ||
902 | /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for | |
903 | very small blocks it is better to use loop. For large blocks, libcall can | |
904 | do nontemporary accesses and beat inline considerably. */ | |
905 | static stringop_algs amdfam10_memcpy[2] = { | |
906 | {libcall, {{6, loop, false}, {14, unrolled_loop, false}, | |
907 | {-1, rep_prefix_4_byte, false}}}, | |
908 | {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, | |
909 | {-1, libcall, false}}}}; | |
910 | static stringop_algs amdfam10_memset[2] = { | |
911 | {libcall, {{8, loop, false}, {24, unrolled_loop, false}, | |
912 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
913 | {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, | |
914 | {-1, libcall, false}}}}; | |
915 | struct processor_costs amdfam10_cost = { | |
916 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
917 | COSTS_N_INSNS (2), /* cost of a lea instruction */ | |
918 | COSTS_N_INSNS (1), /* variable shift costs */ | |
919 | COSTS_N_INSNS (1), /* constant shift costs */ | |
920 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ | |
921 | COSTS_N_INSNS (4), /* HI */ | |
922 | COSTS_N_INSNS (3), /* SI */ | |
923 | COSTS_N_INSNS (4), /* DI */ | |
924 | COSTS_N_INSNS (5)}, /* other */ | |
925 | 0, /* cost of multiply per each bit set */ | |
926 | {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ | |
927 | COSTS_N_INSNS (35), /* HI */ | |
928 | COSTS_N_INSNS (51), /* SI */ | |
929 | COSTS_N_INSNS (83), /* DI */ | |
930 | COSTS_N_INSNS (83)}, /* other */ | |
931 | COSTS_N_INSNS (1), /* cost of movsx */ | |
932 | COSTS_N_INSNS (1), /* cost of movzx */ | |
933 | 8, /* "large" insn */ | |
934 | 9, /* MOVE_RATIO */ | |
df41dbaf JH |
935 | |
936 | /* All move costs are relative to integer->integer move times 2 and thus | |
937 | they are latency*2. */ | |
64766e8d JH |
938 | 4, /* cost for loading QImode using movzbl */ |
939 | {3, 4, 3}, /* cost of loading integer registers | |
940 | in QImode, HImode and SImode. | |
941 | Relative to reg-reg move (2). */ | |
942 | {3, 4, 3}, /* cost of storing integer registers */ | |
943 | 4, /* cost of reg,reg fld/fst */ | |
944 | {4, 4, 12}, /* cost of loading fp registers | |
945 | in SFmode, DFmode and XFmode */ | |
946 | {6, 6, 8}, /* cost of storing fp registers | |
947 | in SFmode, DFmode and XFmode */ | |
948 | 2, /* cost of moving MMX register */ | |
949 | {3, 3}, /* cost of loading MMX registers | |
950 | in SImode and DImode */ | |
951 | {4, 4}, /* cost of storing MMX registers | |
952 | in SImode and DImode */ | |
df41dbaf JH |
953 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
954 | {4, 4, 3, 6, 12}, /* cost of loading SSE registers | |
955 | in 32,64,128,256 and 512-bit */ | |
956 | {4, 4, 3, 7, 12}, /* cost of unaligned loads. */ | |
957 | {4, 4, 5, 10, 20}, /* cost of storing SSE registers | |
958 | in 32,64,128,256 and 512-bit */ | |
959 | {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ | |
960 | 3, 3, /* SSE->integer and integer->SSE moves */ | |
64766e8d JH |
961 | /* On K8: |
962 | MOVD reg64, xmmreg Double FSTORE 4 | |
963 | MOVD reg32, xmmreg Double FSTORE 4 | |
964 | On AMDFAM10: | |
965 | MOVD reg64, xmmreg Double FADD 3 | |
966 | 1/1 1/1 | |
967 | MOVD reg32, xmmreg Double FADD 3 | |
968 | 1/1 1/1 */ | |
a4fe6139 JH |
969 | 4, 4, /* Gather load static, per_elt. */ |
970 | 4, 4, /* Gather store static, per_elt. */ | |
64766e8d JH |
971 | 64, /* size of l1 cache. */ |
972 | 512, /* size of l2 cache. */ | |
973 | 64, /* size of prefetch block */ | |
974 | /* New AMD processors never drop prefetches; if they cannot be performed | |
975 | immediately, they are queued. We set number of simultaneous prefetches | |
976 | to a large constant to reflect this (it probably is not a good idea not | |
977 | to limit number of prefetches at all, as their execution also takes some | |
978 | time). */ | |
979 | 100, /* number of parallel prefetches */ | |
980 | 2, /* Branch cost */ | |
981 | COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ | |
982 | COSTS_N_INSNS (4), /* cost of FMUL instruction. */ | |
983 | COSTS_N_INSNS (19), /* cost of FDIV instruction. */ | |
984 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ | |
985 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ | |
986 | COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ | |
6065f444 | 987 | |
c53c148c | 988 | COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ |
6065f444 JH |
989 | COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ |
990 | COSTS_N_INSNS (4), /* cost of MULSS instruction. */ | |
991 | COSTS_N_INSNS (4), /* cost of MULSD instruction. */ | |
c53c148c JH |
992 | COSTS_N_INSNS (8), /* cost of FMA SS instruction. */ |
993 | COSTS_N_INSNS (8), /* cost of FMA SD instruction. */ | |
6065f444 JH |
994 | /* 11-16 */ |
995 | COSTS_N_INSNS (16), /* cost of DIVSS instruction. */ | |
996 | COSTS_N_INSNS (20), /* cost of DIVSD instruction. */ | |
997 | COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */ | |
998 | COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
999 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
1000 | amdfam10_memcpy, | |
1001 | amdfam10_memset, | |
f6fd8f2b JH |
1002 | COSTS_N_INSNS (2), /* cond_taken_branch_cost. */ |
1003 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
64766e8d JH |
1004 | }; |
1005 | ||
1006 | /* BDVER1 has optimized REP instruction for medium sized blocks, but for | |
1007 | very small blocks it is better to use loop. For large blocks, libcall | |
1008 | can do nontemporary accesses and beat inline considerably. */ | |
1009 | static stringop_algs bdver1_memcpy[2] = { | |
1010 | {libcall, {{6, loop, false}, {14, unrolled_loop, false}, | |
1011 | {-1, rep_prefix_4_byte, false}}}, | |
1012 | {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, | |
1013 | {-1, libcall, false}}}}; | |
1014 | static stringop_algs bdver1_memset[2] = { | |
1015 | {libcall, {{8, loop, false}, {24, unrolled_loop, false}, | |
1016 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
1017 | {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, | |
1018 | {-1, libcall, false}}}}; | |
1019 | ||
1020 | const struct processor_costs bdver1_cost = { | |
1021 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
1022 | COSTS_N_INSNS (1), /* cost of a lea instruction */ | |
1023 | COSTS_N_INSNS (1), /* variable shift costs */ | |
1024 | COSTS_N_INSNS (1), /* constant shift costs */ | |
1025 | {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ | |
1026 | COSTS_N_INSNS (4), /* HI */ | |
1027 | COSTS_N_INSNS (4), /* SI */ | |
1028 | COSTS_N_INSNS (6), /* DI */ | |
1029 | COSTS_N_INSNS (6)}, /* other */ | |
1030 | 0, /* cost of multiply per each bit set */ | |
1031 | {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ | |
1032 | COSTS_N_INSNS (35), /* HI */ | |
1033 | COSTS_N_INSNS (51), /* SI */ | |
1034 | COSTS_N_INSNS (83), /* DI */ | |
1035 | COSTS_N_INSNS (83)}, /* other */ | |
1036 | COSTS_N_INSNS (1), /* cost of movsx */ | |
1037 | COSTS_N_INSNS (1), /* cost of movzx */ | |
1038 | 8, /* "large" insn */ | |
1039 | 9, /* MOVE_RATIO */ | |
df41dbaf JH |
1040 | |
1041 | /* All move costs are relative to integer->integer move times 2 and thus | |
1042 | they are latency*2. */ | |
1043 | 8, /* cost for loading QImode using movzbl */ | |
1044 | {8, 8, 8}, /* cost of loading integer registers | |
64766e8d JH |
1045 | in QImode, HImode and SImode. |
1046 | Relative to reg-reg move (2). */ | |
df41dbaf JH |
1047 | {8, 8, 8}, /* cost of storing integer registers */ |
1048 | 4, /* cost of reg,reg fld/fst */ | |
1049 | {12, 12, 28}, /* cost of loading fp registers | |
64766e8d | 1050 | in SFmode, DFmode and XFmode */ |
df41dbaf | 1051 | {10, 10, 18}, /* cost of storing fp registers |
64766e8d | 1052 | in SFmode, DFmode and XFmode */ |
df41dbaf JH |
1053 | 4, /* cost of moving MMX register */ |
1054 | {12, 12}, /* cost of loading MMX registers | |
64766e8d | 1055 | in SImode and DImode */ |
df41dbaf | 1056 | {10, 10}, /* cost of storing MMX registers |
64766e8d | 1057 | in SImode and DImode */ |
df41dbaf JH |
1058 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
1059 | {12, 12, 10, 20, 30}, /* cost of loading SSE registers | |
1060 | in 32,64,128,256 and 512-bit */ | |
1061 | {12, 12, 10, 20, 30}, /* cost of unaligned loads. */ | |
1062 | {10, 10, 10, 20, 30}, /* cost of storing SSE registers | |
1063 | in 32,64,128,256 and 512-bit */ | |
1064 | {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ | |
1065 | 16, 20, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
1066 | 12, 12, /* Gather load static, per_elt. */ |
1067 | 10, 10, /* Gather store static, per_elt. */ | |
64766e8d JH |
1068 | 16, /* size of l1 cache. */ |
1069 | 2048, /* size of l2 cache. */ | |
1070 | 64, /* size of prefetch block */ | |
1071 | /* New AMD processors never drop prefetches; if they cannot be performed | |
1072 | immediately, they are queued. We set number of simultaneous prefetches | |
1073 | to a large constant to reflect this (it probably is not a good idea not | |
1074 | to limit number of prefetches at all, as their execution also takes some | |
1075 | time). */ | |
1076 | 100, /* number of parallel prefetches */ | |
1077 | 2, /* Branch cost */ | |
1078 | COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ | |
1079 | COSTS_N_INSNS (6), /* cost of FMUL instruction. */ | |
1080 | COSTS_N_INSNS (42), /* cost of FDIV instruction. */ | |
1081 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ | |
1082 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ | |
1083 | COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ | |
6065f444 | 1084 | |
c53c148c | 1085 | COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ |
6065f444 JH |
1086 | COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */ |
1087 | COSTS_N_INSNS (6), /* cost of MULSS instruction. */ | |
1088 | COSTS_N_INSNS (6), /* cost of MULSD instruction. */ | |
c53c148c JH |
1089 | COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ |
1090 | COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ | |
6065f444 JH |
1091 | /* 9-24 */ |
1092 | COSTS_N_INSNS (24), /* cost of DIVSS instruction. */ | |
1093 | /* 9-27 */ | |
1094 | COSTS_N_INSNS (27), /* cost of DIVSD instruction. */ | |
1095 | COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */ | |
1096 | COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
1097 | 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
1098 | bdver1_memcpy, | |
1099 | bdver1_memset, | |
f6fd8f2b JH |
1100 | COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ |
1101 | COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ | |
64766e8d JH |
1102 | }; |
1103 | ||
1104 | /* BDVER2 has optimized REP instruction for medium sized blocks, but for | |
1105 | very small blocks it is better to use loop. For large blocks, libcall | |
1106 | can do nontemporary accesses and beat inline considerably. */ | |
1107 | ||
1108 | static stringop_algs bdver2_memcpy[2] = { | |
1109 | {libcall, {{6, loop, false}, {14, unrolled_loop, false}, | |
1110 | {-1, rep_prefix_4_byte, false}}}, | |
1111 | {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, | |
1112 | {-1, libcall, false}}}}; | |
1113 | static stringop_algs bdver2_memset[2] = { | |
1114 | {libcall, {{8, loop, false}, {24, unrolled_loop, false}, | |
1115 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
1116 | {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, | |
1117 | {-1, libcall, false}}}}; | |
1118 | ||
1119 | const struct processor_costs bdver2_cost = { | |
1120 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
1121 | COSTS_N_INSNS (1), /* cost of a lea instruction */ | |
1122 | COSTS_N_INSNS (1), /* variable shift costs */ | |
1123 | COSTS_N_INSNS (1), /* constant shift costs */ | |
1124 | {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ | |
1125 | COSTS_N_INSNS (4), /* HI */ | |
1126 | COSTS_N_INSNS (4), /* SI */ | |
1127 | COSTS_N_INSNS (6), /* DI */ | |
1128 | COSTS_N_INSNS (6)}, /* other */ | |
1129 | 0, /* cost of multiply per each bit set */ | |
1130 | {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ | |
1131 | COSTS_N_INSNS (35), /* HI */ | |
1132 | COSTS_N_INSNS (51), /* SI */ | |
1133 | COSTS_N_INSNS (83), /* DI */ | |
1134 | COSTS_N_INSNS (83)}, /* other */ | |
1135 | COSTS_N_INSNS (1), /* cost of movsx */ | |
1136 | COSTS_N_INSNS (1), /* cost of movzx */ | |
1137 | 8, /* "large" insn */ | |
1138 | 9, /* MOVE_RATIO */ | |
df41dbaf JH |
1139 | |
1140 | /* All move costs are relative to integer->integer move times 2 and thus | |
1141 | they are latency*2. */ | |
1142 | 8, /* cost for loading QImode using movzbl */ | |
1143 | {8, 8, 8}, /* cost of loading integer registers | |
64766e8d JH |
1144 | in QImode, HImode and SImode. |
1145 | Relative to reg-reg move (2). */ | |
df41dbaf JH |
1146 | {8, 8, 8}, /* cost of storing integer registers */ |
1147 | 4, /* cost of reg,reg fld/fst */ | |
1148 | {12, 12, 28}, /* cost of loading fp registers | |
64766e8d | 1149 | in SFmode, DFmode and XFmode */ |
df41dbaf | 1150 | {10, 10, 18}, /* cost of storing fp registers |
64766e8d | 1151 | in SFmode, DFmode and XFmode */ |
df41dbaf JH |
1152 | 4, /* cost of moving MMX register */ |
1153 | {12, 12}, /* cost of loading MMX registers | |
64766e8d | 1154 | in SImode and DImode */ |
df41dbaf | 1155 | {10, 10}, /* cost of storing MMX registers |
64766e8d | 1156 | in SImode and DImode */ |
df41dbaf JH |
1157 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
1158 | {12, 12, 10, 20, 30}, /* cost of loading SSE registers | |
1159 | in 32,64,128,256 and 512-bit */ | |
1160 | {12, 12, 10, 20, 30}, /* cost of unaligned loads. */ | |
1161 | {10, 10, 10, 20, 30}, /* cost of storing SSE registers | |
1162 | in 32,64,128,256 and 512-bit */ | |
1163 | {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ | |
1164 | 16, 20, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
1165 | 12, 12, /* Gather load static, per_elt. */ |
1166 | 10, 10, /* Gather store static, per_elt. */ | |
64766e8d JH |
1167 | 16, /* size of l1 cache. */ |
1168 | 2048, /* size of l2 cache. */ | |
1169 | 64, /* size of prefetch block */ | |
1170 | /* New AMD processors never drop prefetches; if they cannot be performed | |
1171 | immediately, they are queued. We set number of simultaneous prefetches | |
1172 | to a large constant to reflect this (it probably is not a good idea not | |
1173 | to limit number of prefetches at all, as their execution also takes some | |
1174 | time). */ | |
1175 | 100, /* number of parallel prefetches */ | |
1176 | 2, /* Branch cost */ | |
1177 | COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ | |
1178 | COSTS_N_INSNS (6), /* cost of FMUL instruction. */ | |
1179 | COSTS_N_INSNS (42), /* cost of FDIV instruction. */ | |
1180 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ | |
1181 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ | |
1182 | COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ | |
6065f444 | 1183 | |
c53c148c | 1184 | COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ |
6065f444 JH |
1185 | COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */ |
1186 | COSTS_N_INSNS (6), /* cost of MULSS instruction. */ | |
1187 | COSTS_N_INSNS (6), /* cost of MULSD instruction. */ | |
c53c148c JH |
1188 | COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ |
1189 | COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ | |
6065f444 JH |
1190 | /* 9-24 */ |
1191 | COSTS_N_INSNS (24), /* cost of DIVSS instruction. */ | |
1192 | /* 9-27 */ | |
1193 | COSTS_N_INSNS (27), /* cost of DIVSD instruction. */ | |
1194 | COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */ | |
1195 | COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
1196 | 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
1197 | bdver2_memcpy, | |
1198 | bdver2_memset, | |
f6fd8f2b JH |
1199 | COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ |
1200 | COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ | |
64766e8d JH |
1201 | }; |
1202 | ||
1203 | ||
1204 | /* BDVER3 has optimized REP instruction for medium sized blocks, but for | |
1205 | very small blocks it is better to use loop. For large blocks, libcall | |
1206 | can do nontemporary accesses and beat inline considerably. */ | |
1207 | static stringop_algs bdver3_memcpy[2] = { | |
1208 | {libcall, {{6, loop, false}, {14, unrolled_loop, false}, | |
1209 | {-1, rep_prefix_4_byte, false}}}, | |
1210 | {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, | |
1211 | {-1, libcall, false}}}}; | |
1212 | static stringop_algs bdver3_memset[2] = { | |
1213 | {libcall, {{8, loop, false}, {24, unrolled_loop, false}, | |
1214 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
1215 | {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, | |
1216 | {-1, libcall, false}}}}; | |
1217 | struct processor_costs bdver3_cost = { | |
1218 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
1219 | COSTS_N_INSNS (1), /* cost of a lea instruction */ | |
1220 | COSTS_N_INSNS (1), /* variable shift costs */ | |
1221 | COSTS_N_INSNS (1), /* constant shift costs */ | |
1222 | {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ | |
1223 | COSTS_N_INSNS (4), /* HI */ | |
1224 | COSTS_N_INSNS (4), /* SI */ | |
1225 | COSTS_N_INSNS (6), /* DI */ | |
1226 | COSTS_N_INSNS (6)}, /* other */ | |
1227 | 0, /* cost of multiply per each bit set */ | |
1228 | {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ | |
1229 | COSTS_N_INSNS (35), /* HI */ | |
1230 | COSTS_N_INSNS (51), /* SI */ | |
1231 | COSTS_N_INSNS (83), /* DI */ | |
1232 | COSTS_N_INSNS (83)}, /* other */ | |
1233 | COSTS_N_INSNS (1), /* cost of movsx */ | |
1234 | COSTS_N_INSNS (1), /* cost of movzx */ | |
1235 | 8, /* "large" insn */ | |
1236 | 9, /* MOVE_RATIO */ | |
df41dbaf JH |
1237 | |
1238 | /* All move costs are relative to integer->integer move times 2 and thus | |
1239 | they are latency*2. */ | |
1240 | 8, /* cost for loading QImode using movzbl */ | |
1241 | {8, 8, 8}, /* cost of loading integer registers | |
64766e8d JH |
1242 | in QImode, HImode and SImode. |
1243 | Relative to reg-reg move (2). */ | |
df41dbaf JH |
1244 | {8, 8, 8}, /* cost of storing integer registers */ |
1245 | 4, /* cost of reg,reg fld/fst */ | |
1246 | {12, 12, 28}, /* cost of loading fp registers | |
64766e8d | 1247 | in SFmode, DFmode and XFmode */ |
df41dbaf | 1248 | {10, 10, 18}, /* cost of storing fp registers |
64766e8d | 1249 | in SFmode, DFmode and XFmode */ |
df41dbaf JH |
1250 | 4, /* cost of moving MMX register */ |
1251 | {12, 12}, /* cost of loading MMX registers | |
64766e8d | 1252 | in SImode and DImode */ |
df41dbaf | 1253 | {10, 10}, /* cost of storing MMX registers |
64766e8d | 1254 | in SImode and DImode */ |
df41dbaf JH |
1255 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
1256 | {12, 12, 10, 20, 30}, /* cost of loading SSE registers | |
1257 | in 32,64,128,256 and 512-bit */ | |
1258 | {12, 12, 10, 20, 30}, /* cost of unaligned loads. */ | |
1259 | {10, 10, 10, 20, 30}, /* cost of storing SSE registers | |
1260 | in 32,64,128,256 and 512-bit */ | |
1261 | {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ | |
1262 | 16, 20, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
1263 | 12, 12, /* Gather load static, per_elt. */ |
1264 | 10, 10, /* Gather store static, per_elt. */ | |
64766e8d JH |
1265 | 16, /* size of l1 cache. */ |
1266 | 2048, /* size of l2 cache. */ | |
1267 | 64, /* size of prefetch block */ | |
1268 | /* New AMD processors never drop prefetches; if they cannot be performed | |
1269 | immediately, they are queued. We set number of simultaneous prefetches | |
1270 | to a large constant to reflect this (it probably is not a good idea not | |
1271 | to limit number of prefetches at all, as their execution also takes some | |
1272 | time). */ | |
1273 | 100, /* number of parallel prefetches */ | |
1274 | 2, /* Branch cost */ | |
1275 | COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ | |
1276 | COSTS_N_INSNS (6), /* cost of FMUL instruction. */ | |
1277 | COSTS_N_INSNS (42), /* cost of FDIV instruction. */ | |
1278 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ | |
1279 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ | |
1280 | COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ | |
6065f444 | 1281 | |
c53c148c | 1282 | COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ |
6065f444 JH |
1283 | COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */ |
1284 | COSTS_N_INSNS (6), /* cost of MULSS instruction. */ | |
1285 | COSTS_N_INSNS (6), /* cost of MULSD instruction. */ | |
c53c148c JH |
1286 | COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ |
1287 | COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ | |
6065f444 JH |
1288 | /* 9-24 */ |
1289 | COSTS_N_INSNS (24), /* cost of DIVSS instruction. */ | |
1290 | /* 9-27 */ | |
1291 | COSTS_N_INSNS (27), /* cost of DIVSD instruction. */ | |
1292 | COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */ | |
1293 | COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
1294 | 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
1295 | bdver3_memcpy, | |
1296 | bdver3_memset, | |
f6fd8f2b JH |
1297 | COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ |
1298 | COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ | |
64766e8d JH |
1299 | }; |
1300 | ||
1301 | /* BDVER4 has optimized REP instruction for medium sized blocks, but for | |
1302 | very small blocks it is better to use loop. For large blocks, libcall | |
1303 | can do nontemporary accesses and beat inline considerably. */ | |
1304 | static stringop_algs bdver4_memcpy[2] = { | |
1305 | {libcall, {{6, loop, false}, {14, unrolled_loop, false}, | |
1306 | {-1, rep_prefix_4_byte, false}}}, | |
1307 | {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, | |
1308 | {-1, libcall, false}}}}; | |
1309 | static stringop_algs bdver4_memset[2] = { | |
1310 | {libcall, {{8, loop, false}, {24, unrolled_loop, false}, | |
1311 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
1312 | {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, | |
1313 | {-1, libcall, false}}}}; | |
1314 | struct processor_costs bdver4_cost = { | |
1315 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
1316 | COSTS_N_INSNS (1), /* cost of a lea instruction */ | |
1317 | COSTS_N_INSNS (1), /* variable shift costs */ | |
1318 | COSTS_N_INSNS (1), /* constant shift costs */ | |
1319 | {COSTS_N_INSNS (4), /* cost of starting multiply for QI */ | |
1320 | COSTS_N_INSNS (4), /* HI */ | |
1321 | COSTS_N_INSNS (4), /* SI */ | |
1322 | COSTS_N_INSNS (6), /* DI */ | |
1323 | COSTS_N_INSNS (6)}, /* other */ | |
1324 | 0, /* cost of multiply per each bit set */ | |
1325 | {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ | |
1326 | COSTS_N_INSNS (35), /* HI */ | |
1327 | COSTS_N_INSNS (51), /* SI */ | |
1328 | COSTS_N_INSNS (83), /* DI */ | |
1329 | COSTS_N_INSNS (83)}, /* other */ | |
1330 | COSTS_N_INSNS (1), /* cost of movsx */ | |
1331 | COSTS_N_INSNS (1), /* cost of movzx */ | |
1332 | 8, /* "large" insn */ | |
1333 | 9, /* MOVE_RATIO */ | |
df41dbaf JH |
1334 | |
1335 | /* All move costs are relative to integer->integer move times 2 and thus | |
1336 | they are latency*2. */ | |
1337 | 8, /* cost for loading QImode using movzbl */ | |
1338 | {8, 8, 8}, /* cost of loading integer registers | |
64766e8d JH |
1339 | in QImode, HImode and SImode. |
1340 | Relative to reg-reg move (2). */ | |
df41dbaf JH |
1341 | {8, 8, 8}, /* cost of storing integer registers */ |
1342 | 4, /* cost of reg,reg fld/fst */ | |
1343 | {12, 12, 28}, /* cost of loading fp registers | |
64766e8d | 1344 | in SFmode, DFmode and XFmode */ |
df41dbaf | 1345 | {10, 10, 18}, /* cost of storing fp registers |
64766e8d | 1346 | in SFmode, DFmode and XFmode */ |
df41dbaf JH |
1347 | 4, /* cost of moving MMX register */ |
1348 | {12, 12}, /* cost of loading MMX registers | |
64766e8d | 1349 | in SImode and DImode */ |
df41dbaf | 1350 | {10, 10}, /* cost of storing MMX registers |
64766e8d | 1351 | in SImode and DImode */ |
df41dbaf JH |
1352 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
1353 | {12, 12, 10, 20, 30}, /* cost of loading SSE registers | |
1354 | in 32,64,128,256 and 512-bit */ | |
1355 | {12, 12, 10, 20, 30}, /* cost of unaligned loads. */ | |
1356 | {10, 10, 10, 20, 30}, /* cost of storing SSE registers | |
1357 | in 32,64,128,256 and 512-bit */ | |
1358 | {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ | |
1359 | 16, 20, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
1360 | 12, 12, /* Gather load static, per_elt. */ |
1361 | 10, 10, /* Gather store static, per_elt. */ | |
64766e8d JH |
1362 | 16, /* size of l1 cache. */ |
1363 | 2048, /* size of l2 cache. */ | |
1364 | 64, /* size of prefetch block */ | |
1365 | /* New AMD processors never drop prefetches; if they cannot be performed | |
1366 | immediately, they are queued. We set number of simultaneous prefetches | |
1367 | to a large constant to reflect this (it probably is not a good idea not | |
1368 | to limit number of prefetches at all, as their execution also takes some | |
1369 | time). */ | |
1370 | 100, /* number of parallel prefetches */ | |
1371 | 2, /* Branch cost */ | |
1372 | COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ | |
1373 | COSTS_N_INSNS (6), /* cost of FMUL instruction. */ | |
1374 | COSTS_N_INSNS (42), /* cost of FDIV instruction. */ | |
1375 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ | |
1376 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ | |
1377 | COSTS_N_INSNS (52), /* cost of FSQRT instruction. */ | |
6065f444 | 1378 | |
c53c148c | 1379 | COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ |
6065f444 JH |
1380 | COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */ |
1381 | COSTS_N_INSNS (6), /* cost of MULSS instruction. */ | |
1382 | COSTS_N_INSNS (6), /* cost of MULSD instruction. */ | |
c53c148c JH |
1383 | COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ |
1384 | COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ | |
6065f444 JH |
1385 | /* 9-24 */ |
1386 | COSTS_N_INSNS (24), /* cost of DIVSS instruction. */ | |
1387 | /* 9-27 */ | |
1388 | COSTS_N_INSNS (27), /* cost of DIVSD instruction. */ | |
1389 | COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */ | |
1390 | COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
1391 | 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
1392 | bdver4_memcpy, | |
1393 | bdver4_memset, | |
f6fd8f2b JH |
1394 | COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ |
1395 | COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ | |
64766e8d JH |
1396 | }; |
1397 | ||
1398 | ||
1399 | /* ZNVER1 has optimized REP instruction for medium sized blocks, but for | |
1400 | very small blocks it is better to use loop. For large blocks, libcall | |
1401 | can do nontemporary accesses and beat inline considerably. */ | |
1402 | static stringop_algs znver1_memcpy[2] = { | |
1403 | {libcall, {{6, loop, false}, {14, unrolled_loop, false}, | |
1404 | {-1, rep_prefix_4_byte, false}}}, | |
1405 | {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, | |
1406 | {-1, libcall, false}}}}; | |
1407 | static stringop_algs znver1_memset[2] = { | |
1408 | {libcall, {{8, loop, false}, {24, unrolled_loop, false}, | |
1409 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
1410 | {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, | |
1411 | {-1, libcall, false}}}}; | |
1412 | struct processor_costs znver1_cost = { | |
1413 | COSTS_N_INSNS (1), /* cost of an add instruction. */ | |
1414 | COSTS_N_INSNS (1), /* cost of a lea instruction. */ | |
1415 | COSTS_N_INSNS (1), /* variable shift costs. */ | |
1416 | COSTS_N_INSNS (1), /* constant shift costs. */ | |
1417 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */ | |
1418 | COSTS_N_INSNS (3), /* HI. */ | |
1419 | COSTS_N_INSNS (3), /* SI. */ | |
6065f444 JH |
1420 | COSTS_N_INSNS (3), /* DI. */ |
1421 | COSTS_N_INSNS (3)}, /* other. */ | |
64766e8d JH |
1422 | 0, /* cost of multiply per each bit |
1423 | set. */ | |
6065f444 JH |
1424 | /* Depending on parameters, idiv can get faster on ryzen. This is upper |
1425 | bound. */ | |
1426 | {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */ | |
1427 | COSTS_N_INSNS (22), /* HI. */ | |
1428 | COSTS_N_INSNS (30), /* SI. */ | |
1429 | COSTS_N_INSNS (45), /* DI. */ | |
1430 | COSTS_N_INSNS (45)}, /* other. */ | |
64766e8d JH |
1431 | COSTS_N_INSNS (1), /* cost of movsx. */ |
1432 | COSTS_N_INSNS (1), /* cost of movzx. */ | |
1433 | 8, /* "large" insn. */ | |
1434 | 9, /* MOVE_RATIO. */ | |
01118373 | 1435 | |
df41dbaf JH |
1436 | /* All move costs are relative to integer->integer move times 2 and thus |
1437 | they are latency*2. */ | |
1438 | ||
01118373 JH |
1439 | /* reg-reg moves are done by renaming and thus they are even cheaper than |
1440 | 1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond | |
1441 | to doubles of latencies, we do not model this correctly. It does not | |
1442 | seem to make practical difference to bump prices up even more. */ | |
1443 | 6, /* cost for loading QImode using | |
64766e8d | 1444 | movzbl. */ |
01118373 | 1445 | {6, 6, 6}, /* cost of loading integer registers |
64766e8d JH |
1446 | in QImode, HImode and SImode. |
1447 | Relative to reg-reg move (2). */ | |
01118373 | 1448 | {8, 8, 8}, /* cost of storing integer |
64766e8d JH |
1449 | registers. */ |
1450 | 2, /* cost of reg,reg fld/fst. */ | |
01118373 | 1451 | {6, 6, 16}, /* cost of loading fp registers |
64766e8d | 1452 | in SFmode, DFmode and XFmode. */ |
01118373 | 1453 | {8, 8, 16}, /* cost of storing fp registers |
64766e8d JH |
1454 | in SFmode, DFmode and XFmode. */ |
1455 | 2, /* cost of moving MMX register. */ | |
01118373 | 1456 | {6, 6}, /* cost of loading MMX registers |
64766e8d | 1457 | in SImode and DImode. */ |
01118373 | 1458 | {8, 8}, /* cost of storing MMX registers |
64766e8d | 1459 | in SImode and DImode. */ |
df41dbaf JH |
1460 | 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */ |
1461 | {6, 6, 6, 10, 20}, /* cost of loading SSE registers | |
1462 | in 32,64,128,256 and 512-bit. */ | |
1463 | {6, 6, 6, 10, 20}, /* cost of unaligned loads. */ | |
1464 | {8, 8, 8, 8, 16}, /* cost of storing SSE registers | |
1465 | in 32,64,128,256 and 512-bit. */ | |
1466 | {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ | |
1467 | 6, 6, /* SSE->integer and integer->SSE moves. */ | |
a4fe6139 JH |
1468 | /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, |
1469 | throughput 12. Approx 9 uops do not depend on vector size and every load | |
1470 | is 7 uops. */ | |
1471 | 18, 8, /* Gather load static, per_elt. */ | |
1472 | 18, 10, /* Gather store static, per_elt. */ | |
64766e8d JH |
1473 | 32, /* size of l1 cache. */ |
1474 | 512, /* size of l2 cache. */ | |
1475 | 64, /* size of prefetch block. */ | |
1476 | /* New AMD processors never drop prefetches; if they cannot be performed | |
1477 | immediately, they are queued. We set number of simultaneous prefetches | |
1478 | to a large constant to reflect this (it probably is not a good idea not | |
1479 | to limit number of prefetches at all, as their execution also takes some | |
1480 | time). */ | |
1481 | 100, /* number of parallel prefetches. */ | |
1482 | 3, /* Branch cost. */ | |
6065f444 JH |
1483 | COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */ |
1484 | COSTS_N_INSNS (5), /* cost of FMUL instruction. */ | |
1485 | /* Latency of fdiv is 8-15. */ | |
1486 | COSTS_N_INSNS (15), /* cost of FDIV instruction. */ | |
1487 | COSTS_N_INSNS (1), /* cost of FABS instruction. */ | |
1488 | COSTS_N_INSNS (1), /* cost of FCHS instruction. */ | |
1489 | /* Latency of fsqrt is 4-10. */ | |
1490 | COSTS_N_INSNS (10), /* cost of FSQRT instruction. */ | |
1491 | ||
c53c148c | 1492 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
6065f444 JH |
1493 | COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ |
1494 | COSTS_N_INSNS (3), /* cost of MULSS instruction. */ | |
1495 | COSTS_N_INSNS (4), /* cost of MULSD instruction. */ | |
c53c148c JH |
1496 | COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ |
1497 | COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ | |
6065f444 JH |
1498 | COSTS_N_INSNS (10), /* cost of DIVSS instruction. */ |
1499 | /* 9-13 */ | |
1500 | COSTS_N_INSNS (13), /* cost of DIVSD instruction. */ | |
1501 | COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */ | |
1502 | COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
1503 | /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles |
1504 | and it can execute 2 integer additions and 2 multiplications thus | |
1505 | reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests | |
1506 | that 4 works better than 6 probably due to register pressure. | |
1507 | ||
1508 | Integer vector operations are taken by FP unit and execute 3 vector | |
1509 | plus/minus operations per cycle but only one multiply. This is adjusted | |
1510 | in ix86_reassociation_width. */ | |
1511 | 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */ | |
1512 | znver1_memcpy, | |
1513 | znver1_memset, | |
f6fd8f2b JH |
1514 | COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ |
1515 | COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ | |
64766e8d JH |
1516 | }; |
1517 | ||
c234d831 UB |
1518 | /* skylake_cost should produce code tuned for Skylake familly of CPUs. */ |
1519 | static stringop_algs skylake_memcpy[2] = { | |
1520 | {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, | |
6e559c70 | 1521 | {libcall, {{16, loop, false}, {512, unrolled_loop, false}, |
c234d831 UB |
1522 | {-1, libcall, false}}}}; |
1523 | ||
1524 | static stringop_algs skylake_memset[2] = { | |
1525 | {libcall, {{6, loop_1_byte, true}, | |
1526 | {24, loop, true}, | |
1527 | {8192, rep_prefix_4_byte, true}, | |
1528 | {-1, libcall, false}}}, | |
6e559c70 | 1529 | {libcall, {{24, loop, true}, {512, unrolled_loop, false}, |
c234d831 UB |
1530 | {-1, libcall, false}}}}; |
1531 | ||
1532 | static const | |
1533 | struct processor_costs skylake_cost = { | |
1534 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
1535 | COSTS_N_INSNS (1)+1, /* cost of a lea instruction */ | |
1536 | COSTS_N_INSNS (1), /* variable shift costs */ | |
1537 | COSTS_N_INSNS (1), /* constant shift costs */ | |
1538 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ | |
1539 | COSTS_N_INSNS (4), /* HI */ | |
1540 | COSTS_N_INSNS (3), /* SI */ | |
a2ef9558 MT |
1541 | COSTS_N_INSNS (3), /* DI */ |
1542 | COSTS_N_INSNS (3)}, /* other */ | |
c234d831 | 1543 | 0, /* cost of multiply per each bit set */ |
02308bd3 MT |
1544 | /* Expanding div/mod currently doesn't consider parallelism. So the cost |
1545 | model is not realistic. We compensate by increasing the latencies a bit. */ | |
1546 | {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */ | |
1547 | COSTS_N_INSNS (11), /* HI */ | |
1548 | COSTS_N_INSNS (14), /* SI */ | |
c234d831 UB |
1549 | COSTS_N_INSNS (76), /* DI */ |
1550 | COSTS_N_INSNS (76)}, /* other */ | |
1551 | COSTS_N_INSNS (1), /* cost of movsx */ | |
1552 | COSTS_N_INSNS (0), /* cost of movzx */ | |
1553 | 8, /* "large" insn */ | |
1554 | 17, /* MOVE_RATIO */ | |
1555 | ||
1556 | 6, /* cost for loading QImode using movzbl */ | |
1557 | {4, 4, 4}, /* cost of loading integer registers | |
1558 | in QImode, HImode and SImode. | |
1559 | Relative to reg-reg move (2). */ | |
001e7337 | 1560 | {6, 6, 3}, /* cost of storing integer registers */ |
c234d831 UB |
1561 | 2, /* cost of reg,reg fld/fst */ |
1562 | {6, 6, 8}, /* cost of loading fp registers | |
1563 | in SFmode, DFmode and XFmode */ | |
1564 | {6, 6, 10}, /* cost of storing fp registers | |
1565 | in SFmode, DFmode and XFmode */ | |
1566 | 2, /* cost of moving MMX register */ | |
1567 | {6, 6}, /* cost of loading MMX registers | |
1568 | in SImode and DImode */ | |
1569 | {6, 6}, /* cost of storing MMX registers | |
1570 | in SImode and DImode */ | |
1571 | 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ | |
1572 | {6, 6, 6, 10, 20}, /* cost of loading SSE registers | |
1573 | in 32,64,128,256 and 512-bit */ | |
1574 | {6, 6, 6, 10, 20}, /* cost of unaligned loads. */ | |
001e7337 | 1575 | {8, 8, 8, 12, 24}, /* cost of storing SSE registers |
c234d831 UB |
1576 | in 32,64,128,256 and 512-bit */ |
1577 | {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ | |
1578 | 2, 2, /* SSE->integer and integer->SSE moves */ | |
1579 | 20, 8, /* Gather load static, per_elt. */ | |
1580 | 22, 10, /* Gather store static, per_elt. */ | |
1581 | 64, /* size of l1 cache. */ | |
1582 | 512, /* size of l2 cache. */ | |
1583 | 64, /* size of prefetch block */ | |
1584 | 6, /* number of parallel prefetches */ | |
1585 | 3, /* Branch cost */ | |
1586 | COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ | |
1587 | COSTS_N_INSNS (4), /* cost of FMUL instruction. */ | |
1588 | COSTS_N_INSNS (20), /* cost of FDIV instruction. */ | |
1589 | COSTS_N_INSNS (1), /* cost of FABS instruction. */ | |
1590 | COSTS_N_INSNS (1), /* cost of FCHS instruction. */ | |
1591 | COSTS_N_INSNS (20), /* cost of FSQRT instruction. */ | |
1592 | ||
1593 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ | |
1594 | COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ | |
1595 | COSTS_N_INSNS (4), /* cost of MULSS instruction. */ | |
1596 | COSTS_N_INSNS (4), /* cost of MULSD instruction. */ | |
1597 | COSTS_N_INSNS (4), /* cost of FMA SS instruction. */ | |
1598 | COSTS_N_INSNS (4), /* cost of FMA SD instruction. */ | |
1599 | COSTS_N_INSNS (11), /* cost of DIVSS instruction. */ | |
1600 | COSTS_N_INSNS (14), /* cost of DIVSD instruction. */ | |
1601 | COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */ | |
1602 | COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */ | |
1603 | 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ | |
1604 | skylake_memcpy, | |
1605 | skylake_memset, | |
1606 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ | |
1607 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
1608 | }; | |
64766e8d JH |
1609 | /* BTVER1 has optimized REP instruction for medium sized blocks, but for |
1610 | very small blocks it is better to use loop. For large blocks, libcall can | |
1611 | do nontemporary accesses and beat inline considerably. */ | |
1612 | static stringop_algs btver1_memcpy[2] = { | |
1613 | {libcall, {{6, loop, false}, {14, unrolled_loop, false}, | |
1614 | {-1, rep_prefix_4_byte, false}}}, | |
1615 | {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, | |
1616 | {-1, libcall, false}}}}; | |
1617 | static stringop_algs btver1_memset[2] = { | |
1618 | {libcall, {{8, loop, false}, {24, unrolled_loop, false}, | |
1619 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
1620 | {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, | |
1621 | {-1, libcall, false}}}}; | |
1622 | const struct processor_costs btver1_cost = { | |
1623 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
1624 | COSTS_N_INSNS (2), /* cost of a lea instruction */ | |
1625 | COSTS_N_INSNS (1), /* variable shift costs */ | |
1626 | COSTS_N_INSNS (1), /* constant shift costs */ | |
1627 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ | |
1628 | COSTS_N_INSNS (4), /* HI */ | |
1629 | COSTS_N_INSNS (3), /* SI */ | |
1630 | COSTS_N_INSNS (4), /* DI */ | |
1631 | COSTS_N_INSNS (5)}, /* other */ | |
1632 | 0, /* cost of multiply per each bit set */ | |
1633 | {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ | |
1634 | COSTS_N_INSNS (35), /* HI */ | |
1635 | COSTS_N_INSNS (51), /* SI */ | |
1636 | COSTS_N_INSNS (83), /* DI */ | |
1637 | COSTS_N_INSNS (83)}, /* other */ | |
1638 | COSTS_N_INSNS (1), /* cost of movsx */ | |
1639 | COSTS_N_INSNS (1), /* cost of movzx */ | |
1640 | 8, /* "large" insn */ | |
1641 | 9, /* MOVE_RATIO */ | |
df41dbaf JH |
1642 | |
1643 | /* All move costs are relative to integer->integer move times 2 and thus | |
1644 | they are latency*2. */ | |
1645 | 8, /* cost for loading QImode using movzbl */ | |
1646 | {6, 8, 6}, /* cost of loading integer registers | |
64766e8d JH |
1647 | in QImode, HImode and SImode. |
1648 | Relative to reg-reg move (2). */ | |
df41dbaf | 1649 | {6, 8, 6}, /* cost of storing integer registers */ |
64766e8d | 1650 | 4, /* cost of reg,reg fld/fst */ |
df41dbaf | 1651 | {12, 12, 28}, /* cost of loading fp registers |
64766e8d | 1652 | in SFmode, DFmode and XFmode */ |
df41dbaf | 1653 | {12, 12, 38}, /* cost of storing fp registers |
64766e8d | 1654 | in SFmode, DFmode and XFmode */ |
df41dbaf JH |
1655 | 4, /* cost of moving MMX register */ |
1656 | {10, 10}, /* cost of loading MMX registers | |
64766e8d | 1657 | in SImode and DImode */ |
df41dbaf | 1658 | {12, 12}, /* cost of storing MMX registers |
64766e8d | 1659 | in SImode and DImode */ |
df41dbaf JH |
1660 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
1661 | {10, 10, 12, 24, 48}, /* cost of loading SSE registers | |
1662 | in 32,64,128,256 and 512-bit */ | |
1663 | {10, 10, 12, 24, 48}, /* cost of unaligned loads. */ | |
1664 | {10, 10, 12, 24, 48}, /* cost of storing SSE registers | |
1665 | in 32,64,128,256 and 512-bit */ | |
1666 | {10, 10, 12, 24, 48}, /* cost of unaligned stores. */ | |
1667 | 14, 14, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
1668 | 10, 10, /* Gather load static, per_elt. */ |
1669 | 10, 10, /* Gather store static, per_elt. */ | |
64766e8d JH |
1670 | 32, /* size of l1 cache. */ |
1671 | 512, /* size of l2 cache. */ | |
1672 | 64, /* size of prefetch block */ | |
1673 | 100, /* number of parallel prefetches */ | |
1674 | 2, /* Branch cost */ | |
1675 | COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ | |
1676 | COSTS_N_INSNS (4), /* cost of FMUL instruction. */ | |
1677 | COSTS_N_INSNS (19), /* cost of FDIV instruction. */ | |
1678 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ | |
1679 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ | |
1680 | COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ | |
6065f444 | 1681 | |
c53c148c | 1682 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
6065f444 JH |
1683 | COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ |
1684 | COSTS_N_INSNS (2), /* cost of MULSS instruction. */ | |
1685 | COSTS_N_INSNS (4), /* cost of MULSD instruction. */ | |
c53c148c JH |
1686 | COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ |
1687 | COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ | |
6065f444 JH |
1688 | COSTS_N_INSNS (13), /* cost of DIVSS instruction. */ |
1689 | COSTS_N_INSNS (17), /* cost of DIVSD instruction. */ | |
1690 | COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */ | |
1691 | COSTS_N_INSNS (48), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
1692 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
1693 | btver1_memcpy, | |
1694 | btver1_memset, | |
f6fd8f2b JH |
1695 | COSTS_N_INSNS (2), /* cond_taken_branch_cost. */ |
1696 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
64766e8d JH |
1697 | }; |
1698 | ||
1699 | static stringop_algs btver2_memcpy[2] = { | |
1700 | {libcall, {{6, loop, false}, {14, unrolled_loop, false}, | |
1701 | {-1, rep_prefix_4_byte, false}}}, | |
1702 | {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false}, | |
1703 | {-1, libcall, false}}}}; | |
1704 | static stringop_algs btver2_memset[2] = { | |
1705 | {libcall, {{8, loop, false}, {24, unrolled_loop, false}, | |
1706 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
1707 | {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false}, | |
1708 | {-1, libcall, false}}}}; | |
1709 | const struct processor_costs btver2_cost = { | |
1710 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
1711 | COSTS_N_INSNS (2), /* cost of a lea instruction */ | |
1712 | COSTS_N_INSNS (1), /* variable shift costs */ | |
1713 | COSTS_N_INSNS (1), /* constant shift costs */ | |
1714 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ | |
1715 | COSTS_N_INSNS (4), /* HI */ | |
1716 | COSTS_N_INSNS (3), /* SI */ | |
1717 | COSTS_N_INSNS (4), /* DI */ | |
1718 | COSTS_N_INSNS (5)}, /* other */ | |
1719 | 0, /* cost of multiply per each bit set */ | |
1720 | {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ | |
1721 | COSTS_N_INSNS (35), /* HI */ | |
1722 | COSTS_N_INSNS (51), /* SI */ | |
1723 | COSTS_N_INSNS (83), /* DI */ | |
1724 | COSTS_N_INSNS (83)}, /* other */ | |
1725 | COSTS_N_INSNS (1), /* cost of movsx */ | |
1726 | COSTS_N_INSNS (1), /* cost of movzx */ | |
1727 | 8, /* "large" insn */ | |
1728 | 9, /* MOVE_RATIO */ | |
df41dbaf JH |
1729 | |
1730 | /* All move costs are relative to integer->integer move times 2 and thus | |
1731 | they are latency*2. */ | |
1732 | 8, /* cost for loading QImode using movzbl */ | |
1733 | {8, 8, 6}, /* cost of loading integer registers | |
64766e8d JH |
1734 | in QImode, HImode and SImode. |
1735 | Relative to reg-reg move (2). */ | |
df41dbaf | 1736 | {8, 8, 6}, /* cost of storing integer registers */ |
64766e8d | 1737 | 4, /* cost of reg,reg fld/fst */ |
df41dbaf | 1738 | {12, 12, 28}, /* cost of loading fp registers |
64766e8d | 1739 | in SFmode, DFmode and XFmode */ |
df41dbaf | 1740 | {12, 12, 38}, /* cost of storing fp registers |
64766e8d | 1741 | in SFmode, DFmode and XFmode */ |
df41dbaf JH |
1742 | 4, /* cost of moving MMX register */ |
1743 | {10, 10}, /* cost of loading MMX registers | |
64766e8d | 1744 | in SImode and DImode */ |
df41dbaf | 1745 | {12, 12}, /* cost of storing MMX registers |
64766e8d | 1746 | in SImode and DImode */ |
df41dbaf JH |
1747 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
1748 | {10, 10, 12, 24, 48}, /* cost of loading SSE registers | |
1749 | in 32,64,128,256 and 512-bit */ | |
1750 | {10, 10, 12, 24, 48}, /* cost of unaligned loads. */ | |
1751 | {10, 10, 12, 24, 48}, /* cost of storing SSE registers | |
1752 | in 32,64,128,256 and 512-bit */ | |
1753 | {10, 10, 12, 24, 48}, /* cost of unaligned stores. */ | |
1754 | 14, 14, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
1755 | 10, 10, /* Gather load static, per_elt. */ |
1756 | 10, 10, /* Gather store static, per_elt. */ | |
64766e8d JH |
1757 | 32, /* size of l1 cache. */ |
1758 | 2048, /* size of l2 cache. */ | |
1759 | 64, /* size of prefetch block */ | |
1760 | 100, /* number of parallel prefetches */ | |
1761 | 2, /* Branch cost */ | |
1762 | COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ | |
1763 | COSTS_N_INSNS (4), /* cost of FMUL instruction. */ | |
1764 | COSTS_N_INSNS (19), /* cost of FDIV instruction. */ | |
1765 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ | |
1766 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ | |
1767 | COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ | |
6065f444 | 1768 | |
c53c148c | 1769 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
6065f444 JH |
1770 | COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ |
1771 | COSTS_N_INSNS (2), /* cost of MULSS instruction. */ | |
1772 | COSTS_N_INSNS (4), /* cost of MULSD instruction. */ | |
c53c148c JH |
1773 | COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ |
1774 | COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ | |
6065f444 JH |
1775 | COSTS_N_INSNS (13), /* cost of DIVSS instruction. */ |
1776 | COSTS_N_INSNS (19), /* cost of DIVSD instruction. */ | |
1777 | COSTS_N_INSNS (16), /* cost of SQRTSS instruction. */ | |
1778 | COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
1779 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
1780 | btver2_memcpy, | |
1781 | btver2_memset, | |
f6fd8f2b JH |
1782 | COSTS_N_INSNS (2), /* cond_taken_branch_cost. */ |
1783 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
64766e8d JH |
1784 | }; |
1785 | ||
1786 | static stringop_algs pentium4_memcpy[2] = { | |
1787 | {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}}, | |
1788 | DUMMY_STRINGOP_ALGS}; | |
1789 | static stringop_algs pentium4_memset[2] = { | |
1790 | {libcall, {{6, loop_1_byte, false}, {48, loop, false}, | |
1791 | {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
1792 | DUMMY_STRINGOP_ALGS}; | |
1793 | ||
1794 | static const | |
1795 | struct processor_costs pentium4_cost = { | |
1796 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
1797 | COSTS_N_INSNS (3), /* cost of a lea instruction */ | |
1798 | COSTS_N_INSNS (4), /* variable shift costs */ | |
1799 | COSTS_N_INSNS (4), /* constant shift costs */ | |
1800 | {COSTS_N_INSNS (15), /* cost of starting multiply for QI */ | |
1801 | COSTS_N_INSNS (15), /* HI */ | |
1802 | COSTS_N_INSNS (15), /* SI */ | |
1803 | COSTS_N_INSNS (15), /* DI */ | |
1804 | COSTS_N_INSNS (15)}, /* other */ | |
1805 | 0, /* cost of multiply per each bit set */ | |
1806 | {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */ | |
1807 | COSTS_N_INSNS (56), /* HI */ | |
1808 | COSTS_N_INSNS (56), /* SI */ | |
1809 | COSTS_N_INSNS (56), /* DI */ | |
1810 | COSTS_N_INSNS (56)}, /* other */ | |
1811 | COSTS_N_INSNS (1), /* cost of movsx */ | |
1812 | COSTS_N_INSNS (1), /* cost of movzx */ | |
1813 | 16, /* "large" insn */ | |
1814 | 6, /* MOVE_RATIO */ | |
df41dbaf JH |
1815 | |
1816 | /* All move costs are relative to integer->integer move times 2 and thus | |
1817 | they are latency*2. */ | |
1818 | 5, /* cost for loading QImode using movzbl */ | |
64766e8d JH |
1819 | {4, 5, 4}, /* cost of loading integer registers |
1820 | in QImode, HImode and SImode. | |
1821 | Relative to reg-reg move (2). */ | |
1822 | {2, 3, 2}, /* cost of storing integer registers */ | |
df41dbaf JH |
1823 | 12, /* cost of reg,reg fld/fst */ |
1824 | {14, 14, 14}, /* cost of loading fp registers | |
64766e8d | 1825 | in SFmode, DFmode and XFmode */ |
df41dbaf | 1826 | {14, 14, 14}, /* cost of storing fp registers |
64766e8d | 1827 | in SFmode, DFmode and XFmode */ |
df41dbaf JH |
1828 | 12, /* cost of moving MMX register */ |
1829 | {16, 16}, /* cost of loading MMX registers | |
64766e8d | 1830 | in SImode and DImode */ |
df41dbaf | 1831 | {16, 16}, /* cost of storing MMX registers |
64766e8d | 1832 | in SImode and DImode */ |
df41dbaf JH |
1833 | 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */ |
1834 | {16, 16, 16, 32, 64}, /* cost of loading SSE registers | |
1835 | in 32,64,128,256 and 512-bit */ | |
1836 | {32, 32, 32, 64, 128}, /* cost of unaligned loads. */ | |
1837 | {16, 16, 16, 32, 64}, /* cost of storing SSE registers | |
1838 | in 32,64,128,256 and 512-bit */ | |
1839 | {32, 32, 32, 64, 128}, /* cost of unaligned stores. */ | |
1840 | 20, 12, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
1841 | 16, 16, /* Gather load static, per_elt. */ |
1842 | 16, 16, /* Gather store static, per_elt. */ | |
64766e8d JH |
1843 | 8, /* size of l1 cache. */ |
1844 | 256, /* size of l2 cache. */ | |
1845 | 64, /* size of prefetch block */ | |
1846 | 6, /* number of parallel prefetches */ | |
1847 | 2, /* Branch cost */ | |
1848 | COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */ | |
1849 | COSTS_N_INSNS (7), /* cost of FMUL instruction. */ | |
1850 | COSTS_N_INSNS (43), /* cost of FDIV instruction. */ | |
1851 | COSTS_N_INSNS (2), /* cost of FABS instruction. */ | |
1852 | COSTS_N_INSNS (2), /* cost of FCHS instruction. */ | |
1853 | COSTS_N_INSNS (43), /* cost of FSQRT instruction. */ | |
6065f444 | 1854 | |
c53c148c | 1855 | COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ |
6065f444 JH |
1856 | COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */ |
1857 | COSTS_N_INSNS (6), /* cost of MULSS instruction. */ | |
1858 | COSTS_N_INSNS (6), /* cost of MULSD instruction. */ | |
c53c148c JH |
1859 | COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ |
1860 | COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ | |
6065f444 JH |
1861 | COSTS_N_INSNS (23), /* cost of DIVSS instruction. */ |
1862 | COSTS_N_INSNS (38), /* cost of DIVSD instruction. */ | |
1863 | COSTS_N_INSNS (23), /* cost of SQRTSS instruction. */ | |
1864 | COSTS_N_INSNS (38), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
1865 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
1866 | pentium4_memcpy, | |
1867 | pentium4_memset, | |
f6fd8f2b JH |
1868 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
1869 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
64766e8d JH |
1870 | }; |
1871 | ||
1872 | static stringop_algs nocona_memcpy[2] = { | |
1873 | {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}}, | |
1874 | {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false}, | |
1875 | {100000, unrolled_loop, false}, {-1, libcall, false}}}}; | |
1876 | ||
1877 | static stringop_algs nocona_memset[2] = { | |
1878 | {libcall, {{6, loop_1_byte, false}, {48, loop, false}, | |
1879 | {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
1880 | {libcall, {{24, loop, false}, {64, unrolled_loop, false}, | |
1881 | {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; | |
1882 | ||
1883 | static const | |
1884 | struct processor_costs nocona_cost = { | |
1885 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
1886 | COSTS_N_INSNS (1), /* cost of a lea instruction */ | |
1887 | COSTS_N_INSNS (1), /* variable shift costs */ | |
1888 | COSTS_N_INSNS (1), /* constant shift costs */ | |
1889 | {COSTS_N_INSNS (10), /* cost of starting multiply for QI */ | |
1890 | COSTS_N_INSNS (10), /* HI */ | |
1891 | COSTS_N_INSNS (10), /* SI */ | |
1892 | COSTS_N_INSNS (10), /* DI */ | |
1893 | COSTS_N_INSNS (10)}, /* other */ | |
1894 | 0, /* cost of multiply per each bit set */ | |
1895 | {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */ | |
1896 | COSTS_N_INSNS (66), /* HI */ | |
1897 | COSTS_N_INSNS (66), /* SI */ | |
1898 | COSTS_N_INSNS (66), /* DI */ | |
1899 | COSTS_N_INSNS (66)}, /* other */ | |
1900 | COSTS_N_INSNS (1), /* cost of movsx */ | |
1901 | COSTS_N_INSNS (1), /* cost of movzx */ | |
1902 | 16, /* "large" insn */ | |
1903 | 17, /* MOVE_RATIO */ | |
df41dbaf JH |
1904 | |
1905 | /* All move costs are relative to integer->integer move times 2 and thus | |
1906 | they are latency*2. */ | |
64766e8d JH |
1907 | 4, /* cost for loading QImode using movzbl */ |
1908 | {4, 4, 4}, /* cost of loading integer registers | |
1909 | in QImode, HImode and SImode. | |
1910 | Relative to reg-reg move (2). */ | |
1911 | {4, 4, 4}, /* cost of storing integer registers */ | |
df41dbaf JH |
1912 | 12, /* cost of reg,reg fld/fst */ |
1913 | {14, 14, 14}, /* cost of loading fp registers | |
64766e8d | 1914 | in SFmode, DFmode and XFmode */ |
df41dbaf | 1915 | {14, 14, 14}, /* cost of storing fp registers |
64766e8d | 1916 | in SFmode, DFmode and XFmode */ |
df41dbaf | 1917 | 14, /* cost of moving MMX register */ |
64766e8d JH |
1918 | {12, 12}, /* cost of loading MMX registers |
1919 | in SImode and DImode */ | |
1920 | {12, 12}, /* cost of storing MMX registers | |
1921 | in SImode and DImode */ | |
df41dbaf JH |
1922 | 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */ |
1923 | {12, 12, 12, 24, 48}, /* cost of loading SSE registers | |
1924 | in 32,64,128,256 and 512-bit */ | |
1925 | {24, 24, 24, 48, 96}, /* cost of unaligned loads. */ | |
1926 | {12, 12, 12, 24, 48}, /* cost of storing SSE registers | |
1927 | in 32,64,128,256 and 512-bit */ | |
1928 | {24, 24, 24, 48, 96}, /* cost of unaligned stores. */ | |
1929 | 20, 12, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
1930 | 12, 12, /* Gather load static, per_elt. */ |
1931 | 12, 12, /* Gather store static, per_elt. */ | |
64766e8d JH |
1932 | 8, /* size of l1 cache. */ |
1933 | 1024, /* size of l2 cache. */ | |
1934 | 64, /* size of prefetch block */ | |
1935 | 8, /* number of parallel prefetches */ | |
1936 | 1, /* Branch cost */ | |
1937 | COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ | |
1938 | COSTS_N_INSNS (8), /* cost of FMUL instruction. */ | |
1939 | COSTS_N_INSNS (40), /* cost of FDIV instruction. */ | |
1940 | COSTS_N_INSNS (3), /* cost of FABS instruction. */ | |
1941 | COSTS_N_INSNS (3), /* cost of FCHS instruction. */ | |
1942 | COSTS_N_INSNS (44), /* cost of FSQRT instruction. */ | |
6065f444 | 1943 | |
c53c148c | 1944 | COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */ |
6065f444 JH |
1945 | COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */ |
1946 | COSTS_N_INSNS (7), /* cost of MULSS instruction. */ | |
1947 | COSTS_N_INSNS (7), /* cost of MULSD instruction. */ | |
c53c148c JH |
1948 | COSTS_N_INSNS (7), /* cost of FMA SS instruction. */ |
1949 | COSTS_N_INSNS (7), /* cost of FMA SD instruction. */ | |
6065f444 JH |
1950 | COSTS_N_INSNS (32), /* cost of DIVSS instruction. */ |
1951 | COSTS_N_INSNS (40), /* cost of DIVSD instruction. */ | |
1952 | COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */ | |
1953 | COSTS_N_INSNS (41), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
1954 | 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
1955 | nocona_memcpy, | |
1956 | nocona_memset, | |
f6fd8f2b JH |
1957 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
1958 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
64766e8d JH |
1959 | }; |
1960 | ||
1961 | static stringop_algs atom_memcpy[2] = { | |
1962 | {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, | |
1963 | {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, | |
1964 | {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; | |
1965 | static stringop_algs atom_memset[2] = { | |
1966 | {libcall, {{8, loop, false}, {15, unrolled_loop, false}, | |
1967 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
1968 | {libcall, {{24, loop, false}, {32, unrolled_loop, false}, | |
1969 | {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; | |
1970 | static const | |
1971 | struct processor_costs atom_cost = { | |
1972 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
1973 | COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ | |
1974 | COSTS_N_INSNS (1), /* variable shift costs */ | |
1975 | COSTS_N_INSNS (1), /* constant shift costs */ | |
1976 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ | |
1977 | COSTS_N_INSNS (4), /* HI */ | |
1978 | COSTS_N_INSNS (3), /* SI */ | |
1979 | COSTS_N_INSNS (4), /* DI */ | |
1980 | COSTS_N_INSNS (2)}, /* other */ | |
1981 | 0, /* cost of multiply per each bit set */ | |
1982 | {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ | |
1983 | COSTS_N_INSNS (26), /* HI */ | |
1984 | COSTS_N_INSNS (42), /* SI */ | |
1985 | COSTS_N_INSNS (74), /* DI */ | |
1986 | COSTS_N_INSNS (74)}, /* other */ | |
1987 | COSTS_N_INSNS (1), /* cost of movsx */ | |
1988 | COSTS_N_INSNS (1), /* cost of movzx */ | |
1989 | 8, /* "large" insn */ | |
1990 | 17, /* MOVE_RATIO */ | |
df41dbaf JH |
1991 | |
1992 | /* All move costs are relative to integer->integer move times 2 and thus | |
1993 | they are latency*2. */ | |
1994 | 6, /* cost for loading QImode using movzbl */ | |
1995 | {6, 6, 6}, /* cost of loading integer registers | |
64766e8d JH |
1996 | in QImode, HImode and SImode. |
1997 | Relative to reg-reg move (2). */ | |
df41dbaf | 1998 | {6, 6, 6}, /* cost of storing integer registers */ |
64766e8d | 1999 | 4, /* cost of reg,reg fld/fst */ |
df41dbaf | 2000 | {6, 6, 18}, /* cost of loading fp registers |
64766e8d | 2001 | in SFmode, DFmode and XFmode */ |
df41dbaf | 2002 | {14, 14, 24}, /* cost of storing fp registers |
64766e8d JH |
2003 | in SFmode, DFmode and XFmode */ |
2004 | 2, /* cost of moving MMX register */ | |
2005 | {8, 8}, /* cost of loading MMX registers | |
2006 | in SImode and DImode */ | |
df41dbaf | 2007 | {10, 10}, /* cost of storing MMX registers |
64766e8d | 2008 | in SImode and DImode */ |
df41dbaf JH |
2009 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
2010 | {8, 8, 8, 16, 32}, /* cost of loading SSE registers | |
2011 | in 32,64,128,256 and 512-bit */ | |
2012 | {16, 16, 16, 32, 64}, /* cost of unaligned loads. */ | |
2013 | {8, 8, 8, 16, 32}, /* cost of storing SSE registers | |
2014 | in 32,64,128,256 and 512-bit */ | |
2015 | {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ | |
2016 | 8, 6, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
2017 | 8, 8, /* Gather load static, per_elt. */ |
2018 | 8, 8, /* Gather store static, per_elt. */ | |
64766e8d JH |
2019 | 32, /* size of l1 cache. */ |
2020 | 256, /* size of l2 cache. */ | |
2021 | 64, /* size of prefetch block */ | |
2022 | 6, /* number of parallel prefetches */ | |
2023 | 3, /* Branch cost */ | |
2024 | COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ | |
2025 | COSTS_N_INSNS (8), /* cost of FMUL instruction. */ | |
2026 | COSTS_N_INSNS (20), /* cost of FDIV instruction. */ | |
2027 | COSTS_N_INSNS (8), /* cost of FABS instruction. */ | |
2028 | COSTS_N_INSNS (8), /* cost of FCHS instruction. */ | |
2029 | COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ | |
6065f444 | 2030 | |
c53c148c | 2031 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
6065f444 JH |
2032 | COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */ |
2033 | COSTS_N_INSNS (4), /* cost of MULSS instruction. */ | |
2034 | COSTS_N_INSNS (5), /* cost of MULSD instruction. */ | |
c53c148c JH |
2035 | COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ |
2036 | COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ | |
6065f444 JH |
2037 | COSTS_N_INSNS (31), /* cost of DIVSS instruction. */ |
2038 | COSTS_N_INSNS (60), /* cost of DIVSD instruction. */ | |
2039 | COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */ | |
2040 | COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
2041 | 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ |
2042 | atom_memcpy, | |
2043 | atom_memset, | |
f6fd8f2b JH |
2044 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
2045 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
64766e8d JH |
2046 | }; |
2047 | ||
2048 | static stringop_algs slm_memcpy[2] = { | |
2049 | {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, | |
2050 | {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, | |
2051 | {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; | |
2052 | static stringop_algs slm_memset[2] = { | |
2053 | {libcall, {{8, loop, false}, {15, unrolled_loop, false}, | |
2054 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
2055 | {libcall, {{24, loop, false}, {32, unrolled_loop, false}, | |
2056 | {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; | |
2057 | static const | |
2058 | struct processor_costs slm_cost = { | |
2059 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
2060 | COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ | |
2061 | COSTS_N_INSNS (1), /* variable shift costs */ | |
2062 | COSTS_N_INSNS (1), /* constant shift costs */ | |
2063 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ | |
2064 | COSTS_N_INSNS (3), /* HI */ | |
2065 | COSTS_N_INSNS (3), /* SI */ | |
2066 | COSTS_N_INSNS (4), /* DI */ | |
2067 | COSTS_N_INSNS (2)}, /* other */ | |
2068 | 0, /* cost of multiply per each bit set */ | |
2069 | {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ | |
2070 | COSTS_N_INSNS (26), /* HI */ | |
2071 | COSTS_N_INSNS (42), /* SI */ | |
2072 | COSTS_N_INSNS (74), /* DI */ | |
2073 | COSTS_N_INSNS (74)}, /* other */ | |
2074 | COSTS_N_INSNS (1), /* cost of movsx */ | |
2075 | COSTS_N_INSNS (1), /* cost of movzx */ | |
2076 | 8, /* "large" insn */ | |
2077 | 17, /* MOVE_RATIO */ | |
df41dbaf JH |
2078 | |
2079 | /* All move costs are relative to integer->integer move times 2 and thus | |
2080 | they are latency*2. */ | |
2081 | 8, /* cost for loading QImode using movzbl */ | |
2082 | {8, 8, 8}, /* cost of loading integer registers | |
64766e8d JH |
2083 | in QImode, HImode and SImode. |
2084 | Relative to reg-reg move (2). */ | |
df41dbaf JH |
2085 | {6, 6, 6}, /* cost of storing integer registers */ |
2086 | 2, /* cost of reg,reg fld/fst */ | |
2087 | {8, 8, 18}, /* cost of loading fp registers | |
64766e8d | 2088 | in SFmode, DFmode and XFmode */ |
df41dbaf | 2089 | {6, 6, 18}, /* cost of storing fp registers |
64766e8d JH |
2090 | in SFmode, DFmode and XFmode */ |
2091 | 2, /* cost of moving MMX register */ | |
2092 | {8, 8}, /* cost of loading MMX registers | |
2093 | in SImode and DImode */ | |
df41dbaf | 2094 | {6, 6}, /* cost of storing MMX registers |
64766e8d | 2095 | in SImode and DImode */ |
df41dbaf JH |
2096 | 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */ |
2097 | {8, 8, 8, 16, 32}, /* cost of loading SSE registers | |
2098 | in 32,64,128,256 and 512-bit */ | |
2099 | {16, 16, 16, 32, 64}, /* cost of unaligned loads. */ | |
2100 | {8, 8, 8, 16, 32}, /* cost of storing SSE registers | |
2101 | in 32,64,128,256 and 512-bit */ | |
2102 | {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ | |
2103 | 8, 6, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
2104 | 8, 8, /* Gather load static, per_elt. */ |
2105 | 8, 8, /* Gather store static, per_elt. */ | |
64766e8d JH |
2106 | 32, /* size of l1 cache. */ |
2107 | 256, /* size of l2 cache. */ | |
2108 | 64, /* size of prefetch block */ | |
2109 | 6, /* number of parallel prefetches */ | |
2110 | 3, /* Branch cost */ | |
2111 | COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ | |
2112 | COSTS_N_INSNS (8), /* cost of FMUL instruction. */ | |
2113 | COSTS_N_INSNS (20), /* cost of FDIV instruction. */ | |
2114 | COSTS_N_INSNS (8), /* cost of FABS instruction. */ | |
2115 | COSTS_N_INSNS (8), /* cost of FCHS instruction. */ | |
2116 | COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ | |
6065f444 | 2117 | |
c53c148c | 2118 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
6065f444 JH |
2119 | COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ |
2120 | COSTS_N_INSNS (4), /* cost of MULSS instruction. */ | |
2121 | COSTS_N_INSNS (5), /* cost of MULSD instruction. */ | |
c53c148c JH |
2122 | COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ |
2123 | COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ | |
6065f444 JH |
2124 | COSTS_N_INSNS (39), /* cost of DIVSS instruction. */ |
2125 | COSTS_N_INSNS (69), /* cost of DIVSD instruction. */ | |
2126 | COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */ | |
2127 | COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
2128 | 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
2129 | slm_memcpy, | |
2130 | slm_memset, | |
f6fd8f2b JH |
2131 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
2132 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
64766e8d JH |
2133 | }; |
2134 | ||
2135 | static stringop_algs intel_memcpy[2] = { | |
2136 | {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}}, | |
2137 | {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false}, | |
2138 | {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; | |
2139 | static stringop_algs intel_memset[2] = { | |
2140 | {libcall, {{8, loop, false}, {15, unrolled_loop, false}, | |
2141 | {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}}, | |
2142 | {libcall, {{24, loop, false}, {32, unrolled_loop, false}, | |
2143 | {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}}; | |
2144 | static const | |
2145 | struct processor_costs intel_cost = { | |
2146 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
2147 | COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ | |
2148 | COSTS_N_INSNS (1), /* variable shift costs */ | |
2149 | COSTS_N_INSNS (1), /* constant shift costs */ | |
2150 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ | |
2151 | COSTS_N_INSNS (3), /* HI */ | |
2152 | COSTS_N_INSNS (3), /* SI */ | |
2153 | COSTS_N_INSNS (4), /* DI */ | |
2154 | COSTS_N_INSNS (2)}, /* other */ | |
2155 | 0, /* cost of multiply per each bit set */ | |
2156 | {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ | |
2157 | COSTS_N_INSNS (26), /* HI */ | |
2158 | COSTS_N_INSNS (42), /* SI */ | |
2159 | COSTS_N_INSNS (74), /* DI */ | |
2160 | COSTS_N_INSNS (74)}, /* other */ | |
2161 | COSTS_N_INSNS (1), /* cost of movsx */ | |
2162 | COSTS_N_INSNS (1), /* cost of movzx */ | |
2163 | 8, /* "large" insn */ | |
2164 | 17, /* MOVE_RATIO */ | |
df41dbaf JH |
2165 | |
2166 | /* All move costs are relative to integer->integer move times 2 and thus | |
2167 | they are latency*2. */ | |
af863030 | 2168 | 6, /* cost for loading QImode using movzbl */ |
64766e8d JH |
2169 | {4, 4, 4}, /* cost of loading integer registers |
2170 | in QImode, HImode and SImode. | |
2171 | Relative to reg-reg move (2). */ | |
af863030 JH |
2172 | {6, 6, 6}, /* cost of storing integer registers */ |
2173 | 2, /* cost of reg,reg fld/fst */ | |
2174 | {6, 6, 8}, /* cost of loading fp registers | |
64766e8d | 2175 | in SFmode, DFmode and XFmode */ |
af863030 | 2176 | {6, 6, 10}, /* cost of storing fp registers |
64766e8d JH |
2177 | in SFmode, DFmode and XFmode */ |
2178 | 2, /* cost of moving MMX register */ | |
af863030 | 2179 | {6, 6}, /* cost of loading MMX registers |
64766e8d | 2180 | in SImode and DImode */ |
af863030 | 2181 | {6, 6}, /* cost of storing MMX registers |
64766e8d | 2182 | in SImode and DImode */ |
df41dbaf JH |
2183 | 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */ |
2184 | {6, 6, 6, 6, 6}, /* cost of loading SSE registers | |
2185 | in 32,64,128,256 and 512-bit */ | |
2186 | {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ | |
2187 | {6, 6, 6, 6, 6}, /* cost of storing SSE registers | |
2188 | in 32,64,128,256 and 512-bit */ | |
2189 | {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ | |
2190 | 4, 4, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
2191 | 6, 6, /* Gather load static, per_elt. */ |
2192 | 6, 6, /* Gather store static, per_elt. */ | |
64766e8d JH |
2193 | 32, /* size of l1 cache. */ |
2194 | 256, /* size of l2 cache. */ | |
2195 | 64, /* size of prefetch block */ | |
2196 | 6, /* number of parallel prefetches */ | |
2197 | 3, /* Branch cost */ | |
2198 | COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ | |
2199 | COSTS_N_INSNS (8), /* cost of FMUL instruction. */ | |
2200 | COSTS_N_INSNS (20), /* cost of FDIV instruction. */ | |
2201 | COSTS_N_INSNS (8), /* cost of FABS instruction. */ | |
2202 | COSTS_N_INSNS (8), /* cost of FCHS instruction. */ | |
2203 | COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ | |
6065f444 | 2204 | |
c53c148c | 2205 | COSTS_N_INSNS (8), /* cost of cheap SSE instruction. */ |
6065f444 JH |
2206 | COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */ |
2207 | COSTS_N_INSNS (8), /* cost of MULSS instruction. */ | |
2208 | COSTS_N_INSNS (8), /* cost of MULSD instruction. */ | |
c53c148c JH |
2209 | COSTS_N_INSNS (6), /* cost of FMA SS instruction. */ |
2210 | COSTS_N_INSNS (6), /* cost of FMA SD instruction. */ | |
6065f444 JH |
2211 | COSTS_N_INSNS (20), /* cost of DIVSS instruction. */ |
2212 | COSTS_N_INSNS (20), /* cost of DIVSD instruction. */ | |
2213 | COSTS_N_INSNS (40), /* cost of SQRTSS instruction. */ | |
2214 | COSTS_N_INSNS (40), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
2215 | 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */ |
2216 | intel_memcpy, | |
2217 | intel_memset, | |
f6fd8f2b JH |
2218 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
2219 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
64766e8d JH |
2220 | }; |
2221 | ||
2222 | /* Generic should produce code tuned for Core-i7 (and newer chips) | |
2223 | and btver1 (and newer chips). */ | |
2224 | ||
2225 | static stringop_algs generic_memcpy[2] = { | |
2226 | {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false}, | |
2227 | {-1, libcall, false}}}, | |
2228 | {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false}, | |
2229 | {-1, libcall, false}}}}; | |
2230 | static stringop_algs generic_memset[2] = { | |
2231 | {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false}, | |
2232 | {-1, libcall, false}}}, | |
2233 | {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false}, | |
2234 | {-1, libcall, false}}}}; | |
2235 | static const | |
2236 | struct processor_costs generic_cost = { | |
2237 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
ef9eec0b | 2238 | /* Setting cost to 2 makes our current implementation of synth_mult result in |
64766e8d JH |
2239 | use of unnecessary temporary registers causing regression on several |
2240 | SPECfp benchmarks. */ | |
2241 | COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ | |
2242 | COSTS_N_INSNS (1), /* variable shift costs */ | |
2243 | COSTS_N_INSNS (1), /* constant shift costs */ | |
2244 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ | |
2245 | COSTS_N_INSNS (4), /* HI */ | |
2246 | COSTS_N_INSNS (3), /* SI */ | |
2247 | COSTS_N_INSNS (4), /* DI */ | |
7c080ade | 2248 | COSTS_N_INSNS (4)}, /* other */ |
64766e8d | 2249 | 0, /* cost of multiply per each bit set */ |
7c080ade JH |
2250 | {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */ |
2251 | COSTS_N_INSNS (22), /* HI */ | |
2252 | COSTS_N_INSNS (30), /* SI */ | |
64766e8d JH |
2253 | COSTS_N_INSNS (74), /* DI */ |
2254 | COSTS_N_INSNS (74)}, /* other */ | |
2255 | COSTS_N_INSNS (1), /* cost of movsx */ | |
2256 | COSTS_N_INSNS (1), /* cost of movzx */ | |
2257 | 8, /* "large" insn */ | |
2258 | 17, /* MOVE_RATIO */ | |
df41dbaf JH |
2259 | |
2260 | /* All move costs are relative to integer->integer move times 2 and thus | |
2261 | they are latency*2. */ | |
d555138e JH |
2262 | 6, /* cost for loading QImode using movzbl */ |
2263 | {6, 6, 6}, /* cost of loading integer registers | |
64766e8d JH |
2264 | in QImode, HImode and SImode. |
2265 | Relative to reg-reg move (2). */ | |
af863030 | 2266 | {6, 6, 6}, /* cost of storing integer registers */ |
64766e8d | 2267 | 4, /* cost of reg,reg fld/fst */ |
af863030 | 2268 | {6, 6, 12}, /* cost of loading fp registers |
64766e8d | 2269 | in SFmode, DFmode and XFmode */ |
af863030 | 2270 | {6, 6, 12}, /* cost of storing fp registers |
64766e8d JH |
2271 | in SFmode, DFmode and XFmode */ |
2272 | 2, /* cost of moving MMX register */ | |
af863030 | 2273 | {6, 6}, /* cost of loading MMX registers |
64766e8d | 2274 | in SImode and DImode */ |
af863030 | 2275 | {6, 6}, /* cost of storing MMX registers |
64766e8d | 2276 | in SImode and DImode */ |
df41dbaf JH |
2277 | 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */ |
2278 | {6, 6, 6, 10, 15}, /* cost of loading SSE registers | |
2279 | in 32,64,128,256 and 512-bit */ | |
7c080ade | 2280 | {6, 6, 6, 10, 15}, /* cost of unaligned loads. */ |
df41dbaf JH |
2281 | {6, 6, 6, 10, 15}, /* cost of storing SSE registers |
2282 | in 32,64,128,256 and 512-bit */ | |
7c080ade JH |
2283 | {6, 6, 6, 10, 15}, /* cost of unaligned storess. */ |
2284 | 6, 6, /* SSE->integer and integer->SSE moves */ | |
2285 | 18, 6, /* Gather load static, per_elt. */ | |
2286 | 18, 6, /* Gather store static, per_elt. */ | |
64766e8d JH |
2287 | 32, /* size of l1 cache. */ |
2288 | 512, /* size of l2 cache. */ | |
2289 | 64, /* size of prefetch block */ | |
2290 | 6, /* number of parallel prefetches */ | |
2291 | /* Benchmarks shows large regressions on K8 sixtrack benchmark when this | |
2292 | value is increased to perhaps more appropriate value of 5. */ | |
2293 | 3, /* Branch cost */ | |
ef9eec0b | 2294 | COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ |
7c080ade | 2295 | COSTS_N_INSNS (5), /* cost of FMUL instruction. */ |
e8e3054e | 2296 | COSTS_N_INSNS (17), /* cost of FDIV instruction. */ |
ef9eec0b JH |
2297 | COSTS_N_INSNS (1), /* cost of FABS instruction. */ |
2298 | COSTS_N_INSNS (1), /* cost of FCHS instruction. */ | |
e8e3054e | 2299 | COSTS_N_INSNS (14), /* cost of FSQRT instruction. */ |
6065f444 | 2300 | |
ef9eec0b JH |
2301 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
2302 | COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ | |
2303 | COSTS_N_INSNS (4), /* cost of MULSS instruction. */ | |
2304 | COSTS_N_INSNS (5), /* cost of MULSD instruction. */ | |
2305 | COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ | |
2306 | COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ | |
e8e3054e JH |
2307 | COSTS_N_INSNS (13), /* cost of DIVSS instruction. */ |
2308 | COSTS_N_INSNS (17), /* cost of DIVSD instruction. */ | |
2309 | COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */ | |
2310 | COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */ | |
7c080ade | 2311 | 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */ |
64766e8d JH |
2312 | generic_memcpy, |
2313 | generic_memset, | |
e8e3054e JH |
2314 | COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ |
2315 | COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */ | |
64766e8d JH |
2316 | }; |
2317 | ||
2318 | /* core_cost should produce code tuned for Core familly of CPUs. */ | |
2319 | static stringop_algs core_memcpy[2] = { | |
2320 | {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, | |
2321 | {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true}, | |
2322 | {-1, libcall, false}}}}; | |
2323 | static stringop_algs core_memset[2] = { | |
2324 | {libcall, {{6, loop_1_byte, true}, | |
2325 | {24, loop, true}, | |
2326 | {8192, rep_prefix_4_byte, true}, | |
2327 | {-1, libcall, false}}}, | |
2328 | {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true}, | |
2329 | {-1, libcall, false}}}}; | |
2330 | ||
2331 | static const | |
2332 | struct processor_costs core_cost = { | |
2333 | COSTS_N_INSNS (1), /* cost of an add instruction */ | |
2334 | /* On all chips taken into consideration lea is 2 cycles and more. With | |
2335 | this cost however our current implementation of synth_mult results in | |
2336 | use of unnecessary temporary registers causing regression on several | |
2337 | SPECfp benchmarks. */ | |
2338 | COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ | |
2339 | COSTS_N_INSNS (1), /* variable shift costs */ | |
2340 | COSTS_N_INSNS (1), /* constant shift costs */ | |
2341 | {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ | |
2342 | COSTS_N_INSNS (4), /* HI */ | |
2343 | COSTS_N_INSNS (3), /* SI */ | |
a2ef9558 MT |
2344 | /* Here we tune for Sandybridge or newer. */ |
2345 | COSTS_N_INSNS (3), /* DI */ | |
2346 | COSTS_N_INSNS (3)}, /* other */ | |
64766e8d | 2347 | 0, /* cost of multiply per each bit set */ |
02308bd3 MT |
2348 | /* Expanding div/mod currently doesn't consider parallelism. So the cost |
2349 | model is not realistic. We compensate by increasing the latencies a bit. */ | |
2350 | {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */ | |
2351 | COSTS_N_INSNS (11), /* HI */ | |
2352 | COSTS_N_INSNS (14), /* SI */ | |
ffa3ce53 JH |
2353 | COSTS_N_INSNS (81), /* DI */ |
2354 | COSTS_N_INSNS (81)}, /* other */ | |
64766e8d JH |
2355 | COSTS_N_INSNS (1), /* cost of movsx */ |
2356 | COSTS_N_INSNS (1), /* cost of movzx */ | |
2357 | 8, /* "large" insn */ | |
2358 | 17, /* MOVE_RATIO */ | |
df41dbaf JH |
2359 | |
2360 | /* All move costs are relative to integer->integer move times 2 and thus | |
2361 | they are latency*2. */ | |
ffa3ce53 | 2362 | 6, /* cost for loading QImode using movzbl */ |
64766e8d JH |
2363 | {4, 4, 4}, /* cost of loading integer registers |
2364 | in QImode, HImode and SImode. | |
2365 | Relative to reg-reg move (2). */ | |
ffa3ce53 JH |
2366 | {6, 6, 6}, /* cost of storing integer registers */ |
2367 | 2, /* cost of reg,reg fld/fst */ | |
2368 | {6, 6, 8}, /* cost of loading fp registers | |
64766e8d | 2369 | in SFmode, DFmode and XFmode */ |
af863030 | 2370 | {6, 6, 10}, /* cost of storing fp registers |
64766e8d JH |
2371 | in SFmode, DFmode and XFmode */ |
2372 | 2, /* cost of moving MMX register */ | |
ffa3ce53 | 2373 | {6, 6}, /* cost of loading MMX registers |
64766e8d | 2374 | in SImode and DImode */ |
ffa3ce53 | 2375 | {6, 6}, /* cost of storing MMX registers |
64766e8d | 2376 | in SImode and DImode */ |
df41dbaf JH |
2377 | 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */ |
2378 | {6, 6, 6, 6, 12}, /* cost of loading SSE registers | |
2379 | in 32,64,128,256 and 512-bit */ | |
2380 | {6, 6, 6, 6, 12}, /* cost of unaligned loads. */ | |
2381 | {6, 6, 6, 6, 12}, /* cost of storing SSE registers | |
2382 | in 32,64,128,256 and 512-bit */ | |
2383 | {6, 6, 6, 6, 12}, /* cost of unaligned stores. */ | |
2384 | 2, 2, /* SSE->integer and integer->SSE moves */ | |
a4fe6139 JH |
2385 | /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops, |
2386 | rec. throughput 6. | |
2387 | So 5 uops statically and one uops per load. */ | |
2388 | 10, 6, /* Gather load static, per_elt. */ | |
2389 | 10, 6, /* Gather store static, per_elt. */ | |
64766e8d JH |
2390 | 64, /* size of l1 cache. */ |
2391 | 512, /* size of l2 cache. */ | |
2392 | 64, /* size of prefetch block */ | |
2393 | 6, /* number of parallel prefetches */ | |
2394 | /* FIXME perhaps more appropriate value is 5. */ | |
2395 | 3, /* Branch cost */ | |
ef9eec0b JH |
2396 | COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ |
2397 | COSTS_N_INSNS (5), /* cost of FMUL instruction. */ | |
ffa3ce53 | 2398 | /* 10-24 */ |
ef9eec0b JH |
2399 | COSTS_N_INSNS (24), /* cost of FDIV instruction. */ |
2400 | COSTS_N_INSNS (1), /* cost of FABS instruction. */ | |
2401 | COSTS_N_INSNS (1), /* cost of FCHS instruction. */ | |
ffa3ce53 | 2402 | COSTS_N_INSNS (23), /* cost of FSQRT instruction. */ |
6065f444 | 2403 | |
c53c148c | 2404 | COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ |
6065f444 JH |
2405 | COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ |
2406 | COSTS_N_INSNS (4), /* cost of MULSS instruction. */ | |
2407 | COSTS_N_INSNS (5), /* cost of MULSD instruction. */ | |
c53c148c JH |
2408 | COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ |
2409 | COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ | |
6065f444 JH |
2410 | COSTS_N_INSNS (18), /* cost of DIVSS instruction. */ |
2411 | COSTS_N_INSNS (32), /* cost of DIVSD instruction. */ | |
2412 | COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */ | |
2413 | COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */ | |
64766e8d JH |
2414 | 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */ |
2415 | core_memcpy, | |
2416 | core_memset, | |
f6fd8f2b JH |
2417 | COSTS_N_INSNS (3), /* cond_taken_branch_cost. */ |
2418 | COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */ | |
64766e8d JH |
2419 | }; |
2420 |